From: Tim Northover <tnorthover@apple.com>
Date: Sat, 24 May 2014 12:50:23 +0000 (+0000)
Subject: AArch64/ARM64: move ARM64 into AArch64's place
X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=29f94c72014eaa5d0d3b920686e689e79759cacb

AArch64/ARM64: move ARM64 into AArch64's place

This commit starts with a "git mv ARM64 AArch64" and continues out
from there, renaming the C++ classes, intrinsics, and other
target-local objects for consistency.

"ARM64" test directories are also moved, and tests that began their
life in ARM64 use an arm64 triple, those from AArch64 use an aarch64
triple. Both should be equivalent though.

This finishes the AArch64 merge, and everyone should feel free to
continue committing as normal now.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209577 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b19ab0271ab..0d6eead42f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,7 +127,7 @@ set(LLVM_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
 set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name (32/64)" )
 
 set(LLVM_ALL_TARGETS
-  ARM64
+  AArch64
   ARM
   CppBackend
   Hexagon
@@ -143,7 +143,7 @@ set(LLVM_ALL_TARGETS
   )
 
 # List of targets with JIT support:
-set(LLVM_TARGETS_WITH_JIT X86 PowerPC ARM64 ARM Mips SystemZ)
+set(LLVM_TARGETS_WITH_JIT X86 PowerPC AArch64 ARM Mips SystemZ)
 
 set(LLVM_TARGETS_TO_BUILD "all"
     CACHE STRING "Semicolon-separated list of targets to build, or \"all\".")
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 344e66af65d..08f756c9214 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -419,9 +419,9 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch],
   amd64-* | x86_64-*)     llvm_cv_target_arch="x86_64" ;;
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
-  arm64*-*)               llvm_cv_target_arch="ARM64" ;;
+  arm64*-*)               llvm_cv_target_arch="AArch64" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
-  aarch64*-*)             llvm_cv_target_arch="ARM64" ;;
+  aarch64*-*)             llvm_cv_target_arch="AArch64" ;;
   mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
   mipsel-* | mips64el-*)  llvm_cv_target_arch="Mips" ;;
   xcore-*)                llvm_cv_target_arch="XCore" ;;
@@ -455,9 +455,9 @@ case $host in
   amd64-* | x86_64-*)     host_arch="x86_64" ;;
   sparc*-*)               host_arch="Sparc" ;;
   powerpc*-*)             host_arch="PowerPC" ;;
-  arm64*-*)               host_arch="ARM64" ;;
+  arm64*-*)               host_arch="AArch64" ;;
   arm*-*)                 host_arch="ARM" ;;
-  aarch64*-*)             host_arch="ARM64" ;;
+  aarch64*-*)             host_arch="AArch64" ;;
   mips-* | mips64-*)      host_arch="Mips" ;;
   mipsel-* | mips64el-*)  host_arch="Mips" ;;
   xcore-*)                host_arch="XCore" ;;
@@ -796,7 +796,7 @@ else
   esac
 fi
 
-TARGETS_WITH_JIT="ARM ARM64 Mips PowerPC SystemZ X86"
+TARGETS_WITH_JIT="ARM AArch64 Mips PowerPC SystemZ X86"
 AC_SUBST(TARGETS_WITH_JIT,$TARGETS_WITH_JIT)
 
 dnl Allow enablement of building and installing docs
@@ -949,7 +949,7 @@ if test "$llvm_cv_enable_crash_overrides" = "yes" ; then
 fi
 
 dnl List all possible targets
-ALL_TARGETS="X86 Sparc PowerPC ARM ARM64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
+ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
 AC_SUBST(ALL_TARGETS,$ALL_TARGETS)
 
 dnl Allow specific targets to be specified for building (or not)
@@ -970,8 +970,8 @@ case "$enableval" in
         x86_64)   TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
         sparc)    TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
         powerpc)  TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
-        aarch64)  TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;;
-        arm64)    TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;;
+        aarch64)  TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
+        arm64)    TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
         arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
         mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         mipsel)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
@@ -989,7 +989,7 @@ case "$enableval" in
             x86_64)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
             Sparc)       TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
             PowerPC)     TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
-            AArch64)     TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;;
+            AArch64)     TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
             ARM)         TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
             Mips)        TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
             XCore)       TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;;
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index ca4af73d92c..1325e790c80 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -372,7 +372,7 @@ elseif (LLVM_NATIVE_ARCH MATCHES "powerpc")
 elseif (LLVM_NATIVE_ARCH MATCHES "aarch64")
   set(LLVM_NATIVE_ARCH AArch64)
 elseif (LLVM_NATIVE_ARCH MATCHES "arm64")
-  set(LLVM_NATIVE_ARCH ARM64)
+  set(LLVM_NATIVE_ARCH AArch64)
 elseif (LLVM_NATIVE_ARCH MATCHES "arm")
   set(LLVM_NATIVE_ARCH ARM)
 elseif (LLVM_NATIVE_ARCH MATCHES "mips")
diff --git a/configure b/configure
index a5babe9c230..e1959dfee6c 100755
--- a/configure
+++ b/configure
@@ -4151,9 +4151,9 @@ else
   amd64-* | x86_64-*)     llvm_cv_target_arch="x86_64" ;;
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
-  arm64*-*)               llvm_cv_target_arch="ARM64" ;;
+  arm64*-*)               llvm_cv_target_arch="AArch64" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
-  aarch64*-*)             llvm_cv_target_arch="ARM64" ;;
+  aarch64*-*)             llvm_cv_target_arch="AArch64" ;;
   mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
   mipsel-* | mips64el-*)  llvm_cv_target_arch="Mips" ;;
   xcore-*)                llvm_cv_target_arch="XCore" ;;
@@ -4188,9 +4188,9 @@ case $host in
   amd64-* | x86_64-*)     host_arch="x86_64" ;;
   sparc*-*)               host_arch="Sparc" ;;
   powerpc*-*)             host_arch="PowerPC" ;;
-  arm64*-*)               host_arch="ARM64" ;;
+  arm64*-*)               host_arch="AArch64" ;;
   arm*-*)                 host_arch="ARM" ;;
-  aarch64*-*)             host_arch="ARM64" ;;
+  aarch64*-*)             host_arch="AArch64" ;;
   mips-* | mips64-*)      host_arch="Mips" ;;
   mipsel-* | mips64el-*)  host_arch="Mips" ;;
   xcore-*)                host_arch="XCore" ;;
@@ -5120,7 +5120,7 @@ else
   esac
 fi
 
-TARGETS_WITH_JIT="ARM ARM64 Mips PowerPC SystemZ X86"
+TARGETS_WITH_JIT="ARM AArch64 Mips PowerPC SystemZ X86"
 TARGETS_WITH_JIT=$TARGETS_WITH_JIT
 
 
@@ -5357,7 +5357,7 @@ _ACEOF
 
 fi
 
-ALL_TARGETS="X86 Sparc PowerPC ARM ARM64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
+ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
 ALL_TARGETS=$ALL_TARGETS
 
 
@@ -5380,8 +5380,8 @@ case "$enableval" in
         x86_64)   TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
         sparc)    TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
         powerpc)  TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
-        aarch64)  TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;;
-        arm64)    TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;;
+        aarch64)  TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
+        arm64)    TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
         arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
         mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         mipsel)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
@@ -5399,7 +5399,7 @@ case "$enableval" in
             x86_64)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
             Sparc)       TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
             PowerPC)     TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
-            AArch64)     TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;;
+            AArch64)     TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
             ARM)         TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
             Mips)        TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
             XCore)       TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;;
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index fa8d3c0b75f..9b72eca7de5 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -6877,7 +6877,7 @@ register in surrounding code, including inline assembly. Because of that,
 allocatable registers are not supported.
 
 Warning: So far it only works with the stack pointer on selected
-architectures (ARM, ARM64, AArch64, PowerPC and x86_64). Significant amount of
+architectures (ARM, AArch64, PowerPC and x86_64). Significant amount of
 work is needed to support other registers and even more so, allocatable
 registers.
 
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index b133b4e4096..edd1621ef25 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -533,7 +533,7 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
 include "llvm/IR/IntrinsicsPowerPC.td"
 include "llvm/IR/IntrinsicsX86.td"
 include "llvm/IR/IntrinsicsARM.td"
-include "llvm/IR/IntrinsicsARM64.td"
+include "llvm/IR/IntrinsicsAArch64.td"
 include "llvm/IR/IntrinsicsXCore.td"
 include "llvm/IR/IntrinsicsHexagon.td"
 include "llvm/IR/IntrinsicsNVVM.td"
diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
new file mode 100644
index 00000000000..23757aaef5c
--- /dev/null
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@@ -0,0 +1,636 @@
+//===- IntrinsicsAARCH64.td - Defines AARCH64 intrinsics ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the AARCH64-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "aarch64" in {
+
+def int_aarch64_ldxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
+def int_aarch64_ldaxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
+def int_aarch64_stxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
+def int_aarch64_stlxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
+
+def int_aarch64_ldxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
+def int_aarch64_ldaxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
+def int_aarch64_stxp : Intrinsic<[llvm_i32_ty],
+                               [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>;
+def int_aarch64_stlxp : Intrinsic<[llvm_i32_ty],
+                                [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>;
+
+def int_aarch64_clrex : Intrinsic<[]>;
+
+def int_aarch64_sdiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+                                LLVMMatchType<0>], [IntrNoMem]>;
+def int_aarch64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+                                LLVMMatchType<0>], [IntrNoMem]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON)
+
+let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_2Scalar_Float_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+
+  class AdvSIMD_FPToIntRounding_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
+
+  class AdvSIMD_1IntArg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_1FloatArg_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Expand_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Long_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>;
+  class AdvSIMD_1IntArg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMExtendedType<0>], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Int_Across_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Float_Across_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+
+  class AdvSIMD_2IntArg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2FloatArg_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Compare_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
+  class AdvSIMD_2Arg_FloatCompare_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Long_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMTruncatedType<0>, LLVMTruncatedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Wide_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, LLVMTruncatedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMExtendedType<0>, LLVMExtendedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2Arg_Scalar_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMExtendedType<0>, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_anyvector_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMTruncatedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMTruncatedType<0>, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
+                [IntrNoMem]>;
+
+  class AdvSIMD_3VectorArg_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty],
+               [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+               [IntrNoMem]>;
+  class AdvSIMD_3VectorArg_Scalar_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty],
+               [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+               [IntrNoMem]>;
+  class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty],
+               [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty,
+                LLVMMatchType<1>], [IntrNoMem]>;
+  class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_CvtFxToFP_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_CvtFPToFx_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
+                [IntrNoMem]>;
+}
+
+// Arithmetic ops
+
+let Properties = [IntrNoMem] in {
+  // Vector Add Across Lanes
+  def int_aarch64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_faddv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+
+  // Vector Long Add Across Lanes
+  def int_aarch64_neon_saddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_uaddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+
+  // Vector Halving Add
+  def int_aarch64_neon_shadd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_uhadd : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Rounding Halving Add
+  def int_aarch64_neon_srhadd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_urhadd : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Saturating Add
+  def int_aarch64_neon_sqadd : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_suqadd : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_usqadd : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_uqadd : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Add High-Half
+  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
+  // header is no longer supported.
+  def int_aarch64_neon_addhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Rounding Add High-Half
+  def int_aarch64_neon_raddhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Saturating Doubling Multiply High
+  def int_aarch64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Saturating Rounding Doubling Multiply High
+  def int_aarch64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Polynominal Multiply
+  def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Long Multiply
+  def int_aarch64_neon_smull : AdvSIMD_2VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_umull : AdvSIMD_2VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_pmull : AdvSIMD_2VectorArg_Long_Intrinsic;
+
+  // 64-bit polynomial multiply really returns an i128, which is not legal. Fake
+  // it with a v16i8.
+  def int_aarch64_neon_pmull64 :
+        Intrinsic<[llvm_v16i8_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
+
+  // Vector Extending Multiply
+  def int_aarch64_neon_fmulx : AdvSIMD_2FloatArg_Intrinsic {
+    let Properties = [IntrNoMem, Commutative];
+  }
+
+  // Vector Saturating Doubling Long Multiply
+  def int_aarch64_neon_sqdmull : AdvSIMD_2VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_sqdmulls_scalar
+    : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  // Vector Halving Subtract
+  def int_aarch64_neon_shsub : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_uhsub : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Saturating Subtract
+  def int_aarch64_neon_sqsub : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_uqsub : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Subtract High-Half
+  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
+  // header is no longer supported.
+  def int_aarch64_neon_subhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Rounding Subtract High-Half
+  def int_aarch64_neon_rsubhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Compare Absolute Greater-than-or-equal
+  def int_aarch64_neon_facge : AdvSIMD_2Arg_FloatCompare_Intrinsic;
+
+  // Vector Compare Absolute Greater-than
+  def int_aarch64_neon_facgt : AdvSIMD_2Arg_FloatCompare_Intrinsic;
+
+  // Vector Absolute Difference
+  def int_aarch64_neon_sabd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_uabd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fabd : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Scalar Absolute Difference
+  def int_aarch64_sisd_fabd : AdvSIMD_2Scalar_Float_Intrinsic;
+
+  // Vector Max
+  def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Max Across Lanes
+  def int_aarch64_neon_smaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_umaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_fmaxv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+  def int_aarch64_neon_fmaxnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+
+  // Vector Min
+  def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Min/Max Number
+  def int_aarch64_neon_fminnm : AdvSIMD_2FloatArg_Intrinsic;
+  def int_aarch64_neon_fmaxnm : AdvSIMD_2FloatArg_Intrinsic;
+
+  // Vector Min Across Lanes
+  def int_aarch64_neon_sminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_uminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_fminv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+  def int_aarch64_neon_fminnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+
+  // Pairwise Add
+  def int_aarch64_neon_addp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Long Pairwise Add
+  // FIXME: In theory, we shouldn't need intrinsics for saddlp or
+  // uaddlp, but tblgen's type inference currently can't handle the
+  // pattern fragments this ends up generating.
+  def int_aarch64_neon_saddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
+  def int_aarch64_neon_uaddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
+
+  // Folding Maximum
+  def int_aarch64_neon_smaxp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_umaxp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmaxp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Folding Minimum
+  def int_aarch64_neon_sminp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_uminp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fminp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Reciprocal Estimate/Step
+  def int_aarch64_neon_frecps : AdvSIMD_2FloatArg_Intrinsic;
+  def int_aarch64_neon_frsqrts : AdvSIMD_2FloatArg_Intrinsic;
+
+  // Reciprocal Exponent
+  def int_aarch64_neon_frecpx : AdvSIMD_1FloatArg_Intrinsic;
+
+  // Vector Saturating Shift Left
+  def int_aarch64_neon_sqshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_uqshl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Rounding Shift Left
+  def int_aarch64_neon_srshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_urshl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Saturating Rounding Shift Left
+  def int_aarch64_neon_sqrshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_uqrshl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Signed->Unsigned Shift Left by Constant
+  def int_aarch64_neon_sqshlu : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Signed->Unsigned Narrowing Saturating Shift Right by Constant
+  def int_aarch64_neon_sqshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Signed->Unsigned Rounding Narrowing Saturating Shift Right by Const
+  def int_aarch64_neon_sqrshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Narrowing Shift Right by Constant
+  def int_aarch64_neon_sqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+  def int_aarch64_neon_uqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Rounding Narrowing Shift Right by Constant
+  def int_aarch64_neon_rshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Rounding Narrowing Saturating Shift Right by Constant
+  def int_aarch64_neon_sqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+  def int_aarch64_neon_uqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Shift Left
+  def int_aarch64_neon_sshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_ushl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Widening Shift Left by Constant
+  def int_aarch64_neon_shll : AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic;
+  def int_aarch64_neon_sshll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
+  def int_aarch64_neon_ushll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
+
+  // Vector Shift Right by Constant and Insert
+  def int_aarch64_neon_vsri : AdvSIMD_3VectorArg_Scalar_Intrinsic;
+
+  // Vector Shift Left by Constant and Insert
+  def int_aarch64_neon_vsli : AdvSIMD_3VectorArg_Scalar_Intrinsic;
+
+  // Vector Saturating Narrow
+  def int_aarch64_neon_scalar_sqxtn: AdvSIMD_1IntArg_Narrow_Intrinsic;
+  def int_aarch64_neon_scalar_uqxtn : AdvSIMD_1IntArg_Narrow_Intrinsic;
+  def int_aarch64_neon_sqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
+  def int_aarch64_neon_uqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
+
+  // Vector Saturating Extract and Unsigned Narrow
+  def int_aarch64_neon_scalar_sqxtun : AdvSIMD_1IntArg_Narrow_Intrinsic;
+  def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic;
+
+  // Vector Absolute Value
+  def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
+
+  // Vector Saturating Absolute Value
+  def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;
+
+  // Vector Saturating Negation
+  def int_aarch64_neon_sqneg : AdvSIMD_1IntArg_Intrinsic;
+
+  // Vector Count Leading Sign Bits
+  def int_aarch64_neon_cls : AdvSIMD_1VectorArg_Intrinsic;
+
+  // Vector Reciprocal Estimate
+  def int_aarch64_neon_urecpe : AdvSIMD_1VectorArg_Intrinsic;
+  def int_aarch64_neon_frecpe : AdvSIMD_1FloatArg_Intrinsic;
+
+  // Vector Square Root Estimate
+  def int_aarch64_neon_ursqrte : AdvSIMD_1VectorArg_Intrinsic;
+  def int_aarch64_neon_frsqrte : AdvSIMD_1FloatArg_Intrinsic;
+
+  // Vector Bitwise Reverse
+  def int_aarch64_neon_rbit : AdvSIMD_1VectorArg_Intrinsic;
+
+  // Vector Conversions Between Half-Precision and Single-Precision.
+  def int_aarch64_neon_vcvtfp2hf
+    : Intrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_aarch64_neon_vcvthf2fp
+    : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>;
+
+  // Vector Conversions Between Floating-point and Fixed-point.
+  def int_aarch64_neon_vcvtfp2fxs : AdvSIMD_CvtFPToFx_Intrinsic;
+  def int_aarch64_neon_vcvtfp2fxu : AdvSIMD_CvtFPToFx_Intrinsic;
+  def int_aarch64_neon_vcvtfxs2fp : AdvSIMD_CvtFxToFP_Intrinsic;
+  def int_aarch64_neon_vcvtfxu2fp : AdvSIMD_CvtFxToFP_Intrinsic;
+
+  // Vector FP->Int Conversions
+  def int_aarch64_neon_fcvtas : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtau : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtms : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtmu : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtns : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtnu : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtps : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtpu : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtzs : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtzu : AdvSIMD_FPToIntRounding_Intrinsic;
+
+  // Vector FP Rounding: only ties to even is unrepresented by a normal
+  // intrinsic.
+  def int_aarch64_neon_frintn : AdvSIMD_1FloatArg_Intrinsic;
+
+  // Scalar FP->Int conversions
+
+  // Vector FP Inexact Narrowing
+  def int_aarch64_neon_fcvtxn : AdvSIMD_1VectorArg_Expand_Intrinsic;
+
+  // Scalar FP Inexact Narrowing
+  def int_aarch64_sisd_fcvtxn : Intrinsic<[llvm_float_ty], [llvm_double_ty],
+                                        [IntrNoMem]>;
+}
+
+let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_2Vector2Index_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_anyvector_ty, llvm_i64_ty, LLVMMatchType<0>, llvm_i64_ty],
+                [IntrNoMem]>;
+}
+
+// Vector element to element moves
+def int_aarch64_neon_vcopy_lane: AdvSIMD_2Vector2Index_Intrinsic;
+
+let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_1Vec_Load_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty], [LLVMAnyPointerType<LLVMMatchType<0>>],
+                  [IntrReadArgMem]>;
+  class AdvSIMD_1Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<2>]>;
+
+  class AdvSIMD_2Vec_Load_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                [LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadArgMem]>;
+  class AdvSIMD_2Vec_Load_Lane_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                [LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadArgMem]>;
+  class AdvSIMD_2Vec_Store_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                     LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadWriteArgMem, NoCapture<2>]>;
+  class AdvSIMD_2Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<3>]>;
+
+  class AdvSIMD_3Vec_Load_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadArgMem]>;
+  class AdvSIMD_3Vec_Load_Lane_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadArgMem]>;
+  class AdvSIMD_3Vec_Store_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                     LLVMMatchType<0>, LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadWriteArgMem, NoCapture<3>]>;
+  class AdvSIMD_3Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<4>]>;
+
+  class AdvSIMD_4Vec_Load_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadArgMem]>;
+  class AdvSIMD_4Vec_Load_Lane_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadArgMem]>;
+  class AdvSIMD_4Vec_Store_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadWriteArgMem, NoCapture<4>]>;
+  class AdvSIMD_4Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<5>]>;
+}
+
+// Memory ops
+
+def int_aarch64_neon_ld1x2 : AdvSIMD_2Vec_Load_Intrinsic;
+def int_aarch64_neon_ld1x3 : AdvSIMD_3Vec_Load_Intrinsic;
+def int_aarch64_neon_ld1x4 : AdvSIMD_4Vec_Load_Intrinsic;
+
+def int_aarch64_neon_st1x2 : AdvSIMD_2Vec_Store_Intrinsic;
+def int_aarch64_neon_st1x3 : AdvSIMD_3Vec_Store_Intrinsic;
+def int_aarch64_neon_st1x4 : AdvSIMD_4Vec_Store_Intrinsic;
+
+def int_aarch64_neon_ld2 : AdvSIMD_2Vec_Load_Intrinsic;
+def int_aarch64_neon_ld3 : AdvSIMD_3Vec_Load_Intrinsic;
+def int_aarch64_neon_ld4 : AdvSIMD_4Vec_Load_Intrinsic;
+
+def int_aarch64_neon_ld2lane : AdvSIMD_2Vec_Load_Lane_Intrinsic;
+def int_aarch64_neon_ld3lane : AdvSIMD_3Vec_Load_Lane_Intrinsic;
+def int_aarch64_neon_ld4lane : AdvSIMD_4Vec_Load_Lane_Intrinsic;
+
+def int_aarch64_neon_ld2r : AdvSIMD_2Vec_Load_Intrinsic;
+def int_aarch64_neon_ld3r : AdvSIMD_3Vec_Load_Intrinsic;
+def int_aarch64_neon_ld4r : AdvSIMD_4Vec_Load_Intrinsic;
+
+def int_aarch64_neon_st2  : AdvSIMD_2Vec_Store_Intrinsic;
+def int_aarch64_neon_st3  : AdvSIMD_3Vec_Store_Intrinsic;
+def int_aarch64_neon_st4  : AdvSIMD_4Vec_Store_Intrinsic;
+
+def int_aarch64_neon_st2lane  : AdvSIMD_2Vec_Store_Lane_Intrinsic;
+def int_aarch64_neon_st3lane  : AdvSIMD_3Vec_Store_Lane_Intrinsic;
+def int_aarch64_neon_st4lane  : AdvSIMD_4Vec_Store_Lane_Intrinsic;
+
+let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_Tbl1_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbl2_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_Tbl3_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbl4_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+
+  class AdvSIMD_Tbx1_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbx2_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbx3_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
+                 llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbx4_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
+                 llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+}
+def int_aarch64_neon_tbl1 : AdvSIMD_Tbl1_Intrinsic;
+def int_aarch64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic;
+def int_aarch64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic;
+def int_aarch64_neon_tbl4 : AdvSIMD_Tbl4_Intrinsic;
+
+def int_aarch64_neon_tbx1 : AdvSIMD_Tbx1_Intrinsic;
+def int_aarch64_neon_tbx2 : AdvSIMD_Tbx2_Intrinsic;
+def int_aarch64_neon_tbx3 : AdvSIMD_Tbx3_Intrinsic;
+def int_aarch64_neon_tbx4 : AdvSIMD_Tbx4_Intrinsic;
+
+let TargetPrefix = "aarch64" in {
+  class Crypto_AES_DataKey_Intrinsic
+    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+
+  class Crypto_AES_Data_Intrinsic
+    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+
+  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
+  // (v4i32).
+  class Crypto_SHA_5Hash4Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+
+  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
+  // (v4i32).
+  class Crypto_SHA_1Hash_Intrinsic
+    : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  // SHA intrinsic taking 8 words of the schedule
+  class Crypto_SHA_8Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+
+  // SHA intrinsic taking 12 words of the schedule
+  class Crypto_SHA_12Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+
+  // SHA intrinsic taking 8 words of the hash and 4 of the schedule.
+  class Crypto_SHA_8Hash4Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+}
+
+// AES
+def int_aarch64_crypto_aese   : Crypto_AES_DataKey_Intrinsic;
+def int_aarch64_crypto_aesd   : Crypto_AES_DataKey_Intrinsic;
+def int_aarch64_crypto_aesmc  : Crypto_AES_Data_Intrinsic;
+def int_aarch64_crypto_aesimc : Crypto_AES_Data_Intrinsic;
+
+// SHA1
+def int_aarch64_crypto_sha1c  : Crypto_SHA_5Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha1p  : Crypto_SHA_5Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha1m  : Crypto_SHA_5Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha1h  : Crypto_SHA_1Hash_Intrinsic;
+
+def int_aarch64_crypto_sha1su0 : Crypto_SHA_12Schedule_Intrinsic;
+def int_aarch64_crypto_sha1su1 : Crypto_SHA_8Schedule_Intrinsic;
+
+// SHA256
+def int_aarch64_crypto_sha256h   : Crypto_SHA_8Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha256h2  : Crypto_SHA_8Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha256su0 : Crypto_SHA_8Schedule_Intrinsic;
+def int_aarch64_crypto_sha256su1 : Crypto_SHA_12Schedule_Intrinsic;
+
+//===----------------------------------------------------------------------===//
+// CRC32
+
+let TargetPrefix = "aarch64" in {
+
+def int_aarch64_crc32b  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32h  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32w  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32x  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32cx : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
+    [IntrNoMem]>;
+}
diff --git a/include/llvm/IR/IntrinsicsARM64.td b/include/llvm/IR/IntrinsicsARM64.td
deleted file mode 100644
index 146ea5d970c..00000000000
--- a/include/llvm/IR/IntrinsicsARM64.td
+++ /dev/null
@@ -1,636 +0,0 @@
-//===- IntrinsicsARM64.td - Defines ARM64 intrinsics -------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines all of the ARM64-specific intrinsics.
-//
-//===----------------------------------------------------------------------===//
-
-let TargetPrefix = "arm64" in {
-
-def int_arm64_ldxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
-def int_arm64_ldaxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
-def int_arm64_stxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
-def int_arm64_stlxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
-
-def int_arm64_ldxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
-def int_arm64_ldaxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
-def int_arm64_stxp : Intrinsic<[llvm_i32_ty],
-                               [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>;
-def int_arm64_stlxp : Intrinsic<[llvm_i32_ty],
-                                [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>;
-
-def int_arm64_clrex : Intrinsic<[]>;
-
-def int_arm64_sdiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
-                                LLVMMatchType<0>], [IntrNoMem]>;
-def int_arm64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
-                                LLVMMatchType<0>], [IntrNoMem]>;
-}
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD (NEON)
-
-let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
-  class AdvSIMD_2Scalar_Float_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-
-  class AdvSIMD_FPToIntRounding_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-
-  class AdvSIMD_1IntArg_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  class AdvSIMD_1FloatArg_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Expand_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>;
-  class AdvSIMD_1IntArg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [LLVMExtendedType<0>], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Int_Across_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Float_Across_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-  class AdvSIMD_2IntArg_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2FloatArg_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Compare_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
-                [IntrNoMem]>;
-  class AdvSIMD_2Arg_FloatCompare_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMTruncatedType<0>, LLVMTruncatedType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Wide_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, LLVMTruncatedType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMExtendedType<0>, LLVMExtendedType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2Arg_Scalar_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyint_ty],
-                [LLVMExtendedType<0>, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_anyvector_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMTruncatedType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMTruncatedType<0>, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
-                [IntrNoMem]>;
-
-  class AdvSIMD_3VectorArg_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty],
-               [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-               [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Scalar_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty],
-               [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-               [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty],
-               [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty,
-                LLVMMatchType<1>], [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_CvtFxToFP_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_CvtFPToFx_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
-                [IntrNoMem]>;
-}
-
-// Arithmetic ops
-
-let Properties = [IntrNoMem] in {
-  // Vector Add Across Lanes
-  def int_arm64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_faddv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-
-  // Vector Long Add Across Lanes
-  def int_arm64_neon_saddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_uaddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-
-  // Vector Halving Add
-  def int_arm64_neon_shadd : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_uhadd : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Rounding Halving Add
-  def int_arm64_neon_srhadd : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_urhadd : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Saturating Add
-  def int_arm64_neon_sqadd : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_suqadd : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_usqadd : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_uqadd : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Add High-Half
-  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
-  // header is no longer supported.
-  def int_arm64_neon_addhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
-
-  // Vector Rounding Add High-Half
-  def int_arm64_neon_raddhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
-
-  // Vector Saturating Doubling Multiply High
-  def int_arm64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Saturating Rounding Doubling Multiply High
-  def int_arm64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Polynominal Multiply
-  def int_arm64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Long Multiply
-  def int_arm64_neon_smull : AdvSIMD_2VectorArg_Long_Intrinsic;
-  def int_arm64_neon_umull : AdvSIMD_2VectorArg_Long_Intrinsic;
-  def int_arm64_neon_pmull : AdvSIMD_2VectorArg_Long_Intrinsic;
-
-  // 64-bit polynomial multiply really returns an i128, which is not legal. Fake
-  // it with a v16i8.
-  def int_arm64_neon_pmull64 :
-        Intrinsic<[llvm_v16i8_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
-
-  // Vector Extending Multiply
-  def int_arm64_neon_fmulx : AdvSIMD_2FloatArg_Intrinsic {
-    let Properties = [IntrNoMem, Commutative];
-  }
-
-  // Vector Saturating Doubling Long Multiply
-  def int_arm64_neon_sqdmull : AdvSIMD_2VectorArg_Long_Intrinsic;
-  def int_arm64_neon_sqdmulls_scalar
-    : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  // Vector Halving Subtract
-  def int_arm64_neon_shsub : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_uhsub : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Saturating Subtract
-  def int_arm64_neon_sqsub : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_uqsub : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Subtract High-Half
-  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
-  // header is no longer supported.
-  def int_arm64_neon_subhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
-
-  // Vector Rounding Subtract High-Half
-  def int_arm64_neon_rsubhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
-
-  // Vector Compare Absolute Greater-than-or-equal
-  def int_arm64_neon_facge : AdvSIMD_2Arg_FloatCompare_Intrinsic;
-
-  // Vector Compare Absolute Greater-than
-  def int_arm64_neon_facgt : AdvSIMD_2Arg_FloatCompare_Intrinsic;
-
-  // Vector Absolute Difference
-  def int_arm64_neon_sabd : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_uabd : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fabd : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Scalar Absolute Difference
-  def int_arm64_sisd_fabd : AdvSIMD_2Scalar_Float_Intrinsic;
-
-  // Vector Max
-  def int_arm64_neon_smax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Max Across Lanes
-  def int_arm64_neon_smaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_umaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_fmaxv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-  def int_arm64_neon_fmaxnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-
-  // Vector Min
-  def int_arm64_neon_smin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_umin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Min/Max Number
-  def int_arm64_neon_fminnm : AdvSIMD_2FloatArg_Intrinsic;
-  def int_arm64_neon_fmaxnm : AdvSIMD_2FloatArg_Intrinsic;
-
-  // Vector Min Across Lanes
-  def int_arm64_neon_sminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_uminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_fminv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-  def int_arm64_neon_fminnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-
-  // Pairwise Add
-  def int_arm64_neon_addp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Long Pairwise Add
-  // FIXME: In theory, we shouldn't need intrinsics for saddlp or
-  // uaddlp, but tblgen's type inference currently can't handle the
-  // pattern fragments this ends up generating.
-  def int_arm64_neon_saddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
-  def int_arm64_neon_uaddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
-
-  // Folding Maximum
-  def int_arm64_neon_smaxp : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_umaxp : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fmaxp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Folding Minimum
-  def int_arm64_neon_sminp : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_uminp : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fminp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Reciprocal Estimate/Step
-  def int_arm64_neon_frecps : AdvSIMD_2FloatArg_Intrinsic;
-  def int_arm64_neon_frsqrts : AdvSIMD_2FloatArg_Intrinsic;
-
-  // Reciprocal Exponent
-  def int_arm64_neon_frecpx : AdvSIMD_1FloatArg_Intrinsic;
-
-  // Vector Saturating Shift Left
-  def int_arm64_neon_sqshl : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_uqshl : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Rounding Shift Left
-  def int_arm64_neon_srshl : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_urshl : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Saturating Rounding Shift Left
-  def int_arm64_neon_sqrshl : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_uqrshl : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Signed->Unsigned Shift Left by Constant
-  def int_arm64_neon_sqshlu : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Signed->Unsigned Narrowing Saturating Shift Right by Constant
-  def int_arm64_neon_sqshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Signed->Unsigned Rounding Narrowing Saturating Shift Right by Const
-  def int_arm64_neon_sqrshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Narrowing Shift Right by Constant
-  def int_arm64_neon_sqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-  def int_arm64_neon_uqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Rounding Narrowing Shift Right by Constant
-  def int_arm64_neon_rshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Rounding Narrowing Saturating Shift Right by Constant
-  def int_arm64_neon_sqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-  def int_arm64_neon_uqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Shift Left
-  def int_arm64_neon_sshl : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_ushl : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Widening Shift Left by Constant
-  def int_arm64_neon_shll : AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic;
-  def int_arm64_neon_sshll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
-  def int_arm64_neon_ushll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
-
-  // Vector Shift Right by Constant and Insert
-  def int_arm64_neon_vsri : AdvSIMD_3VectorArg_Scalar_Intrinsic;
-
-  // Vector Shift Left by Constant and Insert
-  def int_arm64_neon_vsli : AdvSIMD_3VectorArg_Scalar_Intrinsic;
-
-  // Vector Saturating Narrow
-  def int_arm64_neon_scalar_sqxtn: AdvSIMD_1IntArg_Narrow_Intrinsic;
-  def int_arm64_neon_scalar_uqxtn : AdvSIMD_1IntArg_Narrow_Intrinsic;
-  def int_arm64_neon_sqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
-  def int_arm64_neon_uqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
-
-  // Vector Saturating Extract and Unsigned Narrow
-  def int_arm64_neon_scalar_sqxtun : AdvSIMD_1IntArg_Narrow_Intrinsic;
-  def int_arm64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic;
-
-  // Vector Absolute Value
-  def int_arm64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
-
-  // Vector Saturating Absolute Value
-  def int_arm64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;
-
-  // Vector Saturating Negation
-  def int_arm64_neon_sqneg : AdvSIMD_1IntArg_Intrinsic;
-
-  // Vector Count Leading Sign Bits
-  def int_arm64_neon_cls : AdvSIMD_1VectorArg_Intrinsic;
-
-  // Vector Reciprocal Estimate
-  def int_arm64_neon_urecpe : AdvSIMD_1VectorArg_Intrinsic;
-  def int_arm64_neon_frecpe : AdvSIMD_1FloatArg_Intrinsic;
-
-  // Vector Square Root Estimate
-  def int_arm64_neon_ursqrte : AdvSIMD_1VectorArg_Intrinsic;
-  def int_arm64_neon_frsqrte : AdvSIMD_1FloatArg_Intrinsic;
-
-  // Vector Bitwise Reverse
-  def int_arm64_neon_rbit : AdvSIMD_1VectorArg_Intrinsic;
-
-  // Vector Conversions Between Half-Precision and Single-Precision.
-  def int_arm64_neon_vcvtfp2hf
-    : Intrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_arm64_neon_vcvthf2fp
-    : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>;
-
-  // Vector Conversions Between Floating-point and Fixed-point.
-  def int_arm64_neon_vcvtfp2fxs : AdvSIMD_CvtFPToFx_Intrinsic;
-  def int_arm64_neon_vcvtfp2fxu : AdvSIMD_CvtFPToFx_Intrinsic;
-  def int_arm64_neon_vcvtfxs2fp : AdvSIMD_CvtFxToFP_Intrinsic;
-  def int_arm64_neon_vcvtfxu2fp : AdvSIMD_CvtFxToFP_Intrinsic;
-
-  // Vector FP->Int Conversions
-  def int_arm64_neon_fcvtas : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtau : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtms : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtmu : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtns : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtnu : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtps : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtpu : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtzs : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtzu : AdvSIMD_FPToIntRounding_Intrinsic;
-
-  // Vector FP Rounding: only ties to even is unrepresented by a normal
-  // intrinsic.
-  def int_arm64_neon_frintn : AdvSIMD_1FloatArg_Intrinsic;
-
-  // Scalar FP->Int conversions
-
-  // Vector FP Inexact Narrowing
-  def int_arm64_neon_fcvtxn : AdvSIMD_1VectorArg_Expand_Intrinsic;
-
-  // Scalar FP Inexact Narrowing
-  def int_arm64_sisd_fcvtxn : Intrinsic<[llvm_float_ty], [llvm_double_ty],
-                                        [IntrNoMem]>;
-}
-
-let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
-  class AdvSIMD_2Vector2Index_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_anyvector_ty, llvm_i64_ty, LLVMMatchType<0>, llvm_i64_ty],
-                [IntrNoMem]>;
-}
-
-// Vector element to element moves
-def int_arm64_neon_vcopy_lane: AdvSIMD_2Vector2Index_Intrinsic;
-
-let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
-  class AdvSIMD_1Vec_Load_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty], [LLVMAnyPointerType<LLVMMatchType<0>>],
-                  [IntrReadArgMem]>;
-  class AdvSIMD_1Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadWriteArgMem, NoCapture<2>]>;
-
-  class AdvSIMD_2Vec_Load_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                [LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadArgMem]>;
-  class AdvSIMD_2Vec_Load_Lane_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                [LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadArgMem]>;
-  class AdvSIMD_2Vec_Store_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                     LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadWriteArgMem, NoCapture<2>]>;
-  class AdvSIMD_2Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadWriteArgMem, NoCapture<3>]>;
-
-  class AdvSIMD_3Vec_Load_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
-                [LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadArgMem]>;
-  class AdvSIMD_3Vec_Load_Lane_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
-                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadArgMem]>;
-  class AdvSIMD_3Vec_Store_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                     LLVMMatchType<0>, LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadWriteArgMem, NoCapture<3>]>;
-  class AdvSIMD_3Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty,
-                 LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadWriteArgMem, NoCapture<4>]>;
-
-  class AdvSIMD_4Vec_Load_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>],
-                [LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadArgMem]>;
-  class AdvSIMD_4Vec_Load_Lane_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>],
-                [LLVMMatchType<0>, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadArgMem]>;
-  class AdvSIMD_4Vec_Store_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>,
-                 LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadWriteArgMem, NoCapture<4>]>;
-  class AdvSIMD_4Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadWriteArgMem, NoCapture<5>]>;
-}
-
-// Memory ops
-
-def int_arm64_neon_ld1x2 : AdvSIMD_2Vec_Load_Intrinsic;
-def int_arm64_neon_ld1x3 : AdvSIMD_3Vec_Load_Intrinsic;
-def int_arm64_neon_ld1x4 : AdvSIMD_4Vec_Load_Intrinsic;
-
-def int_arm64_neon_st1x2 : AdvSIMD_2Vec_Store_Intrinsic;
-def int_arm64_neon_st1x3 : AdvSIMD_3Vec_Store_Intrinsic;
-def int_arm64_neon_st1x4 : AdvSIMD_4Vec_Store_Intrinsic;
-
-def int_arm64_neon_ld2 : AdvSIMD_2Vec_Load_Intrinsic;
-def int_arm64_neon_ld3 : AdvSIMD_3Vec_Load_Intrinsic;
-def int_arm64_neon_ld4 : AdvSIMD_4Vec_Load_Intrinsic;
-
-def int_arm64_neon_ld2lane : AdvSIMD_2Vec_Load_Lane_Intrinsic;
-def int_arm64_neon_ld3lane : AdvSIMD_3Vec_Load_Lane_Intrinsic;
-def int_arm64_neon_ld4lane : AdvSIMD_4Vec_Load_Lane_Intrinsic;
-
-def int_arm64_neon_ld2r : AdvSIMD_2Vec_Load_Intrinsic;
-def int_arm64_neon_ld3r : AdvSIMD_3Vec_Load_Intrinsic;
-def int_arm64_neon_ld4r : AdvSIMD_4Vec_Load_Intrinsic;
-
-def int_arm64_neon_st2  : AdvSIMD_2Vec_Store_Intrinsic;
-def int_arm64_neon_st3  : AdvSIMD_3Vec_Store_Intrinsic;
-def int_arm64_neon_st4  : AdvSIMD_4Vec_Store_Intrinsic;
-
-def int_arm64_neon_st2lane  : AdvSIMD_2Vec_Store_Lane_Intrinsic;
-def int_arm64_neon_st3lane  : AdvSIMD_3Vec_Store_Lane_Intrinsic;
-def int_arm64_neon_st4lane  : AdvSIMD_4Vec_Store_Lane_Intrinsic;
-
-let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
-  class AdvSIMD_Tbl1_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbl2_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
-  class AdvSIMD_Tbl3_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
-                 LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbl4_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
-                 LLVMMatchType<0>],
-                [IntrNoMem]>;
-
-  class AdvSIMD_Tbx1_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_v16i8_ty, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbx2_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-                 LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbx3_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-                 llvm_v16i8_ty, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbx4_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-                 llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
-                [IntrNoMem]>;
-}
-def int_arm64_neon_tbl1 : AdvSIMD_Tbl1_Intrinsic;
-def int_arm64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic;
-def int_arm64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic;
-def int_arm64_neon_tbl4 : AdvSIMD_Tbl4_Intrinsic;
-
-def int_arm64_neon_tbx1 : AdvSIMD_Tbx1_Intrinsic;
-def int_arm64_neon_tbx2 : AdvSIMD_Tbx2_Intrinsic;
-def int_arm64_neon_tbx3 : AdvSIMD_Tbx3_Intrinsic;
-def int_arm64_neon_tbx4 : AdvSIMD_Tbx4_Intrinsic;
-
-let TargetPrefix = "arm64" in {
-  class Crypto_AES_DataKey_Intrinsic
-    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-
-  class Crypto_AES_Data_Intrinsic
-    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-
-  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
-  // (v4i32).
-  class Crypto_SHA_5Hash4Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
-                [IntrNoMem]>;
-
-  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
-  // (v4i32).
-  class Crypto_SHA_1Hash_Intrinsic
-    : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-
-  // SHA intrinsic taking 8 words of the schedule
-  class Crypto_SHA_8Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-
-  // SHA intrinsic taking 12 words of the schedule
-  class Crypto_SHA_12Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
-                [IntrNoMem]>;
-
-  // SHA intrinsic taking 8 words of the hash and 4 of the schedule.
-  class Crypto_SHA_8Hash4Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
-                [IntrNoMem]>;
-}
-
-// AES
-def int_arm64_crypto_aese   : Crypto_AES_DataKey_Intrinsic;
-def int_arm64_crypto_aesd   : Crypto_AES_DataKey_Intrinsic;
-def int_arm64_crypto_aesmc  : Crypto_AES_Data_Intrinsic;
-def int_arm64_crypto_aesimc : Crypto_AES_Data_Intrinsic;
-
-// SHA1
-def int_arm64_crypto_sha1c  : Crypto_SHA_5Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha1p  : Crypto_SHA_5Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha1m  : Crypto_SHA_5Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha1h  : Crypto_SHA_1Hash_Intrinsic;
-
-def int_arm64_crypto_sha1su0 : Crypto_SHA_12Schedule_Intrinsic;
-def int_arm64_crypto_sha1su1 : Crypto_SHA_8Schedule_Intrinsic;
-
-// SHA256
-def int_arm64_crypto_sha256h   : Crypto_SHA_8Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha256h2  : Crypto_SHA_8Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha256su0 : Crypto_SHA_8Schedule_Intrinsic;
-def int_arm64_crypto_sha256su1 : Crypto_SHA_12Schedule_Intrinsic;
-
-//===----------------------------------------------------------------------===//
-// CRC32
-
-let TargetPrefix = "arm64" in {
-
-def int_arm64_crc32b  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32h  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32w  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32x  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32cx : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
-    [IntrNoMem]>;
-}
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index a70b03d95cf..2b425fbdd33 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -168,8 +168,9 @@ void RuntimeDyldMachO::resolveRelocation(const RelocationEntry &RE,
   case Triple::thumb:
     resolveARMRelocation(RE, Value);
     break;
+  case Triple::aarch64:
   case Triple::arm64:
-    resolveARM64Relocation(RE, Value);
+    resolveAArch64Relocation(RE, Value);
     break;
   }
 }
@@ -289,8 +290,8 @@ bool RuntimeDyldMachO::resolveARMRelocation(const RelocationEntry &RE,
   return false;
 }
 
-bool RuntimeDyldMachO::resolveARM64Relocation(const RelocationEntry &RE,
-                                              uint64_t Value) {
+bool RuntimeDyldMachO::resolveAArch64Relocation(const RelocationEntry &RE,
+                                                uint64_t Value) {
   const SectionEntry &Section = Sections[RE.SectionID];
   uint8_t* LocalAddress = Section.Address + RE.Offset;
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 08573eed5c8..060eb8c29a2 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -41,7 +41,7 @@ private:
   bool resolveI386Relocation(const RelocationEntry &RE, uint64_t Value);
   bool resolveX86_64Relocation(const RelocationEntry &RE, uint64_t Value);
   bool resolveARMRelocation(const RelocationEntry &RE, uint64_t Value);
-  bool resolveARM64Relocation(const RelocationEntry &RE, uint64_t Value);
+  bool resolveAArch64Relocation(const RelocationEntry &RE, uint64_t Value);
 
   // Populate stubs in __jump_table section.
   void populateJumpTable(MachOObjectFile &Obj, const SectionRef &JTSection,
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 028c1912717..99236bd24ea 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -312,7 +312,8 @@ bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
       MCpu = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       MCpu = "yonah";
-    else if (Triple.getArch() == llvm::Triple::arm64)
+    else if (Triple.getArch() == llvm::Triple::arm64 ||
+             Triple.getArch() == llvm::Triple::aarch64)
       MCpu = "cyclone";
   }
 
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index 255951a7070..d1175142651 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -168,7 +168,8 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
       CPU = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       CPU = "yonah";
-    else if (Triple.getArch() == llvm::Triple::arm64)
+    else if (Triple.getArch() == llvm::Triple::arm64 ||
+             Triple.getArch() == llvm::Triple::aarch64)
       CPU = "cyclone";
   }
 
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index bb132799504..9d413afe5db 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -23,7 +23,8 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
   IsFunctionEHFrameSymbolPrivate = false;
   SupportsWeakOmittedEHFrame = false;
 
-  if (T.isOSDarwin() && T.getArch() == Triple::arm64)
+  if (T.isOSDarwin() &&
+      (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64))
     SupportsCompactUnwindWithoutEHFrame = true;
 
   PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel
@@ -151,7 +152,8 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
   COFFDebugSymbolsSection = nullptr;
 
   if ((T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) ||
-      (T.isOSDarwin() && T.getArch() == Triple::arm64)) {
+      (T.isOSDarwin() &&
+       (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64))) {
     CompactUnwindSection =
       Ctx->getMachOSection("__LD", "__compact_unwind",
                            MachO::S_ATTR_DEBUG,
@@ -159,7 +161,7 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
 
     if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86)
       CompactUnwindDwarfEHFrameOnly = 0x04000000;
-    else if (T.getArch() == Triple::arm64)
+    else if (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64)
       CompactUnwindDwarfEHFrameOnly = 0x03000000;
   }
 
@@ -785,7 +787,7 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
   // cellspu-apple-darwin. Perhaps we should fix in Triple?
   if ((Arch == Triple::x86 || Arch == Triple::x86_64 ||
        Arch == Triple::arm || Arch == Triple::thumb ||
-       Arch == Triple::arm64 ||
+       Arch == Triple::arm64 || Arch == Triple::aarch64 ||
        Arch == Triple::ppc || Arch == Triple::ppc64 ||
        Arch == Triple::UnknownArch) &&
       (T.isOSDarwin() || T.isOSBinFormatMachO())) {
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
new file mode 100644
index 00000000000..1c022aaf86b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64.h
@@ -0,0 +1,49 @@
+//==-- AArch64.h - Top-level interface for AArch64  --------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// AArch64 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_AArch64_H
+#define TARGET_AArch64_H
+
+#include "Utils/AArch64BaseInfo.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class AArch64TargetMachine;
+class FunctionPass;
+class MachineFunctionPass;
+
+FunctionPass *createAArch64DeadRegisterDefinitions();
+FunctionPass *createAArch64ConditionalCompares();
+FunctionPass *createAArch64AdvSIMDScalar();
+FunctionPass *createAArch64BranchRelaxation();
+FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
+                                 CodeGenOpt::Level OptLevel);
+FunctionPass *createAArch64StorePairSuppressPass();
+FunctionPass *createAArch64ExpandPseudoPass();
+FunctionPass *createAArch64LoadStoreOptimizationPass();
+ModulePass *createAArch64PromoteConstantPass();
+FunctionPass *createAArch64AddressTypePromotionPass();
+/// \brief Creates an ARM-specific Target Transformation Info pass.
+ImmutablePass *
+createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);
+
+FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
+
+FunctionPass *createAArch64CollectLOHPass();
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
new file mode 100644
index 00000000000..1ad5ac8c6f3
--- /dev/null
+++ b/lib/Target/AArch64/AArch64.td
@@ -0,0 +1,134 @@
+//=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// AArch64 Subtarget features.
+//
+
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
+                                       "Enable ARMv8 FP">;
+
+def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
+  "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
+
+def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+  "Enable cryptographic instructions">;
+
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+  "Enable ARMv8 CRC-32 checksum instructions">;
+
+/// Cyclone has register move instructions which are "free".
+def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
+                                        "Has zero-cycle register moves">;
+
+/// Cyclone has instructions which zero registers for "free".
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                        "Has zero-cycle zeroing instructions">;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "AArch64RegisterInfo.td"
+include "AArch64CallingConvention.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "AArch64Schedule.td"
+include "AArch64InstrInfo.td"
+
+def AArch64InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// AArch64 Processors supported.
+//
+include "AArch64SchedA53.td"
+include "AArch64SchedCyclone.td"
+
+def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+                                   "Cortex-A53 ARM processors",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC]>;
+
+def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+                                   "Cortex-A57 ARM processors",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC]>;
+
+def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
+                                   "Cyclone",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC,
+                                   FeatureZCRegMove, FeatureZCZeroing]>;
+
+def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
+                                              FeatureNEON,
+                                              FeatureCRC]>;
+
+def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
+def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>;
+def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
+
+//===----------------------------------------------------------------------===//
+// Assembly parser
+//===----------------------------------------------------------------------===//
+
+def GenericAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string Name = "generic";
+}
+
+def AppleAsmParserVariant : AsmParserVariant {
+  int Variant = 1;
+  string Name = "apple-neon";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+// AArch64 Uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def GenericAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
+
+def AppleAsmWriter : AsmWriter {
+  let AsmWriterClassName = "AppleInstPrinter";
+  int Variant = 1;
+  int isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def AArch64 : Target {
+  let InstructionSet = AArch64InstrInfo;
+  let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
+  let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
+}
diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
new file mode 100644
index 00000000000..04906f6078f
--- /dev/null
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -0,0 +1,492 @@
+//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to promote the computations use to obtained a sign extended
+// value used into memory accesses.
+// E.g.
+// a = add nsw i32 b, 3
+// d = sext i32 a to i64
+// e = getelementptr ..., i64 d
+//
+// =>
+// f = sext i32 b to i64
+// a = add nsw i64 f, 3
+// e = getelementptr ..., i64 a
+//
+// This is legal to do so if the computations are markers with either nsw or nuw
+// markers.
+// Moreover, the current heuristic is simple: it does not create new sext
+// operations, i.e., it gives up when a sext would have forked (e.g., if
+// a = add i32 b, c, two sexts are required to promote the computation).
+//
+// FIXME: This pass may be useful for other targets too.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-type-promotion"
+
+static cl::opt<bool>
+EnableAddressTypePromotion("aarch64-type-promotion", cl::Hidden,
+                           cl::desc("Enable the type promotion pass"),
+                           cl::init(true));
+static cl::opt<bool>
+EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
+            cl::desc("Enable merging of redundant sexts when one is dominating"
+                     " the other."),
+            cl::init(true));
+
+//===----------------------------------------------------------------------===//
+//                       AArch64AddressTypePromotion
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+void initializeAArch64AddressTypePromotionPass(PassRegistry &);
+}
+
+namespace {
+class AArch64AddressTypePromotion : public FunctionPass {
+
+public:
+  static char ID;
+  AArch64AddressTypePromotion()
+      : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) {
+    initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "AArch64 Address Type Promotion";
+  }
+
+  /// Iterate over the functions and promote the computation of interesting
+  // sext instructions.
+  bool runOnFunction(Function &F) override;
+
+private:
+  /// The current function.
+  Function *Func;
+  /// Filter out all sexts that does not have this type.
+  /// Currently initialized with Int64Ty.
+  Type *ConsideredSExtType;
+
+  // This transformation requires dominator info.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  typedef SmallPtrSet<Instruction *, 32> SetOfInstructions;
+  typedef SmallVector<Instruction *, 16> Instructions;
+  typedef DenseMap<Value *, Instructions> ValueToInsts;
+
+  /// Check if it is profitable to move a sext through this instruction.
+  /// Currently, we consider it is profitable if:
+  /// - Inst is used only once (no need to insert truncate).
+  /// - Inst has only one operand that will require a sext operation (we do
+  ///   do not create new sext operation).
+  bool shouldGetThrough(const Instruction *Inst);
+
+  /// Check if it is possible and legal to move a sext through this
+  /// instruction.
+  /// Current heuristic considers that we can get through:
+  /// - Arithmetic operation marked with the nsw or nuw flag.
+  /// - Other sext operation.
+  /// - Truncate operation if it was just dropping sign extended bits.
+  bool canGetThrough(const Instruction *Inst);
+
+  /// Move sext operations through safe to sext instructions.
+  bool propagateSignExtension(Instructions &SExtInsts);
+
+  /// Is this sext should be considered for code motion.
+  /// We look for sext with ConsideredSExtType and uses in at least one
+  // GetElementPtrInst.
+  bool shouldConsiderSExt(const Instruction *SExt) const;
+
+  /// Collect all interesting sext operations, i.e., the ones with the right
+  /// type and used in memory accesses.
+  /// More precisely, a sext instruction is considered as interesting if it
+  /// is used in a "complex" getelementptr or it exits at least another
+  /// sext instruction that sign extended the same initial value.
+  /// A getelementptr is considered as "complex" if it has more than 2
+  // operands.
+  void analyzeSExtension(Instructions &SExtInsts);
+
+  /// Merge redundant sign extension operations in common dominator.
+  void mergeSExts(ValueToInsts &ValToSExtendedUses,
+                  SetOfInstructions &ToRemove);
+};
+} // end anonymous namespace.
+
+char AArch64AddressTypePromotion::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
+                      "AArch64 Type Promotion Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
+                    "AArch64 Type Promotion Pass", false, false)
+
+FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
+  return new AArch64AddressTypePromotion();
+}
+
+bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
+  if (isa<SExtInst>(Inst))
+    return true;
+
+  const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+  if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
+      (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
+    return true;
+
+  // sext(trunc(sext)) --> sext
+  if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) {
+    const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0));
+    // Check that the truncate just drop sign extended bits.
+    if (Inst->getType()->getIntegerBitWidth() >=
+            Opnd->getOperand(0)->getType()->getIntegerBitWidth() &&
+        Inst->getOperand(0)->getType()->getIntegerBitWidth() <=
+            ConsideredSExtType->getIntegerBitWidth())
+      return true;
+  }
+
+  return false;
+}
+
+bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
+  // If the type of the sext is the same as the considered one, this sext
+  // will become useless.
+  // Otherwise, we will have to do something to preserve the original value,
+  // unless it is used once.
+  if (isa<SExtInst>(Inst) &&
+      (Inst->getType() == ConsideredSExtType || Inst->hasOneUse()))
+    return true;
+
+  // If the Inst is used more that once, we may need to insert truncate
+  // operations and we don't do that at the moment.
+  if (!Inst->hasOneUse())
+    return false;
+
+  // This truncate is used only once, thus if we can get thourgh, it will become
+  // useless.
+  if (isa<TruncInst>(Inst))
+    return true;
+
+  // If both operands are not constant, a new sext will be created here.
+  // Current heuristic is: each step should be profitable.
+  // Therefore we don't allow to increase the number of sext even if it may
+  // be profitable later on.
+  if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1)))
+    return true;
+
+  return false;
+}
+
+static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
+  if (isa<SelectInst>(Inst) && OpIdx == 0)
+    return false;
+  return true;
+}
+
+bool
+AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
+  if (SExt->getType() != ConsideredSExtType)
+    return false;
+
+  for (const Use &U : SExt->uses()) {
+    if (isa<GetElementPtrInst>(*U))
+      return true;
+  }
+
+  return false;
+}
+
+// Input:
+// - SExtInsts contains all the sext instructions that are use direclty in
+//   GetElementPtrInst, i.e., access to memory.
+// Algorithm:
+// - For each sext operation in SExtInsts:
+//   Let var be the operand of sext.
+//   while it is profitable (see shouldGetThrough), legal, and safe
+//   (see canGetThrough) to move sext through var's definition:
+//   * promote the type of var's definition.
+//   * fold var into sext uses.
+//   * move sext above var's definition.
+//   * update sext operand to use the operand of var that should be sign
+//     extended (by construction there is only one).
+//
+//   E.g.,
+//   a = ... i32 c, 3
+//   b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a'
+//   ...
+//   = b
+// => Yes, update the code
+//   b = sext i32 c to i64
+//   a = ... i64 b, 3
+//   ...
+//   = a
+// Iterate on 'c'.
+bool
+AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
+  DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
+
+  bool LocalChange = false;
+  SetOfInstructions ToRemove;
+  ValueToInsts ValToSExtendedUses;
+  while (!SExtInsts.empty()) {
+    // Get through simple chain.
+    Instruction *SExt = SExtInsts.pop_back_val();
+
+    DEBUG(dbgs() << "Consider:\n" << *SExt << '\n');
+
+    // If this SExt has already been merged continue.
+    if (SExt->use_empty() && ToRemove.count(SExt)) {
+      DEBUG(dbgs() << "No uses => marked as delete\n");
+      continue;
+    }
+
+    // Now try to get through the chain of definitions.
+    while (isa<Instruction>(SExt->getOperand(0))) {
+      Instruction *Inst = dyn_cast<Instruction>(SExt->getOperand(0));
+      DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
+      if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
+        // We cannot get through something that is not an Instruction
+        // or not safe to SExt.
+        DEBUG(dbgs() << "Cannot get through\n");
+        break;
+      }
+
+      LocalChange = true;
+      // If this is a sign extend, it becomes useless.
+      if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) {
+        DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n");
+        // We cannot use replaceAllUsesWith here because we may trigger some
+        // assertion on the type as all involved sext operation may have not
+        // been moved yet.
+        while (!Inst->use_empty()) {
+          Value::use_iterator UseIt = Inst->use_begin();
+          Instruction *UseInst = dyn_cast<Instruction>(*UseIt);
+          assert(UseInst && "Use of sext is not an Instruction!");
+          UseInst->setOperand(UseIt->getOperandNo(), SExt);
+        }
+        ToRemove.insert(Inst);
+        SExt->setOperand(0, Inst->getOperand(0));
+        SExt->moveBefore(Inst);
+        continue;
+      }
+
+      // Get through the Instruction:
+      // 1. Update its type.
+      // 2. Replace the uses of SExt by Inst.
+      // 3. Sign extend each operand that needs to be sign extended.
+
+      // Step #1.
+      Inst->mutateType(SExt->getType());
+      // Step #2.
+      SExt->replaceAllUsesWith(Inst);
+      // Step #3.
+      Instruction *SExtForOpnd = SExt;
+
+      DEBUG(dbgs() << "Propagate SExt to operands\n");
+      for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx;
+           ++OpIdx) {
+        DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n');
+        if (Inst->getOperand(OpIdx)->getType() == SExt->getType() ||
+            !shouldSExtOperand(Inst, OpIdx)) {
+          DEBUG(dbgs() << "No need to propagate\n");
+          continue;
+        }
+        // Check if we can statically sign extend the operand.
+        Value *Opnd = Inst->getOperand(OpIdx);
+        if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
+          DEBUG(dbgs() << "Statically sign extend\n");
+          Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(),
+                                                         Cst->getSExtValue()));
+          continue;
+        }
+        // UndefValue are typed, so we have to statically sign extend them.
+        if (isa<UndefValue>(Opnd)) {
+          DEBUG(dbgs() << "Statically sign extend\n");
+          Inst->setOperand(OpIdx, UndefValue::get(SExt->getType()));
+          continue;
+        }
+
+        // Otherwise we have to explicity sign extend it.
+        assert(SExtForOpnd &&
+               "Only one operand should have been sign extended");
+
+        SExtForOpnd->setOperand(0, Opnd);
+
+        DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n");
+        // Move the sign extension before the insertion point.
+        SExtForOpnd->moveBefore(Inst);
+        Inst->setOperand(OpIdx, SExtForOpnd);
+        // If more sext are required, new instructions will have to be created.
+        SExtForOpnd = nullptr;
+      }
+      if (SExtForOpnd == SExt) {
+        DEBUG(dbgs() << "Sign extension is useless now\n");
+        ToRemove.insert(SExt);
+        break;
+      }
+    }
+
+    // If the use is already of the right type, connect its uses to its argument
+    // and delete it.
+    // This can happen for an Instruction which all uses are sign extended.
+    if (!ToRemove.count(SExt) &&
+        SExt->getType() == SExt->getOperand(0)->getType()) {
+      DEBUG(dbgs() << "Sign extension is useless, attach its use to "
+                      "its argument\n");
+      SExt->replaceAllUsesWith(SExt->getOperand(0));
+      ToRemove.insert(SExt);
+    } else
+      ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt);
+  }
+
+  if (EnableMerge)
+    mergeSExts(ValToSExtendedUses, ToRemove);
+
+  // Remove all instructions marked as ToRemove.
+  for (Instruction *I: ToRemove)
+    I->eraseFromParent();
+  return LocalChange;
+}
+
+void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
+                                             SetOfInstructions &ToRemove) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  for (auto &Entry : ValToSExtendedUses) {
+    Instructions &Insts = Entry.second;
+    Instructions CurPts;
+    for (Instruction *Inst : Insts) {
+      if (ToRemove.count(Inst))
+        continue;
+      bool inserted = false;
+      for (auto Pt : CurPts) {
+        if (DT.dominates(Inst, Pt)) {
+          DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n"
+                       << *Inst << '\n');
+          (Pt)->replaceAllUsesWith(Inst);
+          ToRemove.insert(Pt);
+          Pt = Inst;
+          inserted = true;
+          break;
+        }
+        if (!DT.dominates(Pt, Inst))
+          // Give up if we need to merge in a common dominator as the
+          // expermients show it is not profitable.
+          continue;
+
+        DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n"
+                     << *Pt << '\n');
+        Inst->replaceAllUsesWith(Pt);
+        ToRemove.insert(Inst);
+        inserted = true;
+        break;
+      }
+      if (!inserted)
+        CurPts.push_back(Inst);
+    }
+  }
+}
+
+void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
+  DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
+
+  DenseMap<Value *, Instruction *> SeenChains;
+
+  for (auto &BB : *Func) {
+    for (auto &II : BB) {
+      Instruction *SExt = &II;
+
+      // Collect all sext operation per type.
+      if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt))
+        continue;
+
+      DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n');
+
+      // Cases where we actually perform the optimization:
+      // 1. SExt is used in a getelementptr with more than 2 operand =>
+      //    likely we can merge some computation if they are done on 64 bits.
+      // 2. The beginning of the SExt chain is SExt several time. =>
+      //    code sharing is possible.
+
+      bool insert = false;
+      // #1.
+      for (const Use &U : SExt->uses()) {
+        const Instruction *Inst = dyn_cast<GetElementPtrInst>(U);
+        if (Inst && Inst->getNumOperands() > 2) {
+          DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
+                       << '\n');
+          insert = true;
+          break;
+        }
+      }
+
+      // #2.
+      // Check the head of the chain.
+      Instruction *Inst = SExt;
+      Value *Last;
+      do {
+        int OpdIdx = 0;
+        const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+        if (BinOp && isa<ConstantInt>(BinOp->getOperand(0)))
+          OpdIdx = 1;
+        Last = Inst->getOperand(OpdIdx);
+        Inst = dyn_cast<Instruction>(Last);
+      } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst));
+
+      DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n');
+      DenseMap<Value *, Instruction *>::iterator AlreadySeen =
+          SeenChains.find(Last);
+      if (insert || AlreadySeen != SeenChains.end()) {
+        DEBUG(dbgs() << "Insert\n");
+        SExtInsts.push_back(SExt);
+        if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) {
+          DEBUG(dbgs() << "Insert chain member\n");
+          SExtInsts.push_back(AlreadySeen->second);
+          SeenChains[Last] = nullptr;
+        }
+      } else {
+        DEBUG(dbgs() << "Record its chain membership\n");
+        SeenChains[Last] = SExt;
+      }
+    }
+  }
+}
+
+bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
+  if (!EnableAddressTypePromotion || F.isDeclaration())
+    return false;
+  Func = &F;
+  ConsideredSExtType = Type::getInt64Ty(Func->getContext());
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n');
+
+  Instructions SExtInsts;
+  analyzeSExtension(SExtInsts);
+  return propagateSignExtension(SExtInsts);
+}
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
new file mode 100644
index 00000000000..734fb215e6e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -0,0 +1,387 @@
+//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When profitable, replace GPR targeting i64 instructions with their
+// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
+// as minimizing the number of cross-class register copies.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO: Graph based predicate heuristics.
+// Walking the instruction list linearly will get many, perhaps most, of
+// the cases, but to do a truly thorough job of this, we need a more
+// wholistic approach.
+//
+// This optimization is very similar in spirit to the register allocator's
+// spill placement, only here we're determining where to place cross-class
+// register copies rather than spills. As such, a similar approach is
+// called for.
+//
+// We want to build up a set of graphs of all instructions which are candidates
+// for transformation along with instructions which generate their inputs and
+// consume their outputs. For each edge in the graph, we assign a weight
+// based on whether there is a copy required there (weight zero if not) and
+// the block frequency of the block containing the defining or using
+// instruction, whichever is less. Our optimization is then a graph problem
+// to minimize the total weight of all the graphs, then transform instructions
+// and add or remove copy instructions as called for to implement the
+// solution.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-simd-scalar"
+
+// Allow forcing all i64 operations with equivalent SIMD instructions to use
+// them. For stress-testing the transformation function.
+static cl::opt<bool>
+TransformAll("aarch64-simd-scalar-force-all",
+             cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
+             cl::init(false), cl::Hidden);
+
+STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
+STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
+STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
+
+namespace {
+class AArch64AdvSIMDScalar : public MachineFunctionPass {
+  MachineRegisterInfo *MRI;
+  const AArch64InstrInfo *TII;
+
+private:
+  // isProfitableToTransform - Predicate function to determine whether an
+  // instruction should be transformed to its equivalent AdvSIMD scalar
+  // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+  bool isProfitableToTransform(const MachineInstr *MI) const;
+
+  // transformInstruction - Perform the transformation of an instruction
+  // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+  // to be the correct register class, minimizing cross-class copies.
+  void transformInstruction(MachineInstr *MI);
+
+  // processMachineBasicBlock - Main optimzation loop.
+  bool processMachineBasicBlock(MachineBasicBlock *MBB);
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  const char *getPassName() const override {
+    return "AdvSIMD Scalar Operation Optimization";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64AdvSIMDScalar::ID = 0;
+} // end anonymous namespace
+
+static bool isGPR64(unsigned Reg, unsigned SubReg,
+                    const MachineRegisterInfo *MRI) {
+  if (SubReg)
+    return false;
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
+  return AArch64::GPR64RegClass.contains(Reg);
+}
+
+static bool isFPR64(unsigned Reg, unsigned SubReg,
+                    const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
+            SubReg == 0) ||
+           (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
+            SubReg == AArch64::dsub);
+  // Physical register references just check the register class directly.
+  return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
+         (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
+}
+
+// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
+// copy instruction. Return zero_reg if the instruction is not a copy.
+static unsigned getSrcFromCopy(const MachineInstr *MI,
+                               const MachineRegisterInfo *MRI,
+                               unsigned &SubReg) {
+  SubReg = 0;
+  // The "FMOV Xd, Dn" instruction is the typical form.
+  if (MI->getOpcode() == AArch64::FMOVDXr ||
+      MI->getOpcode() == AArch64::FMOVXDr)
+    return MI->getOperand(1).getReg();
+  // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
+  // these at this stage, but it's easy to check for.
+  if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
+    SubReg = AArch64::dsub;
+    return MI->getOperand(1).getReg();
+  }
+  // Or just a plain COPY instruction. This can be directly to/from FPR64,
+  // or it can be a dsub subreg reference to an FPR128.
+  if (MI->getOpcode() == AArch64::COPY) {
+    if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+                MRI) &&
+        isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
+      return MI->getOperand(1).getReg();
+    if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+                MRI) &&
+        isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
+                MRI)) {
+      SubReg = MI->getOperand(1).getSubReg();
+      return MI->getOperand(1).getReg();
+    }
+  }
+
+  // Otherwise, this is some other kind of instruction.
+  return 0;
+}
+
+// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
+// that we're considering transforming to, return that AdvSIMD opcode. For all
+// others, return the original opcode.
+static int getTransformOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    break;
+  // FIXME: Lots more possibilities.
+  case AArch64::ADDXrr:
+    return AArch64::ADDv1i64;
+  case AArch64::SUBXrr:
+    return AArch64::SUBv1i64;
+  }
+  // No AdvSIMD equivalent, so just return the original opcode.
+  return Opc;
+}
+
+static bool isTransformable(const MachineInstr *MI) {
+  int Opc = MI->getOpcode();
+  return Opc != getTransformOpcode(Opc);
+}
+
+// isProfitableToTransform - Predicate function to determine whether an
+// instruction should be transformed to its equivalent AdvSIMD scalar
+// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+bool
+AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
+  // If this instruction isn't eligible to be transformed (no SIMD equivalent),
+  // early exit since that's the common case.
+  if (!isTransformable(MI))
+    return false;
+
+  // Count the number of copies we'll need to add and approximate the number
+  // of copies that a transform will enable us to remove.
+  unsigned NumNewCopies = 3;
+  unsigned NumRemovableCopies = 0;
+
+  unsigned OrigSrc0 = MI->getOperand(1).getReg();
+  unsigned OrigSrc1 = MI->getOperand(2).getReg();
+  unsigned Src0 = 0, SubReg0;
+  unsigned Src1 = 0, SubReg1;
+  if (!MRI->def_empty(OrigSrc0)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc0);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    // If the source was from a copy, we don't need to insert a new copy.
+    if (Src0)
+      --NumNewCopies;
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0))
+      ++NumRemovableCopies;
+  }
+  if (!MRI->def_empty(OrigSrc1)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc1);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    if (Src1)
+      --NumNewCopies;
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1))
+      ++NumRemovableCopies;
+  }
+
+  // If any of the uses of the original instructions is a cross class copy,
+  // that's a copy that will be removable if we transform. Likewise, if
+  // any of the uses is a transformable instruction, it's likely the tranforms
+  // will chain, enabling us to save a copy there, too. This is an aggressive
+  // heuristic that approximates the graph based cost analysis described above.
+  unsigned Dst = MI->getOperand(0).getReg();
+  bool AllUsesAreCopies = true;
+  for (MachineRegisterInfo::use_instr_nodbg_iterator
+           Use = MRI->use_instr_nodbg_begin(Dst),
+           E = MRI->use_instr_nodbg_end();
+       Use != E; ++Use) {
+    unsigned SubReg;
+    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use))
+      ++NumRemovableCopies;
+    // If the use is an INSERT_SUBREG, that's still something that can
+    // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
+    // preferable to have it use the FPR64 in most cases, as if the source
+    // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
+    // Ditto for a lane insert.
+    else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
+             Use->getOpcode() == AArch64::INSvi64gpr)
+      ;
+    else
+      AllUsesAreCopies = false;
+  }
+  // If all of the uses of the original destination register are copies to
+  // FPR64, then we won't end up having a new copy back to GPR64 either.
+  if (AllUsesAreCopies)
+    --NumNewCopies;
+
+  // If a transform will not increase the number of cross-class copies required,
+  // return true.
+  if (NumNewCopies <= NumRemovableCopies)
+    return true;
+
+  // Finally, even if we otherwise wouldn't transform, check if we're forcing
+  // transformation of everything.
+  return TransformAll;
+}
+
+static MachineInstr *insertCopy(const AArch64InstrInfo *TII, MachineInstr *MI,
+                                unsigned Dst, unsigned Src, bool IsKill) {
+  MachineInstrBuilder MIB =
+      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY),
+              Dst)
+          .addReg(Src, getKillRegState(IsKill));
+  DEBUG(dbgs() << "    adding copy: " << *MIB);
+  ++NumCopiesInserted;
+  return MIB;
+}
+
+// transformInstruction - Perform the transformation of an instruction
+// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+// to be the correct register class, minimizing cross-class copies.
+void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
+  DEBUG(dbgs() << "Scalar transform: " << *MI);
+
+  MachineBasicBlock *MBB = MI->getParent();
+  int OldOpc = MI->getOpcode();
+  int NewOpc = getTransformOpcode(OldOpc);
+  assert(OldOpc != NewOpc && "transform an instruction to itself?!");
+
+  // Check if we need a copy for the source registers.
+  unsigned OrigSrc0 = MI->getOperand(1).getReg();
+  unsigned OrigSrc1 = MI->getOperand(2).getReg();
+  unsigned Src0 = 0, SubReg0;
+  unsigned Src1 = 0, SubReg1;
+  if (!MRI->def_empty(OrigSrc0)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc0);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) {
+      assert(Src0 && "Can't delete copy w/o a valid original source!");
+      Def->eraseFromParent();
+      ++NumCopiesDeleted;
+    }
+  }
+  if (!MRI->def_empty(OrigSrc1)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc1);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) {
+      assert(Src1 && "Can't delete copy w/o a valid original source!");
+      Def->eraseFromParent();
+      ++NumCopiesDeleted;
+    }
+  }
+  // If we weren't able to reference the original source directly, create a
+  // copy.
+  if (!Src0) {
+    SubReg0 = 0;
+    Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+    insertCopy(TII, MI, Src0, OrigSrc0, true);
+  }
+  if (!Src1) {
+    SubReg1 = 0;
+    Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+    insertCopy(TII, MI, Src1, OrigSrc1, true);
+  }
+
+  // Create a vreg for the destination.
+  // FIXME: No need to do this if the ultimate user expects an FPR64.
+  // Check for that and avoid the copy if possible.
+  unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+
+  // For now, all of the new instructions have the same simple three-register
+  // form, so no need to special case based on what instruction we're
+  // building.
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst)
+      .addReg(Src0, getKillRegState(true), SubReg0)
+      .addReg(Src1, getKillRegState(true), SubReg1);
+
+  // Now copy the result back out to a GPR.
+  // FIXME: Try to avoid this if all uses could actually just use the FPR64
+  // directly.
+  insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true);
+
+  // Erase the old instruction.
+  MI->eraseFromParent();
+
+  ++NumScalarInsnsUsed;
+}
+
+// processMachineBasicBlock - Main optimzation loop.
+bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr *MI = I;
+    ++I;
+    if (isProfitableToTransform(MI)) {
+      transformInstruction(MI);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+// runOnMachineFunction - Pass entry point from PassManager.
+bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
+
+  const TargetMachine &TM = mf.getTarget();
+  MRI = &mf.getRegInfo();
+  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+
+  // Just check things on a one-block-at-a-time basis.
+  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
+    if (processMachineBasicBlock(I))
+      Changed = true;
+  return Changed;
+}
+
+// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
+// to add the pass to the PassManager.
+FunctionPass *llvm::createAArch64AdvSIMDScalar() {
+  return new AArch64AdvSIMDScalar();
+}
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
new file mode 100644
index 00000000000..8553a591fee
--- /dev/null
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -0,0 +1,519 @@
+//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the AArch64 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64MCInstLower.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
+#include "InstPrinter/AArch64InstPrinter.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+
+class AArch64AsmPrinter : public AsmPrinter {
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when printing asm code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  AArch64MCInstLower MCInstLowering;
+  StackMaps SM;
+
+public:
+  AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer),
+        Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
+        MCInstLowering(OutContext, *Mang, *this), SM(*this), AArch64FI(nullptr),
+        LOHLabelCounter(0) {}
+
+  const char *getPassName() const override {
+    return "AArch64 Assembly Printer";
+  }
+
+  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
+  /// tblgen'erated pseudo lowering.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+    return MCInstLowering.lowerOperand(MO, MCOp);
+  }
+
+  void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                     const MachineInstr &MI);
+  void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                       const MachineInstr &MI);
+  /// \brief tblgen'erated driver function for lowering simple MI->MC
+  /// pseudo instructions.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  void EmitInstruction(const MachineInstr *MI) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AsmPrinter::getAnalysisUsage(AU);
+    AU.setPreservesAll();
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override {
+    AArch64FI = F.getInfo<AArch64FunctionInfo>();
+    return AsmPrinter::runOnMachineFunction(F);
+  }
+
+private:
+  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
+  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+  bool printAsmRegInClass(const MachineOperand &MO,
+                          const TargetRegisterClass *RC, bool isVector,
+                          raw_ostream &O);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O) override;
+
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  void EmitFunctionBodyEnd() override;
+
+  MCSymbol *GetCPISymbol(unsigned CPID) const override;
+  void EmitEndOfAsmFile(Module &M) override;
+  AArch64FunctionInfo *AArch64FI;
+
+  /// \brief Emit the LOHs contained in AArch64FI.
+  void EmitLOHs();
+
+  typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
+  MInstToMCSymbol LOHInstToLabel;
+  unsigned LOHLabelCounter;
+};
+
+} // end of anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetMachO()) {
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+    SM.serializeToStackMapSection();
+  }
+
+  // Emit a .data.rel section containing any stubs that were created.
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        OutStreamer.EmitLabel(Stubs[i].first);
+        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+                                    TD->getPointerSize(0));
+      }
+      Stubs.clear();
+    }
+  }
+
+}
+
+MachineLocation
+AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+  MachineLocation Location;
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  // Frame address.  Currently handles register +- offset only.
+  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
+    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+  else {
+    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+  }
+  return Location;
+}
+
+void AArch64AsmPrinter::EmitLOHs() {
+  SmallVector<MCSymbol *, 3> MCArgs;
+
+  for (const auto &D : AArch64FI->getLOHContainer()) {
+    for (const MachineInstr *MI : D.getArgs()) {
+      MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
+      assert(LabelIt != LOHInstToLabel.end() &&
+             "Label hasn't been inserted for LOH related instruction");
+      MCArgs.push_back(LabelIt->second);
+    }
+    OutStreamer.EmitLOHDirective(D.getKind(), MCArgs);
+    MCArgs.clear();
+  }
+}
+
+void AArch64AsmPrinter::EmitFunctionBodyEnd() {
+  if (!AArch64FI->getLOHRelated().empty())
+    EmitLOHs();
+}
+
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  // Darwin uses a linker-private symbol name for constant-pools (to
+  // avoid addends on the relocation?), ELF has no such concept and
+  // uses a normal private symbol.
+  if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
+    return OutContext.GetOrCreateSymbol(
+        Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
+        Twine(getFunctionNumber()) + "_" + Twine(CPID));
+
+  return OutContext.GetOrCreateSymbol(
+      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
+      Twine(getFunctionNumber()) + "_" + Twine(CPID));
+}
+
+void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  default:
+    assert(0 && "<unknown operand type>");
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    O << AArch64InstPrinter::getRegisterName(Reg);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    int64_t Imm = MO.getImm();
+    O << '#' << Imm;
+    break;
+  }
+  }
+}
+
+bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+                                          raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default:
+    return true; // Unknown mode.
+  case 'w':
+    Reg = getWRegFromXReg(Reg);
+    break;
+  case 'x':
+    Reg = getXRegFromWReg(Reg);
+    break;
+  }
+
+  O << AArch64InstPrinter::getRegisterName(Reg);
+  return false;
+}
+
+// Prints the register in MO using class RC using the offset in the
+// new register class. This should not be used for cross class
+// printing.
+bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
+                                           const TargetRegisterClass *RC,
+                                           bool isVector, raw_ostream &O) {
+  assert(MO.isReg() && "Should only get here with a register!");
+  const AArch64RegisterInfo *RI =
+      static_cast<const AArch64RegisterInfo *>(TM.getRegisterInfo());
+  unsigned Reg = MO.getReg();
+  unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
+  assert(RI->regsOverlap(RegToPrint, Reg));
+  O << AArch64InstPrinter::getRegisterName(
+           RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
+  return false;
+}
+
+bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                        unsigned AsmVariant,
+                                        const char *ExtraCode, raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      return true; // Unknown modifier.
+    case 'w':      // Print W register
+    case 'x':      // Print X register
+      if (MO.isReg())
+        return printAsmMRegister(MO, ExtraCode[0], O);
+      if (MO.isImm() && MO.getImm() == 0) {
+        unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR;
+        O << AArch64InstPrinter::getRegisterName(Reg);
+        return false;
+      }
+      printOperand(MI, OpNum, O);
+      return false;
+    case 'b': // Print B register.
+    case 'h': // Print H register.
+    case 's': // Print S register.
+    case 'd': // Print D register.
+    case 'q': // Print Q register.
+      if (MO.isReg()) {
+        const TargetRegisterClass *RC;
+        switch (ExtraCode[0]) {
+        case 'b':
+          RC = &AArch64::FPR8RegClass;
+          break;
+        case 'h':
+          RC = &AArch64::FPR16RegClass;
+          break;
+        case 's':
+          RC = &AArch64::FPR32RegClass;
+          break;
+        case 'd':
+          RC = &AArch64::FPR64RegClass;
+          break;
+        case 'q':
+          RC = &AArch64::FPR128RegClass;
+          break;
+        default:
+          return true;
+        }
+        return printAsmRegInClass(MO, RC, false /* vector */, O);
+      }
+      printOperand(MI, OpNum, O);
+      return false;
+    }
+  }
+
+  // According to ARM, we should emit x and v registers unless we have a
+  // modifier.
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+
+    // If this is a w or x register, print an x register.
+    if (AArch64::GPR32allRegClass.contains(Reg) ||
+        AArch64::GPR64allRegClass.contains(Reg))
+      return printAsmMRegister(MO, 'x', O);
+
+    // If this is a b, h, s, d, or q register, print it as a v register.
+    return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
+                              O);
+  }
+
+  printOperand(MI, OpNum, O);
+  return false;
+}
+
+bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                              unsigned OpNum,
+                                              unsigned AsmVariant,
+                                              const char *ExtraCode,
+                                              raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]";
+  return false;
+}
+
+void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                               raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps == 4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '[';
+  printOperand(MI, 0, OS);
+  OS << '+';
+  printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps - 2, OS);
+}
+
+void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                                      const MachineInstr &MI) {
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+  SM.recordStackMap(MI);
+  // Emit padding.
+  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+  for (unsigned i = 0; i < NumNOPBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                                        const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers Opers(&MI);
+
+  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+  unsigned EncodedBytes = 0;
+  if (CallTarget) {
+    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+           "High 16 bits of call target should be zero.");
+    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+    EncodedBytes = 16;
+    // Materialize the jump address:
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 32) & 0xFFFF)
+                                    .addImm(32));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 16) & 0xFFFF)
+                                    .addImm(16));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm(CallTarget & 0xFFFF)
+                                    .addImm(0));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
+  }
+  // Emit padding.
+  unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+  assert((NumBytes - EncodedBytes) % 4 == 0 &&
+         "Invalid number of NOP bytes requested!");
+  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "AArch64GenMCPseudoLowering.inc"
+
+void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(OutStreamer, MI))
+    return;
+
+  if (AArch64FI->getLOHRelated().count(MI)) {
+    // Generate a label for LOH related instruction
+    MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
+    // Associate the instruction with the label
+    LOHInstToLabel[MI] = LOHLabel;
+    OutStreamer.EmitLabel(LOHLabel);
+  }
+
+  // Do any manual lowerings.
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
+
+  // Tail calls use pseudo instructions so they have the proper code-gen
+  // attributes (isCall, isReturn, etc.). We lower them to the real
+  // instruction here.
+  case AArch64::TCRETURNri: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(AArch64::BR);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case AArch64::TCRETURNdi: {
+    MCOperand Dest;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
+    MCInst TmpInst;
+    TmpInst.setOpcode(AArch64::B);
+    TmpInst.addOperand(Dest);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case AArch64::TLSDESC_BLR: {
+    MCOperand Callee, Sym;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
+    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
+
+    // First emit a relocation-annotation. This expands to no code, but requests
+    // the following instruction gets an R_AARCH64_TLSDESC_CALL.
+    MCInst TLSDescCall;
+    TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
+    TLSDescCall.addOperand(Sym);
+    EmitToStreamer(OutStreamer, TLSDescCall);
+
+    // Other than that it's just a normal indirect call to the function loaded
+    // from the descriptor.
+    MCInst BLR;
+    BLR.setOpcode(AArch64::BLR);
+    BLR.addOperand(Callee);
+    EmitToStreamer(OutStreamer, BLR);
+
+    return;
+  }
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(OutStreamer, SM, *MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(OutStreamer, SM, *MI);
+  }
+
+  // Finally, do the automated lowerings for everything else.
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmPrinter() {
+  RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
+  RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
+
+  RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64leTarget);
+  RegisterAsmPrinter<AArch64AsmPrinter> W(TheARM64beTarget);
+}
diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
new file mode 100644
index 00000000000..52094526727
--- /dev/null
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -0,0 +1,510 @@
+//===-- AArch64BranchRelaxation.cpp - AArch64 branch relaxation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-branch-relax"
+
+static cl::opt<bool>
+BranchRelaxation("aarch64-branch-relax", cl::Hidden, cl::init(true),
+                 cl::desc("Relax out of range conditional branches"));
+
+static cl::opt<unsigned>
+TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
+                    cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
+                    cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
+                    cl::desc("Restrict range of Bcc instructions (DEBUG)"));
+
+STATISTIC(NumSplit, "Number of basic blocks split");
+STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
+
+namespace {
+class AArch64BranchRelaxation : public MachineFunctionPass {
+  /// BasicBlockInfo - Information about the offset and size of a single
+  /// basic block.
+  struct BasicBlockInfo {
+    /// Offset - Distance from the beginning of the function to the beginning
+    /// of this basic block.
+    ///
+    /// The offset is always aligned as required by the basic block.
+    unsigned Offset;
+
+    /// Size - Size of the basic block in bytes.  If the block contains
+    /// inline assembly, this is a worst case estimate.
+    ///
+    /// The size does not include any alignment padding whether from the
+    /// beginning of the block, or from an aligned jump table at the end.
+    unsigned Size;
+
+    BasicBlockInfo() : Offset(0), Size(0) {}
+
+    /// Compute the offset immediately following this block.  If LogAlign is
+    /// specified, return the offset the successor block will get if it has
+    /// this alignment.
+    unsigned postOffset(unsigned LogAlign = 0) const {
+      unsigned PO = Offset + Size;
+      unsigned Align = 1 << LogAlign;
+      return (PO + Align - 1) / Align * Align;
+    }
+  };
+
+  SmallVector<BasicBlockInfo, 16> BlockInfo;
+
+  MachineFunction *MF;
+  const AArch64InstrInfo *TII;
+
+  bool relaxBranchInstructions();
+  void scanFunction();
+  MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+  void adjustBlockOffsets(MachineBasicBlock &MBB);
+  bool isBlockInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+  bool fixupConditionalBranch(MachineInstr *MI);
+  void computeBlockSize(const MachineBasicBlock &MBB);
+  unsigned getInstrOffset(MachineInstr *MI) const;
+  void dumpBBs();
+  void verify();
+
+public:
+  static char ID;
+  AArch64BranchRelaxation() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "AArch64 branch relaxation pass";
+  }
+};
+char AArch64BranchRelaxation::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void AArch64BranchRelaxation::verify() {
+#ifndef NDEBUG
+  unsigned PrevNum = MF->begin()->getNumber();
+  for (MachineBasicBlock &MBB : *MF) {
+    unsigned Align = MBB.getAlignment();
+    unsigned Num = MBB.getNumber();
+    assert(BlockInfo[Num].Offset % (1u << Align) == 0);
+    assert(!Num || BlockInfo[PrevNum].postOffset() <= BlockInfo[Num].Offset);
+    PrevNum = Num;
+  }
+#endif
+}
+
+/// print block size and offset information - debugging
+void AArch64BranchRelaxation::dumpBBs() {
+  for (auto &MBB : *MF) {
+    const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
+    dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
+           << format("size=%#x\n", BBI.Size);
+  }
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  MachineBasicBlock *NextBB = std::next(MBBI);
+  if (NextBB == MBB->getParent()->end())
+    return false;
+
+  for (MachineBasicBlock *S : MBB->successors()) 
+    if (S == NextBB)
+      return true;
+
+  return false;
+}
+
+/// scanFunction - Do the initial scan of the function, building up
+/// information about each block.
+void AArch64BranchRelaxation::scanFunction() {
+  BlockInfo.clear();
+  BlockInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineBasicBlock &MBB : *MF)
+    computeBlockSize(MBB);
+
+  // Compute block offsets and known bits.
+  adjustBlockOffsets(*MF->begin());
+}
+
+/// computeBlockSize - Compute the size for MBB.
+/// This function updates BlockInfo directly.
+void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) {
+  unsigned Size = 0;
+  for (const MachineInstr &MI : MBB)
+    Size += TII->GetInstSizeInBytes(&MI);
+  BlockInfo[MBB.getNumber()].Size = Size;
+}
+
+/// getInstrOffset - Return the current offset of the specified machine
+/// instruction from the start of the function.  This offset changes as stuff is
+/// moved around inside the function.
+unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BlockInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+  return Offset;
+}
+
+void AArch64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
+  unsigned PrevNum = Start.getNumber();
+  for (auto &MBB : make_range(MachineFunction::iterator(Start), MF->end())) {
+    unsigned Num = MBB.getNumber();
+    if (!Num) // block zero is never changed from offset zero.
+      continue;
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned LogAlign = MBB.getAlignment();
+    BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(LogAlign);
+    PrevNum = Num;
+  }
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+/// NOTE: Successor list of the original BB is out of date after this function,
+/// and must be updated by the caller! Other transforms follow using this
+/// utility function, so no point updating now rather than waiting.
+MachineBasicBlock *
+AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+      MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB;
+  ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::B)).addMBB(NewBB);
+
+  // Insert an entry into BlockInfo to align it properly with the block numbers.
+  BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(*OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(*NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBlockOffsets(*OrigBB);
+
+  ++NumSplit;
+
+  return NewBB;
+}
+
+/// isBlockInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool AArch64BranchRelaxation::isBlockInRange(MachineInstr *MI,
+                                             MachineBasicBlock *DestBB,
+                                             unsigned Bits) {
+  unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2;
+  unsigned BrOffset = getInstrOffset(MI);
+  unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxOffs << " from " << getInstrOffset(MI)
+               << " to " << DestOffset << " offset "
+               << int(DestOffset - BrOffset) << "\t" << *MI);
+
+  // Branch before the Dest.
+  if (BrOffset <= DestOffset)
+    return (DestOffset - BrOffset <= MaxOffs);
+  return (BrOffset - DestOffset <= MaxOffs);
+}
+
+static bool isConditionalBranch(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::TBZW:
+  case AArch64::TBNZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZX:
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+  case AArch64::Bcc:
+    return true;
+  }
+}
+
+static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    assert(0 && "unexpected opcode!");
+  case AArch64::TBZW:
+  case AArch64::TBNZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZX:
+    return MI->getOperand(2).getMBB();
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+  case AArch64::Bcc:
+    return MI->getOperand(1).getMBB();
+  }
+}
+
+static unsigned getOppositeConditionOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    assert(0 && "unexpected opcode!");
+  case AArch64::TBNZW:   return AArch64::TBZW;
+  case AArch64::TBNZX:   return AArch64::TBZX;
+  case AArch64::TBZW:    return AArch64::TBNZW;
+  case AArch64::TBZX:    return AArch64::TBNZX;
+  case AArch64::CBNZW:   return AArch64::CBZW;
+  case AArch64::CBNZX:   return AArch64::CBZX;
+  case AArch64::CBZW:    return AArch64::CBNZW;
+  case AArch64::CBZX:    return AArch64::CBNZX;
+  case AArch64::Bcc:     return AArch64::Bcc; // Condition is an operand for Bcc.
+  }
+}
+
+static unsigned getBranchDisplacementBits(unsigned Opc) {
+  switch (Opc) {
+  default:
+    assert(0 && "unexpected opcode!");
+  case AArch64::TBNZW:
+  case AArch64::TBZW:
+  case AArch64::TBNZX:
+  case AArch64::TBZX:
+    return TBZDisplacementBits;
+  case AArch64::CBNZW:
+  case AArch64::CBZW:
+  case AArch64::CBNZX:
+  case AArch64::CBZX:
+    return CBZDisplacementBits;
+  case AArch64::Bcc:
+    return BCCDisplacementBits;
+  }
+}
+
+static inline void invertBccCondition(MachineInstr *MI) {
+  assert(MI->getOpcode() == AArch64::Bcc && "Unexpected opcode!");
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(0).getImm();
+  CC = AArch64CC::getInvertedCondCode(CC);
+  MI->getOperand(0).setImm((int64_t)CC);
+}
+
+/// fixupConditionalBranch - Fix up a conditional branch whose destination is
+/// too far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
+  MachineBasicBlock *DestBB = getDestBlock(MI);
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // tbz L1
+  // =>
+  // tbnz L2
+  // b   L1
+  // L2:
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  if (BMI != MI) {
+    if (std::next(MachineBasicBlock::iterator(MI)) ==
+            std::prev(MBB->getLastNonDebugInstr()) &&
+        BMI->getOpcode() == AArch64::B) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBlockInRange(MI, NewDest,
+                         getBranchDisplacementBits(MI->getOpcode()))) {
+        DEBUG(dbgs() << "  Invert condition and swap its destination with "
+                     << *BMI);
+        BMI->getOperand(0).setMBB(DestBB);
+        unsigned OpNum = (MI->getOpcode() == AArch64::TBZW ||
+                          MI->getOpcode() == AArch64::TBNZW ||
+                          MI->getOpcode() == AArch64::TBZX ||
+                          MI->getOpcode() == AArch64::TBNZX)
+                             ? 2
+                             : 1;
+        MI->getOperand(OpNum).setMBB(NewDest);
+        MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode())));
+        if (MI->getOpcode() == AArch64::Bcc)
+          invertBccCondition(MI);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    // Analyze the branch so we know how to update the successor lists.
+    MachineBasicBlock *TBB, *FBB;
+    SmallVector<MachineOperand, 2> Cond;
+    TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false);
+
+    MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BlockInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below
+
+    // Update the successor lists according to the transformation to follow.
+    // Do it here since if there's no split, no update is needed.
+    MBB->replaceSuccessor(FBB, NewBB);
+    NewBB->addSuccessor(FBB);
+  }
+  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << ", invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  MachineInstrBuilder MIB = BuildMI(
+      MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode())))
+                                .addOperand(MI->getOperand(0));
+  if (MI->getOpcode() == AArch64::TBZW || MI->getOpcode() == AArch64::TBNZW ||
+      MI->getOpcode() == AArch64::TBZX || MI->getOpcode() == AArch64::TBNZX)
+    MIB.addOperand(MI->getOperand(1));
+  if (MI->getOpcode() == AArch64::Bcc)
+    invertBccCondition(MIB);
+  MIB.addMBB(NextBB);
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB);
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+
+  // Finally, keep the block offsets up to date.
+  adjustBlockOffsets(*MBB);
+  return true;
+}
+
+bool AArch64BranchRelaxation::relaxBranchInstructions() {
+  bool Changed = false;
+  // Relaxing branches involves creating new basic blocks, so re-eval
+  // end() for termination.
+  for (auto &MBB : *MF) {
+    MachineInstr *MI = MBB.getFirstTerminator();
+    if (isConditionalBranch(MI->getOpcode()) &&
+        !isBlockInRange(MI, getDestBlock(MI),
+                        getBranchDisplacementBits(MI->getOpcode()))) {
+      fixupConditionalBranch(MI);
+      ++NumRelaxed;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+
+  // If the pass is disabled, just bail early.
+  if (!BranchRelaxation)
+    return false;
+
+  DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n");
+
+  TII = (const AArch64InstrInfo *)MF->getTarget().getInstrInfo();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block.
+  scanFunction();
+
+  DEBUG(dbgs() << "  Basic blocks before relaxation\n");
+  DEBUG(dumpBBs());
+
+  bool MadeChange = false;
+  while (relaxBranchInstructions())
+    MadeChange = true;
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify();
+
+  DEBUG(dbgs() << "  Basic blocks after relaxation\n");
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BlockInfo.clear();
+
+  return MadeChange;
+}
+
+/// createAArch64BranchRelaxation - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createAArch64BranchRelaxation() {
+  return new AArch64BranchRelaxation();
+}
diff --git a/lib/Target/AArch64/AArch64CallingConv.h b/lib/Target/AArch64/AArch64CallingConv.h
new file mode 100644
index 00000000000..1fe426ed686
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallingConv.h
@@ -0,0 +1,94 @@
+//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the AArch64 Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64CALLINGCONV_H
+#define AArch64CALLINGCONV_H
+
+#include "AArch64InstrInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+/// CC_AArch64_Custom_i1i8i16_Reg - customized handling of passing i1/i8/i16 via
+/// register. Here, ValVT can be i1/i8/i16 or i32 depending on whether the
+/// argument is already promoted and LocVT is i1/i8/i16. We only promote the
+/// argument to i32 if we are sure this argument will be passed in register.
+static bool CC_AArch64_Custom_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                        CCValAssign::LocInfo LocInfo,
+                                        ISD::ArgFlagsTy ArgFlags,
+                                        CCState &State,
+                                        bool IsWebKitJS = false) {
+  static const MCPhysReg RegList1[] = { AArch64::W0, AArch64::W1, AArch64::W2,
+                                        AArch64::W3, AArch64::W4, AArch64::W5,
+                                        AArch64::W6, AArch64::W7 };
+  static const MCPhysReg RegList2[] = { AArch64::X0, AArch64::X1, AArch64::X2,
+                                        AArch64::X3, AArch64::X4, AArch64::X5,
+                                        AArch64::X6, AArch64::X7 };
+  static const MCPhysReg WebKitRegList1[] = { AArch64::W0 };
+  static const MCPhysReg WebKitRegList2[] = { AArch64::X0 };
+
+  const MCPhysReg *List1 = IsWebKitJS ? WebKitRegList1 : RegList1;
+  const MCPhysReg *List2 = IsWebKitJS ? WebKitRegList2 : RegList2;
+
+  if (unsigned Reg = State.AllocateReg(List1, List2, 8)) {
+    // Customized extra section for handling i1/i8/i16:
+    // We need to promote the argument to i32 if it is not done already.
+    if (ValVT != MVT::i32) {
+      if (ArgFlags.isSExt())
+        LocInfo = CCValAssign::SExt;
+      else if (ArgFlags.isZExt())
+        LocInfo = CCValAssign::ZExt;
+      else
+        LocInfo = CCValAssign::AExt;
+      ValVT = MVT::i32;
+    }
+    // Set LocVT to i32 as well if passing via register.
+    LocVT = MVT::i32;
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+  return false;
+}
+
+/// CC_AArch64_WebKit_JS_i1i8i16_Reg - customized handling of passing i1/i8/i16
+/// via register. This behaves the same as CC_AArch64_Custom_i1i8i16_Reg, but only
+/// uses the first register.
+static bool CC_AArch64_WebKit_JS_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                           CCValAssign::LocInfo LocInfo,
+                                           ISD::ArgFlagsTy ArgFlags,
+                                           CCState &State) {
+  return CC_AArch64_Custom_i1i8i16_Reg(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
+                                     State, true);
+}
+
+/// CC_AArch64_Custom_i1i8i16_Stack: customized handling of passing i1/i8/i16 on
+/// stack. Here, ValVT can be i1/i8/i16 or i32 depending on whether the argument
+/// is already promoted and LocVT is i1/i8/i16. If ValVT is already promoted,
+/// it will be truncated back to i1/i8/i16.
+static bool CC_AArch64_Custom_i1i8i16_Stack(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                          CCValAssign::LocInfo LocInfo,
+                                          ISD::ArgFlagsTy ArgFlags,
+                                          CCState &State) {
+  unsigned Space = ((LocVT == MVT::i1 || LocVT == MVT::i8) ? 1 : 2);
+  unsigned Offset12 = State.AllocateStack(Space, Space);
+  ValVT = LocVT;
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset12, LocVT, LocInfo));
+  return true;
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
new file mode 100644
index 00000000000..c263d14dcc3
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -0,0 +1,236 @@
+//=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for AArch64 architecture.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A> :
+  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+/// CCIfBigEndian - Match only if we're in big endian mode.
+class CCIfBigEndian<CCAction A> :
+  CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_AArch64_AAPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+  // Big endian vectors must be passed as if they were 1-element vectors so that
+  // their lanes are in a consistent order.
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+                         CCBitConvertToType<f64>>>,
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+                         CCBitConvertToType<f128>>>,
+
+  // An SRet is passed in X8, not X0 like a normal pointer parameter.
+  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+  // slot is 64-bit.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+  // up to eight each of GPR and FPR.
+  CCIfType<[i1, i8, i16], CCCustom<"CC_AArch64_Custom_i1i8i16_Reg">>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  // i128 is split to two i64s, we can't fit half to register X7.
+  CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6],
+                                                    [X0, X1, X3, X5]>>>,
+
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // If more than will fit in registers, pass them on the stack instead.
+  CCIfType<[i1, i8, i16], CCAssignToStack<8, 8>>,
+  CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToStack<16, 16>>
+]>;
+
+def RetCC_AArch64_AAPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+  // Big endian vectors must be passed as if they were 1-element vectors so that
+  // their lanes are in a consistent order.
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+                         CCBitConvertToType<f64>>>,
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+                         CCBitConvertToType<f128>>>,
+
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+      CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                              [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+      CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+
+// Darwin uses a calling convention which differs in only two ways
+// from the standard one at this level:
+//     + i128s (i.e. split i64s) don't need even registers.
+//     + Stack slots are sized as needed rather than being at least 64-bit.
+def CC_AArch64_DarwinPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // An SRet is passed in X8, not X0 like a normal pointer parameter.
+  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+  // slot is 64-bit.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+  // up to eight each of GPR and FPR.
+  CCIfType<[i1, i8, i16], CCCustom<"CC_AArch64_Custom_i1i8i16_Reg">>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  // i128 is split to two i64s, we can't fit half to register X7.
+  CCIfType<[i64],
+           CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6],
+                                             [W0, W1, W2, W3, W4, W5, W6]>>>,
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // If more than will fit in registers, pass them on the stack instead.
+  CCIfType<[i1, i8, i16], CCCustom<"CC_AArch64_Custom_i1i8i16_Stack">>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+]>;
+
+def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // Handle all scalar types as either i64 or f64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+  CCIfType<[f32],          CCPromoteToType<f64>>,
+
+  // Everything is on the stack.
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],   CCAssignToStack<16, 16>>
+]>;
+
+// The WebKit_JS calling convention only passes the first argument (the callee)
+// in register and the remaining arguments on stack. We allow 32bit stack slots,
+// so that WebKit can write partial values in the stack and define the other
+// 32bit quantity as undef.
+def CC_AArch64_WebKit_JS : CallingConv<[
+  // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
+  CCIfType<[i1, i8, i16], CCCustom<"CC_AArch64_WebKit_JS_i1i8i16_Reg">>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
+
+  // Pass the remaining arguments on the stack instead.
+  CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_AArch64_WebKit_JS : CallingConv<[
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+// FIXME: LR is only callee-saved in the sense that *we* preserve it and are
+// presumably a callee to someone. External functions may not do so, but this
+// is currently safe since BL has LR as an implicit-def and what happens after a
+// tail call doesn't matter.
+//
+// It would be better to model its preservation semantics properly (create a
+// vreg on entry, use it in RET & tail call generation; make that vreg def if we
+// end up saving LR as part of a call frame). Watch this space...
+def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+                                           X23, X24, X25, X26, X27, X28,
+                                           D8,  D9,  D10, D11,
+                                           D12, D13, D14, D15)>;
+
+// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
+// 'this' and the pointer return value are both passed in X0 in these cases,
+// this can be partially modelled by treating X0 as a callee-saved register;
+// only the resulting RegMask is used; the SaveList is ignored
+//
+// (For generic ARM 64-bit ABI code, clang will not generate constructors or
+// destructors with 'this' returns, so this RegMask will not be used in that
+// case)
+def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// guarantees more than a normal AAPCS function. x16 and x17 are used on the
+// fast path for calculation, but other registers except X0 (argument/return)
+// and LR (it is a call, after all) are preserved.
+def CSR_AArch64_TLS_Darwin
+    : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
+                           FP,
+                           (sequence "Q%u", 0, 31))>;
+
+// The ELF stub used for TLS-descriptor access saves every feasible
+// register. Only X0 and LR are clobbered.
+def CSR_AArch64_TLS_ELF
+    : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP,
+                           (sequence "Q%u", 0, 31))>;
+
+def CSR_AArch64_AllRegs
+    : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP,
+                           (sequence "X%u", 0, 28), FP, LR, SP,
+                           (sequence "B%u", 0, 31), (sequence "H%u", 0, 31),
+                           (sequence "S%u", 0, 31), (sequence "D%u", 0, 31),
+                           (sequence "Q%u", 0, 31))>;
+
diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
new file mode 100644
index 00000000000..4d23dc59d7a
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -0,0 +1,147 @@
+//===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Local-dynamic access to thread-local variables proceeds in three stages.
+//
+// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated
+//    in much the same way as a general-dynamic TLS-descriptor access against
+//    the special symbol _TLS_MODULE_BASE.
+// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using
+//    instructions with "dtprel" modifiers.
+// 3. These two are added, together with TPIDR_EL0, to obtain the variable's
+//    true address.
+//
+// This is only better than general-dynamic access to the variable if two or
+// more of the first stage TLS-descriptor calculations can be combined. This
+// pass looks through a function and performs such combinations.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+namespace {
+struct LDTLSCleanup : public MachineFunctionPass {
+  static char ID;
+  LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
+      // No point folding accesses if there isn't at least two.
+      return false;
+    }
+
+    MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+    return VisitNode(DT->getRootNode(), 0);
+  }
+
+  // Visit the dominator subtree rooted at Node in pre-order.
+  // If TLSBaseAddrReg is non-null, then use that to replace any
+  // TLS_base_addr instructions. Otherwise, create the register
+  // when the first such instruction is seen, and then use it
+  // as we encounter more instructions.
+  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+    MachineBasicBlock *BB = Node->getBlock();
+    bool Changed = false;
+
+    // Traverse the current block.
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+         ++I) {
+      switch (I->getOpcode()) {
+      case AArch64::TLSDESC_BLR:
+        // Make sure it's a local dynamic access.
+        if (!I->getOperand(1).isSymbol() ||
+            strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
+          break;
+
+        if (TLSBaseAddrReg)
+          I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+        else
+          I = setRegister(I, &TLSBaseAddrReg);
+        Changed = true;
+        break;
+      default:
+        break;
+      }
+    }
+
+    // Visit the children of this block in the dominator tree.
+    for (MachineDomTreeNode *N : *Node) {
+      Changed |= VisitNode(N, TLSBaseAddrReg);
+    }
+
+    return Changed;
+  }
+
+  // Replace the TLS_base_addr instruction I with a copy from
+  // TLSBaseAddrReg, returning the new instruction.
+  MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
+                                       unsigned TLSBaseAddrReg) {
+    MachineFunction *MF = I->getParent()->getParent();
+    const AArch64TargetMachine *TM =
+        static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+    const AArch64InstrInfo *TII = TM->getInstrInfo();
+
+    // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
+    // code sequence assumes the address will be.
+    MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 AArch64::X0).addReg(TLSBaseAddrReg);
+
+    // Erase the TLS_base_addr instruction.
+    I->eraseFromParent();
+
+    return Copy;
+  }
+
+  // Create a virtal register in *TLSBaseAddrReg, and populate it by
+  // inserting a copy instruction after I. Returns the new instruction.
+  MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+    MachineFunction *MF = I->getParent()->getParent();
+    const AArch64TargetMachine *TM =
+        static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+    const AArch64InstrInfo *TII = TM->getInstrInfo();
+
+    // Create a virtual register for the TLS base address.
+    MachineRegisterInfo &RegInfo = MF->getRegInfo();
+    *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
+
+    // Insert a copy from X0 to TLSBaseAddrReg for later.
+    MachineInstr *Next = I->getNextNode();
+    MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 *TLSBaseAddrReg).addReg(AArch64::X0);
+
+    return Copy;
+  }
+
+  const char *getPassName() const override {
+    return "Local Dynamic TLS Access Clean-up";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass *llvm::createAArch64CleanupLocalDynamicTLSPass() {
+  return new LDTLSCleanup();
+}
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
new file mode 100644
index 00000000000..6b1f09678e9
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -0,0 +1,1117 @@
+//===---------- AArch64CollectLOH.cpp - AArch64 collect LOH pass --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that collect the Linker Optimization Hint (LOH).
+// This pass should be run at the very end of the compilation flow, just before
+// assembly printer.
+// To be useful for the linker, the LOH must be printed into the assembly file.
+//
+// A LOH describes a sequence of instructions that may be optimized by the
+// linker.
+// This same sequence cannot be optimized by the compiler because some of
+// the information will be known at link time.
+// For instance, consider the following sequence:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: ldr xC, [xB, #imm]
+// This sequence can be turned into:
+// A literal load if sym@PAGE + sym@PAGEOFF + #imm - address(L3) is < 1MB:
+//     L3: ldr xC, sym+#imm
+// It may also be turned into either the following more efficient
+// code sequences:
+// - If sym@PAGEOFF + #imm fits the encoding space of L3.
+//     L1: adrp xA, sym@PAGE
+//     L3: ldr xC, [xB, sym@PAGEOFF + #imm]
+// - If sym@PAGE + sym@PAGEOFF - address(L1) < 1MB:
+//     L1: adr xA, sym
+//     L3: ldr xC, [xB, #imm]
+//
+// To be valid a LOH must meet all the requirements needed by all the related
+// possible linker transformations.
+// For instance, using the running example, the constraints to emit
+// ".loh AdrpAddLdr" are:
+// - L1, L2, and L3 instructions are of the expected type, i.e.,
+//   respectively ADRP, ADD (immediate), and LD.
+// - The result of L1 is used only by L2.
+// - The register argument (xA) used in the ADD instruction is defined
+//   only by L1.
+// - The result of L2 is used only by L3.
+// - The base address (xB) in L3 is defined only L2.
+// - The ADRP in L1 and the ADD in L2 must reference the same symbol using
+//   @PAGE/@PAGEOFF with no additional constants
+//
+// Currently supported LOHs are:
+// * So called non-ADRP-related:
+//   - .loh AdrpAddLdr L1, L2, L3:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: ldr xC, [xB, #imm]
+//   - .loh AdrpLdrGotLdr L1, L2, L3:
+//     L1: adrp xA, sym@GOTPAGE
+//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
+//     L3: ldr xC, [xB, #imm]
+//   - .loh AdrpLdr L1, L3:
+//     L1: adrp xA, sym@PAGE
+//     L3: ldr xC, [xA, sym@PAGEOFF]
+//   - .loh AdrpAddStr L1, L2, L3:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: str xC, [xB, #imm]
+//   - .loh AdrpLdrGotStr L1, L2, L3:
+//     L1: adrp xA, sym@GOTPAGE
+//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
+//     L3: str xC, [xB, #imm]
+//   - .loh AdrpAdd L1, L2:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//   For all these LOHs, L1, L2, L3 form a simple chain:
+//   L1 result is used only by L2 and L2 result by L3.
+//   L3 LOH-related argument is defined only by L2 and L2 LOH-related argument
+//   by L1.
+// All these LOHs aim at using more efficient load/store patterns by folding
+// some instructions used to compute the address directly into the load/store.
+//
+// * So called ADRP-related:
+//  - .loh AdrpAdrp L2, L1:
+//    L2: ADRP xA, sym1@PAGE
+//    L1: ADRP xA, sym2@PAGE
+//    L2 dominates L1 and xA is not redifined between L2 and L1
+// This LOH aims at getting rid of redundant ADRP instructions.
+//
+// The overall design for emitting the LOHs is:
+// 1. AArch64CollectLOH (this pass) records the LOHs in the AArch64FunctionInfo.
+// 2. AArch64AsmPrinter reads the LOHs from AArch64FunctionInfo and it:
+//     1. Associates them a label.
+//     2. Emits them in a MCStreamer (EmitLOHDirective).
+//         - The MCMachOStreamer records them into the MCAssembler.
+//         - The MCAsmStreamer prints them.
+//         - Other MCStreamers ignore them.
+//     3. Closes the MCStreamer:
+//         - The MachObjectWriter gets them from the MCAssembler and writes
+//           them in the object file.
+//         - Other ObjectWriters ignore them.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-collect-loh"
+
+static cl::opt<bool>
+PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden,
+                   cl::desc("Restrict analysis to registers invovled"
+                            " in LOHs"),
+                   cl::init(true));
+
+static cl::opt<bool>
+BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden,
+                    cl::desc("Restrict analysis at basic block scope"),
+                    cl::init(true));
+
+STATISTIC(NumADRPSimpleCandidate,
+          "Number of simplifiable ADRP dominate by another");
+STATISTIC(NumADRPComplexCandidate2,
+          "Number of simplifiable ADRP reachable by 2 defs");
+STATISTIC(NumADRPComplexCandidate3,
+          "Number of simplifiable ADRP reachable by 3 defs");
+STATISTIC(NumADRPComplexCandidateOther,
+          "Number of simplifiable ADRP reachable by 4 or more defs");
+STATISTIC(NumADDToSTRWithImm,
+          "Number of simplifiable STR with imm reachable by ADD");
+STATISTIC(NumLDRToSTRWithImm,
+          "Number of simplifiable STR with imm reachable by LDR");
+STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD");
+STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR");
+STATISTIC(NumADDToLDRWithImm,
+          "Number of simplifiable LDR with imm reachable by ADD");
+STATISTIC(NumLDRToLDRWithImm,
+          "Number of simplifiable LDR with imm reachable by LDR");
+STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD");
+STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR");
+STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP");
+STATISTIC(NumCplxLvl1, "Number of complex case of level 1");
+STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1");
+STATISTIC(NumCplxLvl2, "Number of complex case of level 2");
+STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2");
+STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
+STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
+
+namespace llvm {
+void initializeAArch64CollectLOHPass(PassRegistry &);
+}
+
+namespace {
+struct AArch64CollectLOH : public MachineFunctionPass {
+  static char ID;
+  AArch64CollectLOH() : MachineFunctionPass(ID) {
+    initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "AArch64 Collect Linker Optimization Hint (LOH)";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineDominatorTree>();
+  }
+
+private:
+};
+
+/// A set of MachineInstruction.
+typedef SetVector<const MachineInstr *> SetOfMachineInstr;
+/// Map a basic block to a set of instructions per register.
+/// This is used to represent the exposed uses of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *, SetOfMachineInstr *>
+BlockToSetOfInstrsPerColor;
+/// Map a basic block to an instruction per register.
+/// This is used to represent the live-out definitions of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *, const MachineInstr **>
+BlockToInstrPerColor;
+/// Map an instruction to a set of instructions. Used to represent the
+/// mapping def to reachable uses or use to definitions.
+typedef MapVector<const MachineInstr *, SetOfMachineInstr> InstrToInstrs;
+/// Map a basic block to a BitVector.
+/// This is used to record the kill registers per basic block.
+typedef MapVector<const MachineBasicBlock *, BitVector> BlockToRegSet;
+
+/// Map a register to a dense id.
+typedef DenseMap<unsigned, unsigned> MapRegToId;
+/// Map a dense id to a register. Used for debug purposes.
+typedef SmallVector<unsigned, 32> MapIdToReg;
+} // end anonymous namespace.
+
+char AArch64CollectLOH::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
+                      "AArch64 Collect Linker Optimization Hint (LOH)", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
+                    "AArch64 Collect Linker Optimization Hint (LOH)", false,
+                    false)
+
+/// Given a couple (MBB, reg) get the corresponding set of instruction from
+/// the given "sets".
+/// If this couple does not reference any set, an empty set is added to "sets"
+/// for this couple and returned.
+/// \param nbRegs is used internally allocate some memory. It must be consistent
+/// with the way sets is used.
+static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
+                                 const MachineBasicBlock &MBB, unsigned reg,
+                                 unsigned nbRegs) {
+  SetOfMachineInstr *result;
+  BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
+  if (it != sets.end())
+    result = it->second;
+  else
+    result = sets[&MBB] = new SetOfMachineInstr[nbRegs];
+
+  return result[reg];
+}
+
+/// Given a couple (reg, MI) get the corresponding set of instructions from the
+/// the given "sets".
+/// This is used to get the uses record in sets of a definition identified by
+/// MI and reg, i.e., MI defines reg.
+/// If the couple does not reference anything, an empty set is added to
+/// "sets[reg]".
+/// \pre set[reg] is valid.
+static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg,
+                                  const MachineInstr &MI) {
+  return sets[reg][&MI];
+}
+
+/// Same as getUses but does not modify the input map: sets.
+/// \return NULL if the couple (reg, MI) is not in sets.
+static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
+                                        const MachineInstr &MI) {
+  InstrToInstrs::const_iterator Res = sets[reg].find(&MI);
+  if (Res != sets[reg].end())
+    return &(Res->second);
+  return nullptr;
+}
+
+/// Initialize the reaching definition algorithm:
+/// For each basic block BB in MF, record:
+/// - its kill set.
+/// - its reachable uses (uses that are exposed to BB's predecessors).
+/// - its the generated definitions.
+/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to
+/// the list of uses of exposed defintions.
+/// \param ADRPMode specifies to only consider ADRP instructions for generated
+/// definition. It also consider definitions of ADRP instructions as uses and
+/// ignore other uses. The ADRPMode is used to collect the information for LHO
+/// that involve ADRP operation only.
+static void initReachingDef(MachineFunction &MF,
+                            InstrToInstrs *ColorOpToReachedUses,
+                            BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+                            BlockToSetOfInstrsPerColor &ReachableUses,
+                            const MapRegToId &RegToId,
+                            const MachineInstr *DummyOp, bool ADRPMode) {
+  const TargetMachine &TM = MF.getTarget();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+
+  unsigned NbReg = RegToId.size();
+
+  for (MachineBasicBlock &MBB : MF) {
+    const MachineInstr **&BBGen = Gen[&MBB];
+    BBGen = new const MachineInstr *[NbReg];
+    memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg);
+
+    BitVector &BBKillSet = Kill[&MBB];
+    BBKillSet.resize(NbReg);
+    for (const MachineInstr &MI : MBB) {
+      bool IsADRP = MI.getOpcode() == AArch64::ADRP;
+
+      // Process uses first.
+      if (IsADRP || !ADRPMode)
+        for (const MachineOperand &MO : MI.operands()) {
+          // Treat ADRP def as use, as the goal of the analysis is to find
+          // ADRP defs reached by other ADRP defs.
+          if (!MO.isReg() || (!ADRPMode && !MO.isUse()) ||
+              (ADRPMode && (!IsADRP || !MO.isDef())))
+            continue;
+          unsigned CurReg = MO.getReg();
+          MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+          if (ItCurRegId == RegToId.end())
+            continue;
+          CurReg = ItCurRegId->second;
+
+          // if CurReg has not been defined, this use is reachable.
+          if (!BBGen[CurReg] && !BBKillSet.test(CurReg))
+            getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI);
+          // current basic block definition for this color, if any, is in Gen.
+          if (BBGen[CurReg])
+            getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI);
+        }
+
+      // Process clobbers.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
+          continue;
+        // Clobbers kill the related colors.
+        const uint32_t *PreservedRegs = MO.getRegMask();
+
+        // Set generated regs.
+        for (const auto Entry : RegToId) {
+          unsigned Reg = Entry.second;
+          // Use the global register ID when querying APIs external to this
+          // pass.
+          if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) {
+            // Do not register clobbered definition for no ADRP.
+            // This definition is not used anyway (otherwise register
+            // allocation is wrong).
+            BBGen[Reg] = ADRPMode ? &MI : nullptr;
+            BBKillSet.set(Reg);
+          }
+        }
+      }
+
+      // Process register defs.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || !MO.isDef())
+          continue;
+        unsigned CurReg = MO.getReg();
+        MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+        if (ItCurRegId == RegToId.end())
+          continue;
+
+        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
+          MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
+          assert(ItRegId != RegToId.end() &&
+                 "Sub-register of an "
+                 "involved register, not recorded as involved!");
+          BBKillSet.set(ItRegId->second);
+          BBGen[ItRegId->second] = &MI;
+        }
+        BBGen[ItCurRegId->second] = &MI;
+      }
+    }
+
+    // If we restrict our analysis to basic block scope, conservatively add a
+    // dummy
+    // use for each generated value.
+    if (!ADRPMode && DummyOp && !MBB.succ_empty())
+      for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg)
+        if (BBGen[CurReg])
+          getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp);
+  }
+}
+
+/// Reaching def core algorithm:
+/// while an Out has changed
+///    for each bb
+///       for each color
+///           In[bb][color] = U Out[bb.predecessors][color]
+///           insert reachableUses[bb][color] in each in[bb][color]
+///                 op.reachedUses
+///
+///           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+static void reachingDefAlgorithm(MachineFunction &MF,
+                                 InstrToInstrs *ColorOpToReachedUses,
+                                 BlockToSetOfInstrsPerColor &In,
+                                 BlockToSetOfInstrsPerColor &Out,
+                                 BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+                                 BlockToSetOfInstrsPerColor &ReachableUses,
+                                 unsigned NbReg) {
+  bool HasChanged;
+  do {
+    HasChanged = false;
+    for (MachineBasicBlock &MBB : MF) {
+      unsigned CurReg;
+      for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+        SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg);
+        SetOfMachineInstr &BBReachableUses =
+            getSet(ReachableUses, MBB, CurReg, NbReg);
+        SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg);
+        unsigned Size = BBOutSet.size();
+        //   In[bb][color] = U Out[bb.predecessors][color]
+        for (MachineBasicBlock *PredMBB : MBB.predecessors()) {
+          SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg);
+          BBInSet.insert(PredOutSet.begin(), PredOutSet.end());
+        }
+        //   insert reachableUses[bb][color] in each in[bb][color] op.reachedses
+        for (const MachineInstr *MI : BBInSet) {
+          SetOfMachineInstr &OpReachedUses =
+              getUses(ColorOpToReachedUses, CurReg, *MI);
+          OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end());
+        }
+        //           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+        if (!Kill[&MBB].test(CurReg))
+          BBOutSet.insert(BBInSet.begin(), BBInSet.end());
+        if (Gen[&MBB][CurReg])
+          BBOutSet.insert(Gen[&MBB][CurReg]);
+        HasChanged |= BBOutSet.size() != Size;
+      }
+    }
+  } while (HasChanged);
+}
+
+/// Release all memory dynamically allocated during the reaching
+/// definition algorithm.
+static void finitReachingDef(BlockToSetOfInstrsPerColor &In,
+                             BlockToSetOfInstrsPerColor &Out,
+                             BlockToInstrPerColor &Gen,
+                             BlockToSetOfInstrsPerColor &ReachableUses) {
+  for (auto &IT : Out)
+    delete[] IT.second;
+  for (auto &IT : In)
+    delete[] IT.second;
+  for (auto &IT : ReachableUses)
+    delete[] IT.second;
+  for (auto &IT : Gen)
+    delete[] IT.second;
+}
+
+/// Reaching definition algorithm.
+/// \param MF function on which the algorithm will operate.
+/// \param[out] ColorOpToReachedUses will contain the result of the reaching
+/// def algorithm.
+/// \param ADRPMode specify whether the reaching def algorithm should be tuned
+/// for ADRP optimization. \see initReachingDef for more details.
+/// \param DummyOp if not NULL, the algorithm will work at
+/// basic block scope and will set for every exposed definition a use to
+/// @p DummyOp.
+/// \pre ColorOpToReachedUses is an array of at least number of registers of
+/// InstrToInstrs.
+static void reachingDef(MachineFunction &MF,
+                        InstrToInstrs *ColorOpToReachedUses,
+                        const MapRegToId &RegToId, bool ADRPMode = false,
+                        const MachineInstr *DummyOp = nullptr) {
+  // structures:
+  // For each basic block.
+  // Out: a set per color of definitions that reach the
+  //      out boundary of this block.
+  // In: Same as Out but for in boundary.
+  // Gen: generated color in this block (one operation per color).
+  // Kill: register set of killed color in this block.
+  // ReachableUses: a set per color of uses (operation) reachable
+  //                for "In" definitions.
+  BlockToSetOfInstrsPerColor Out, In, ReachableUses;
+  BlockToInstrPerColor Gen;
+  BlockToRegSet Kill;
+
+  // Initialize Gen, kill and reachableUses.
+  initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId,
+                  DummyOp, ADRPMode);
+
+  // Algo.
+  if (!DummyOp)
+    reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
+                         ReachableUses, RegToId.size());
+
+  // finit.
+  finitReachingDef(In, Out, Gen, ReachableUses);
+}
+
+#ifndef NDEBUG
+/// print the result of the reaching definition algorithm.
+static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses,
+                             unsigned NbReg, const TargetRegisterInfo *TRI,
+                             const MapIdToReg &IdToReg) {
+  unsigned CurReg;
+  for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+    if (ColorOpToReachedUses[CurReg].empty())
+      continue;
+    DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n");
+
+    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+      DEBUG(dbgs() << "Def:\n");
+      DEBUG(DefsIt.first->print(dbgs()));
+      DEBUG(dbgs() << "Reachable uses:\n");
+      for (const MachineInstr *MI : DefsIt.second) {
+        DEBUG(MI->print(dbgs()));
+      }
+    }
+  }
+}
+#endif // NDEBUG
+
+/// Answer the following question: Can Def be one of the definition
+/// involved in a part of a LOH?
+static bool canDefBePartOfLOH(const MachineInstr *Def) {
+  unsigned Opc = Def->getOpcode();
+  // Accept ADRP, ADDLow and LOADGot.
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::ADRP:
+    return true;
+  case AArch64::ADDXri:
+    // Check immediate to see if the immediate is an address.
+    switch (Def->getOperand(2).getType()) {
+    default:
+      return false;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_JumpTableIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_BlockAddress:
+      return true;
+    }
+  case AArch64::LDRXui:
+    // Check immediate to see if the immediate is an address.
+    switch (Def->getOperand(2).getType()) {
+    default:
+      return false;
+    case MachineOperand::MO_GlobalAddress:
+      return true;
+    }
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction can the end of a LOH chain involving a
+/// store.
+static bool isCandidateStore(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRBui:
+  case AArch64::STRHui:
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+    // In case we have str xA, [xA, #imm], this is two different uses
+    // of xA and we cannot fold, otherwise the xA stored may be wrong,
+    // even if #imm == 0.
+    if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg())
+      return true;
+  }
+  return false;
+}
+
+/// Given the result of a reaching definition algorithm in ColorOpToReachedUses,
+/// Build the Use to Defs information and filter out obvious non-LOH candidates.
+/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions.
+/// In non-ADRPMode, non-LOH candidates are "uses" with several definition,
+/// i.e., no simple chain.
+/// \param ADRPMode -- \see initReachingDef.
+static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
+                              const InstrToInstrs *ColorOpToReachedUses,
+                              const MapRegToId &RegToId,
+                              bool ADRPMode = false) {
+
+  SetOfMachineInstr NotCandidate;
+  unsigned NbReg = RegToId.size();
+  MapRegToId::const_iterator EndIt = RegToId.end();
+  for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) {
+    // If this color is never defined, continue.
+    if (ColorOpToReachedUses[CurReg].empty())
+      continue;
+
+    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+      for (const MachineInstr *MI : DefsIt.second) {
+        const MachineInstr *Def = DefsIt.first;
+        MapRegToId::const_iterator It;
+        // if all the reaching defs are not adrp, this use will not be
+        // simplifiable.
+        if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) ||
+            (!ADRPMode && !canDefBePartOfLOH(Def)) ||
+            (!ADRPMode && isCandidateStore(MI) &&
+             // store are LOH candidate iff the end of the chain is used as
+             // base.
+             ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt ||
+              It->second != CurReg))) {
+          NotCandidate.insert(MI);
+          continue;
+        }
+        // Do not consider self reaching as a simplifiable case for ADRP.
+        if (!ADRPMode || MI != DefsIt.first) {
+          UseToReachingDefs[MI].insert(DefsIt.first);
+          // If UsesIt has several reaching definitions, it is not
+          // candidate for simplificaton in non-ADRPMode.
+          if (!ADRPMode && UseToReachingDefs[MI].size() > 1)
+            NotCandidate.insert(MI);
+        }
+      }
+    }
+  }
+  for (const MachineInstr *Elem : NotCandidate) {
+    DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n");
+    // It would have been better if we could just remove the entry
+    // from the map.  Because of that, we have to filter the garbage
+    // (second.empty) in the subsequence analysis.
+    UseToReachingDefs[Elem].clear();
+  }
+}
+
+/// Based on the use to defs information (in ADRPMode), compute the
+/// opportunities of LOH ADRP-related.
+static void computeADRP(const InstrToInstrs &UseToDefs,
+                        AArch64FunctionInfo &AArch64FI,
+                        const MachineDominatorTree *MDT) {
+  DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
+  for (const auto &Entry : UseToDefs) {
+    unsigned Size = Entry.second.size();
+    if (Size == 0)
+      continue;
+    if (Size == 1) {
+      const MachineInstr *L2 = *Entry.second.begin();
+      const MachineInstr *L1 = Entry.first;
+      if (!MDT->dominates(L2, L1)) {
+        DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1
+                     << '\n');
+        continue;
+      }
+      DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
+      SmallVector<const MachineInstr *, 2> Args;
+      Args.push_back(L2);
+      Args.push_back(L1);
+      AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
+      ++NumADRPSimpleCandidate;
+    }
+#ifdef DEBUG
+    else if (Size == 2)
+      ++NumADRPComplexCandidate2;
+    else if (Size == 3)
+      ++NumADRPComplexCandidate3;
+    else
+      ++NumADRPComplexCandidateOther;
+#endif
+    // if Size < 1, the use should have been removed from the candidates
+    assert(Size >= 1 && "No reaching defs for that use!");
+  }
+}
+
+/// Check whether the given instruction can be the end of a LOH chain
+/// involving a load.
+static bool isCandidateLoad(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::LDRBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)
+      return false;
+    return true;
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction can load a litteral.
+static bool supportLoadFromLiteral(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDRSWui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    return true;
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction is a LOH candidate.
+/// \param UseToDefs is used to check that Instr is at the end of LOH supported
+/// chain.
+/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are
+/// already been filtered out.
+static bool isCandidate(const MachineInstr *Instr,
+                        const InstrToInstrs &UseToDefs,
+                        const MachineDominatorTree *MDT) {
+  if (!isCandidateLoad(Instr) && !isCandidateStore(Instr))
+    return false;
+
+  const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
+  if (Def->getOpcode() != AArch64::ADRP) {
+    // At this point, Def is ADDXri or LDRXui of the right type of
+    // symbol, because we filtered out the uses that were not defined
+    // by these kind of instructions (+ ADRP).
+
+    // Check if this forms a simple chain: each intermediate node must
+    // dominates the next one.
+    if (!MDT->dominates(Def, Instr))
+      return false;
+    // Move one node up in the simple chain.
+    if (UseToDefs.find(Def) ==
+            UseToDefs.end()
+            // The map may contain garbage we have to ignore.
+        ||
+        UseToDefs.find(Def)->second.empty())
+      return false;
+    Instr = Def;
+    Def = *UseToDefs.find(Def)->second.begin();
+  }
+  // Check if we reached the top of the simple chain:
+  // - top is ADRP.
+  // - check the simple chain property: each intermediate node must
+  // dominates the next one.
+  if (Def->getOpcode() == AArch64::ADRP)
+    return MDT->dominates(Def, Instr);
+  return false;
+}
+
+static bool registerADRCandidate(const MachineInstr &Use,
+                                 const InstrToInstrs &UseToDefs,
+                                 const InstrToInstrs *DefsPerColorToUses,
+                                 AArch64FunctionInfo &AArch64FI,
+                                 SetOfMachineInstr *InvolvedInLOHs,
+                                 const MapRegToId &RegToId) {
+  // Look for opportunities to turn ADRP -> ADD or
+  // ADRP -> LDR GOTPAGEOFF into ADR.
+  // If ADRP has more than one use. Give up.
+  if (Use.getOpcode() != AArch64::ADDXri &&
+      (Use.getOpcode() != AArch64::LDRXui ||
+       !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT)))
+    return false;
+  InstrToInstrs::const_iterator It = UseToDefs.find(&Use);
+  // The map may contain garbage that we need to ignore.
+  if (It == UseToDefs.end() || It->second.empty())
+    return false;
+  const MachineInstr &Def = **It->second.begin();
+  if (Def.getOpcode() != AArch64::ADRP)
+    return false;
+  // Check the number of users of ADRP.
+  const SetOfMachineInstr *Users =
+      getUses(DefsPerColorToUses,
+              RegToId.find(Def.getOperand(0).getReg())->second, Def);
+  if (Users->size() > 1) {
+    ++NumADRComplexCandidate;
+    return false;
+  }
+  ++NumADRSimpleCandidate;
+  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) &&
+         "ADRP already involved in LOH.");
+  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) &&
+         "ADD already involved in LOH.");
+  DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
+
+  SmallVector<const MachineInstr *, 2> Args;
+  Args.push_back(&Def);
+  Args.push_back(&Use);
+
+  AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd
+                                                           : MCLOH_AdrpLdrGot,
+                          Args);
+  return true;
+}
+
+/// Based on the use to defs information (in non-ADRPMode), compute the
+/// opportunities of LOH non-ADRP-related
+static void computeOthers(const InstrToInstrs &UseToDefs,
+                          const InstrToInstrs *DefsPerColorToUses,
+                          AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId,
+                          const MachineDominatorTree *MDT) {
+  SetOfMachineInstr *InvolvedInLOHs = nullptr;
+#ifdef DEBUG
+  SetOfMachineInstr InvolvedInLOHsStorage;
+  InvolvedInLOHs = &InvolvedInLOHsStorage;
+#endif // DEBUG
+  DEBUG(dbgs() << "*** Compute LOH for Others\n");
+  // ADRP -> ADD/LDR -> LDR/STR pattern.
+  // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern.
+
+  // FIXME: When the statistics are not important,
+  // This initial filtering loop can be merged into the next loop.
+  // Currently, we didn't do it to have the same code for both DEBUG and
+  // NDEBUG builds. Indeed, the iterator of the second loop would need
+  // to be changed.
+  SetOfMachineInstr PotentialCandidates;
+  SetOfMachineInstr PotentialADROpportunities;
+  for (auto &Use : UseToDefs) {
+    // If no definition is available, this is a non candidate.
+    if (Use.second.empty())
+      continue;
+    // Keep only instructions that are load or store and at the end of
+    // a ADRP -> ADD/LDR/Nothing chain.
+    // We already filtered out the no-chain cases.
+    if (!isCandidate(Use.first, UseToDefs, MDT)) {
+      PotentialADROpportunities.insert(Use.first);
+      continue;
+    }
+    PotentialCandidates.insert(Use.first);
+  }
+
+  // Make the following distinctions for statistics as the linker does
+  // know how to decode instructions:
+  // - ADD/LDR/Nothing make there different patterns.
+  // - LDR/STR make two different patterns.
+  // Hence, 6 - 1 base patterns.
+  // (because ADRP-> Nothing -> STR is not simplifiable)
+
+  // The linker is only able to have a simple semantic, i.e., if pattern A
+  // do B.
+  // However, we want to see the opportunity we may miss if we were able to
+  // catch more complex cases.
+
+  // PotentialCandidates are result of a chain ADRP -> ADD/LDR ->
+  // A potential candidate becomes a candidate, if its current immediate
+  // operand is zero and all nodes of the chain have respectively only one user
+#ifdef DEBUG
+  SetOfMachineInstr DefsOfPotentialCandidates;
+#endif
+  for (const MachineInstr *Candidate : PotentialCandidates) {
+    // Get the definition of the candidate i.e., ADD or LDR.
+    const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin();
+    // Record the elements of the chain.
+    const MachineInstr *L1 = Def;
+    const MachineInstr *L2 = nullptr;
+    unsigned ImmediateDefOpc = Def->getOpcode();
+    if (Def->getOpcode() != AArch64::ADRP) {
+      // Check the number of users of this node.
+      const SetOfMachineInstr *Users =
+          getUses(DefsPerColorToUses,
+                  RegToId.find(Def->getOperand(0).getReg())->second, *Def);
+      if (Users->size() > 1) {
+#ifdef DEBUG
+        // if all the uses of this def are in potential candidate, this is
+        // a complex candidate of level 2.
+        bool IsLevel2 = true;
+        for (const MachineInstr *MI : *Users) {
+          if (!PotentialCandidates.count(MI)) {
+            ++NumTooCplxLvl2;
+            IsLevel2 = false;
+            break;
+          }
+        }
+        if (IsLevel2)
+          ++NumCplxLvl2;
+#endif // DEBUG
+        PotentialADROpportunities.insert(Def);
+        continue;
+      }
+      L2 = Def;
+      Def = *UseToDefs.find(Def)->second.begin();
+      L1 = Def;
+    } // else the element in the middle of the chain is nothing, thus
+      // Def already contains the first element of the chain.
+
+    // Check the number of users of the first node in the chain, i.e., ADRP
+    const SetOfMachineInstr *Users =
+        getUses(DefsPerColorToUses,
+                RegToId.find(Def->getOperand(0).getReg())->second, *Def);
+    if (Users->size() > 1) {
+#ifdef DEBUG
+      // if all the uses of this def are in the defs of the potential candidate,
+      // this is a complex candidate of level 1
+      if (DefsOfPotentialCandidates.empty()) {
+        // lazy init
+        DefsOfPotentialCandidates = PotentialCandidates;
+        for (const MachineInstr *Candidate : PotentialCandidates) {
+          if (!UseToDefs.find(Candidate)->second.empty())
+            DefsOfPotentialCandidates.insert(
+                *UseToDefs.find(Candidate)->second.begin());
+        }
+      }
+      bool Found = false;
+      for (auto &Use : *Users) {
+        if (!DefsOfPotentialCandidates.count(Use)) {
+          ++NumTooCplxLvl1;
+          Found = true;
+          break;
+        }
+      }
+      if (!Found)
+        ++NumCplxLvl1;
+#endif // DEBUG
+      continue;
+    }
+
+    bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
+    // If the chain is three instructions long and ldr is the second element,
+    // then this ldr must load form GOT, otherwise this is not a correct chain.
+    if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT)
+      continue;
+    SmallVector<const MachineInstr *, 3> Args;
+    MCLOHType Kind;
+    if (isCandidateLoad(Candidate)) {
+      if (!L2) {
+        // At this point, the candidate LOH indicates that the ldr instruction
+        // may use a direct access to the symbol. There is not such encoding
+        // for loads of byte and half.
+        if (!supportLoadFromLiteral(Candidate))
+          continue;
+
+        DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate
+                     << '\n');
+        Kind = MCLOH_AdrpLdr;
+        Args.push_back(L1);
+        Args.push_back(Candidate);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+        ++NumADRPToLDR;
+      } else {
+        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+                     << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+                     << '\n');
+
+        Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr;
+        Args.push_back(L1);
+        Args.push_back(L2);
+        Args.push_back(Candidate);
+
+        PotentialADROpportunities.remove(L2);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+               "L2 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+#ifdef DEBUG
+        // get the immediate of the load
+        if (Candidate->getOperand(2).getImm() == 0)
+          if (ImmediateDefOpc == AArch64::ADDXri)
+            ++NumADDToLDR;
+          else
+            ++NumLDRToLDR;
+        else if (ImmediateDefOpc == AArch64::ADDXri)
+          ++NumADDToLDRWithImm;
+        else
+          ++NumLDRToLDRWithImm;
+#endif // DEBUG
+      }
+    } else {
+      if (ImmediateDefOpc == AArch64::ADRP)
+        continue;
+      else {
+
+        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+                     << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+                     << '\n');
+
+        Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr;
+        Args.push_back(L1);
+        Args.push_back(L2);
+        Args.push_back(Candidate);
+
+        PotentialADROpportunities.remove(L2);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+               "L2 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+#ifdef DEBUG
+        // get the immediate of the store
+        if (Candidate->getOperand(2).getImm() == 0)
+          if (ImmediateDefOpc == AArch64::ADDXri)
+            ++NumADDToSTR;
+          else
+            ++NumLDRToSTR;
+        else if (ImmediateDefOpc == AArch64::ADDXri)
+          ++NumADDToSTRWithImm;
+        else
+          ++NumLDRToSTRWithImm;
+#endif // DEBUG
+      }
+    }
+    AArch64FI.addLOHDirective(Kind, Args);
+  }
+
+  // Now, we grabbed all the big patterns, check ADR opportunities.
+  for (const MachineInstr *Candidate : PotentialADROpportunities)
+    registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI,
+                         InvolvedInLOHs, RegToId);
+}
+
+/// Look for every register defined by potential LOHs candidates.
+/// Map these registers with dense id in @p RegToId and vice-versa in
+/// @p IdToReg. @p IdToReg is populated only in DEBUG mode.
+static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
+                               MapIdToReg &IdToReg,
+                               const TargetRegisterInfo *TRI) {
+  unsigned CurRegId = 0;
+  if (!PreCollectRegister) {
+    unsigned NbReg = TRI->getNumRegs();
+    for (; CurRegId < NbReg; ++CurRegId) {
+      RegToId[CurRegId] = CurRegId;
+      DEBUG(IdToReg.push_back(CurRegId));
+      DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches"));
+    }
+    return;
+  }
+
+  DEBUG(dbgs() << "** Collect Involved Register\n");
+  for (const auto &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (!canDefBePartOfLOH(&MI))
+        continue;
+
+      // Process defs
+      for (MachineInstr::const_mop_iterator IO = MI.operands_begin(),
+                                            IOEnd = MI.operands_end();
+           IO != IOEnd; ++IO) {
+        if (!IO->isReg() || !IO->isDef())
+          continue;
+        unsigned CurReg = IO->getReg();
+        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI)
+          if (RegToId.find(*AI) == RegToId.end()) {
+            DEBUG(IdToReg.push_back(*AI);
+                  assert(IdToReg[CurRegId] == *AI &&
+                         "Reg index mismatches insertion index."));
+            RegToId[*AI] = CurRegId++;
+            DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n');
+          }
+      }
+    }
+  }
+}
+
+bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+
+  MapRegToId RegToId;
+  MapIdToReg IdToReg;
+  AArch64FunctionInfo *AArch64FI = MF.getInfo<AArch64FunctionInfo>();
+  assert(AArch64FI && "No MachineFunctionInfo for this function!");
+
+  DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n');
+
+  collectInvolvedReg(MF, RegToId, IdToReg, TRI);
+  if (RegToId.empty())
+    return false;
+
+  MachineInstr *DummyOp = nullptr;
+  if (BasicBlockScopeOnly) {
+    const AArch64InstrInfo *TII =
+        static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+    // For local analysis, create a dummy operation to record uses that are not
+    // local.
+    DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
+  }
+
+  unsigned NbReg = RegToId.size();
+  bool Modified = false;
+
+  // Start with ADRP.
+  InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+  // Compute the reaching def in ADRP mode, meaning ADRP definitions
+  // are first considered as uses.
+  reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp);
+  DEBUG(dbgs() << "ADRP reaching defs\n");
+  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+  // Translate the definition to uses map into a use to definitions map to ease
+  // statistic computation.
+  InstrToInstrs ADRPToReachingDefs;
+  reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
+
+  // Compute LOH for ADRP.
+  computeADRP(ADRPToReachingDefs, *AArch64FI, MDT);
+  delete[] ColorOpToReachedUses;
+
+  // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
+  ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+  // first perform a regular reaching def analysis.
+  reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp);
+  DEBUG(dbgs() << "All reaching defs\n");
+  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+  // Turn that into a use to defs to ease statistic computation.
+  InstrToInstrs UsesToReachingDefs;
+  reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
+
+  // Compute other than AdrpAdrp LOH.
+  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId,
+                MDT);
+  delete[] ColorOpToReachedUses;
+
+  if (BasicBlockScopeOnly)
+    MF.DeleteMachineInstr(DummyOp);
+
+  return Modified;
+}
+
+/// createAArch64CollectLOHPass - returns an instance of the Statistic for
+/// linker optimization pass.
+FunctionPass *llvm::createAArch64CollectLOHPass() {
+  return new AArch64CollectLOH();
+}
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
new file mode 100644
index 00000000000..452cdecf8a0
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -0,0 +1,919 @@
+//===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64ConditionalCompares pass which reduces
+// branching and code size by using the conditional compare instructions CCMP,
+// CCMN, and FCMP.
+//
+// The CFG transformations for forming conditional compares are very similar to
+// if-conversion, and this pass should run immediately before the early
+// if-conversion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ccmp"
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned> BlockInstrLimit(
+    "aarch64-ccmp-limit", cl::init(30), cl::Hidden,
+    cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("aarch64-stress-ccmp", cl::Hidden,
+                            cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumConsidered, "Number of ccmps considered");
+STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)");
+STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)");
+STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)");
+STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)");
+STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)");
+STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)");
+STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)");
+STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)");
+STATISTIC(NumMultNZCVUses, "Number of ccmps rejected (NZCV used)");
+STATISTIC(NumUnknNZCVDefs, "Number of ccmps rejected (NZCV def unknown)");
+
+STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)");
+
+STATISTIC(NumConverted, "Number of ccmp instructions created");
+STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted");
+
+//===----------------------------------------------------------------------===//
+//                                 SSACCmpConv
+//===----------------------------------------------------------------------===//
+//
+// The SSACCmpConv class performs ccmp-conversion on SSA form machine code
+// after determining if it is possible. The class contains no heuristics;
+// external code should be used to determine when ccmp-conversion is a good
+// idea.
+//
+// CCmp-formation works on a CFG representing chained conditions, typically
+// from C's short-circuit || and && operators:
+//
+//   From:         Head            To:         Head
+//                 / |                         CmpBB
+//                /  |                         / |
+//               |  CmpBB                     /  |
+//               |  / |                    Tail  |
+//               | /  |                      |   |
+//              Tail  |                      |   |
+//                |   |                      |   |
+//               ... ...                    ... ...
+//
+// The Head block is terminated by a br.cond instruction, and the CmpBB block
+// contains compare + br.cond. Tail must be a successor of both.
+//
+// The cmp-conversion turns the compare instruction in CmpBB into a conditional
+// compare, and merges CmpBB into Head, speculatively executing its
+// instructions. The AArch64 conditional compare instructions have an immediate
+// operand that specifies the NZCV flag values when the condition is false and
+// the compare isn't executed. This makes it possible to chain compares with
+// different condition codes.
+//
+// Example:
+//
+//    if (a == 5 || b == 17)
+//      foo();
+//
+//    Head:
+//       cmp  w0, #5
+//       b.eq Tail
+//    CmpBB:
+//       cmp  w1, #17
+//       b.eq Tail
+//    ...
+//    Tail:
+//      bl _foo
+//
+//  Becomes:
+//
+//    Head:
+//       cmp  w0, #5
+//       ccmp w1, #17, 4, ne  ; 4 = nZcv
+//       b.eq Tail
+//    ...
+//    Tail:
+//      bl _foo
+//
+// The ccmp condition code is the one that would cause the Head terminator to
+// branch to CmpBB.
+//
+// FIXME: It should also be possible to speculate a block on the critical edge
+// between Head and Tail, just like if-converting a diamond.
+//
+// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion).
+
+namespace {
+class SSACCmpConv {
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  /// The first block containing a conditional branch, dominating everything
+  /// else.
+  MachineBasicBlock *Head;
+
+  /// The block containing cmp+br.cond with a successor shared with Head.
+  MachineBasicBlock *CmpBB;
+
+  /// The common successor for Head and CmpBB.
+  MachineBasicBlock *Tail;
+
+  /// The compare instruction in CmpBB that can be converted to a ccmp.
+  MachineInstr *CmpMI;
+
+private:
+  /// The branch condition in Head as determined by AnalyzeBranch.
+  SmallVector<MachineOperand, 4> HeadCond;
+
+  /// The condition code that makes Head branch to CmpBB.
+  AArch64CC::CondCode HeadCmpBBCC;
+
+  /// The branch condition in CmpBB.
+  SmallVector<MachineOperand, 4> CmpBBCond;
+
+  /// The condition code that makes CmpBB branch to Tail.
+  AArch64CC::CondCode CmpBBTailCC;
+
+  /// Check if the Tail PHIs are trivially convertible.
+  bool trivialTailPHIs();
+
+  /// Remove CmpBB from the Tail PHIs.
+  void updateTailPHIs();
+
+  /// Check if an operand defining DstReg is dead.
+  bool isDeadDef(unsigned DstReg);
+
+  /// Find the compare instruction in MBB that controls the conditional branch.
+  /// Return NULL if a convertible instruction can't be found.
+  MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB);
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI);
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF) {
+    this->MF = &MF;
+    TII = MF.getTarget().getInstrInfo();
+    TRI = MF.getTarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+  }
+
+  /// If the sub-CFG headed by MBB can be cmp-converted, initialize the
+  /// internal state, and return true.
+  bool canConvert(MachineBasicBlock *MBB);
+
+  /// Cmo-convert the last block passed to canConvertCmp(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks);
+
+  /// Return the expected code size delta if the conversion into a
+  /// conditional compare is performed.
+  int expectedCodeSizeDelta() const;
+};
+} // end anonymous namespace
+
+// Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
+// This means that no if-conversion is required when merging CmpBB into Head.
+bool SSACCmpConv::trivialTailPHIs() {
+  for (auto &I : *Tail) {
+    if (!I.isPHI())
+      break;
+    unsigned HeadReg = 0, CmpBBReg = 0;
+    // PHI operands come in (VReg, MBB) pairs.
+    for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
+      MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
+      unsigned Reg = I.getOperand(oi).getReg();
+      if (MBB == Head) {
+        assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
+        HeadReg = Reg;
+      }
+      if (MBB == CmpBB) {
+        assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands");
+        CmpBBReg = Reg;
+      }
+    }
+    if (HeadReg != CmpBBReg)
+      return false;
+  }
+  return true;
+}
+
+// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply
+// removing the CmpBB operands. The Head operands will be identical.
+void SSACCmpConv::updateTailPHIs() {
+  for (auto &I : *Tail) {
+    if (!I.isPHI())
+      break;
+    // I is a PHI. It can have multiple entries for CmpBB.
+    for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) {
+      // PHI operands are (Reg, MBB) at (oi-2, oi-1).
+      if (I.getOperand(oi - 1).getMBB() == CmpBB) {
+        I.RemoveOperand(oi - 1);
+        I.RemoveOperand(oi - 2);
+      }
+    }
+  }
+}
+
+// This pass runs before the AArch64DeadRegisterDefinitions pass, so compares
+// are still writing virtual registers without any uses.
+bool SSACCmpConv::isDeadDef(unsigned DstReg) {
+  // Writes to the zero register are dead.
+  if (DstReg == AArch64::WZR || DstReg == AArch64::XZR)
+    return true;
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+  // A virtual register def without any uses will be marked dead later, and
+  // eventually replaced by the zero register.
+  return MRI->use_nodbg_empty(DstReg);
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Return
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
+  // A normal br.cond simply has the condition code.
+  if (Cond[0].getImm() != -1) {
+    assert(Cond.size() == 1 && "Unknown Cond array format");
+    CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    return true;
+  }
+  // For tbz and cbz instruction, the opcode is next.
+  switch (Cond[1].getImm()) {
+  default:
+    // This includes tbz / tbnz branches which can't be converted to
+    // ccmp + br.cond.
+    return false;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+    assert(Cond.size() == 3 && "Unknown Cond array format");
+    CC = AArch64CC::EQ;
+    return true;
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    assert(Cond.size() == 3 && "Unknown Cond array format");
+    CC = AArch64CC::NE;
+    return true;
+  }
+}
+
+MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+  if (I == MBB->end())
+    return nullptr;
+  // The terminator must be controlled by the flags.
+  if (!I->readsRegister(AArch64::NZCV)) {
+    switch (I->getOpcode()) {
+    case AArch64::CBZW:
+    case AArch64::CBZX:
+    case AArch64::CBNZW:
+    case AArch64::CBNZX:
+      // These can be converted into a ccmp against #0.
+      return I;
+    }
+    ++NumCmpTermRejs;
+    DEBUG(dbgs() << "Flags not used by terminator: " << *I);
+    return nullptr;
+  }
+
+  // Now find the instruction controlling the terminator.
+  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+    --I;
+    assert(!I->isTerminator() && "Spurious terminator");
+    switch (I->getOpcode()) {
+    // cmp is an alias for subs with a dead destination register.
+    case AArch64::SUBSWri:
+    case AArch64::SUBSXri:
+    // cmn is an alias for adds with a dead destination register.
+    case AArch64::ADDSWri:
+    case AArch64::ADDSXri:
+      // Check that the immediate operand is within range, ccmp wants a uimm5.
+      // Rd = SUBSri Rn, imm, shift
+      if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
+        DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
+        ++NumImmRangeRejs;
+        return nullptr;
+      }
+    // Fall through.
+    case AArch64::SUBSWrr:
+    case AArch64::SUBSXrr:
+    case AArch64::ADDSWrr:
+    case AArch64::ADDSXrr:
+      if (isDeadDef(I->getOperand(0).getReg()))
+        return I;
+      DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
+      ++NumLiveDstRejs;
+      return nullptr;
+    case AArch64::FCMPSrr:
+    case AArch64::FCMPDrr:
+    case AArch64::FCMPESrr:
+    case AArch64::FCMPEDrr:
+      return I;
+    }
+
+    // Check for flag reads and clobbers.
+    MIOperands::PhysRegInfo PRI =
+        MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI);
+
+    if (PRI.Reads) {
+      // The ccmp doesn't produce exactly the same flags as the original
+      // compare, so reject the transform if there are uses of the flags
+      // besides the terminators.
+      DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
+      ++NumMultNZCVUses;
+      return nullptr;
+    }
+
+    if (PRI.Clobbers) {
+      DEBUG(dbgs() << "Not convertible compare: " << *I);
+      ++NumUnknNZCVDefs;
+      return nullptr;
+    }
+  }
+  DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+  return nullptr;
+}
+
+/// Determine if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// Only CmpMI is allowed to clobber the flags.
+///
+bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
+                                     const MachineInstr *CmpMI) {
+  // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) {
+    if (I.isDebugValue())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+                   << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I.isPHI()) {
+      DEBUG(dbgs() << "Can't hoist: " << I);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (I.mayLoad()) {
+      DEBUG(dbgs() << "Won't speculate load: " << I);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!I.isSafeToMove(TII, nullptr, DontMoveAcrossStore)) {
+      DEBUG(dbgs() << "Can't speculate: " << I);
+      return false;
+    }
+
+    // Only CmpMI is allowed to clobber the flags.
+    if (&I != CmpMI && I.modifiesRegister(AArch64::NZCV, TRI)) {
+      DEBUG(dbgs() << "Clobbers flags: " << I);
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential
+/// candidate for cmp-conversion. Fill out the internal state.
+///
+bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
+  Head = MBB;
+  Tail = CmpBB = nullptr;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // CmpBB can only have a single predecessor. Tail is allowed many.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  // Succ0 is our candidate for CmpBB.
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2)
+    return false;
+
+  CmpBB = Succ0;
+  Tail = Succ1;
+
+  if (!CmpBB->isSuccessor(Tail))
+    return false;
+
+  // The CFG topology checks out.
+  DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#"
+               << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n');
+  ++NumConsidered;
+
+  // Tail is allowed to have many predecessors, but we can't handle PHIs yet.
+  //
+  // FIXME: Real PHIs could be if-converted as long as the CmpBB values are
+  // defined before The CmpBB cmp clobbers the flags. Alternatively, it should
+  // always be safe to sink the ccmp down to immediately before the CmpBB
+  // terminators.
+  if (!trivialTailPHIs()) {
+    DEBUG(dbgs() << "Can't handle phis in Tail.\n");
+    ++NumPhiRejs;
+    return false;
+  }
+
+  if (!Tail->livein_empty()) {
+    DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // CmpBB should never have PHIs since Head is its only predecessor.
+  // FIXME: Clean them up if it happens.
+  if (!CmpBB->empty() && CmpBB->front().isPHI()) {
+    DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
+    ++NumPhi2Rejs;
+    return false;
+  }
+
+  if (!CmpBB->livein_empty()) {
+    DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  HeadCond.clear();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) {
+    DEBUG(dbgs() << "Head branch not analyzable.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG, or an edge to a
+  // landing pad.
+  if (!TBB || HeadCond.empty()) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(HeadCond, HeadCmpBBCC)) {
+    DEBUG(dbgs() << "Unsupported branch type on Head\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // Make sure the branch direction is right.
+  if (TBB != CmpBB) {
+    assert(TBB == Tail && "Unexpected TBB");
+    HeadCmpBBCC = AArch64CC::getInvertedCondCode(HeadCmpBBCC);
+  }
+
+  CmpBBCond.clear();
+  TBB = FBB = nullptr;
+  if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
+    DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!TBB || CmpBBCond.empty()) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(CmpBBCond, CmpBBTailCC)) {
+    DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (TBB != Tail)
+    CmpBBTailCC = AArch64CC::getInvertedCondCode(CmpBBTailCC);
+
+  DEBUG(dbgs() << "Head->CmpBB on " << AArch64CC::getCondCodeName(HeadCmpBBCC)
+               << ", CmpBB->Tail on " << AArch64CC::getCondCodeName(CmpBBTailCC)
+               << '\n');
+
+  CmpMI = findConvertibleCompare(CmpBB);
+  if (!CmpMI)
+    return false;
+
+  if (!canSpeculateInstrs(CmpBB, CmpMI)) {
+    ++NumSpeculateRejs;
+    return false;
+  }
+  return true;
+}
+
+void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
+  DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#"
+               << Head->getNumber() << ":\n" << *CmpBB);
+
+  // All CmpBB instructions are moved into Head, and CmpBB is deleted.
+  // Update the CFG first.
+  updateTailPHIs();
+  Head->removeSuccessor(CmpBB);
+  CmpBB->removeSuccessor(Tail);
+  Head->transferSuccessorsAndUpdatePHIs(CmpBB);
+  DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
+  TII->RemoveBranch(*Head);
+
+  // If the Head terminator was one of the cbz / tbz branches with built-in
+  // compare, we need to insert an explicit compare instruction in its place.
+  if (HeadCond[0].getImm() == -1) {
+    ++NumCompBranches;
+    unsigned Opc = 0;
+    switch (HeadCond[1].getImm()) {
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+      Opc = AArch64::SUBSWri;
+      break;
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+      Opc = AArch64::SUBSXri;
+      break;
+    default:
+      llvm_unreachable("Cannot convert Head branch");
+    }
+    const MCInstrDesc &MCID = TII->get(Opc);
+    // Create a dummy virtual register for the SUBS def.
+    unsigned DestReg =
+        MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
+    // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
+    BuildMI(*Head, Head->end(), TermDL, MCID)
+        .addReg(DestReg, RegState::Define | RegState::Dead)
+        .addOperand(HeadCond[2])
+        .addImm(0)
+        .addImm(0);
+    // SUBS uses the GPR*sp register classes.
+    MRI->constrainRegClass(HeadCond[2].getReg(),
+                           TII->getRegClass(MCID, 1, TRI, *MF));
+  }
+
+  Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
+
+  // Now replace CmpMI with a ccmp instruction that also considers the incoming
+  // flags.
+  unsigned Opc = 0;
+  unsigned FirstOp = 1;   // First CmpMI operand to copy.
+  bool isZBranch = false; // CmpMI is a cbz/cbnz instruction.
+  switch (CmpMI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown compare opcode");
+  case AArch64::SUBSWri:    Opc = AArch64::CCMPWi; break;
+  case AArch64::SUBSWrr:    Opc = AArch64::CCMPWr; break;
+  case AArch64::SUBSXri:    Opc = AArch64::CCMPXi; break;
+  case AArch64::SUBSXrr:    Opc = AArch64::CCMPXr; break;
+  case AArch64::ADDSWri:    Opc = AArch64::CCMNWi; break;
+  case AArch64::ADDSWrr:    Opc = AArch64::CCMNWr; break;
+  case AArch64::ADDSXri:    Opc = AArch64::CCMNXi; break;
+  case AArch64::ADDSXrr:    Opc = AArch64::CCMNXr; break;
+  case AArch64::FCMPSrr:    Opc = AArch64::FCCMPSrr; FirstOp = 0; break;
+  case AArch64::FCMPDrr:    Opc = AArch64::FCCMPDrr; FirstOp = 0; break;
+  case AArch64::FCMPESrr:   Opc = AArch64::FCCMPESrr; FirstOp = 0; break;
+  case AArch64::FCMPEDrr:   Opc = AArch64::FCCMPEDrr; FirstOp = 0; break;
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+    Opc = AArch64::CCMPWi;
+    FirstOp = 0;
+    isZBranch = true;
+    break;
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+    Opc = AArch64::CCMPXi;
+    FirstOp = 0;
+    isZBranch = true;
+    break;
+  }
+
+  // The ccmp instruction should set the flags according to the comparison when
+  // Head would have branched to CmpBB.
+  // The NZCV immediate operand should provide flags for the case where Head
+  // would have branched to Tail. These flags should cause the new Head
+  // terminator to branch to tail.
+  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
+  const MCInstrDesc &MCID = TII->get(Opc);
+  MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
+                         TII->getRegClass(MCID, 0, TRI, *MF));
+  if (CmpMI->getOperand(FirstOp + 1).isReg())
+    MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
+                           TII->getRegClass(MCID, 1, TRI, *MF));
+  MachineInstrBuilder MIB =
+      BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
+          .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
+  if (isZBranch)
+    MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
+  else
+    MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
+  MIB.addImm(NZCV).addImm(HeadCmpBBCC);
+
+  // If CmpMI was a terminator, we need a new conditional branch to replace it.
+  // This now becomes a Head terminator.
+  if (isZBranch) {
+    bool isNZ = CmpMI->getOpcode() == AArch64::CBNZW ||
+                CmpMI->getOpcode() == AArch64::CBNZX;
+    BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc))
+        .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ)
+        .addOperand(CmpMI->getOperand(1)); // Branch target.
+  }
+  CmpMI->eraseFromParent();
+  Head->updateTerminator();
+
+  RemovedBlocks.push_back(CmpBB);
+  CmpBB->eraseFromParent();
+  DEBUG(dbgs() << "Result:\n" << *Head);
+  ++NumConverted;
+}
+
+int SSACCmpConv::expectedCodeSizeDelta() const {
+  int delta = 0;
+  // If the Head terminator was one of the cbz / tbz branches with built-in
+  // compare, we need to insert an explicit compare instruction in its place
+  // plus a branch instruction.
+  if (HeadCond[0].getImm() == -1) {
+    switch (HeadCond[1].getImm()) {
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+      // Therefore delta += 1
+      delta = 1;
+      break;
+    default:
+      llvm_unreachable("Cannot convert Head branch");
+    }
+  }
+  // If the Cmp terminator was one of the cbz / tbz branches with
+  // built-in compare, it will be turned into a compare instruction
+  // into Head, but we do not save any instruction.
+  // Otherwise, we save the branch instruction.
+  switch (CmpMI->getOpcode()) {
+  default:
+    --delta;
+    break;
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+    break;
+  }
+  return delta;
+}
+
+//===----------------------------------------------------------------------===//
+//                       AArch64ConditionalCompares Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64ConditionalCompares : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MCSchedModel *SchedModel;
+  // Does the proceeded function has Oz attribute.
+  bool MinSize;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+  SSACCmpConv CmpConv;
+
+public:
+  static char ID;
+  AArch64ConditionalCompares() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const char *getPassName() const override {
+    return "AArch64 Conditional Compares";
+  }
+
+private:
+  bool tryConvert(MachineBasicBlock *);
+  void updateDomTree(ArrayRef<MachineBasicBlock *> Removed);
+  void updateLoops(ArrayRef<MachineBasicBlock *> Removed);
+  void invalidateTraces();
+  bool shouldConvert();
+};
+} // end anonymous namespace
+
+char AArch64ConditionalCompares::ID = 0;
+
+namespace llvm {
+void initializeAArch64ConditionalComparesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
+                      "AArch64 CCMP Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
+                    "AArch64 CCMP Pass", false, false)
+
+FunctionPass *llvm::createAArch64ConditionalCompares() {
+  return new AArch64ConditionalCompares();
+}
+
+void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void AArch64ConditionalCompares::updateDomTree(
+    ArrayRef<MachineBasicBlock *> Removed) {
+  // convert() removes CmpBB which was previously dominated by Head.
+  // CmpBB children should be transferred to Head.
+  MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+    assert(Node != HeadNode && "Cannot erase the head node");
+    assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
+    while (Node->getNumChildren())
+      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+    DomTree->eraseNode(Removed[i]);
+  }
+}
+
+/// Update LoopInfo after if-conversion.
+void
+AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
+  if (!Loops)
+    return;
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
+    Loops->removeBlock(Removed[i]);
+}
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void AArch64ConditionalCompares::invalidateTraces() {
+  Traces->invalidate(CmpConv.Head);
+  Traces->invalidate(CmpConv.CmpBB);
+}
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool AArch64ConditionalCompares::shouldConvert() {
+  // Stress testing mode disables all cost considerations.
+  if (Stress)
+    return true;
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  // Head dominates CmpBB, so it is always included in its trace.
+  MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB);
+
+  // If code size is the main concern
+  if (MinSize) {
+    int CodeSizeDelta = CmpConv.expectedCodeSizeDelta();
+    DEBUG(dbgs() << "Code size delta:  " << CodeSizeDelta << '\n');
+    // If we are minimizing the code size, do the conversion whatever
+    // the cost is.
+    if (CodeSizeDelta < 0)
+      return true;
+    if (CodeSizeDelta > 0) {
+      DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
+      return false;
+    }
+    // CodeSizeDelta == 0, continue with the regular heuristics
+  }
+
+  // Heuristic: The compare conversion delays the execution of the branch
+  // instruction because we must wait for the inputs to the second compare as
+  // well. The branch has no dependent instructions, but delaying it increases
+  // the cost of a misprediction.
+  //
+  // Set a limit on the delay we will accept.
+  unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4;
+
+  // Instruction depths can be computed for all trace instructions above CmpBB.
+  unsigned HeadDepth =
+      Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth;
+  unsigned CmpBBDepth =
+      Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth;
+  DEBUG(dbgs() << "Head depth:  " << HeadDepth
+               << "\nCmpBB depth: " << CmpBBDepth << '\n');
+  if (CmpBBDepth > HeadDepth + DelayLimit) {
+    DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
+                 << " cycles.\n");
+    return false;
+  }
+
+  // Check the resource depth at the bottom of CmpBB - these instructions will
+  // be speculated.
+  unsigned ResDepth = Trace.getResourceDepth(true);
+  DEBUG(dbgs() << "Resources:   " << ResDepth << '\n');
+
+  // Heuristic: The speculatively executed instructions must all be able to
+  // merge into the Head block. The Head critical path should dominate the
+  // resource cost of the speculated instructions.
+  if (ResDepth > HeadDepth) {
+    DEBUG(dbgs() << "Too many instructions to speculate.\n");
+    return false;
+  }
+  return true;
+}
+
+bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (CmpConv.canConvert(MBB) && shouldConvert()) {
+    invalidateTraces();
+    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
+    CmpConv.convert(RemovedBlocks);
+    Changed = true;
+    updateDomTree(RemovedBlocks);
+    updateLoops(RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  SchedModel =
+      MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
+  MRI = &MF.getRegInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = nullptr;
+  MinSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
+
+  bool Changed = false;
+  CmpConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+  // cmp-conversions from the same head block.
+  // Note that updateDomTree() modifies the children of the DomTree node
+  // currently being visited. The df_iterator supports that; it doesn't look at
+  // child_begin() / child_end() until after a node has been visited.
+  for (auto *I : depth_first(DomTree))
+    if (tryConvert(I->getBlock()))
+      Changed = true;
+
+  return Changed;
+}
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
new file mode 100644
index 00000000000..a2d853c85fe
--- /dev/null
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -0,0 +1,134 @@
+//==-- AArch64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When allowed by the instruction, replace a dead definition of a GPR with
+// the zero register. This makes the code a bit friendlier towards the
+// hardware's register renamer.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-dead-defs"
+
+STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
+
+namespace {
+class AArch64DeadRegisterDefinitions : public MachineFunctionPass {
+private:
+  const TargetRegisterInfo *TRI;
+  bool implicitlyDefinesOverlappingReg(unsigned Reg, const MachineInstr &MI);
+  bool processMachineBasicBlock(MachineBasicBlock &MBB);
+  bool usesFrameIndex(const MachineInstr &MI);
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &F) override;
+
+  const char *getPassName() const override { return "Dead register definitions"; }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64DeadRegisterDefinitions::ID = 0;
+} // end anonymous namespace
+
+bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg(
+    unsigned Reg, const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.implicit_operands())
+    if (MO.isReg() && MO.isDef())
+      if (TRI->regsOverlap(Reg, MO.getReg()))
+        return true;
+  return false;
+}
+
+bool AArch64DeadRegisterDefinitions::usesFrameIndex(const MachineInstr &MI) {
+  for (const MachineOperand &Op : MI.uses())
+    if (Op.isFI())
+      return true;
+  return false;
+}
+
+bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
+    MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineInstr &MI : MBB) {
+    if (usesFrameIndex(MI)) {
+      // We need to skip this instruction because while it appears to have a
+      // dead def it uses a frame index which might expand into a multi
+      // instruction sequence during EPI.
+      DEBUG(dbgs() << "    Ignoring, operand is frame index\n");
+      continue;
+    }
+    for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && MO.isDead() && MO.isDef()) {
+        assert(!MO.isImplicit() && "Unexpected implicit def!");
+        DEBUG(dbgs() << "  Dead def operand #" << i << " in:\n    ";
+              MI.print(dbgs()));
+        // Be careful not to change the register if it's a tied operand.
+        if (MI.isRegTiedToUseOperand(i)) {
+          DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
+          continue;
+        }
+        // Don't change the register if there's an implicit def of a subreg or
+        // supperreg.
+        if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) {
+          DEBUG(dbgs() << "    Ignoring, implicitly defines overlap reg.\n");
+          continue;
+        }
+        // Make sure the instruction take a register class that contains
+        // the zero register and replace it if so.
+        unsigned NewReg;
+        switch (MI.getDesc().OpInfo[i].RegClass) {
+        default:
+          DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
+          continue;
+        case AArch64::GPR32RegClassID:
+          NewReg = AArch64::WZR;
+          break;
+        case AArch64::GPR64RegClassID:
+          NewReg = AArch64::XZR;
+          break;
+        }
+        DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
+        MO.setReg(NewReg);
+        DEBUG(MI.print(dbgs()));
+        ++NumDeadDefsReplaced;
+      }
+    }
+  }
+  return Changed;
+}
+
+// Scan the function for instructions that have a dead definition of a
+// register. Replace that register with the zero register when possible.
+bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getTarget().getRegisterInfo();
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
+
+  for (auto &MBB : MF)
+    if (processMachineBasicBlock(MBB))
+      Changed = true;
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64DeadRegisterDefinitions() {
+  return new AArch64DeadRegisterDefinitions();
+}
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
new file mode 100644
index 00000000000..a76fd76e5ed
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -0,0 +1,749 @@
+//==-- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling and other late optimizations.  This
+// pass should be run after register allocation but before the post-regalloc
+// scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+namespace {
+class AArch64ExpandPseudo : public MachineFunctionPass {
+public:
+  static char ID;
+  AArch64ExpandPseudo() : MachineFunctionPass(ID) {}
+
+  const AArch64InstrInfo *TII;
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  const char *getPassName() const override {
+    return "AArch64 pseudo instruction expansion pass";
+  }
+
+private:
+  bool expandMBB(MachineBasicBlock &MBB);
+  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                    unsigned BitSize);
+};
+char AArch64ExpandPseudo::ID = 0;
+}
+
+/// \brief Transfer implicit operands on the pseudo instruction to the
+/// instructions created from the expansion.
+static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
+                           MachineInstrBuilder &DefMI) {
+  const MCInstrDesc &Desc = OldMI.getDesc();
+  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
+       ++i) {
+    const MachineOperand &MO = OldMI.getOperand(i);
+    assert(MO.isReg() && MO.getReg());
+    if (MO.isUse())
+      UseMI.addOperand(MO);
+    else
+      DefMI.addOperand(MO);
+  }
+}
+
+/// \brief Helper function which extracts the specified 16-bit chunk from a
+/// 64-bit value.
+static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
+  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+
+  return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
+}
+
+/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
+/// value. Indices correspond to element numbers in a v4i16.
+static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
+  assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
+  const unsigned ShiftAmt = ToIdx * 16;
+
+  // Replicate the source chunk to the destination position.
+  const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
+  // Clear the destination chunk.
+  Imm &= ~(0xFFFFLL << ShiftAmt);
+  // Insert the replicated chunk.
+  return Imm | Chunk;
+}
+
+/// \brief Helper function which tries to materialize a 64-bit value with an
+/// ORR + MOVK instruction sequence.
+static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
+                       MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator &MBBI,
+                       const AArch64InstrInfo *TII, unsigned ChunkIdx) {
+  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+  const unsigned ShiftAmt = ChunkIdx * 16;
+
+  uint64_t Encoding;
+  if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
+    // Create the ORR-immediate instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(AArch64::XZR)
+            .addImm(Encoding);
+
+    // Create the MOVK instruction.
+    const unsigned Imm16 = getChunk(UImm, ChunkIdx);
+    const unsigned DstReg = MI.getOperand(0).getReg();
+    const bool DstIsDead = MI.getOperand(0).isDead();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+    transferImpOps(MI, MIB, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
+/// can be materialized with an ORR instruction.
+static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
+  Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
+
+  return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
+}
+
+/// \brief Check for identical 16-bit chunks within the constant and if so
+/// materialize them with a single ORR instruction. The remaining one or two
+/// 16-bit chunks will be materialized with MOVK instructions.
+///
+/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
+/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
+/// an ORR instruction.
+///
+static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
+                                 MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI,
+                                 const AArch64InstrInfo *TII) {
+  typedef DenseMap<uint64_t, unsigned> CountMap;
+  CountMap Counts;
+
+  // Scan the constant and count how often every chunk occurs.
+  for (unsigned Idx = 0; Idx < 4; ++Idx)
+    ++Counts[getChunk(UImm, Idx)];
+
+  // Traverse the chunks to find one which occurs more than once.
+  for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
+       Chunk != End; ++Chunk) {
+    const uint64_t ChunkVal = Chunk->first;
+    const unsigned Count = Chunk->second;
+
+    uint64_t Encoding = 0;
+
+    // We are looking for chunks which have two or three instances and can be
+    // materialized with an ORR instruction.
+    if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
+      continue;
+
+    const bool CountThree = Count == 3;
+    // Create the ORR-immediate instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(AArch64::XZR)
+            .addImm(Encoding);
+
+    const unsigned DstReg = MI.getOperand(0).getReg();
+    const bool DstIsDead = MI.getOperand(0).isDead();
+
+    unsigned ShiftAmt = 0;
+    uint64_t Imm16 = 0;
+    // Find the first chunk not materialized with the ORR instruction.
+    for (; ShiftAmt < 64; ShiftAmt += 16) {
+      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+      if (Imm16 != ChunkVal)
+        break;
+    }
+
+    // Create the first MOVK instruction.
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+            .addReg(DstReg,
+                    RegState::Define | getDeadRegState(DstIsDead && CountThree))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+    // In case we have three instances the whole constant is now materialized
+    // and we can exit.
+    if (CountThree) {
+      transferImpOps(MI, MIB, MIB1);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    // Find the remaining chunk which needs to be materialized.
+    for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
+      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+      if (Imm16 != ChunkVal)
+        break;
+    }
+
+    // Create the second MOVK instruction.
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+    transferImpOps(MI, MIB, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
+/// starts a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isStartChunk(uint64_t Chunk) {
+  if (Chunk == 0 || Chunk == UINT64_MAX)
+    return false;
+
+  return (CountLeadingOnes_64(Chunk) + countTrailingZeros(Chunk)) == 64;
+}
+
+/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
+/// ends a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isEndChunk(uint64_t Chunk) {
+  if (Chunk == 0 || Chunk == UINT64_MAX)
+    return false;
+
+  return (countLeadingZeros(Chunk) + CountTrailingOnes_64(Chunk)) == 64;
+}
+
+/// \brief Clear or set all bits in the chunk at the given index.
+static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
+  const uint64_t Mask = 0xFFFF;
+
+  if (Clear)
+    // Clear chunk in the immediate.
+    Imm &= ~(Mask << (Idx * 16));
+  else
+    // Set all bits in the immediate for the particular chunk.
+    Imm |= Mask << (Idx * 16);
+
+  return Imm;
+}
+
+/// \brief Check whether the constant contains a sequence of contiguous ones,
+/// which might be interrupted by one or two chunks. If so, materialize the
+/// sequence of contiguous ones with an ORR instruction.
+/// Materialize the chunks which are either interrupting the sequence or outside
+/// of the sequence with a MOVK instruction.
+///
+/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
+/// which ends the sequence (0...1...). Then we are looking for constants which
+/// contain at least one S and E chunk.
+/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
+///
+/// We are also looking for constants like |S|A|B|E| where the contiguous
+/// sequence of ones wraps around the MSB into the LSB.
+///
+static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              const AArch64InstrInfo *TII) {
+  const int NotSet = -1;
+  const uint64_t Mask = 0xFFFF;
+
+  int StartIdx = NotSet;
+  int EndIdx = NotSet;
+  // Try to find the chunks which start/end a contiguous sequence of ones.
+  for (int Idx = 0; Idx < 4; ++Idx) {
+    int64_t Chunk = getChunk(UImm, Idx);
+    // Sign extend the 16-bit chunk to 64-bit.
+    Chunk = (Chunk << 48) >> 48;
+
+    if (isStartChunk(Chunk))
+      StartIdx = Idx;
+    else if (isEndChunk(Chunk))
+      EndIdx = Idx;
+  }
+
+  // Early exit in case we can't find a start/end chunk.
+  if (StartIdx == NotSet || EndIdx == NotSet)
+    return false;
+
+  // Outside of the contiguous sequence of ones everything needs to be zero.
+  uint64_t Outside = 0;
+  // Chunks between the start and end chunk need to have all their bits set.
+  uint64_t Inside = Mask;
+
+  // If our contiguous sequence of ones wraps around from the MSB into the LSB,
+  // just swap indices and pretend we are materializing a contiguous sequence
+  // of zeros surrounded by a contiguous sequence of ones.
+  if (StartIdx > EndIdx) {
+    std::swap(StartIdx, EndIdx);
+    std::swap(Outside, Inside);
+  }
+
+  uint64_t OrrImm = UImm;
+  int FirstMovkIdx = NotSet;
+  int SecondMovkIdx = NotSet;
+
+  // Find out which chunks we need to patch up to obtain a contiguous sequence
+  // of ones.
+  for (int Idx = 0; Idx < 4; ++Idx) {
+    const uint64_t Chunk = getChunk(UImm, Idx);
+
+    // Check whether we are looking at a chunk which is not part of the
+    // contiguous sequence of ones.
+    if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
+      OrrImm = updateImm(OrrImm, Idx, Outside == 0);
+
+      // Remember the index we need to patch.
+      if (FirstMovkIdx == NotSet)
+        FirstMovkIdx = Idx;
+      else
+        SecondMovkIdx = Idx;
+
+      // Check whether we are looking a chunk which is part of the contiguous
+      // sequence of ones.
+    } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
+      OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
+
+      // Remember the index we need to patch.
+      if (FirstMovkIdx == NotSet)
+        FirstMovkIdx = Idx;
+      else
+        SecondMovkIdx = Idx;
+    }
+  }
+  assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
+
+  // Create the ORR-immediate instruction.
+  uint64_t Encoding = 0;
+  AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
+  MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+          .addOperand(MI.getOperand(0))
+          .addReg(AArch64::XZR)
+          .addImm(Encoding);
+
+  const unsigned DstReg = MI.getOperand(0).getReg();
+  const bool DstIsDead = MI.getOperand(0).isDead();
+
+  const bool SingleMovk = SecondMovkIdx == NotSet;
+  // Create the first MOVK instruction.
+  MachineInstrBuilder MIB1 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+          .addReg(DstReg,
+                  RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
+          .addReg(DstReg)
+          .addImm(getChunk(UImm, FirstMovkIdx))
+          .addImm(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16));
+
+  // Early exit in case we only need to emit a single MOVK instruction.
+  if (SingleMovk) {
+    transferImpOps(MI, MIB, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Create the second MOVK instruction.
+  MachineInstrBuilder MIB2 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg)
+          .addImm(getChunk(UImm, SecondMovkIdx))
+          .addImm(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16));
+
+  transferImpOps(MI, MIB, MIB2);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
+/// real move-immediate instructions to synthesize the immediate.
+bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       unsigned BitSize) {
+  MachineInstr &MI = *MBBI;
+  uint64_t Imm = MI.getOperand(1).getImm();
+  const unsigned Mask = 0xFFFF;
+
+  // Try a MOVI instruction (aka ORR-immediate with the zero register).
+  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+  uint64_t Encoding;
+  if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+    unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+            .addOperand(MI.getOperand(0))
+            .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
+            .addImm(Encoding);
+    transferImpOps(MI, MIB, MIB);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Scan the immediate and count the number of 16-bit chunks which are either
+  // all ones or all zeros.
+  unsigned OneChunks = 0;
+  unsigned ZeroChunks = 0;
+  for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
+    const unsigned Chunk = (Imm >> Shift) & Mask;
+    if (Chunk == Mask)
+      OneChunks++;
+    else if (Chunk == 0)
+      ZeroChunks++;
+  }
+
+  // Since we can't materialize the constant with a single ORR instruction,
+  // let's see whether we can materialize 3/4 of the constant with an ORR
+  // instruction and use an additional MOVK instruction to materialize the
+  // remaining 1/4.
+  //
+  // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
+  //
+  // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
+  // we would create the following instruction sequence:
+  //
+  // ORR x0, xzr, |A|X|A|X|
+  // MOVK x0, |B|, LSL #16
+  //
+  // Only look at 64-bit constants which can't be materialized with a single
+  // instruction e.g. which have less than either three all zero or all one
+  // chunks.
+  //
+  // Ignore 32-bit constants here, they always can be materialized with a
+  // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
+  // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
+  // Thus we fall back to the default code below which in the best case creates
+  // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
+  //
+  if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
+    // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
+    // identical?
+    if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 3 into element 1.
+      uint64_t OrrImm = replicateChunk(UImm, 3, 1);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
+        return true;
+
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 1 into element 3.
+      OrrImm = replicateChunk(UImm, 1, 3);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
+        return true;
+
+      // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
+      // identical?
+    } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 2 into element 0.
+      uint64_t OrrImm = replicateChunk(UImm, 2, 0);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
+        return true;
+
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 1 into element 3.
+      OrrImm = replicateChunk(UImm, 0, 2);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
+        return true;
+    }
+  }
+
+  // Check for identical 16-bit chunks within the constant and if so materialize
+  // them with a single ORR instruction. The remaining one or two 16-bit chunks
+  // will be materialized with MOVK instructions.
+  if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
+    return true;
+
+  // Check whether the constant contains a sequence of contiguous ones, which
+  // might be interrupted by one or two chunks. If so, materialize the sequence
+  // of contiguous ones with an ORR instruction. Materialize the chunks which
+  // are either interrupting the sequence or outside of the sequence with a
+  // MOVK instruction.
+  if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
+    return true;
+
+  // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
+  // more MOVK instructions to insert additional 16-bit portions into the
+  // lower bits.
+  bool isNeg = false;
+
+  // Use MOVN to materialize the high bits if we have more all one chunks
+  // than all zero chunks.
+  if (OneChunks > ZeroChunks) {
+    isNeg = true;
+    Imm = ~Imm;
+  }
+
+  unsigned FirstOpc;
+  if (BitSize == 32) {
+    Imm &= (1LL << 32) - 1;
+    FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
+  } else {
+    FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
+  }
+  unsigned Shift = 0;     // LSL amount for high bits with MOVZ/MOVN
+  unsigned LastShift = 0; // LSL amount for last MOVK
+  if (Imm != 0) {
+    unsigned LZ = countLeadingZeros(Imm);
+    unsigned TZ = countTrailingZeros(Imm);
+    Shift = ((63 - LZ) / 16) * 16;
+    LastShift = (TZ / 16) * 16;
+  }
+  unsigned Imm16 = (Imm >> Shift) & Mask;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  MachineInstrBuilder MIB1 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
+          .addReg(DstReg, RegState::Define |
+                              getDeadRegState(DstIsDead && Shift == LastShift))
+          .addImm(Imm16)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+
+  // If a MOVN was used for the high bits of a negative value, flip the rest
+  // of the bits back for use with MOVK.
+  if (isNeg)
+    Imm = ~Imm;
+
+  if (Shift == LastShift) {
+    transferImpOps(MI, MIB1, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  MachineInstrBuilder MIB2;
+  unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
+  while (Shift != LastShift) {
+    Shift -= 16;
+    Imm16 = (Imm >> Shift) & Mask;
+    if (Imm16 == (isNeg ? Mask : 0))
+      continue; // This 16-bit portion is already set correctly.
+    MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+               .addReg(DstReg,
+                       RegState::Define |
+                           getDeadRegState(DstIsDead && Shift == LastShift))
+               .addReg(DstReg)
+               .addImm(Imm16)
+               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+  }
+
+  transferImpOps(MI, MIB1, MIB2);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// do the expansion and return true.  Otherwise return false.
+bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  default:
+    break;
+
+  case AArch64::ADDWrr:
+  case AArch64::SUBWrr:
+  case AArch64::ADDXrr:
+  case AArch64::SUBXrr:
+  case AArch64::ADDSWrr:
+  case AArch64::SUBSWrr:
+  case AArch64::ADDSXrr:
+  case AArch64::SUBSXrr:
+  case AArch64::ANDWrr:
+  case AArch64::ANDXrr:
+  case AArch64::BICWrr:
+  case AArch64::BICXrr:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSXrr:
+  case AArch64::BICSWrr:
+  case AArch64::BICSXrr:
+  case AArch64::EONWrr:
+  case AArch64::EONXrr:
+  case AArch64::EORWrr:
+  case AArch64::EORXrr:
+  case AArch64::ORNWrr:
+  case AArch64::ORNXrr:
+  case AArch64::ORRWrr:
+  case AArch64::ORRXrr: {
+    unsigned Opcode;
+    switch (MI.getOpcode()) {
+    default:
+      return false;
+    case AArch64::ADDWrr:      Opcode = AArch64::ADDWrs; break;
+    case AArch64::SUBWrr:      Opcode = AArch64::SUBWrs; break;
+    case AArch64::ADDXrr:      Opcode = AArch64::ADDXrs; break;
+    case AArch64::SUBXrr:      Opcode = AArch64::SUBXrs; break;
+    case AArch64::ADDSWrr:     Opcode = AArch64::ADDSWrs; break;
+    case AArch64::SUBSWrr:     Opcode = AArch64::SUBSWrs; break;
+    case AArch64::ADDSXrr:     Opcode = AArch64::ADDSXrs; break;
+    case AArch64::SUBSXrr:     Opcode = AArch64::SUBSXrs; break;
+    case AArch64::ANDWrr:      Opcode = AArch64::ANDWrs; break;
+    case AArch64::ANDXrr:      Opcode = AArch64::ANDXrs; break;
+    case AArch64::BICWrr:      Opcode = AArch64::BICWrs; break;
+    case AArch64::BICXrr:      Opcode = AArch64::BICXrs; break;
+    case AArch64::ANDSWrr:     Opcode = AArch64::ANDSWrs; break;
+    case AArch64::ANDSXrr:     Opcode = AArch64::ANDSXrs; break;
+    case AArch64::BICSWrr:     Opcode = AArch64::BICSWrs; break;
+    case AArch64::BICSXrr:     Opcode = AArch64::BICSXrs; break;
+    case AArch64::EONWrr:      Opcode = AArch64::EONWrs; break;
+    case AArch64::EONXrr:      Opcode = AArch64::EONXrs; break;
+    case AArch64::EORWrr:      Opcode = AArch64::EORWrs; break;
+    case AArch64::EORXrr:      Opcode = AArch64::EORXrs; break;
+    case AArch64::ORNWrr:      Opcode = AArch64::ORNWrs; break;
+    case AArch64::ORNXrr:      Opcode = AArch64::ORNXrs; break;
+    case AArch64::ORRWrr:      Opcode = AArch64::ORRWrs; break;
+    case AArch64::ORRXrr:      Opcode = AArch64::ORRXrs; break;
+    }
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
+                MI.getOperand(0).getReg())
+            .addOperand(MI.getOperand(1))
+            .addOperand(MI.getOperand(2))
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    transferImpOps(MI, MIB1, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case AArch64::FCVTSHpseudo: {
+    MachineOperand Src = MI.getOperand(1);
+    Src.setImplicit();
+    unsigned SrcH =
+        TII->getRegisterInfo().getSubReg(Src.getReg(), AArch64::hsub);
+    auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::FCVTSHr))
+                   .addOperand(MI.getOperand(0))
+                   .addReg(SrcH, RegState::Undef)
+                   .addOperand(Src);
+    transferImpOps(MI, MIB, MIB);
+    MI.eraseFromParent();
+    return true;
+  }
+  case AArch64::LOADgot: {
+    // Expand into ADRP + LDR.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    const MachineOperand &MO1 = MI.getOperand(1);
+    unsigned Flags = MO1.getTargetFlags();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
+            .addOperand(MI.getOperand(0))
+            .addReg(DstReg);
+
+    if (MO1.isGlobal()) {
+      MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
+      MIB2.addGlobalAddress(MO1.getGlobal(), 0,
+                            Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    } else if (MO1.isSymbol()) {
+      MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE);
+      MIB2.addExternalSymbol(MO1.getSymbolName(),
+                             Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    } else {
+      assert(MO1.isCPI() &&
+             "Only expect globals, externalsymbols, or constant pools");
+      MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+                                Flags | AArch64II::MO_PAGE);
+      MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+                                Flags | AArch64II::MO_PAGEOFF |
+                                    AArch64II::MO_NC);
+    }
+
+    transferImpOps(MI, MIB1, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case AArch64::MOVaddr:
+  case AArch64::MOVaddrJT:
+  case AArch64::MOVaddrCP:
+  case AArch64::MOVaddrBA:
+  case AArch64::MOVaddrTLS:
+  case AArch64::MOVaddrEXT: {
+    // Expand into ADRP + ADD.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
+            .addOperand(MI.getOperand(1));
+
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(DstReg)
+            .addOperand(MI.getOperand(2))
+            .addImm(0);
+
+    transferImpOps(MI, MIB1, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case AArch64::MOVi32imm:
+    return expandMOVImm(MBB, MBBI, 32);
+  case AArch64::MOVi64imm:
+    return expandMOVImm(MBB, MBBI, 64);
+  case AArch64::RET_ReallyLR:
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET))
+        .addReg(AArch64::LR);
+    MI.eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+/// \brief Iterate over the instructions in basic block MBB and expand any
+/// pseudo instructions.  Return true if anything was modified.
+bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= expandMI(MBB, MBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+
+  bool Modified = false;
+  for (auto &MBB : MF)
+    Modified |= expandMBB(MBB);
+  return Modified;
+}
+
+/// \brief Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createAArch64ExpandPseudoPass() {
+  return new AArch64ExpandPseudo();
+}
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
new file mode 100644
index 00000000000..58178b1a48b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -0,0 +1,1977 @@
+//===-- AArch6464FastISel.cpp - AArch64 FastISel implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// AArch64GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64Subtarget.h"
+#include "AArch64CallingConv.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+namespace {
+
+class AArch64FastISel : public FastISel {
+
+  class Address {
+  public:
+    typedef enum {
+      RegBase,
+      FrameIndexBase
+    } BaseKind;
+
+  private:
+    BaseKind Kind;
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+    int64_t Offset;
+
+  public:
+    Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; }
+    void setKind(BaseKind K) { Kind = K; }
+    BaseKind getKind() const { return Kind; }
+    bool isRegBase() const { return Kind == RegBase; }
+    bool isFIBase() const { return Kind == FrameIndexBase; }
+    void setReg(unsigned Reg) {
+      assert(isRegBase() && "Invalid base register access!");
+      Base.Reg = Reg;
+    }
+    unsigned getReg() const {
+      assert(isRegBase() && "Invalid base register access!");
+      return Base.Reg;
+    }
+    void setFI(unsigned FI) {
+      assert(isFIBase() && "Invalid base frame index  access!");
+      Base.FI = FI;
+    }
+    unsigned getFI() const {
+      assert(isFIBase() && "Invalid base frame index access!");
+      return Base.FI;
+    }
+    void setOffset(int64_t O) { Offset = O; }
+    int64_t getOffset() { return Offset; }
+
+    bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); }
+  };
+
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+  LLVMContext *Context;
+
+private:
+  // Selection routines.
+  bool SelectLoad(const Instruction *I);
+  bool SelectStore(const Instruction *I);
+  bool SelectBranch(const Instruction *I);
+  bool SelectIndirectBr(const Instruction *I);
+  bool SelectCmp(const Instruction *I);
+  bool SelectSelect(const Instruction *I);
+  bool SelectFPExt(const Instruction *I);
+  bool SelectFPTrunc(const Instruction *I);
+  bool SelectFPToInt(const Instruction *I, bool Signed);
+  bool SelectIntToFP(const Instruction *I, bool Signed);
+  bool SelectRem(const Instruction *I, unsigned ISDOpcode);
+  bool SelectCall(const Instruction *I, const char *IntrMemName);
+  bool SelectIntrinsicCall(const IntrinsicInst &I);
+  bool SelectRet(const Instruction *I);
+  bool SelectTrunc(const Instruction *I);
+  bool SelectIntExt(const Instruction *I);
+  bool SelectMul(const Instruction *I);
+
+  // Utility helper routines.
+  bool isTypeLegal(Type *Ty, MVT &VT);
+  bool isLoadStoreTypeLegal(Type *Ty, MVT &VT);
+  bool ComputeAddress(const Value *Obj, Address &Addr);
+  bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
+                       bool UseUnscaled);
+  void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
+                            unsigned Flags, bool UseUnscaled);
+  bool IsMemCpySmall(uint64_t Len, unsigned Alignment);
+  bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+                          unsigned Alignment);
+  // Emit functions.
+  bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt);
+  bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
+                bool UseUnscaled = false);
+  bool EmitStore(MVT VT, unsigned SrcReg, Address Addr,
+                 bool UseUnscaled = false);
+  unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+  unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+
+  unsigned AArch64MaterializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned AArch64MaterializeGV(const GlobalValue *GV);
+
+  // Call handling routines.
+private:
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
+  bool ProcessCallArgs(SmallVectorImpl<Value *> &Args,
+                       SmallVectorImpl<unsigned> &ArgRegs,
+                       SmallVectorImpl<MVT> &ArgVTs,
+                       SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                       SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+                       unsigned &NumBytes);
+  bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                  const Instruction *I, CallingConv::ID CC, unsigned &NumBytes);
+
+public:
+  // Backend specific FastISel code.
+  unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
+  unsigned TargetMaterializeConstant(const Constant *C) override;
+
+  explicit AArch64FastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo)
+      : FastISel(funcInfo, libInfo) {
+    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+    Context = &funcInfo.Fn->getContext();
+  }
+
+  bool TargetSelectInstruction(const Instruction *I) override;
+
+#include "AArch64GenFastISel.inc"
+};
+
+} // end anonymous namespace
+
+#include "AArch64GenCallingConv.inc"
+
+CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
+  if (CC == CallingConv::WebKit_JS)
+    return CC_AArch64_WebKit_JS;
+  return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
+}
+
+unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+  assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
+         "Alloca should always return a pointer.");
+
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI))
+    return 0;
+
+  DenseMap<const AllocaInst *, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+            ResultReg)
+        .addFrameIndex(SI->second)
+        .addImm(0)
+        .addImm(0);
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return 0;
+
+  const APFloat Val = CFP->getValueAPF();
+  bool is64bit = (VT == MVT::f64);
+
+  // This checks to see if we can use FMOV instructions to materialize
+  // a constant, otherwise we have to materialize via the constant pool.
+  if (TLI.isFPImmLegal(Val, VT)) {
+    int Imm;
+    unsigned Opc;
+    if (is64bit) {
+      Imm = AArch64_AM::getFP64Imm(Val);
+      Opc = AArch64::FMOVDi;
+    } else {
+      Imm = AArch64_AM::getFP32Imm(Val);
+      Opc = AArch64::FMOVSi;
+    }
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+        .addImm(Imm);
+    return ResultReg;
+  }
+
+  // Materialize via constant pool.  MachineConstantPool wants an explicit
+  // alignment.
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+  if (Align == 0)
+    Align = DL.getTypeAllocSize(CFP->getType());
+
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+          ADRPReg).addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGE);
+
+  unsigned Opc = is64bit ? AArch64::LDRDui : AArch64::LDRSui;
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(ADRPReg)
+      .addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) {
+  // We can't handle thread-local variables quickly yet. Unfortunately we have
+  // to peer through any aliases to find out if that rule applies.
+  const GlobalValue *TLSGV = GV;
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+    TLSGV = GA->getAliasee();
+
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(TLSGV))
+    if (GVar->isThreadLocal())
+      return 0;
+
+  unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+  EVT DestEVT = TLI.getValueType(GV->getType(), true);
+  if (!DestEVT.isSimple())
+    return 0;
+
+  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  unsigned ResultReg;
+
+  if (OpFlags & AArch64II::MO_GOT) {
+    // ADRP + LDRX
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+            ADRPReg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+
+    ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+            ResultReg)
+        .addReg(ADRPReg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+                          AArch64II::MO_NC);
+  } else {
+    // ADRP + ADDX
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+            ADRPReg).addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+
+    ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+            ResultReg)
+        .addReg(ADRPReg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
+        .addImm(0);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  // FIXME: Handle ConstantInt.
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return AArch64MaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return AArch64MaterializeGV(GV);
+
+  return 0;
+}
+
+// Computes the address to get to an object.
+bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  switch (Opcode) {
+  default:
+    break;
+  case Instruction::BitCast: {
+    // Look through bitcasts.
+    return ComputeAddress(U->getOperand(0), Addr);
+  }
+  case Instruction::IntToPtr: {
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return ComputeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::PtrToInt: {
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return ComputeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    Address SavedAddr = Addr;
+    uint64_t TmpOffset = Addr.getOffset();
+
+    // Iterate through the GEP folding the constants into offsets where
+    // we can.
+    gep_type_iterator GTI = gep_type_begin(U);
+    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
+         ++i, ++GTI) {
+      const Value *Op = *i;
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+        TmpOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        for (;;) {
+          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+            // Constant-offset addressing.
+            TmpOffset += CI->getSExtValue() * S;
+            break;
+          }
+          if (canFoldAddIntoGEP(U, Op)) {
+            // A compatible add with a constant operand. Fold the constant.
+            ConstantInt *CI =
+                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+            TmpOffset += CI->getSExtValue() * S;
+            // Iterate on the other operand.
+            Op = cast<AddOperator>(Op)->getOperand(0);
+            continue;
+          }
+          // Unsupported
+          goto unsupported_gep;
+        }
+      }
+    }
+
+    // Try to grab the base operand now.
+    Addr.setOffset(TmpOffset);
+    if (ComputeAddress(U->getOperand(0), Addr))
+      return true;
+
+    // We failed, restore everything and try the other options.
+    Addr = SavedAddr;
+
+  unsupported_gep:
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst *AI = cast<AllocaInst>(Obj);
+    DenseMap<const AllocaInst *, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      Addr.setKind(Address::FrameIndexBase);
+      Addr.setFI(SI->second);
+      return true;
+    }
+    break;
+  }
+  }
+
+  // Try to get this in a register if nothing else has worked.
+  if (!Addr.isValid())
+    Addr.setReg(getRegForValue(Obj));
+  return Addr.isValid();
+}
+
+bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(Ty, true);
+
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple())
+    return false;
+  VT = evt.getSimpleVT();
+
+  // This is a legal type, but it's not something we handle in fast-isel.
+  if (VT == MVT::f128)
+    return false;
+
+  // Handle all other legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+bool AArch64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT))
+    return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now. For stores, this reflects truncation.
+  if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+    return true;
+
+  return false;
+}
+
+bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT,
+                                      int64_t ScaleFactor, bool UseUnscaled) {
+  bool needsLowering = false;
+  int64_t Offset = Addr.getOffset();
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+  case MVT::f32:
+  case MVT::f64:
+    if (!UseUnscaled)
+      // Using scaled, 12-bit, unsigned immediate offsets.
+      needsLowering = ((Offset & 0xfff) != Offset);
+    else
+      // Using unscaled, 9-bit, signed immediate offsets.
+      needsLowering = (Offset > 256 || Offset < -256);
+    break;
+  }
+
+  // FIXME: If this is a stack pointer and the offset needs to be simplified
+  // then put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (needsLowering && Addr.getKind() == Address::FrameIndexBase) {
+    return false;
+  }
+
+  // Since the offset is too large for the load/store instruction get the
+  // reg+offset into a register.
+  if (needsLowering) {
+    uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor;
+    unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false,
+                                      UnscaledOffset, MVT::i64);
+    if (ResultReg == 0)
+      return false;
+    Addr.setReg(ResultReg);
+    Addr.setOffset(0);
+  }
+  return true;
+}
+
+void AArch64FastISel::AddLoadStoreOperands(Address &Addr,
+                                           const MachineInstrBuilder &MIB,
+                                           unsigned Flags, bool UseUnscaled) {
+  int64_t Offset = Addr.getOffset();
+  // Frame base works a bit differently. Handle it separately.
+  if (Addr.getKind() == Address::FrameIndexBase) {
+    int FI = Addr.getFI();
+    // FIXME: We shouldn't be using getObjectSize/getObjectAlignment.  The size
+    // and alignment should be based on the VT.
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(FI, Offset), Flags,
+        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+    // Now add the rest of the operands.
+    MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+  } else {
+    // Now add the rest of the operands.
+    MIB.addReg(Addr.getReg());
+    MIB.addImm(Offset);
+  }
+}
+
+bool AArch64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
+                               bool UseUnscaled) {
+  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+  if (!UseUnscaled && Addr.getOffset() < 0)
+    UseUnscaled = true;
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  bool VTIsi1 = false;
+  int64_t ScaleFactor = 0;
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+    VTIsi1 = true;
+  // Intentional fall-through.
+  case MVT::i8:
+    Opc = UseUnscaled ? AArch64::LDURBBi : AArch64::LDRBBui;
+    RC = &AArch64::GPR32RegClass;
+    ScaleFactor = 1;
+    break;
+  case MVT::i16:
+    Opc = UseUnscaled ? AArch64::LDURHHi : AArch64::LDRHHui;
+    RC = &AArch64::GPR32RegClass;
+    ScaleFactor = 2;
+    break;
+  case MVT::i32:
+    Opc = UseUnscaled ? AArch64::LDURWi : AArch64::LDRWui;
+    RC = &AArch64::GPR32RegClass;
+    ScaleFactor = 4;
+    break;
+  case MVT::i64:
+    Opc = UseUnscaled ? AArch64::LDURXi : AArch64::LDRXui;
+    RC = &AArch64::GPR64RegClass;
+    ScaleFactor = 8;
+    break;
+  case MVT::f32:
+    Opc = UseUnscaled ? AArch64::LDURSi : AArch64::LDRSui;
+    RC = TLI.getRegClassFor(VT);
+    ScaleFactor = 4;
+    break;
+  case MVT::f64:
+    Opc = UseUnscaled ? AArch64::LDURDi : AArch64::LDRDui;
+    RC = TLI.getRegClassFor(VT);
+    ScaleFactor = 8;
+    break;
+  }
+  // Scale the offset.
+  if (!UseUnscaled) {
+    int64_t Offset = Addr.getOffset();
+    if (Offset & (ScaleFactor - 1))
+      // Retry using an unscaled, 9-bit, signed immediate offset.
+      return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true);
+
+    Addr.setOffset(Offset / ScaleFactor);
+  }
+
+  // Simplify this down to something we can handle.
+  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
+    return false;
+
+  // Create the base instruction, then add the operands.
+  ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(Opc), ResultReg);
+  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled);
+
+  // Loading an i1 requires special handling.
+  if (VTIsi1) {
+    MRI.constrainRegClass(ResultReg, &AArch64::GPR32RegClass);
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+            ANDReg)
+        .addReg(ResultReg)
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+    ResultReg = ANDReg;
+  }
+  return true;
+}
+
+bool AArch64FastISel::SelectLoad(const Instruction *I) {
+  MVT VT;
+  // Verify we have a legal type before going any further.  Currently, we handle
+  // simple types that will directly fit in a register (i32/f32/i64/f64) or
+  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+  if (!isLoadStoreTypeLegal(I->getType(), VT) || cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg;
+  if (!EmitLoad(VT, ResultReg, Addr))
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
+                                bool UseUnscaled) {
+  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+  if (!UseUnscaled && Addr.getOffset() < 0)
+    UseUnscaled = true;
+
+  unsigned StrOpc;
+  bool VTIsi1 = false;
+  int64_t ScaleFactor = 0;
+  // Using scaled, 12-bit, unsigned immediate offsets.
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+    VTIsi1 = true;
+  case MVT::i8:
+    StrOpc = UseUnscaled ? AArch64::STURBBi : AArch64::STRBBui;
+    ScaleFactor = 1;
+    break;
+  case MVT::i16:
+    StrOpc = UseUnscaled ? AArch64::STURHHi : AArch64::STRHHui;
+    ScaleFactor = 2;
+    break;
+  case MVT::i32:
+    StrOpc = UseUnscaled ? AArch64::STURWi : AArch64::STRWui;
+    ScaleFactor = 4;
+    break;
+  case MVT::i64:
+    StrOpc = UseUnscaled ? AArch64::STURXi : AArch64::STRXui;
+    ScaleFactor = 8;
+    break;
+  case MVT::f32:
+    StrOpc = UseUnscaled ? AArch64::STURSi : AArch64::STRSui;
+    ScaleFactor = 4;
+    break;
+  case MVT::f64:
+    StrOpc = UseUnscaled ? AArch64::STURDi : AArch64::STRDui;
+    ScaleFactor = 8;
+    break;
+  }
+  // Scale the offset.
+  if (!UseUnscaled) {
+    int64_t Offset = Addr.getOffset();
+    if (Offset & (ScaleFactor - 1))
+      // Retry using an unscaled, 9-bit, signed immediate offset.
+      return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true);
+
+    Addr.setOffset(Offset / ScaleFactor);
+  }
+
+  // Simplify this down to something we can handle.
+  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
+    return false;
+
+  // Storing an i1 requires special handling.
+  if (VTIsi1) {
+    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+            ANDReg)
+        .addReg(SrcReg)
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+    SrcReg = ANDReg;
+  }
+  // Create the base instruction, then add the operands.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(StrOpc)).addReg(SrcReg);
+  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled);
+  return true;
+}
+
+bool AArch64FastISel::SelectStore(const Instruction *I) {
+  MVT VT;
+  Value *Op0 = I->getOperand(0);
+  // Verify we have a legal type before going any further.  Currently, we handle
+  // simple types that will directly fit in a register (i32/f32/i64/f64) or
+  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+  if (!isLoadStoreTypeLegal(Op0->getType(), VT) ||
+      cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Get the value to be stored into a register.
+  unsigned SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!EmitStore(VT, SrcReg, Addr))
+    return false;
+  return true;
+}
+
+static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
+  switch (Pred) {
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UEQ:
+  default:
+    // AL is our "false" for now. The other two need more compares.
+    return AArch64CC::AL;
+  case CmpInst::ICMP_EQ:
+  case CmpInst::FCMP_OEQ:
+    return AArch64CC::EQ;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::FCMP_OGT:
+    return AArch64CC::GT;
+  case CmpInst::ICMP_SGE:
+  case CmpInst::FCMP_OGE:
+    return AArch64CC::GE;
+  case CmpInst::ICMP_UGT:
+  case CmpInst::FCMP_UGT:
+    return AArch64CC::HI;
+  case CmpInst::FCMP_OLT:
+    return AArch64CC::MI;
+  case CmpInst::ICMP_ULE:
+  case CmpInst::FCMP_OLE:
+    return AArch64CC::LS;
+  case CmpInst::FCMP_ORD:
+    return AArch64CC::VC;
+  case CmpInst::FCMP_UNO:
+    return AArch64CC::VS;
+  case CmpInst::FCMP_UGE:
+    return AArch64CC::PL;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::FCMP_ULT:
+    return AArch64CC::LT;
+  case CmpInst::ICMP_SLE:
+  case CmpInst::FCMP_ULE:
+    return AArch64CC::LE;
+  case CmpInst::FCMP_UNE:
+  case CmpInst::ICMP_NE:
+    return AArch64CC::NE;
+  case CmpInst::ICMP_UGE:
+    return AArch64CC::HS;
+  case CmpInst::ICMP_ULT:
+    return AArch64CC::LO;
+  }
+}
+
+bool AArch64FastISel::SelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
+      // We may not handle every CC for now.
+      AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
+      if (CC == AArch64CC::AL)
+        return false;
+
+      // Emit the cmp.
+      if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+        return false;
+
+      // Emit the branch.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+          .addImm(CC)
+          .addMBB(TBB);
+      FuncInfo.MBB->addSuccessor(TBB);
+
+      FastEmitBranch(FBB, DbgLoc);
+      return true;
+    }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    MVT SrcVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) {
+      unsigned CondReg = getRegForValue(TI->getOperand(0));
+      if (CondReg == 0)
+        return false;
+
+      // Issue an extract_subreg to get the lower 32-bits.
+      if (SrcVT == MVT::i64)
+        CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true,
+                                             AArch64::sub_32);
+
+      MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
+      unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::ANDWri), ANDReg)
+          .addReg(CondReg)
+          .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::SUBSWri))
+          .addReg(ANDReg)
+          .addReg(ANDReg)
+          .addImm(0)
+          .addImm(0);
+
+      unsigned CC = AArch64CC::NE;
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        CC = AArch64CC::EQ;
+      }
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+          .addImm(CC)
+          .addMBB(TBB);
+      FuncInfo.MBB->addSuccessor(TBB);
+      FastEmitBranch(FBB, DbgLoc);
+      return true;
+    }
+  } else if (const ConstantInt *CI =
+                 dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
+        .addMBB(Target);
+    FuncInfo.MBB->addSuccessor(Target);
+    return true;
+  }
+
+  unsigned CondReg = getRegForValue(BI->getCondition());
+  if (CondReg == 0)
+    return false;
+
+  // We've been divorced from our compare!  Our block was split, and
+  // now our compare lives in a predecessor block.  We musn't
+  // re-compare here, as the children of the compare aren't guaranteed
+  // live across the block boundary (we *could* check for this).
+  // Regardless, the compare has been done in the predecessor block,
+  // and it left a value for us in a virtual register.  Ergo, we test
+  // the one-bit value left in the virtual register.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri),
+          AArch64::WZR)
+      .addReg(CondReg)
+      .addImm(0)
+      .addImm(0);
+
+  unsigned CC = AArch64CC::NE;
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    CC = AArch64CC::EQ;
+  }
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+      .addImm(CC)
+      .addMBB(TBB);
+  FuncInfo.MBB->addSuccessor(TBB);
+  FastEmitBranch(FBB, DbgLoc);
+  return true;
+}
+
+bool AArch64FastISel::SelectIndirectBr(const Instruction *I) {
+  const IndirectBrInst *BI = cast<IndirectBrInst>(I);
+  unsigned AddrReg = getRegForValue(BI->getOperand(0));
+  if (AddrReg == 0)
+    return false;
+
+  // Emit the indirect branch.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BR))
+      .addReg(AddrReg);
+
+  // Make sure the CFG is up-to-date.
+  for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]);
+
+  return true;
+}
+
+bool AArch64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
+  Type *Ty = Src1Value->getType();
+  EVT SrcEVT = TLI.getValueType(Ty, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  // Check to see if the 2nd operand is a constant that we can encode directly
+  // in the compare.
+  uint64_t Imm;
+  bool UseImm = false;
+  bool isNegativeImm = false;
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
+    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
+        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+      const APInt &CIVal = ConstInt->getValue();
+
+      Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue();
+      if (CIVal.isNegative()) {
+        isNegativeImm = true;
+        Imm = -Imm;
+      }
+      // FIXME: We can handle more immediates using shifts.
+      UseImm = ((Imm & 0xfff) == Imm);
+    }
+  } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
+    if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
+      if (ConstFP->isZero() && !ConstFP->isNegative())
+        UseImm = true;
+  }
+
+  unsigned ZReg;
+  unsigned CmpOpc;
+  bool isICmp = true;
+  bool needsExt = false;
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+    needsExt = true;
+  // Intentional fall-through.
+  case MVT::i32:
+    ZReg = AArch64::WZR;
+    if (UseImm)
+      CmpOpc = isNegativeImm ? AArch64::ADDSWri : AArch64::SUBSWri;
+    else
+      CmpOpc = AArch64::SUBSWrr;
+    break;
+  case MVT::i64:
+    ZReg = AArch64::XZR;
+    if (UseImm)
+      CmpOpc = isNegativeImm ? AArch64::ADDSXri : AArch64::SUBSXri;
+    else
+      CmpOpc = AArch64::SUBSXrr;
+    break;
+  case MVT::f32:
+    isICmp = false;
+    CmpOpc = UseImm ? AArch64::FCMPSri : AArch64::FCMPSrr;
+    break;
+  case MVT::f64:
+    isICmp = false;
+    CmpOpc = UseImm ? AArch64::FCMPDri : AArch64::FCMPDrr;
+    break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(Src1Value);
+  if (SrcReg1 == 0)
+    return false;
+
+  unsigned SrcReg2;
+  if (!UseImm) {
+    SrcReg2 = getRegForValue(Src2Value);
+    if (SrcReg2 == 0)
+      return false;
+  }
+
+  // We have i1, i8, or i16, we need to either zero extend or sign extend.
+  if (needsExt) {
+    SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
+    if (SrcReg1 == 0)
+      return false;
+    if (!UseImm) {
+      SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
+      if (SrcReg2 == 0)
+        return false;
+    }
+  }
+
+  if (isICmp) {
+    if (UseImm)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(ZReg)
+          .addReg(SrcReg1)
+          .addImm(Imm)
+          .addImm(0);
+    else
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(ZReg)
+          .addReg(SrcReg1)
+          .addReg(SrcReg2);
+  } else {
+    if (UseImm)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(SrcReg1);
+    else
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(SrcReg1)
+          .addReg(SrcReg2);
+  }
+  return true;
+}
+
+bool AArch64FastISel::SelectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  // We may not handle every CC for now.
+  AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
+  if (CC == AArch64CC::AL)
+    return false;
+
+  // Emit the cmp.
+  if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+    return false;
+
+  // Now set a register based on the comparison.
+  AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
+  unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+          ResultReg)
+      .addReg(AArch64::WZR)
+      .addReg(AArch64::WZR)
+      .addImm(invertedCC);
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectSelect(const Instruction *I) {
+  const SelectInst *SI = cast<SelectInst>(I);
+
+  EVT DestEVT = TLI.getValueType(SI->getType(), true);
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT DestVT = DestEVT.getSimpleVT();
+  if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 &&
+      DestVT != MVT::f64)
+    return false;
+
+  unsigned CondReg = getRegForValue(SI->getCondition());
+  if (CondReg == 0)
+    return false;
+  unsigned TrueReg = getRegForValue(SI->getTrueValue());
+  if (TrueReg == 0)
+    return false;
+  unsigned FalseReg = getRegForValue(SI->getFalseValue());
+  if (FalseReg == 0)
+    return false;
+
+
+  MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
+  unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+          ANDReg)
+      .addReg(CondReg)
+      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri))
+      .addReg(ANDReg)
+      .addReg(ANDReg)
+      .addImm(0)
+      .addImm(0);
+
+  unsigned SelectOpc;
+  switch (DestVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i32:
+    SelectOpc = AArch64::CSELWr;
+    break;
+  case MVT::i64:
+    SelectOpc = AArch64::CSELXr;
+    break;
+  case MVT::f32:
+    SelectOpc = AArch64::FCSELSrrr;
+    break;
+  case MVT::f64:
+    SelectOpc = AArch64::FCSELDrrr;
+    break;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc),
+          ResultReg)
+      .addReg(TrueReg)
+      .addReg(FalseReg)
+      .addImm(AArch64CC::NE);
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectFPExt(const Instruction *I) {
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
+    return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
+          ResultReg).addReg(Op);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectFPTrunc(const Instruction *I) {
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
+    return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
+          ResultReg).addReg(Op);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// FPToUI and FPToSI
+bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
+  MVT DestVT;
+  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+    return false;
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (SrcReg == 0)
+    return false;
+
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+  if (SrcVT == MVT::f128)
+    return false;
+
+  unsigned Opc;
+  if (SrcVT == MVT::f64) {
+    if (Signed)
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWDr : AArch64::FCVTZSUXDr;
+    else
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWDr : AArch64::FCVTZUUXDr;
+  } else {
+    if (Signed)
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWSr : AArch64::FCVTZSUXSr;
+    else
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr;
+  }
+  unsigned ResultReg = createResultReg(
+      DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
+  MVT DestVT;
+  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+    return false;
+  assert ((DestVT == MVT::f32 || DestVT == MVT::f64) &&
+          "Unexpected value type.");
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (SrcReg == 0)
+    return false;
+
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+
+  // Handle sign-extension.
+  if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+    SrcReg =
+        EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
+    if (SrcReg == 0)
+      return false;
+  }
+
+  MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &AArch64::GPR64RegClass
+                                                  : &AArch64::GPR32RegClass);
+
+  unsigned Opc;
+  if (SrcVT == MVT::i64) {
+    if (Signed)
+      Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUXSri : AArch64::SCVTFUXDri;
+    else
+      Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUXSri : AArch64::UCVTFUXDri;
+  } else {
+    if (Signed)
+      Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUWSri : AArch64::SCVTFUWDri;
+    else
+      Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::ProcessCallArgs(
+    SmallVectorImpl<Value *> &Args, SmallVectorImpl<unsigned> &ArgRegs,
+    SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+    SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+    unsigned &NumBytes) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+      .addImm(NumBytes);
+
+  // Process the args.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Handle arg promotion: SExt, ZExt, AExt.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
+      if (Arg == 0)
+        return false;
+      ArgVT = DestVT;
+      break;
+    }
+    case CCValAssign::AExt:
+    // Intentional fall-through.
+    case CCValAssign::ZExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
+      if (Arg == 0)
+        return false;
+      ArgVT = DestVT;
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown arg promotion!");
+    }
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
+      RegArgs.push_back(VA.getLocReg());
+    } else if (VA.needsCustom()) {
+      // FIXME: Handle custom args.
+      return false;
+    } else {
+      assert(VA.isMemLoc() && "Assuming store on stack.");
+
+      // Need to store on the stack.
+      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+
+      unsigned BEAlign = 0;
+      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+        BEAlign = 8 - ArgSize;
+
+      Address Addr;
+      Addr.setKind(Address::RegBase);
+      Addr.setReg(AArch64::SP);
+      Addr.setOffset(VA.getLocMemOffset() + BEAlign);
+
+      if (!EmitStore(ArgVT, Arg, Addr))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                                 const Instruction *I, CallingConv::ID CC,
+                                 unsigned &NumBytes) {
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+      .addImm(NumBytes)
+      .addImm(0);
+
+  // Now the return value.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
+
+    // Only handle a single return value.
+    if (RVLocs.size() != 1)
+      return false;
+
+    // Copy all of the result registers out of their specified physreg.
+    MVT CopyVT = RVLocs[0].getValVT();
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(RVLocs[0].getLocReg());
+    UsedRegs.push_back(RVLocs[0].getLocReg());
+
+    // Finally update the result.
+    UpdateValueMap(I, ResultReg);
+  }
+
+  return true;
+}
+
+bool AArch64FastISel::SelectCall(const Instruction *I,
+                                 const char *IntrMemName = nullptr) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Don't handle inline asm or intrinsics.
+  if (isa<InlineAsm>(Callee))
+    return false;
+
+  // Only handle global variable Callees.
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV)
+    return false;
+
+  // Check the calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  // Let SDISel handle vararg functions.
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  if (FTy->isVarArg())
+    return false;
+
+  // Handle *simple* calls for now.
+  MVT RetVT;
+  Type *RetTy = I->getType();
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  // Set up the argument vectors.
+  SmallVector<Value *, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  Args.reserve(CS.arg_size());
+  ArgRegs.reserve(CS.arg_size());
+  ArgVTs.reserve(CS.arg_size());
+  ArgFlags.reserve(CS.arg_size());
+
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    // If we're lowering a memory intrinsic instead of a regular call, skip the
+    // last two arguments, which shouldn't be passed to the underlying function.
+    if (IntrMemName && e - i <= 2)
+      break;
+
+    unsigned Arg = getRegForValue(*i);
+    if (Arg == 0)
+      return false;
+
+    ISD::ArgFlagsTy Flags;
+    unsigned AttrInd = i - CS.arg_begin() + 1;
+    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+      Flags.setZExt();
+
+    // FIXME: Only handle *easy* calls for now.
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+      return false;
+
+    MVT ArgVT;
+    Type *ArgTy = (*i)->getType();
+    if (!isTypeLegal(ArgTy, ArgVT) &&
+        !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16))
+      return false;
+
+    // We don't handle vector parameters yet.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+      return false;
+
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*i);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Handle the arguments now that we've gotten them.
+  SmallVector<unsigned, 4> RegArgs;
+  unsigned NumBytes;
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+    return false;
+
+  // Issue the call.
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BL));
+  if (!IntrMemName)
+    MIB.addGlobalAddress(GV, 0, 0);
+  else
+    MIB.addExternalSymbol(IntrMemName, 0);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes))
+    return false;
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+bool AArch64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
+  if (Alignment)
+    return Len / Alignment <= 4;
+  else
+    return Len < 32;
+}
+
+bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
+                                         uint64_t Len, unsigned Alignment) {
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!IsMemCpySmall(Len, Alignment))
+    return false;
+
+  int64_t UnscaledOffset = 0;
+  Address OrigDest = Dest;
+  Address OrigSrc = Src;
+
+  while (Len) {
+    MVT VT;
+    if (!Alignment || Alignment >= 8) {
+      if (Len >= 8)
+        VT = MVT::i64;
+      else if (Len >= 4)
+        VT = MVT::i32;
+      else if (Len >= 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    } else {
+      // Bound based on alignment.
+      if (Len >= 4 && Alignment == 4)
+        VT = MVT::i32;
+      else if (Len >= 2 && Alignment == 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    }
+
+    bool RV;
+    unsigned ResultReg;
+    RV = EmitLoad(VT, ResultReg, Src);
+    assert(RV == true && "Should be able to handle this load.");
+    RV = EmitStore(VT, ResultReg, Dest);
+    assert(RV == true && "Should be able to handle this store.");
+    (void)RV;
+
+    int64_t Size = VT.getSizeInBits() / 8;
+    Len -= Size;
+    UnscaledOffset += Size;
+
+    // We need to recompute the unscaled offset for each iteration.
+    Dest.setOffset(OrigDest.getOffset() + UnscaledOffset);
+    Src.setOffset(OrigSrc.getOffset() + UnscaledOffset);
+  }
+
+  return true;
+}
+
+bool AArch64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
+  // FIXME: Handle more intrinsics.
+  switch (I.getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove: {
+    const MemTransferInst &MTI = cast<MemTransferInst>(I);
+    // Don't handle volatile.
+    if (MTI.isVolatile())
+      return false;
+
+    // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
+    // we would emit dead code because we don't currently handle memmoves.
+    bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
+    if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+      // Small memcpy's are common enough that we want to do them without a call
+      // if possible.
+      uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
+      unsigned Alignment = MTI.getAlignment();
+      if (IsMemCpySmall(Len, Alignment)) {
+        Address Dest, Src;
+        if (!ComputeAddress(MTI.getRawDest(), Dest) ||
+            !ComputeAddress(MTI.getRawSource(), Src))
+          return false;
+        if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+          return true;
+      }
+    }
+
+    if (!MTI.getLength()->getType()->isIntegerTy(64))
+      return false;
+
+    if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+    const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
+    return SelectCall(&I, IntrMemName);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst &MSI = cast<MemSetInst>(I);
+    // Don't handle volatile.
+    if (MSI.isVolatile())
+      return false;
+
+    if (!MSI.getLength()->getType()->isIntegerTy(64))
+      return false;
+
+    if (MSI.getDestAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+    return SelectCall(&I, "memset");
+  }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
+        .addImm(1);
+    return true;
+  }
+  }
+  return false;
+}
+
+bool AArch64FastISel::SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  if (F.isVarArg())
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  if (Ret->getNumOperands() > 0) {
+    CallingConv::ID CC = F.getCallingConv();
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
+                   I->getContext());
+    CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
+                                                     : RetCC_AArch64_AAPCS;
+    CCInfo.AnalyzeReturn(Outs, RetCC);
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+    const Value *RV = Ret->getOperand(0);
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    unsigned DestReg = VA.getLocReg();
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!MRI.getRegClass(SrcReg)->contains(DestReg))
+      return false;
+
+    EVT RVEVT = TLI.getValueType(RV->getType());
+    if (!RVEVT.isSimple())
+      return false;
+
+    // Vectors (of > 1 lane) in big endian need tricky handling.
+    if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1)
+      return false;
+
+    MVT RVVT = RVEVT.getSimpleVT();
+    if (RVVT == MVT::f128)
+      return false;
+    MVT DestVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (RVVT != DestVT) {
+      if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+        return false;
+
+      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+        return false;
+
+      bool isZExt = Outs[0].Flags.isZExt();
+      SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt);
+      if (SrcReg == 0)
+        return false;
+    }
+
+    // Make the copy.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(AArch64::RET_ReallyLR));
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+  return true;
+}
+
+bool AArch64FastISel::SelectTrunc(const Instruction *I) {
+  Type *DestTy = I->getType();
+  Value *Op = I->getOperand(0);
+  Type *SrcTy = Op->getType();
+
+  EVT SrcEVT = TLI.getValueType(SrcTy, true);
+  EVT DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i8)
+    return false;
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 &&
+      DestVT != MVT::i1)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Op);
+  if (!SrcReg)
+    return false;
+
+  // If we're truncating from i64 to a smaller non-legal type then generate an
+  // AND.  Otherwise, we know the high bits are undefined and a truncate doesn't
+  // generate any code.
+  if (SrcVT == MVT::i64) {
+    uint64_t Mask = 0;
+    switch (DestVT.SimpleTy) {
+    default:
+      // Trunc i64 to i32 is handled by the target-independent fast-isel.
+      return false;
+    case MVT::i1:
+      Mask = 0x1;
+      break;
+    case MVT::i8:
+      Mask = 0xff;
+      break;
+    case MVT::i16:
+      Mask = 0xffff;
+      break;
+    }
+    // Issue an extract_subreg to get the lower 32-bits.
+    unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true,
+                                                AArch64::sub_32);
+    MRI.constrainRegClass(Reg32, &AArch64::GPR32RegClass);
+    // Create the AND instruction which performs the actual truncation.
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+            ANDReg)
+        .addReg(Reg32)
+        .addImm(AArch64_AM::encodeLogicalImmediate(Mask, 32));
+    SrcReg = ANDReg;
+  }
+
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
+  assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
+          DestVT == MVT::i64) &&
+         "Unexpected value type.");
+  // Handle i8 and i16 as i32.
+  if (DestVT == MVT::i8 || DestVT == MVT::i16)
+    DestVT = MVT::i32;
+
+  if (isZExt) {
+    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+    unsigned ResultReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+            ResultReg)
+        .addReg(SrcReg)
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+
+    if (DestVT == MVT::i64) {
+      // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
+      // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
+      unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::SUBREG_TO_REG), Reg64)
+          .addImm(0)
+          .addReg(ResultReg)
+          .addImm(AArch64::sub_32);
+      ResultReg = Reg64;
+    }
+    return ResultReg;
+  } else {
+    if (DestVT == MVT::i64) {
+      // FIXME: We're SExt i1 to i64.
+      return 0;
+    }
+    unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SBFMWri),
+            ResultReg)
+        .addReg(SrcReg)
+        .addImm(0)
+        .addImm(0);
+    return ResultReg;
+  }
+}
+
+unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                     bool isZExt) {
+  assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
+  unsigned Opc;
+  unsigned Imm = 0;
+
+  switch (SrcVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+    return Emiti1Ext(SrcReg, DestVT, isZExt);
+  case MVT::i8:
+    if (DestVT == MVT::i64)
+      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    else
+      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+    Imm = 7;
+    break;
+  case MVT::i16:
+    if (DestVT == MVT::i64)
+      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    else
+      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+    Imm = 15;
+    break;
+  case MVT::i32:
+    assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
+    Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    Imm = 31;
+    break;
+  }
+
+  // Handle i8 and i16 as i32.
+  if (DestVT == MVT::i8 || DestVT == MVT::i16)
+    DestVT = MVT::i32;
+  else if (DestVT == MVT::i64) {
+    unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), Src64)
+        .addImm(0)
+        .addReg(SrcReg)
+        .addImm(AArch64::sub_32);
+    SrcReg = Src64;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg)
+      .addImm(0)
+      .addImm(Imm);
+
+  return ResultReg;
+}
+
+bool AArch64FastISel::SelectIntExt(const Instruction *I) {
+  // On ARM, in general, integer casts don't involve legal types; this code
+  // handles promotable integers.  The high bits for a type smaller than
+  // the register size are assumed to be undefined.
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool isZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  EVT SrcEVT = TLI.getValueType(SrcTy, true);
+  EVT DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+  unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
+  if (ResultReg == 0)
+    return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestEVT = TLI.getValueType(I->getType(), true);
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT DestVT = DestEVT.getSimpleVT();
+  if (DestVT != MVT::i64 && DestVT != MVT::i32)
+    return false;
+
+  unsigned DivOpc;
+  bool is64bit = (DestVT == MVT::i64);
+  switch (ISDOpcode) {
+  default:
+    return false;
+  case ISD::SREM:
+    DivOpc = is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
+    break;
+  case ISD::UREM:
+    DivOpc = is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
+    break;
+  }
+  unsigned MSubOpc = is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src1Reg)
+    return false;
+
+  unsigned QuotReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), QuotReg)
+      .addReg(Src0Reg)
+      .addReg(Src1Reg);
+  // The remainder is computed as numerator - (quotient * denominator) using the
+  // MSUB instruction.
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg)
+      .addReg(QuotReg)
+      .addReg(Src1Reg)
+      .addReg(Src0Reg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectMul(const Instruction *I) {
+  EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  // Must be simple value type.  Don't handle vectors.
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i8)
+    return false;
+
+  unsigned Opc;
+  unsigned ZReg;
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    ZReg = AArch64::WZR;
+    Opc = AArch64::MADDWrrr;
+    break;
+  case MVT::i64:
+    ZReg = AArch64::XZR;
+    Opc = AArch64::MADDXrrr;
+    break;
+  }
+
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src1Reg)
+    return false;
+
+  // Create the base instruction, then add the operands.
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(Src0Reg)
+      .addReg(Src1Reg)
+      .addReg(ZReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::TargetSelectInstruction(const Instruction *I) {
+  switch (I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Load:
+    return SelectLoad(I);
+  case Instruction::Store:
+    return SelectStore(I);
+  case Instruction::Br:
+    return SelectBranch(I);
+  case Instruction::IndirectBr:
+    return SelectIndirectBr(I);
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+    return SelectCmp(I);
+  case Instruction::Select:
+    return SelectSelect(I);
+  case Instruction::FPExt:
+    return SelectFPExt(I);
+  case Instruction::FPTrunc:
+    return SelectFPTrunc(I);
+  case Instruction::FPToSI:
+    return SelectFPToInt(I, /*Signed=*/true);
+  case Instruction::FPToUI:
+    return SelectFPToInt(I, /*Signed=*/false);
+  case Instruction::SIToFP:
+    return SelectIntToFP(I, /*Signed=*/true);
+  case Instruction::UIToFP:
+    return SelectIntToFP(I, /*Signed=*/false);
+  case Instruction::SRem:
+    return SelectRem(I, ISD::SREM);
+  case Instruction::URem:
+    return SelectRem(I, ISD::UREM);
+  case Instruction::Call:
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+      return SelectIntrinsicCall(*II);
+    return SelectCall(I);
+  case Instruction::Ret:
+    return SelectRet(I);
+  case Instruction::Trunc:
+    return SelectTrunc(I);
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    return SelectIntExt(I);
+  case Instruction::Mul:
+    // FIXME: This really should be handled by the target-independent selector.
+    return SelectMul(I);
+  }
+  return false;
+  // Silence warnings.
+  (void)&CC_AArch64_DarwinPCS_VarArg;
+}
+
+namespace llvm {
+llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &funcInfo,
+                                        const TargetLibraryInfo *libInfo) {
+  return new AArch64FastISel(funcInfo, libInfo);
+}
+}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
new file mode 100644
index 00000000000..deb306a506d
--- /dev/null
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -0,0 +1,891 @@
+//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64FrameLowering.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "frame-info"
+
+static cl::opt<bool> EnableRedZone("aarch64-redzone",
+                                   cl::desc("enable use of redzone on AArch64"),
+                                   cl::init(false), cl::Hidden);
+
+STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+
+static unsigned estimateStackSize(MachineFunction &MF) {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  int Offset = 0;
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -FFI->getObjectOffset(i);
+    if (FixedOff > Offset)
+      Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    Offset += FFI->getObjectSize(i);
+    unsigned Align = FFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset + Align - 1) / Align * Align;
+  }
+  // This does not include the 16 bytes used for fp and lr.
+  return (unsigned)Offset;
+}
+
+bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
+  if (!EnableRedZone)
+    return false;
+  // Don't use the red zone if the function explicitly asks us not to.
+  // This is typically used for kernel code.
+  if (MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoRedZone))
+    return false;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  unsigned NumBytes = AFI->getLocalStackSize();
+
+  // Note: currently hasFP() is always true for hasCalls(), but that's an
+  // implementation detail of the current code, not a strict requirement,
+  // so stay safe here and check both.
+  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
+    return false;
+  return true;
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.
+bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+#ifndef NDEBUG
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  assert(!RegInfo->needsStackRealignment(MF) &&
+         "No stack realignment on AArch64!");
+#endif
+
+  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function.  This eliminates the need for
+/// add/sub sp brackets around call sites.  Returns true if the call frame is
+/// included as part of the stack frame.
+bool
+AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void AArch64FrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  const AArch64InstrInfo *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  DebugLoc DL = I->getDebugLoc();
+  int Opc = I->getOpcode();
+  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  if (!TFI->hasReservedCallFrame(MF)) {
+    unsigned Align = getStackAlignment();
+
+    int64_t Amount = I->getOperand(0).getImm();
+    Amount = RoundUpToAlignment(Amount, Align);
+    if (!IsDestroy)
+      Amount = -Amount;
+
+    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
+    // doesn't have to pop anything), then the first operand will be zero too so
+    // this adjustment is a no-op.
+    if (CalleePopAmount == 0) {
+      // FIXME: in-function stack adjustment for calls is limited to 24-bits
+      // because there's no guaranteed temporary register available.
+      //
+      // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
+      // 1) For offset <= 12-bit, we use LSL #0
+      // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
+      // LSL #0, and the other uses LSL #12.
+      //
+      // Mostly call frames will be allocated at the start of a function so
+      // this is OK, but it is a limitation that needs dealing with.
+      assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
+      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
+    }
+  } else if (CalleePopAmount != 0) {
+    // If the calling convention demands that the callee pops arguments from the
+    // stack, we want to add it back if we have a reserved call frame.
+    assert(CalleePopAmount < 0xffffff && "call frame too large");
+    emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
+                    TII);
+  }
+  MBB.erase(I);
+}
+
+void AArch64FrameLowering::emitCalleeSavedFrameMoves(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    unsigned FramePtr) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const AArch64InstrInfo *TII = TM.getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
+
+  const DataLayout *TD = MF.getTarget().getDataLayout();
+  bool HasFP = hasFP(MF);
+
+  // Calculate amount of bytes used for return address storing.
+  int stackGrowth = -TD->getPointerSize(0);
+
+  // Calculate offsets.
+  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
+  unsigned TotalSkipped = 0;
+  for (const auto &Info : CSI) {
+    unsigned Reg = Info.getReg();
+    int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
+                     getOffsetOfLocalArea() + saveAreaOffset;
+
+    // Don't output a new CFI directive if we're re-saving the frame pointer or
+    // link register. This happens when the PrologEpilogInserter has inserted an
+    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
+    // method automatically generates the directives when frame pointers are
+    // used. If we generate CFI directives for the extra "STP"s, the linker will
+    // lose track of the correct values for the frame pointer and link register.
+    if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) {
+      TotalSkipped += stackGrowth;
+      continue;
+    }
+
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, DwarfReg, Offset - TotalSkipped));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+}
+
+void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  const AArch64RegisterInfo *RegInfo = TM.getRegisterInfo();
+  const AArch64InstrInfo *TII = TM.getInstrInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
+  bool HasFP = hasFP(MF);
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  int NumBytes = (int)MFI->getStackSize();
+  if (!AFI->hasStackFrame()) {
+    assert(!HasFP && "unexpected function without stack frame but with FP");
+
+    // All of the stack allocation is for locals.
+    AFI->setLocalStackSize(NumBytes);
+
+    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+    MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+
+    // REDZONE: If the stack size is less than 128 bytes, we don't need
+    // to actually allocate.
+    if (NumBytes && !canUseRedZone(MF)) {
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else if (NumBytes) {
+      ++NumRedZoneFunctions;
+    }
+
+    return;
+  }
+
+  // Only set up FP if we actually need to.
+  int FPOffset = 0;
+  if (HasFP) {
+    // First instruction must a) allocate the stack  and b) have an immediate
+    // that is a multiple of -2.
+    assert((MBBI->getOpcode() == AArch64::STPXpre ||
+            MBBI->getOpcode() == AArch64::STPDpre) &&
+           MBBI->getOperand(3).getReg() == AArch64::SP &&
+           MBBI->getOperand(4).getImm() < 0 &&
+           (MBBI->getOperand(4).getImm() & 1) == 0);
+
+    // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
+    // required for the callee saved register area we get the frame pointer
+    // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
+    FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8;
+    assert(FPOffset >= 0 && "Bad Framepointer Offset");
+  }
+
+  // Move past the saves of the callee-saved registers.
+  while (MBBI->getOpcode() == AArch64::STPXi ||
+         MBBI->getOpcode() == AArch64::STPDi ||
+         MBBI->getOpcode() == AArch64::STPXpre ||
+         MBBI->getOpcode() == AArch64::STPDpre) {
+    ++MBBI;
+    NumBytes -= 16;
+  }
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+  if (HasFP) {
+    // Issue    sub fp, sp, FPOffset or
+    //          mov fp,sp          when FPOffset is zero.
+    // Note: All stores of callee-saved registers are marked as "FrameSetup".
+    // This code marks the instruction(s) that set the FP also.
+    emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
+                    MachineInstr::FrameSetup);
+  }
+
+  // All of the remaining stack allocations are for locals.
+  AFI->setLocalStackSize(NumBytes);
+
+  // Allocate space for the rest of the frame.
+  if (NumBytes) {
+    // If we're a leaf function, try using the red zone.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value of the
+  // stack pointer is at this point. Any variable size objects will be allocated
+  // after this, so we can still use the base pointer to reference locals.
+  //
+  // FIXME: Clarify FrameSetup flags here.
+  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
+  // needed.
+  //
+  if (RegInfo->hasBasePointer(MF))
+    TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false);
+
+  if (needsFrameMoves) {
+    const DataLayout *TD = MF.getTarget().getDataLayout();
+    const int StackGrowth = -TD->getPointerSize(0);
+    unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+    // An example of the prologue:
+    //
+    //     .globl __foo
+    //     .align 2
+    //  __foo:
+    // Ltmp0:
+    //     .cfi_startproc
+    //     .cfi_personality 155, ___gxx_personality_v0
+    // Leh_func_begin:
+    //     .cfi_lsda 16, Lexception33
+    //
+    //     stp  xa,bx, [sp, -#offset]!
+    //     ...
+    //     stp  x28, x27, [sp, #offset-32]
+    //     stp  fp, lr, [sp, #offset-16]
+    //     add  fp, sp, #offset - 16
+    //     sub  sp, sp, #1360
+    //
+    // The Stack:
+    //       +-------------------------------------------+
+    // 10000 | ........ | ........ | ........ | ........ |
+    // 10004 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10008 | ........ | ........ | ........ | ........ |
+    // 1000c | ........ | ........ | ........ | ........ |
+    //       +===========================================+
+    // 10010 |                X28 Register               |
+    // 10014 |                X28 Register               |
+    //       +-------------------------------------------+
+    // 10018 |                X27 Register               |
+    // 1001c |                X27 Register               |
+    //       +===========================================+
+    // 10020 |                Frame Pointer              |
+    // 10024 |                Frame Pointer              |
+    //       +-------------------------------------------+
+    // 10028 |                Link Register              |
+    // 1002c |                Link Register              |
+    //       +===========================================+
+    // 10030 | ........ | ........ | ........ | ........ |
+    // 10034 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10038 | ........ | ........ | ........ | ........ |
+    // 1003c | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    //
+    //     [sp] = 10030        ::    >>initial value<<
+    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
+    //     fp = sp == 10020    ::  mov fp, sp
+    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
+    //     sp == 10010         ::    >>final value<<
+    //
+    // The frame pointer (w29) points to address 10020. If we use an offset of
+    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
+    // for w27, and -32 for w28:
+    //
+    //  Ltmp1:
+    //     .cfi_def_cfa w29, 16
+    //  Ltmp2:
+    //     .cfi_offset w30, -8
+    //  Ltmp3:
+    //     .cfi_offset w29, -16
+    //  Ltmp4:
+    //     .cfi_offset w27, -24
+    //  Ltmp5:
+    //     .cfi_offset w28, -32
+
+    if (HasFP) {
+      // Define the current CFA rule to use the provided FP.
+      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored LR
+      unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored FP
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else {
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
+    // Now emit the moves for whatever callee saved regs we have.
+    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
+  }
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
+  unsigned RtIdx = 0;
+  if (MI->getOpcode() == AArch64::LDPXpost ||
+      MI->getOpcode() == AArch64::LDPDpost)
+    RtIdx = 1;
+
+  if (MI->getOpcode() == AArch64::LDPXpost ||
+      MI->getOpcode() == AArch64::LDPDpost ||
+      MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) {
+    if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) ||
+        !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) ||
+        MI->getOperand(RtIdx + 2).getReg() != AArch64::SP)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64InstrInfo *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  DebugLoc DL = MBBI->getDebugLoc();
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  int NumBytes = MFI->getStackSize();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+  // Initial and residual are named for consitency with the prologue. Note that
+  // in the epilogue, the residual adjustment is executed first.
+  uint64_t ArgumentPopSize = 0;
+  if (RetOpcode == AArch64::TCRETURNdi || RetOpcode == AArch64::TCRETURNri) {
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+    // For a tail-call in a callee-pops-arguments environment, some or all of
+    // the stack may actually be in use for the call's arguments, this is
+    // calculated during LowerCall and consumed here...
+    ArgumentPopSize = StackAdjust.getImm();
+  } else {
+    // ... otherwise the amount to pop is *all* of the argument space,
+    // conveniently stored in the MachineFunctionInfo by
+    // LowerFormalArguments. This will, of course, be zero for the C calling
+    // convention.
+    ArgumentPopSize = AFI->getArgumentStackToRestore();
+  }
+
+  // The stack frame should be like below,
+  //
+  //      ----------------------                     ---
+  //      |                    |                      |
+  //      | BytesInStackArgArea|              CalleeArgStackSize
+  //      | (NumReusableBytes) |                (of tail call)
+  //      |                    |                     ---
+  //      |                    |                      |
+  //      ---------------------|        ---           |
+  //      |                    |         |            |
+  //      |   CalleeSavedReg   |         |            |
+  //      | (NumRestores * 16) |         |            |
+  //      |                    |         |            |
+  //      ---------------------|         |         NumBytes
+  //      |                    |     StackSize  (StackAdjustUp)
+  //      |   LocalStackSize   |         |            |
+  //      | (covering callee   |         |            |
+  //      |       args)        |         |            |
+  //      |                    |         |            |
+  //      ----------------------        ---          ---
+  //
+  // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
+  //             = StackSize + ArgumentPopSize
+  //
+  // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
+  // it as the 2nd argument of AArch64ISD::TC_RETURN.
+  NumBytes += ArgumentPopSize;
+
+  unsigned NumRestores = 0;
+  // Move past the restores of the callee-saved registers.
+  MachineBasicBlock::iterator LastPopI = MBBI;
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  if (LastPopI != MBB.begin()) {
+    do {
+      ++NumRestores;
+      --LastPopI;
+    } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
+    if (!isCSRestore(LastPopI, CSRegs)) {
+      ++LastPopI;
+      --NumRestores;
+    }
+  }
+  NumBytes -= NumRestores * 16;
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  if (!hasFP(MF)) {
+    // If this was a redzone leaf function, we don't need to restore the
+    // stack pointer.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes,
+                      TII);
+    return;
+  }
+
+  // Restore the original stack pointer.
+  // FIXME: Rather than doing the math here, we should instead just use
+  // non-post-indexed loads for the restores if we aren't actually going to
+  // be able to save any instructions.
+  if (NumBytes || MFI->hasVarSizedObjects())
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
+                    -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
+}
+
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index.
+int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                              int FI) const {
+  unsigned FrameReg;
+  return getFrameIndexReference(MF, FI, FrameReg);
+}
+
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info.  It's the same as what we use for resolving the code-gen
+/// references for now.  FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                 int FI,
+                                                 unsigned &FrameReg) const {
+  return resolveFrameIndexReference(MF, FI, FrameReg);
+}
+
+int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
+                                                     int FI, unsigned &FrameReg,
+                                                     bool PreferFP) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  int FPOffset = MFI->getObjectOffset(FI) + 16;
+  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+  bool isFixed = MFI->isFixedObjectIndex(FI);
+
+  // Use frame pointer to reference fixed objects. Use it for locals if
+  // there are VLAs (and thus the SP isn't reliable as a base).
+  // Make sure useFPForScavengingIndex() does the right thing for the emergency
+  // spill slot.
+  bool UseFP = false;
+  if (AFI->hasStackFrame()) {
+    // Note: Keeping the following as multiple 'if' statements rather than
+    // merging to a single expression for readability.
+    //
+    // Argument access should always use the FP.
+    if (isFixed) {
+      UseFP = hasFP(MF);
+    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) {
+      // Use SP or FP, whichever gives us the best chance of the offset
+      // being in range for direct access. If the FPOffset is positive,
+      // that'll always be best, as the SP will be even further away.
+      // If the FPOffset is negative, we have to keep in mind that the
+      // available offset range for negative offsets is smaller than for
+      // positive ones. If we have variable sized objects, we're stuck with
+      // using the FP regardless, though, as the SP offset is unknown
+      // and we don't have a base pointer available. If an offset is
+      // available via the FP and the SP, use whichever is closest.
+      if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
+          (FPOffset >= -256 && Offset > -FPOffset))
+        UseFP = true;
+    }
+  }
+
+  if (UseFP) {
+    FrameReg = RegInfo->getFrameRegister(MF);
+    return FPOffset;
+  }
+
+  // Use the base pointer if we have one.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  else {
+    FrameReg = AArch64::SP;
+    // If we're using the red zone for this function, the SP won't actually
+    // be adjusted, so the offsets will be negative. They're also all
+    // within range of the signed 9-bit immediate instructions.
+    if (canUseRedZone(MF))
+      Offset -= AFI->getLocalStackSize();
+  }
+
+  return Offset;
+}
+
+static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
+  if (Reg != AArch64::LR)
+    return getKillRegState(true);
+
+  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
+  bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR);
+  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
+  return getKillRegState(LRKill);
+}
+
+bool AArch64FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned idx = Count - i - 2;
+    unsigned Reg1 = CSI[idx].getReg();
+    unsigned Reg2 = CSI[idx + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    //
+    // The order of the registers in the list is controlled by
+    // getCalleeSavedRegs(), so they will always be in-order, as well.
+    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    unsigned StrOpc;
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
+    // first spill is a pre-increment that allocates the stack.
+    // For example:
+    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
+    //    stp     x20, x19, [sp, #16]    // addImm(+2)
+    //    stp     fp, lr, [sp, #32]      // addImm(+4)
+    // Rationale: This sequence saves uop updates compared to a sequence of
+    // pre-increment spills like stp xi,xj,[sp,#-16]!
+    // Note: Similar rational and sequence for restores in epilog.
+    if (AArch64::GPR64RegClass.contains(Reg1)) {
+      assert(AArch64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = AArch64::STPXpre;
+      else
+        StrOpc = AArch64::STPXi;
+    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
+      assert(AArch64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = AArch64::STPDpre;
+      else
+        StrOpc = AArch64::STPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
+                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
+    // Compute offset: i = 0 => offset = -Count;
+    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
+    const int Offset = (i == 0) ? -Count : i;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for STP immediate");
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+    if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre)
+      MIB.addReg(AArch64::SP, RegState::Define);
+
+    MIB.addReg(Reg2, getPrologueDeath(MF, Reg2))
+        .addReg(Reg1, getPrologueDeath(MF, Reg1))
+        .addReg(AArch64::SP)
+        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+  return true;
+}
+
+bool AArch64FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned Reg1 = CSI[i].getReg();
+    unsigned Reg2 = CSI[i + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
+    // the last load is sp-pi post-increment and de-allocates the stack:
+    // For example:
+    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
+    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
+    //    ldp     x22, x21, [sp], #48     // addImm(+6)
+    // Note: see comment in spillCalleeSavedRegisters()
+    unsigned LdrOpc;
+
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    if (AArch64::GPR64RegClass.contains(Reg1)) {
+      assert(AArch64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = AArch64::LDPXpost;
+      else
+        LdrOpc = AArch64::LDPXi;
+    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
+      assert(AArch64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = AArch64::LDPDpost;
+      else
+        LdrOpc = AArch64::LDPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
+                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
+
+    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
+    // etc.
+    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for LDP immediate");
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
+    if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost)
+      MIB.addReg(AArch64::SP, RegState::Define);
+
+    MIB.addReg(Reg2, getDefRegState(true))
+        .addReg(Reg1, getDefRegState(true))
+        .addReg(AArch64::SP)
+        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
+                         // where the factor * 8 is implicit
+  }
+  return true;
+}
+
+void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
+    MachineFunction &MF, RegScavenger *RS) const {
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  SmallVector<unsigned, 4> UnspilledCSGPRs;
+  SmallVector<unsigned, 4> UnspilledCSFPRs;
+
+  // The frame record needs to be created by saving the appropriate registers
+  if (hasFP(MF)) {
+    MRI->setPhysRegUsed(AArch64::FP);
+    MRI->setPhysRegUsed(AArch64::LR);
+  }
+
+  // Spill the BasePtr if it's used. Do this first thing so that the
+  // getCalleeSavedRegs() below will get the right answer.
+  if (RegInfo->hasBasePointer(MF))
+    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
+
+  // If any callee-saved registers are used, the frame cannot be eliminated.
+  unsigned NumGPRSpilled = 0;
+  unsigned NumFPRSpilled = 0;
+  bool ExtraCSSpill = false;
+  bool CanEliminateFrame = true;
+  DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+
+  // Check pairs of consecutive callee-saved registers.
+  for (unsigned i = 0; CSRegs[i]; i += 2) {
+    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
+
+    const unsigned OddReg = CSRegs[i];
+    const unsigned EvenReg = CSRegs[i + 1];
+    assert((AArch64::GPR64RegClass.contains(OddReg) &&
+            AArch64::GPR64RegClass.contains(EvenReg)) ^
+               (AArch64::FPR64RegClass.contains(OddReg) &&
+                AArch64::FPR64RegClass.contains(EvenReg)) &&
+           "Register class mismatch!");
+
+    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
+    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
+
+    // Early exit if none of the registers in the register pair is actually
+    // used.
+    if (!OddRegUsed && !EvenRegUsed) {
+      if (AArch64::GPR64RegClass.contains(OddReg)) {
+        UnspilledCSGPRs.push_back(OddReg);
+        UnspilledCSGPRs.push_back(EvenReg);
+      } else {
+        UnspilledCSFPRs.push_back(OddReg);
+        UnspilledCSFPRs.push_back(EvenReg);
+      }
+      continue;
+    }
+
+    unsigned Reg = AArch64::NoRegister;
+    // If only one of the registers of the register pair is used, make sure to
+    // mark the other one as used as well.
+    if (OddRegUsed ^ EvenRegUsed) {
+      // Find out which register is the additional spill.
+      Reg = OddRegUsed ? EvenReg : OddReg;
+      MRI->setPhysRegUsed(Reg);
+    }
+
+    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
+    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
+
+    assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) ||
+            (RegInfo->getEncodingValue(OddReg) + 1 ==
+             RegInfo->getEncodingValue(EvenReg))) &&
+           "Register pair of non-adjacent registers!");
+    if (AArch64::GPR64RegClass.contains(OddReg)) {
+      NumGPRSpilled += 2;
+      // If it's not a reserved register, we can use it in lieu of an
+      // emergency spill slot for the register scavenger.
+      // FIXME: It would be better to instead keep looking and choose another
+      // unspilled register that isn't reserved, if there is one.
+      if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
+        ExtraCSSpill = true;
+    } else
+      NumFPRSpilled += 2;
+
+    CanEliminateFrame = false;
+  }
+
+  // FIXME: Set BigStack if any stack slot references may be out of range.
+  // For now, just conservatively guestimate based on unscaled indexing
+  // range. We'll end up allocating an unnecessary spill slot a lot, but
+  // realistically that's not a big deal at this stage of the game.
+  // The CSR spill slots have not been allocated yet, so estimateStackSize
+  // won't include them.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
+  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
+  bool BigStack = (CFSize >= 256);
+  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
+    AFI->setHasStackFrame(true);
+
+  // Estimate if we might need to scavenge a register at some point in order
+  // to materialize a stack offset. If so, either spill one additional
+  // callee-saved register or reserve a special spill slot to facilitate
+  // register scavenging. If we already spilled an extra callee-saved register
+  // above to keep the number of spills even, we don't need to do anything else
+  // here.
+  if (BigStack && !ExtraCSSpill) {
+
+    // If we're adding a register to spill here, we have to add two of them
+    // to keep the number of regs to spill even.
+    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
+    unsigned Count = 0;
+    while (!UnspilledCSGPRs.empty() && Count < 2) {
+      unsigned Reg = UnspilledCSGPRs.back();
+      UnspilledCSGPRs.pop_back();
+      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
+                   << " to get a scratch register.\n");
+      MRI->setPhysRegUsed(Reg);
+      ExtraCSSpill = true;
+      ++Count;
+    }
+
+    // If we didn't find an extra callee-saved register to spill, create
+    // an emergency spill slot.
+    if (!ExtraCSSpill) {
+      const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
+      int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+      RS->addScavengingFrameIndex(FI);
+      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
+                   << " as the emergency spill slot.\n");
+    }
+  }
+}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
new file mode 100644
index 00000000000..0e00d168003
--- /dev/null
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -0,0 +1,75 @@
+//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64_FRAMELOWERING_H
+#define AArch64_FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64FrameLowering : public TargetFrameLowering {
+  const AArch64TargetMachine &TM;
+
+public:
+  explicit AArch64FrameLowering(const AArch64TargetMachine &TM,
+                              const AArch64Subtarget &STI)
+      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+                            false /*StackRealignable*/),
+        TM(TM) {}
+
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 unsigned FramePtr) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                 unsigned &FrameReg,
+                                 bool PreferFP = false) const;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  /// \brief Can this function use the red zone for local allocations.
+  bool canUseRedZone(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
new file mode 100644
index 00000000000..7007ffcce29
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -0,0 +1,3035 @@
+//===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the AArch64 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h" // To access function attributes.
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-isel"
+
+//===--------------------------------------------------------------------===//
+/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+
+class AArch64DAGToDAGISel : public SelectionDAGISel {
+  AArch64TargetMachine &TM;
+
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  bool ForCodeSize;
+
+public:
+  explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
+                               CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr),
+        ForCodeSize(false) {}
+
+  const char *getPassName() const override {
+    return "AArch64 Instruction Selection";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+    ForCodeSize =
+        FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                             Attribute::OptimizeForSize) ||
+        FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  SDNode *Select(SDNode *Node) override;
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
+
+  SDNode *SelectMLAV64LaneV128(SDNode *N);
+  SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
+  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
+  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, false, Reg, Shift);
+  }
+  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, true, Reg, Shift);
+  }
+  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 16, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
+  }
+
+  template<int Width>
+  bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &SignExtend, SDValue &DoShift) {
+    return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+  }
+
+  template<int Width>
+  bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &SignExtend, SDValue &DoShift) {
+    return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+  }
+
+
+  /// Form sequences of consecutive 64/128-bit registers for use in NEON
+  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
+  /// between 1 and 4 elements. If it contains a single element that is returned
+  /// unchanged; otherwise a REG_SEQUENCE value is returned.
+  SDValue createDTuple(ArrayRef<SDValue> Vecs);
+  SDValue createQTuple(ArrayRef<SDValue> Vecs);
+
+  /// Generic helper for the createDTuple/createQTuple
+  /// functions. Those should almost always be called instead.
+  SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
+                      unsigned SubRegs[]);
+
+  SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
+
+  SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
+
+  SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                     unsigned SubRegIdx);
+  SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                         unsigned SubRegIdx);
+  SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node);
+  SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node);
+
+  SDNode *SelectBitfieldExtractOp(SDNode *N);
+  SDNode *SelectBitfieldInsertOp(SDNode *N);
+
+  SDNode *SelectLIBM(SDNode *N);
+
+// Include the pieces autogenerated from the target description.
+#include "AArch64GenDAGISel.inc"
+
+private:
+  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
+                             SDValue &Shift);
+  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
+                             SDValue &OffImm);
+  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
+                              SDValue &OffImm);
+  bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
+                         SDValue &Offset, SDValue &SignExtend,
+                         SDValue &DoShift);
+  bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
+                         SDValue &Offset, SDValue &SignExtend,
+                         SDValue &DoShift);
+  bool isWorthFolding(SDValue V) const;
+  bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
+                         SDValue &Offset, SDValue &SignExtend);
+
+  template<unsigned RegWidth>
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
+    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
+  }
+
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
+};
+} // end anonymous namespace
+
+/// isIntImmediate - This method tests to see if the node is a constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
+  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
+    Imm = C->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isIntImmediate - This method tests to see if a constant operand.
+// If so Imm will receive the value.
+static bool isIntImmediate(SDValue N, uint64_t &Imm) {
+  return isIntImmediate(N.getNode(), Imm);
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
+                                  uint64_t &Imm) {
+  return N->getOpcode() == Opc &&
+         isIntImmediate(N->getOperand(1).getNode(), Imm);
+}
+
+bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+  // Require the address to be in a register.  That is safe for all AArch64
+  // variants and it is hard to do anything much smarter without knowing
+  // how the operand is used.
+  OutOps.push_back(Op);
+  return false;
+}
+
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
+                                           SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
+
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+  unsigned ShiftAmt;
+
+  if (Immed >> 12 == 0) {
+    ShiftAmt = 0;
+  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+    ShiftAmt = 12;
+    Immed = Immed >> 12;
+  } else
+    return false;
+
+  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+  Val = CurDAG->getTargetConstant(Immed, MVT::i32);
+  Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
+  return true;
+}
+
+/// SelectNegArithImmed - As above, but negates the value before trying to
+/// select it.
+bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
+                                              SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
+
+  // The immediate operand must be a 24-bit zero-extended immediate.
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+
+  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
+  // have the opposite effect on the C flag, so this pattern mustn't match under
+  // those circumstances.
+  if (Immed == 0)
+    return false;
+
+  if (N.getValueType() == MVT::i32)
+    Immed = ~((uint32_t)Immed) + 1;
+  else
+    Immed = ~Immed + 1ULL;
+  if (Immed & 0xFFFFFFFFFF000000ULL)
+    return false;
+
+  Immed &= 0xFFFFFFULL;
+  return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift);
+}
+
+/// getShiftTypeForNode - Translate a shift node to the corresponding
+/// ShiftType value.
+static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
+  switch (N.getOpcode()) {
+  default:
+    return AArch64_AM::InvalidShiftExtend;
+  case ISD::SHL:
+    return AArch64_AM::LSL;
+  case ISD::SRL:
+    return AArch64_AM::LSR;
+  case ISD::SRA:
+    return AArch64_AM::ASR;
+  case ISD::ROTR:
+    return AArch64_AM::ROR;
+  }
+}
+
+/// \brief Determine wether it is worth to fold V into an extended register.
+bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
+  // it hurts if the a value is used at least twice, unless we are optimizing
+  // for code size.
+  if (ForCodeSize || V.hasOneUse())
+    return true;
+  return false;
+}
+
+/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
+/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
+/// instructions allow the shifted register to be rotated, but the arithmetic
+/// instructions do not.  The AllowROR parameter specifies whether ROR is
+/// supported.
+bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
+                                                SDValue &Reg, SDValue &Shift) {
+  AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
+  if (ShType == AArch64_AM::InvalidShiftExtend)
+    return false;
+  if (!AllowROR && ShType == AArch64_AM::ROR)
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
+    unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
+
+    Reg = N.getOperand(0);
+    Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
+    return isWorthFolding(N);
+  }
+
+  return false;
+}
+
+/// getExtendTypeForNode - Translate an extend node to the corresponding
+/// ExtendType value.
+static AArch64_AM::ShiftExtendType
+getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
+  if (N.getOpcode() == ISD::SIGN_EXTEND ||
+      N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT SrcVT;
+    if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
+      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
+    else
+      SrcVT = N.getOperand(0).getValueType();
+
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return AArch64_AM::SXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return AArch64_AM::SXTH;
+    else if (SrcVT == MVT::i32)
+      return AArch64_AM::SXTW;
+    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+    return AArch64_AM::InvalidShiftExtend;
+  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
+             N.getOpcode() == ISD::ANY_EXTEND) {
+    EVT SrcVT = N.getOperand(0).getValueType();
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return AArch64_AM::UXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return AArch64_AM::UXTH;
+    else if (SrcVT == MVT::i32)
+      return AArch64_AM::UXTW;
+    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+    return AArch64_AM::InvalidShiftExtend;
+  } else if (N.getOpcode() == ISD::AND) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return AArch64_AM::InvalidShiftExtend;
+    uint64_t AndMask = CSD->getZExtValue();
+
+    switch (AndMask) {
+    default:
+      return AArch64_AM::InvalidShiftExtend;
+    case 0xFF:
+      return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
+    case 0xFFFF:
+      return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
+    case 0xFFFFFFFF:
+      return AArch64_AM::UXTW;
+    }
+  }
+
+  return AArch64_AM::InvalidShiftExtend;
+}
+
+// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
+static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
+  if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
+      DL->getOpcode() != AArch64ISD::DUPLANE32)
+    return false;
+
+  SDValue SV = DL->getOperand(0);
+  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
+    return false;
+
+  SDValue EV = SV.getOperand(1);
+  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return false;
+
+  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
+  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
+  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
+  LaneOp = EV.getOperand(0);
+
+  return true;
+}
+
+// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
+// high lane extract.
+static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
+                             SDValue &LaneOp, int &LaneIdx) {
+
+  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
+      return false;
+  }
+  StdOp = Op1;
+  return true;
+}
+
+/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
+/// is a lane in the upper half of a 128-bit vector.  Recognize and select this
+/// so that we don't emit unnecessary lane extracts.
+SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
+  SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
+  int LaneIdx = -1; // Will hold the lane index.
+
+  if (Op1.getOpcode() != ISD::MUL ||
+      !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                        LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (Op1.getOpcode() != ISD::MUL ||
+        !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                          LaneIdx))
+      return nullptr;
+  }
+
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+
+  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
+
+  unsigned MLAOpc = ~0U;
+
+  switch (N->getSimpleValueType(0).SimpleTy) {
+  default:
+    llvm_unreachable("Unrecognized MLA.");
+  case MVT::v4i16:
+    MLAOpc = AArch64::MLAv4i16_indexed;
+    break;
+  case MVT::v8i16:
+    MLAOpc = AArch64::MLAv8i16_indexed;
+    break;
+  case MVT::v2i32:
+    MLAOpc = AArch64::MLAv2i32_indexed;
+    break;
+  case MVT::v4i32:
+    MLAOpc = AArch64::MLAv4i32_indexed;
+    break;
+  }
+
+  return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+  SDValue SMULLOp0;
+  SDValue SMULLOp1;
+  int LaneIdx;
+
+  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
+                        LaneIdx))
+    return nullptr;
+
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+
+  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
+
+  unsigned SMULLOpc = ~0U;
+
+  if (IntNo == Intrinsic::aarch64_neon_smull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = AArch64::SMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = AArch64::SMULLv2i32_indexed;
+      break;
+    }
+  } else if (IntNo == Intrinsic::aarch64_neon_umull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = AArch64::UMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = AArch64::UMULLv2i32_indexed;
+      break;
+    }
+  } else
+    llvm_unreachable("Unrecognized intrinsic.");
+
+  return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops);
+}
+
+/// Instructions that accept extend modifiers like UXTW expect the register
+/// being extended to be a GPR32, but the incoming DAG might be acting on a
+/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
+/// this is the case.
+static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
+  if (N.getValueType() == MVT::i32)
+    return N;
+
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+  MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                               SDLoc(N), MVT::i32, N, SubReg);
+  return SDValue(Node, 0);
+}
+
+
+/// SelectArithExtendedRegister - Select a "extended register" operand.  This
+/// operand folds in an extend followed by an optional left shift.
+bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
+                                                      SDValue &Shift) {
+  unsigned ShiftVal = 0;
+  AArch64_AM::ShiftExtendType Ext;
+
+  if (N.getOpcode() == ISD::SHL) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return false;
+    ShiftVal = CSD->getZExtValue();
+    if (ShiftVal > 4)
+      return false;
+
+    Ext = getExtendTypeForNode(N.getOperand(0));
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Reg = N.getOperand(0).getOperand(0);
+  } else {
+    Ext = getExtendTypeForNode(N);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Reg = N.getOperand(0);
+  }
+
+  // AArch64 mandates that the RHS of the operation must use the smallest
+  // register classs that could contain the size being extended from.  Thus,
+  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
+  // there might not be an actual 32-bit value in the program.  We can
+  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
+  assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
+  Reg = narrowIfNeeded(CurDAG, Reg);
+  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32);
+  return isWorthFolding(N);
+}
+
+/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
+/// immediate" address.  The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
+                                              SDValue &Base, SDValue &OffImm) {
+  const TargetLowering *TLI = getTargetLowering();
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+    return true;
+  }
+
+  if (N.getOpcode() == AArch64ISD::ADDlow) {
+    GlobalAddressSDNode *GAN =
+        dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
+    Base = N.getOperand(0);
+    OffImm = N.getOperand(1);
+    if (!GAN)
+      return true;
+
+    const GlobalValue *GV = GAN->getGlobal();
+    unsigned Alignment = GV->getAlignment();
+    const DataLayout *DL = TLI->getDataLayout();
+    if (Alignment == 0 && !Subtarget->isTargetDarwin())
+      Alignment = DL->getABITypeAlignment(GV->getType()->getElementType());
+
+    if (Alignment >= Size)
+      return true;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int64_t RHSC = (int64_t)RHS->getZExtValue();
+      unsigned Scale = Log2_32(Size);
+      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        }
+        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64);
+        return true;
+      }
+    }
+  }
+
+  // Before falling back to our general case, check if the unscaled
+  // instructions can handle this. If so, that's preferable.
+  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
+    return false;
+
+  // Base only. The address will be materialized into a register before
+  // the memory is accessed.
+  //    add x0, Xbase, #offset
+  //    ldr x0, [x0]
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+  return true;
+}
+
+/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
+/// immediate" address.  This should only match when there is an offset that
+/// is not valid for a scaled immediate addressing mode.  The "Size" argument
+/// is the size in bytes of the memory reference, which is needed here to know
+/// what is valid for a scaled immediate.
+bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
+                                                 SDValue &Base,
+                                                 SDValue &OffImm) {
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int64_t RHSC = RHS->getSExtValue();
+    // If the offset is valid as a scaled immediate, don't match here.
+    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
+        RHSC < (0x1000 << Log2_32(Size)))
+      return false;
+    if (RHSC >= -256 && RHSC < 256) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        const TargetLowering *TLI = getTargetLowering();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64);
+      return true;
+    }
+  }
+  return false;
+}
+
+static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+  SDValue ImpDef = SDValue(
+      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64),
+      0);
+  MachineSDNode *Node = CurDAG->getMachineNode(
+      TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg);
+  return SDValue(Node, 0);
+}
+
+/// \brief Check if the given SHL node (\p N), can be used to form an
+/// extended register for an addressing mode.
+bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
+                                            bool WantExtend, SDValue &Offset,
+                                            SDValue &SignExtend) {
+  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
+  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
+    return false;
+
+  if (WantExtend) {
+    AArch64_AM::ShiftExtendType Ext =
+        getExtendTypeForNode(N.getOperand(0), true);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+  } else {
+    Offset = N.getOperand(0);
+    SignExtend = CurDAG->getTargetConstant(0, MVT::i32);
+  }
+
+  unsigned LegalShiftVal = Log2_32(Size);
+  unsigned ShiftVal = CSD->getZExtValue();
+
+  if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
+    return false;
+
+  if (isWorthFolding(N))
+    return true;
+
+  return false;
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
+                                            SDValue &Base, SDValue &Offset,
+                                            SDValue &SignExtend,
+                                            SDValue &DoShift) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
+
+  // We don't want to match immediate adds here, because they are better lowered
+  // to the register-immediate addressing modes.
+  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode *UI : Node->uses()) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
+    Base = LHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
+    Base = RHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // There was no shift, whatever else we find.
+  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+
+  AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
+  // Try to match an unshifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(LHS, true)) !=
+          AArch64_AM::InvalidShiftExtend) {
+    Base = RHS;
+    Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    if (isWorthFolding(LHS))
+      return true;
+  }
+
+  // Try to match an unshifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(RHS, true)) !=
+          AArch64_AM::InvalidShiftExtend) {
+    Base = LHS;
+    Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    if (isWorthFolding(RHS))
+      return true;
+  }
+
+  return false;
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
+                                            SDValue &Base, SDValue &Offset,
+                                            SDValue &SignExtend,
+                                            SDValue &DoShift) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
+
+  // We don't want to match immediate adds here, because they are better lowered
+  // to the register-immediate addressing modes.
+  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode *UI : Node->uses()) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
+    Base = LHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
+    Base = RHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // Match any non-shifted, non-extend, non-immediate add expression.
+  Base = LHS;
+  Offset = RHS;
+  SignExtend = CurDAG->getTargetConstant(false, MVT::i32);
+  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+  // Reg1 + Reg2 is free: no check needed.
+  return true;
+}
+
+SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
+  static unsigned RegClassIDs[] = {
+      AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
+  static unsigned SubRegs[] = { AArch64::dsub0, AArch64::dsub1,
+                                AArch64::dsub2, AArch64::dsub3 };
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
+  static unsigned RegClassIDs[] = {
+      AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
+  static unsigned SubRegs[] = { AArch64::qsub0, AArch64::qsub1,
+                                AArch64::qsub2, AArch64::qsub3 };
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
+                                         unsigned RegClassIDs[],
+                                         unsigned SubRegs[]) {
+  // There's no special register-class for a vector-list of 1 element: it's just
+  // a vector.
+  if (Regs.size() == 1)
+    return Regs[0];
+
+  assert(Regs.size() >= 2 && Regs.size() <= 4);
+
+  SDLoc DL(Regs[0].getNode());
+
+  SmallVector<SDValue, 4> Ops;
+
+  // First operand of REG_SEQUENCE is the desired RegClass.
+  Ops.push_back(
+      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32));
+
+  // Then we get pairs of source & subregister-position for the components.
+  for (unsigned i = 0; i < Regs.size(); ++i) {
+    Ops.push_back(Regs[i]);
+    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32));
+  }
+
+  SDNode *N =
+      CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
+  return SDValue(N, 0);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc, bool isExt) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  unsigned ExtOff = isExt;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  unsigned Vec0Off = ExtOff + 1;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
+                               N->op_begin() + Vec0Off + NumVecs);
+  SDValue RegSeq = createQTuple(Regs);
+
+  SmallVector<SDValue, 6> Ops;
+  if (isExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->isUnindexed())
+    return nullptr;
+  EVT VT = LD->getMemoryVT();
+  EVT DstVT = N->getValueType(0);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+
+  // We're not doing validity checking here. That was done when checking
+  // if we should mark the load as indexed or not. We're just selecting
+  // the right instruction.
+  unsigned Opcode = 0;
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  bool InsertTo64 = false;
+  if (VT == MVT::i64)
+    Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
+  else if (VT == MVT::i32) {
+    if (ExtType == ISD::NON_EXTLOAD)
+      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+    else if (ExtType == ISD::SEXTLOAD)
+      Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
+    else {
+      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+      InsertTo64 = true;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i16) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
+      else
+        Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
+    } else {
+      Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i8) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
+      else
+        Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
+    } else {
+      Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::f32) {
+    Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
+  } else if (VT == MVT::f64 || VT.is64BitVector()) {
+    Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
+  } else if (VT.is128BitVector()) {
+    Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
+  } else
+    return nullptr;
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
+  int OffsetVal = (int)OffsetOp->getZExtValue();
+  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64);
+  SDValue Ops[] = { Base, Offset, Chain };
+  SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i64, DstVT,
+                                       MVT::Other, Ops);
+  // Either way, we're replacing the node, so tell the caller that.
+  Done = true;
+  SDValue LoadedVal = SDValue(Res, 1);
+  if (InsertTo64) {
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+    LoadedVal =
+        SDValue(CurDAG->getMachineNode(
+                    AArch64::SUBREG_TO_REG, SDLoc(N), MVT::i64,
+                    CurDAG->getTargetConstant(0, MVT::i64), LoadedVal, SubReg),
+                0);
+  }
+
+  ReplaceUses(SDValue(N, 0), LoadedVal);
+  ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
+  ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
+
+  return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
+                                        unsigned Opc, unsigned SubRegIdx) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(2)); // Mem operand;
+  Ops.push_back(Chain);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+  for (unsigned i = 0; i < NumVecs; ++i)
+    ReplaceUses(SDValue(N, i),
+        CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+  return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
+                                            unsigned Opc, unsigned SubRegIdx) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(1)); // Mem operand
+  Ops.push_back(N->getOperand(2)); // Incremental
+  Ops.push_back(Chain);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::i64); // Type of the write back register
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Update uses of write back register
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+  // Update uses of vector list
+  SDValue SuperReg = SDValue(Ld, 1);
+  if (NumVecs == 1)
+    ReplaceUses(SDValue(N, 0), SuperReg);
+  else
+    for (unsigned i = 0; i < NumVecs; ++i)
+      ReplaceUses(SDValue(N, i),
+          CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+  // Update the chain
+  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+  return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + 2));
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
+
+  return St;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  SmallVector<EVT, 2> ResTys;
+  ResTys.push_back(MVT::i64);   // Type of the write back register
+  ResTys.push_back(MVT::Other); // Type for the Chain
+
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + 1)); // base register
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental
+  Ops.push_back(N->getOperand(0)); // Chain
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  return St;
+}
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+class WidenVector {
+  SelectionDAG &DAG;
+
+public:
+  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
+
+  SDValue operator()(SDValue V64Reg) {
+    EVT VT = V64Reg.getValueType();
+    unsigned NarrowSize = VT.getVectorNumElements();
+    MVT EltTy = VT.getVectorElementType().getSimpleVT();
+    MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+    SDLoc DL(V64Reg);
+
+    SDValue Undef =
+        SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
+    return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
+  }
+};
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+
+  return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
+                                    V128Reg);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
+                                            unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 3));
+  Ops.push_back(N->getOperand(0));
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+
+  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+  static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
+                              AArch64::qsub3 };
+  for (unsigned i = 0; i < NumVecs; ++i) {
+    SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
+    if (Narrow)
+      NV = NarrowVector(NV, *CurDAG);
+    ReplaceUses(SDValue(N, i), NV);
+  }
+
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+
+  return Ld;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
+                                                unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::i64); // Type of the write back register
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Base register
+  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
+  Ops.push_back(N->getOperand(0));
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Update uses of the write back register
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+  // Update uses of the vector list
+  SDValue SuperReg = SDValue(Ld, 1);
+  if (NumVecs == 1) {
+    ReplaceUses(SDValue(N, 0),
+                Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
+  } else {
+    EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+    static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
+                                AArch64::qsub3 };
+    for (unsigned i = 0; i < NumVecs; ++i) {
+      SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
+                                                  SuperReg);
+      if (Narrow)
+        NV = NarrowVector(NV, *CurDAG);
+      ReplaceUses(SDValue(N, i), NV);
+    }
+  }
+
+  // Update the Chain
+  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+
+  return Ld;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 3));
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  return St;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
+                                                 unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  SmallVector<EVT, 2> ResTys;
+  ResTys.push_back(MVT::i64);   // Type of the write back register
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register
+  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  return St;
+}
+
+static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
+                                       unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       unsigned NumberOfIgnoredLowBits,
+                                       bool BiggerPattern) {
+  assert(N->getOpcode() == ISD::AND &&
+         "N must be a AND operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // FIXME: simplify-demanded-bits in DAGCombine will probably have
+  // changed the AND node to a 32-bit mask operation. We'll have to
+  // undo that as part of the transform here if we want to catch all
+  // the opportunities.
+  // Currently the NumberOfIgnoredLowBits argument helps to recover
+  // form these situations when matching bigger pattern (bitfield insert).
+
+  // For unsigned extracts, check for a shift right and mask
+  uint64_t And_imm = 0;
+  if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
+    return false;
+
+  const SDNode *Op0 = N->getOperand(0).getNode();
+
+  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
+  // simplified. Try to undo that
+  And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
+
+  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+  if (And_imm & (And_imm + 1))
+    return false;
+
+  bool ClampMSB = false;
+  uint64_t Srl_imm = 0;
+  // Handle the SRL + ANY_EXTEND case.
+  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
+      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
+    // Extend the incoming operand of the SRL to 64-bit.
+    Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
+    // Make sure to clamp the MSB so that we preserve the semantics of the
+    // original operations.
+    ClampMSB = true;
+  } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
+             isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
+                                   Srl_imm)) {
+    // If the shift result was truncated, we can still combine them.
+    Opd0 = Op0->getOperand(0).getOperand(0);
+
+    // Use the type of SRL node.
+    VT = Opd0->getValueType(0);
+  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
+    Opd0 = Op0->getOperand(0);
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift right has been performed.
+    // The resulting code will be at least as good as the original one
+    // plus it may expose more opportunities for bitfield insert pattern.
+    // FIXME: Currently we limit this to the bigger pattern, because
+    // some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
+         "bad amount in shift node!");
+
+  LSB = Srl_imm;
+  MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
+                                  : CountTrailingOnes_64(And_imm)) -
+        1;
+  if (ClampMSB)
+    // Since we're moving the extend before the right shift operation, we need
+    // to clamp the MSB to make sure we don't shift in undefined bits instead of
+    // the zeros which would get shifted in with the original right shift
+    // operation.
+    MSB = MSB > 31 ? 31 : MSB;
+
+  Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+  return true;
+}
+
+static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                     unsigned &LSB, unsigned &MSB) {
+  // We are looking for the following pattern which basically extracts a single
+  // bit from the source value and places it in the LSB of the destination
+  // value, all other bits of the destination value or set to zero:
+  //
+  // Value2 = AND Value, MaskImm
+  // SRL Value2, ShiftImm
+  //
+  // with MaskImm >> ShiftImm == 1.
+  //
+  // This gets selected into a single UBFM:
+  //
+  // UBFM Value, ShiftImm, ShiftImm
+  //
+
+  if (N->getOpcode() != ISD::SRL)
+    return false;
+
+  uint64_t And_mask = 0;
+  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
+    return false;
+
+  Opd0 = N->getOperand(0).getOperand(0);
+
+  uint64_t Srl_imm = 0;
+  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+    return false;
+
+  // Check whether we really have a one bit extract here.
+  if (And_mask >> Srl_imm == 0x1) {
+    if (N->getValueType(0) == MVT::i32)
+      Opc = AArch64::UBFMWri;
+    else
+      Opc = AArch64::UBFMXri;
+
+    LSB = MSB = Srl_imm;
+
+    return true;
+  }
+
+  return false;
+}
+
+static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       bool BiggerPattern) {
+  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
+         "N must be a SHR/SRA operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // Check for AND + SRL doing a one bit extract.
+  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+    return true;
+
+  // we're looking for a shift of a shift
+  uint64_t Shl_imm = 0;
+  uint64_t Trunc_bits = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+    Opd0 = N->getOperand(0).getOperand(0);
+  } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
+             N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
+    // We are looking for a shift of truncate. Truncate from i64 to i32 could
+    // be considered as setting high 32 bits as zero. Our strategy here is to
+    // always generate 64bit UBFM. This consistency will help the CSE pass
+    // later find more redundancy.
+    Opd0 = N->getOperand(0).getOperand(0);
+    Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
+    VT = Opd0->getValueType(0);
+    assert(VT == MVT::i64 && "the promoted type should be i64");
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift left has been performed.
+    // FIXME: Currently we limit this to the bigger pattern case,
+    // because some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
+  uint64_t Srl_imm = 0;
+  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+    return false;
+
+  assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
+         "bad amount in shift node!");
+  // Note: The width operand is encoded as width-1.
+  unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
+  int sLSB = Srl_imm - Shl_imm;
+  if (sLSB < 0)
+    return false;
+  LSB = sLSB;
+  MSB = LSB + Width;
+  // SRA requires a signed extraction
+  if (VT == MVT::i32)
+    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
+  else
+    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
+  return true;
+}
+
+static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
+                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
+                                unsigned NumberOfIgnoredLowBits = 0,
+                                bool BiggerPattern = false) {
+  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+    if (!N->isMachineOpcode())
+      return false;
+    break;
+  case ISD::AND:
+    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
+                                      NumberOfIgnoredLowBits, BiggerPattern);
+  case ISD::SRL:
+  case ISD::SRA:
+    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
+  }
+
+  unsigned NOpc = N->getMachineOpcode();
+  switch (NOpc) {
+  default:
+    return false;
+  case AArch64::SBFMWri:
+  case AArch64::UBFMWri:
+  case AArch64::SBFMXri:
+  case AArch64::UBFMXri:
+    Opc = NOpc;
+    Opd0 = N->getOperand(0);
+    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+    return true;
+  }
+  // Unreachable
+  return false;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
+  unsigned Opc, LSB, MSB;
+  SDValue Opd0;
+  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
+    return nullptr;
+
+  EVT VT = N->getValueType(0);
+
+  // If the bit extract operation is 64bit but the original type is 32bit, we
+  // need to add one EXTRACT_SUBREG.
+  if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
+    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
+                       CurDAG->getTargetConstant(MSB, MVT::i64)};
+
+    SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+    MachineSDNode *Node =
+        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32,
+                               SDValue(BFM, 0), SubReg);
+    return Node;
+  }
+
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
+                   CurDAG->getTargetConstant(MSB, VT)};
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
+/// Does DstMask form a complementary pair with the mask provided by
+/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
+/// this asks whether DstMask zeroes precisely those bits that will be set by
+/// the other half.
+static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
+                              unsigned NumberOfIgnoredHighBits, EVT VT) {
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "i32 or i64 mask type expected!");
+  unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
+
+  APInt SignificantDstMask = APInt(BitWidth, DstMask);
+  APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
+
+  return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
+         (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
+}
+
+// Look for bits that will be useful for later uses.
+// A bit is consider useless as soon as it is dropped and never used
+// before it as been dropped.
+// E.g., looking for useful bit of x
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// After #1, x useful bits are 0x7, then the useful bits of x, live through
+// y.
+// After #2, the useful bits of x are 0x4.
+// However, if x is used on an unpredicatable instruction, then all its bits
+// are useful.
+// E.g.
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// 3. str x, [@x]
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
+
+static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
+  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
+  getUsefulBits(Op, UsefulBits, Depth + 1);
+}
+
+static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
+                                             uint64_t Imm, uint64_t MSB,
+                                             unsigned Depth) {
+  // inherit the bitwidth value
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    // The interesting part will be in the lower part of the result
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was starting at Imm in the argument
+    OpUsefulBits = OpUsefulBits.shl(Imm);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    // The interesting part will be shifted in the result
+    OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was at zero in the argument
+    OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
+  }
+
+  UsefulBits &= OpUsefulBits;
+}
+
+static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
+                                  unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+
+  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+}
+
+static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t ShiftTypeAndValue =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  APInt Mask(UsefulBits);
+  Mask.clearAllBits();
+  Mask.flipAllBits();
+
+  if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
+    // Shift Left
+    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.shl(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.lshr(ShiftAmt);
+  } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
+    // Shift Right
+    // We do not handle AArch64_AM::ASR, because the sign will change the
+    // number of useful bits
+    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.lshr(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.shl(ShiftAmt);
+  } else
+    return;
+
+  UsefulBits &= Mask;
+}
+
+static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
+                                 unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
+
+  if (Op.getOperand(1) == Orig)
+    return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    UsefulBits &= ~OpUsefulBits;
+    getUsefulBits(Op, UsefulBits, Depth + 1);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm));
+    getUsefulBits(Op, UsefulBits, Depth + 1);
+  }
+}
+
+static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
+                                SDValue Orig, unsigned Depth) {
+
+  // Users of this node should have already been instruction selected
+  // FIXME: Can we turn that into an assert?
+  if (!UserNode->isMachineOpcode())
+    return;
+
+  switch (UserNode->getMachineOpcode()) {
+  default:
+    return;
+  case AArch64::ANDSWri:
+  case AArch64::ANDSXri:
+  case AArch64::ANDWri:
+  case AArch64::ANDXri:
+    // We increment Depth only when we call the getUsefulBits
+    return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case AArch64::UBFMWri:
+  case AArch64::UBFMXri:
+    return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
+
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+    if (UserNode->getOperand(1) != Orig)
+      return;
+    return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case AArch64::BFMWri:
+  case AArch64::BFMXri:
+    return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
+  }
+}
+
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
+  if (Depth >= 6)
+    return;
+  // Initialize UsefulBits
+  if (!Depth) {
+    unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits();
+    // At the beginning, assume every produced bits is useful
+    UsefulBits = APInt(Bitwidth, 0);
+    UsefulBits.flipAllBits();
+  }
+  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
+
+  for (SDNode *Node : Op.getNode()->uses()) {
+    // A use cannot produce useful bits
+    APInt UsefulBitsForUse = APInt(UsefulBits);
+    getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
+    UsersUsefulBits |= UsefulBitsForUse;
+  }
+  // UsefulBits contains the produced bits that are meaningful for the
+  // current definition, thus a user cannot make a bit meaningful at
+  // this point
+  UsefulBits &= UsersUsefulBits;
+}
+
+/// Create a machine node performing a notional SHL of Op by ShlAmount. If
+/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
+/// 0, return Op unchanged.
+static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
+  if (ShlAmount == 0)
+    return Op;
+
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+
+  SDNode *ShiftNode;
+  if (ShlAmount > 0) {
+    // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, SDLoc(Op), VT, Op,
+        CurDAG->getTargetConstant(BitWidth - ShlAmount, VT),
+        CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT));
+  } else {
+    // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
+    assert(ShlAmount < 0 && "expected right shift");
+    int ShrAmount = -ShlAmount;
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT),
+        CurDAG->getTargetConstant(BitWidth - 1, VT));
+  }
+
+  return SDValue(ShiftNode, 0);
+}
+
+/// Does this tree qualify as an attempt to move a bitfield into position,
+/// essentially "(and (shl VAL, N), Mask)".
+static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+                                    SDValue &Src, int &ShiftAmount,
+                                    int &MaskWidth) {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  (void)BitWidth;
+  assert(BitWidth == 32 || BitWidth == 64);
+
+  APInt KnownZero, KnownOne;
+  CurDAG->computeKnownBits(Op, KnownZero, KnownOne);
+
+  // Non-zero in the sense that they're not provably zero, which is the key
+  // point if we want to use this value
+  uint64_t NonZeroBits = (~KnownZero).getZExtValue();
+
+  // Discard a constant AND mask if present. It's safe because the node will
+  // already have been factored into the computeKnownBits calculation above.
+  uint64_t AndImm;
+  if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
+    assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
+    Op = Op.getOperand(0);
+  }
+
+  uint64_t ShlImm;
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
+    return false;
+  Op = Op.getOperand(0);
+
+  if (!isShiftedMask_64(NonZeroBits))
+    return false;
+
+  ShiftAmount = countTrailingZeros(NonZeroBits);
+  MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
+
+  // BFI encompasses sufficiently many nodes that it's worth inserting an extra
+  // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
+  // amount.
+  Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
+
+  return true;
+}
+
+// Given a OR operation, check if we have the following pattern
+// ubfm c, b, imm, imm2 (or something that does the same jobs, see
+//                       isBitfieldExtractOp)
+// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
+//                 countTrailingZeros(mask2) == imm2 - imm + 1
+// f = d | c
+// if yes, given reference arguments will be update so that one can replace
+// the OR instruction with:
+// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
+static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
+                                     SDValue &Src, unsigned &ImmR,
+                                     unsigned &ImmS, SelectionDAG *CurDAG) {
+  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+  // Set Opc
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::i32)
+    Opc = AArch64::BFMWri;
+  else if (VT == MVT::i64)
+    Opc = AArch64::BFMXri;
+  else
+    return false;
+
+  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
+  // have the expected shape. Try to undo that.
+  APInt UsefulBits;
+  getUsefulBits(SDValue(N, 0), UsefulBits);
+
+  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
+  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
+
+  // OR is commutative, check both possibilities (does llvm provide a
+  // way to do that directely, e.g., via code matcher?)
+  SDValue OrOpd1Val = N->getOperand(1);
+  SDNode *OrOpd0 = N->getOperand(0).getNode();
+  SDNode *OrOpd1 = N->getOperand(1).getNode();
+  for (int i = 0; i < 2;
+       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
+    unsigned BFXOpc;
+    int DstLSB, Width;
+    if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
+                            NumberOfIgnoredLowBits, true)) {
+      // Check that the returned opcode is compatible with the pattern,
+      // i.e., same type and zero extended (U and not S)
+      if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
+          (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
+        continue;
+
+      // Compute the width of the bitfield insertion
+      DstLSB = 0;
+      Width = ImmS - ImmR + 1;
+      // FIXME: This constraint is to catch bitfield insertion we may
+      // want to widen the pattern if we want to grab general bitfied
+      // move case
+      if (Width <= 0)
+        continue;
+
+      // If the mask on the insertee is correct, we have a BFXIL operation. We
+      // can share the ImmR and ImmS values from the already-computed UBFM.
+    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
+                                       DstLSB, Width)) {
+      ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+      ImmS = Width - 1;
+    } else
+      continue;
+
+    // Check the second part of the pattern
+    EVT VT = OrOpd1->getValueType(0);
+    assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
+
+    // Compute the Known Zero for the candidate of the first operand.
+    // This allows to catch more general case than just looking for
+    // AND with imm. Indeed, simplify-demanded-bits may have removed
+    // the AND instruction because it proves it was useless.
+    APInt KnownZero, KnownOne;
+    CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne);
+
+    // Check if there is enough room for the second operand to appear
+    // in the first one
+    APInt BitsToBeInserted =
+        APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
+
+    if ((BitsToBeInserted & ~KnownZero) != 0)
+      continue;
+
+    // Set the first operand
+    uint64_t Imm;
+    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
+        isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
+      // In that case, we can eliminate the AND
+      Dst = OrOpd1->getOperand(0);
+    else
+      // Maybe the AND has been removed by simplify-demanded-bits
+      // or is useful because it discards more bits
+      Dst = OrOpd1Val;
+
+    // both parts match
+    return true;
+  }
+
+  return false;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
+  if (N->getOpcode() != ISD::OR)
+    return nullptr;
+
+  unsigned Opc;
+  unsigned LSB, MSB;
+  SDValue Opd0, Opd1;
+
+  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+    return nullptr;
+
+  EVT VT = N->getValueType(0);
+  SDValue Ops[] = { Opd0,
+                    Opd1,
+                    CurDAG->getTargetConstant(LSB, VT),
+                    CurDAG->getTargetConstant(MSB, VT) };
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  unsigned Variant;
+  unsigned Opc;
+  unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr };
+
+  if (VT == MVT::f32) {
+    Variant = 0;
+  } else if (VT == MVT::f64) {
+    Variant = 1;
+  } else
+    return nullptr; // Unrecognized argument type. Fall back on default codegen.
+
+  // Pick the FRINTX variant needed to set the flags.
+  unsigned FRINTXOpc = FRINTXOpcs[Variant];
+
+  switch (N->getOpcode()) {
+  default:
+    return nullptr; // Unrecognized libm ISD node. Fall back on default codegen.
+  case ISD::FCEIL: {
+    unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr };
+    Opc = FRINTPOpcs[Variant];
+    break;
+  }
+  case ISD::FFLOOR: {
+    unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr };
+    Opc = FRINTMOpcs[Variant];
+    break;
+  }
+  case ISD::FTRUNC: {
+    unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr };
+    Opc = FRINTZOpcs[Variant];
+    break;
+  }
+  case ISD::FROUND: {
+    unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr };
+    Opc = FRINTAOpcs[Variant];
+    break;
+  }
+  }
+
+  SDLoc dl(N);
+  SDValue In = N->getOperand(0);
+  SmallVector<SDValue, 2> Ops;
+  Ops.push_back(In);
+
+  if (!TM.Options.UnsafeFPMath) {
+    SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
+    Ops.push_back(SDValue(FRINTX, 1));
+  }
+
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+}
+
+bool
+AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
+                                              unsigned RegWidth) {
+  APFloat FVal(0.0);
+  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+    FVal = CN->getValueAPF();
+  else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
+    // Some otherwise illegal constants are allowed in this case.
+    if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
+        !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
+      return false;
+
+    ConstantPoolSDNode *CN =
+        dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
+    FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
+  } else
+    return false;
+
+  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
+  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
+  // x-register.
+  //
+  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
+  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
+  // integers.
+  bool IsExact;
+
+  // fbits is between 1 and 64 in the worst-case, which means the fmul
+  // could have 2^64 as an actual operand. Need 65 bits of precision.
+  APSInt IntVal(65, true);
+  FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
+
+  // N.b. isPowerOf2 also checks for > 0.
+  if (!IsExact || !IntVal.isPowerOf2()) return false;
+  unsigned FBits = IntVal.logBase2();
+
+  // Checks above should have guaranteed that we haven't lost information in
+  // finding FBits, but it must still be in range.
+  if (FBits == 0 || FBits > RegWidth) return false;
+
+  FixedPos = CurDAG->getTargetConstant(FBits, MVT::i32);
+  return true;
+}
+
+SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
+    return nullptr;
+  }
+
+  // Few custom selection stuff.
+  SDNode *ResNode = nullptr;
+  EVT VT = Node->getValueType(0);
+
+  switch (Node->getOpcode()) {
+  default:
+    break;
+
+  case ISD::ADD:
+    if (SDNode *I = SelectMLAV64LaneV128(Node))
+      return I;
+    break;
+
+  case ISD::LOAD: {
+    // Try to select as an indexed load. Fall through to normal processing
+    // if we can't.
+    bool Done = false;
+    SDNode *I = SelectIndexedLoad(Node, Done);
+    if (Done)
+      return I;
+    break;
+  }
+
+  case ISD::SRL:
+  case ISD::AND:
+  case ISD::SRA:
+    if (SDNode *I = SelectBitfieldExtractOp(Node))
+      return I;
+    break;
+
+  case ISD::OR:
+    if (SDNode *I = SelectBitfieldInsertOp(Node))
+      return I;
+    break;
+
+  case ISD::EXTRACT_VECTOR_ELT: {
+    // Extracting lane zero is a special case where we can just use a plain
+    // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
+    // the rest of the compiler, especially the register allocator and copyi
+    // propagation, to reason about, so is preferred when it's possible to
+    // use it.
+    ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
+    // Bail and use the default Select() for non-zero lanes.
+    if (LaneNode->getZExtValue() != 0)
+      break;
+    // If the element type is not the same as the result type, likewise
+    // bail and use the default Select(), as there's more to do than just
+    // a cross-class COPY. This catches extracts of i8 and i16 elements
+    // since they will need an explicit zext.
+    if (VT != Node->getOperand(0).getValueType().getVectorElementType())
+      break;
+    unsigned SubReg;
+    switch (Node->getOperand(0)
+                .getValueType()
+                .getVectorElementType()
+                .getSizeInBits()) {
+    default:
+      assert(0 && "Unexpected vector element type!");
+    case 64:
+      SubReg = AArch64::dsub;
+      break;
+    case 32:
+      SubReg = AArch64::ssub;
+      break;
+    case 16: // FALLTHROUGH
+    case 8:
+      llvm_unreachable("unexpected zext-requiring extract element!");
+    }
+    SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
+                                                     Node->getOperand(0));
+    DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
+    DEBUG(Extract->dumpr(CurDAG));
+    DEBUG(dbgs() << "\n");
+    return Extract.getNode();
+  }
+  case ISD::Constant: {
+    // Materialize zero constants as copies from WZR/XZR.  This allows
+    // the coalescer to propagate these into other instructions.
+    ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+    if (ConstNode->isNullValue()) {
+      if (VT == MVT::i32)
+        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+                                      AArch64::WZR, MVT::i32).getNode();
+      else if (VT == MVT::i64)
+        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+                                      AArch64::XZR, MVT::i64).getNode();
+    }
+    break;
+  }
+
+  case ISD::FrameIndex: {
+    // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+    const TargetLowering *TLI = getTargetLowering();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                      CurDAG->getTargetConstant(Shifter, MVT::i32) };
+    return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_ldaxp:
+    case Intrinsic::aarch64_ldxp: {
+      unsigned Op =
+          IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
+      SDValue MemAddr = Node->getOperand(2);
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+
+      SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
+                                          MVT::Other, MemAddr, Chain);
+
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+      return Ld;
+    }
+    case Intrinsic::aarch64_stlxp:
+    case Intrinsic::aarch64_stxp: {
+      unsigned Op =
+          IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+      SDValue ValLo = Node->getOperand(2);
+      SDValue ValHi = Node->getOperand(3);
+      SDValue MemAddr = Node->getOperand(4);
+
+      // Place arguments in the right order.
+      SmallVector<SDValue, 7> Ops;
+      Ops.push_back(ValLo);
+      Ops.push_back(ValHi);
+      Ops.push_back(MemAddr);
+      Ops.push_back(Chain);
+
+      SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+      return St;
+    }
+    case Intrinsic::aarch64_neon_ld1x2:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld1x3:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld1x4:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld3:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld4:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld3r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld4r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 2, AArch64::LD2i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 2, AArch64::LD2i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 2, AArch64::LD2i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 2, AArch64::LD2i64);
+      break;
+    case Intrinsic::aarch64_neon_ld3lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 3, AArch64::LD3i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 3, AArch64::LD3i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 3, AArch64::LD3i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 3, AArch64::LD3i64);
+      break;
+    case Intrinsic::aarch64_neon_ld4lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 4, AArch64::LD4i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 4, AArch64::LD4i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 4, AArch64::LD4i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 4, AArch64::LD4i64);
+      break;
+    }
+  } break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_tbl2:
+      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two
+                                                  : AArch64::TBLv16i8Two,
+                         false);
+    case Intrinsic::aarch64_neon_tbl3:
+      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
+                                                  : AArch64::TBLv16i8Three,
+                         false);
+    case Intrinsic::aarch64_neon_tbl4:
+      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
+                                                  : AArch64::TBLv16i8Four,
+                         false);
+    case Intrinsic::aarch64_neon_tbx2:
+      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two
+                                                  : AArch64::TBXv16i8Two,
+                         true);
+    case Intrinsic::aarch64_neon_tbx3:
+      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
+                                                  : AArch64::TBXv16i8Three,
+                         true);
+    case Intrinsic::aarch64_neon_tbx4:
+      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
+                                                  : AArch64::TBXv16i8Four,
+                         true);
+    case Intrinsic::aarch64_neon_smull:
+    case Intrinsic::aarch64_neon_umull:
+      if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
+        return N;
+      break;
+    }
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    if (Node->getNumOperands() >= 3)
+      VT = Node->getOperand(2)->getValueType(0);
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_st1x2: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 2, AArch64::ST1Twov8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 2, AArch64::ST1Twov16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 2, AArch64::ST1Twov4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 2, AArch64::ST1Twov8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 2, AArch64::ST1Twov2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 2, AArch64::ST1Twov4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st1x3: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 3, AArch64::ST1Threev8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 3, AArch64::ST1Threev16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 3, AArch64::ST1Threev4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 3, AArch64::ST1Threev8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 3, AArch64::ST1Threev2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 3, AArch64::ST1Threev4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st1x4: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 4, AArch64::ST1Fourv8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 4, AArch64::ST1Fourv16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 4, AArch64::ST1Fourv4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 4, AArch64::ST1Fourv8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 4, AArch64::ST1Fourv2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 4, AArch64::ST1Fourv4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st2: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 2, AArch64::ST2Twov8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 2, AArch64::ST2Twov16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 2, AArch64::ST2Twov4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 2, AArch64::ST2Twov8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 2, AArch64::ST2Twov2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 2, AArch64::ST2Twov4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 2, AArch64::ST2Twov2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st3: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 3, AArch64::ST3Threev8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 3, AArch64::ST3Threev16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 3, AArch64::ST3Threev4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 3, AArch64::ST3Threev8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 3, AArch64::ST3Threev2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 3, AArch64::ST3Threev4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 3, AArch64::ST3Threev2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st4: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 4, AArch64::ST4Fourv8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 4, AArch64::ST4Fourv16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 4, AArch64::ST4Fourv4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 4, AArch64::ST4Fourv8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 4, AArch64::ST4Fourv2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 4, AArch64::ST4Fourv4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 4, AArch64::ST4Fourv2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st2lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 2, AArch64::ST2i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 2, AArch64::ST2i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 2, AArch64::ST2i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 2, AArch64::ST2i64);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st3lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 3, AArch64::ST3i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 3, AArch64::ST3i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 3, AArch64::ST3i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 3, AArch64::ST3i64);
+      break;
+    }
+    case Intrinsic::aarch64_neon_st4lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 4, AArch64::ST4i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 4, AArch64::ST4i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 4, AArch64::ST4i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 4, AArch64::ST4i64);
+      break;
+    }
+    }
+  }
+  case AArch64ISD::LD2post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD3post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD4post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x2post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x3post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x4post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD2DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD3DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD4DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+    break;
+  }
+  case AArch64ISD::LD2LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+    break;
+  }
+  case AArch64ISD::LD3LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+    break;
+  }
+  case AArch64ISD::LD4LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+    break;
+  }
+  case AArch64ISD::ST2post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+    break;
+  }
+  case AArch64ISD::ST3post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+    break;
+  }
+  case AArch64ISD::ST4post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x2post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x3post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x4post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+    break;
+  }
+  case AArch64ISD::ST2LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+    break;
+  }
+  case AArch64ISD::ST3LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+    break;
+  }
+  case AArch64ISD::ST4LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
+    break;
+  }
+
+  case ISD::FCEIL:
+  case ISD::FFLOOR:
+  case ISD::FTRUNC:
+  case ISD::FROUND:
+    if (SDNode *I = SelectLIBM(Node))
+      return I;
+    break;
+  }
+
+  // Select the default instruction
+  ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == nullptr || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  return ResNode;
+}
+
+/// createAArch64ISelDag - This pass converts a legalized DAG into a
+/// AArch64-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
+                                         CodeGenOpt::Level OptLevel) {
+  return new AArch64DAGToDAGISel(TM, OptLevel);
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
new file mode 100644
index 00000000000..4ddba007339
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -0,0 +1,7926 @@
+//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ISelLowering.h"
+#include "AArch64PerfectShuffle.h"
+#include "AArch64Subtarget.h"
+#include "AArch64CallingConv.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64TargetObjectFile.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-lower"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+
+enum AlignMode {
+  StrictAlign,
+  NoStrictAlign
+};
+
+static cl::opt<AlignMode>
+Align(cl::desc("Load/store alignment support"),
+      cl::Hidden, cl::init(NoStrictAlign),
+      cl::values(
+          clEnumValN(StrictAlign,   "aarch64-strict-align",
+                     "Disallow all unaligned memory accesses"),
+          clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
+                     "Allow unaligned memory accesses"),
+          clEnumValEnd));
+
+// Place holder until extr generation is tested fully.
+static cl::opt<bool>
+EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
+                          cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
+                          cl::init(true));
+
+static cl::opt<bool>
+EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
+                         cl::desc("Allow AArch64 SLI/SRI formation"),
+                         cl::init(false));
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering public interface.
+//===----------------------------------------------------------------------===//
+static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
+  if (TM.getSubtarget<AArch64Subtarget>().isTargetDarwin())
+    return new AArch64_MachoTargetObjectFile();
+
+  return new AArch64_ELFTargetObjectFile();
+}
+
+AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
+    : TargetLowering(TM, createTLOF(TM)) {
+  Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+
+  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
+  // we have to make something up. Arbitrarily, choose ZeroOrOne.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // When comparing vectors the result sets the different elements in the
+  // vector to all-one or all-zero.
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
+  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
+
+  if (Subtarget->hasFPARMv8()) {
+    addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+    addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
+    addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+  }
+
+  if (Subtarget->hasNEON()) {
+    addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
+    addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
+    // Someone set us up the NEON.
+    addDRTypeForNEON(MVT::v2f32);
+    addDRTypeForNEON(MVT::v8i8);
+    addDRTypeForNEON(MVT::v4i16);
+    addDRTypeForNEON(MVT::v2i32);
+    addDRTypeForNEON(MVT::v1i64);
+    addDRTypeForNEON(MVT::v1f64);
+
+    addQRTypeForNEON(MVT::v4f32);
+    addQRTypeForNEON(MVT::v2f64);
+    addQRTypeForNEON(MVT::v16i8);
+    addQRTypeForNEON(MVT::v8i16);
+    addQRTypeForNEON(MVT::v4i32);
+    addQRTypeForNEON(MVT::v2i64);
+  }
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Provide all sorts of operation actions
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCC, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::f32, Custom);
+  setOperationAction(ISD::SETCC, MVT::f64, Custom);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+  setOperationAction(ISD::FREM, MVT::f80, Expand);
+
+  // Custom lowering hooks are needed for XOR
+  // to fold it into CSINC/CSINV.
+  setOperationAction(ISD::XOR, MVT::i32, Custom);
+  setOperationAction(ISD::XOR, MVT::i64, Custom);
+
+  // Virtually no operation on f128 is legal, but LLVM can't expand them when
+  // there's a valid register class, so we need custom operations in most cases.
+  setOperationAction(ISD::FABS, MVT::f128, Expand);
+  setOperationAction(ISD::FADD, MVT::f128, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+  setOperationAction(ISD::FCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FDIV, MVT::f128, Custom);
+  setOperationAction(ISD::FMA, MVT::f128, Expand);
+  setOperationAction(ISD::FMUL, MVT::f128, Custom);
+  setOperationAction(ISD::FNEG, MVT::f128, Expand);
+  setOperationAction(ISD::FPOW, MVT::f128, Expand);
+  setOperationAction(ISD::FREM, MVT::f128, Expand);
+  setOperationAction(ISD::FRINT, MVT::f128, Expand);
+  setOperationAction(ISD::FSIN, MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+  setOperationAction(ISD::FSUB, MVT::f128, Custom);
+  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
+  setOperationAction(ISD::SETCC, MVT::f128, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+
+  // Lowering for many of the conversions is actually specified by the non-f128
+  // type. The LowerXXX function will be trivial when f128 isn't involved.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+
+  // Variable arguments.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+  // Variable-sized objects.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  // Exception handling.
+  // FIXME: These are guesses. Has this been defined yet?
+  setExceptionPointerRegister(AArch64::X0);
+  setExceptionSelectorRegister(AArch64::X1);
+
+  // Constant pool entries
+  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
+
+  // BlockAddress
+  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
+
+  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
+  setOperationAction(ISD::ADDC, MVT::i32, Custom);
+  setOperationAction(ISD::ADDE, MVT::i32, Custom);
+  setOperationAction(ISD::SUBC, MVT::i32, Custom);
+  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+  setOperationAction(ISD::ADDC, MVT::i64, Custom);
+  setOperationAction(ISD::ADDE, MVT::i64, Custom);
+  setOperationAction(ISD::SUBC, MVT::i64, Custom);
+  setOperationAction(ISD::SUBE, MVT::i64, Custom);
+
+  // AArch64 lacks both left-rotate and popcount instructions.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+
+  // AArch64 doesn't have {U|S}MUL_LOHI.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+
+
+  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
+  // counterparts, which AArch64 supports directly.
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+
+  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // Custom lower Add/Sub/Mul with overflow.
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SADDO, MVT::i64, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i64, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i64, Custom);
+  setOperationAction(ISD::SMULO, MVT::i32, Custom);
+  setOperationAction(ISD::SMULO, MVT::i64, Custom);
+  setOperationAction(ISD::UMULO, MVT::i32, Custom);
+  setOperationAction(ISD::UMULO, MVT::i64, Custom);
+
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+  // AArch64 has implementations of a lot of rounding-like FP operations.
+  static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
+  for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
+    MVT Ty = RoundingTypes[I];
+    setOperationAction(ISD::FFLOOR, Ty, Legal);
+    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+    setOperationAction(ISD::FCEIL, Ty, Legal);
+    setOperationAction(ISD::FRINT, Ty, Legal);
+    setOperationAction(ISD::FTRUNC, Ty, Legal);
+    setOperationAction(ISD::FROUND, Ty, Legal);
+  }
+
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+  if (Subtarget->isTargetMachO()) {
+    // For iOS, we don't want to the normal expansion of a libcall to
+    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+    // traffic.
+    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  } else {
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+  }
+
+  // AArch64 does not have floating-point extending loads, i1 sign-extending
+  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+  // Indexed loads and stores are supported.
+  for (unsigned im = (unsigned)ISD::PRE_INC;
+       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+    setIndexedLoadAction(im, MVT::i8, Legal);
+    setIndexedLoadAction(im, MVT::i16, Legal);
+    setIndexedLoadAction(im, MVT::i32, Legal);
+    setIndexedLoadAction(im, MVT::i64, Legal);
+    setIndexedLoadAction(im, MVT::f64, Legal);
+    setIndexedLoadAction(im, MVT::f32, Legal);
+    setIndexedStoreAction(im, MVT::i8, Legal);
+    setIndexedStoreAction(im, MVT::i16, Legal);
+    setIndexedStoreAction(im, MVT::i32, Legal);
+    setIndexedStoreAction(im, MVT::i64, Legal);
+    setIndexedStoreAction(im, MVT::f64, Legal);
+    setIndexedStoreAction(im, MVT::f32, Legal);
+  }
+
+  // Trap.
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // We combine OR nodes for bitfield operations.
+  setTargetDAGCombine(ISD::OR);
+
+  // Vector add and sub nodes may conceal a high-half opportunity.
+  // Also, try to fold ADD into CSINC/CSINV..
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
+
+  setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::BITCAST);
+  setTargetDAGCombine(ISD::CONCAT_VECTORS);
+  setTargetDAGCombine(ISD::STORE);
+
+  setTargetDAGCombine(ISD::MUL);
+
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::VSELECT);
+
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
+
+  setStackPointerRegisterToSaveRestore(AArch64::SP);
+
+  setSchedulingPreference(Sched::Hybrid);
+
+  // Enable TBZ/TBNZ
+  MaskAndBranchFoldingIsLegal = true;
+
+  setMinFunctionAlignment(2);
+
+  RequireStrictAlign = (Align == StrictAlign);
+
+  setHasExtractBitsInsn(true);
+
+  if (Subtarget->hasNEON()) {
+    // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
+    // silliness like this:
+    setOperationAction(ISD::FABS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FADD, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
+    setOperationAction(ISD::FMA, MVT::v1f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
+    setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
+    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
+    setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
+    setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
+
+    setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
+
+    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+
+    // AArch64 doesn't have a direct vector ->f32 conversion instructions for
+    // elements smaller than i32, so promote the input to i32 first.
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
+    // Similarly, there is no direct i32 -> f64 vector conversion instruction.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
+
+    // AArch64 doesn't have MUL.2d:
+    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
+    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+    // Likewise, narrowing and extending vector loads/stores aren't handled
+    // directly.
+    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+
+      setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+                         Expand);
+
+      setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+
+      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+
+      for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+           InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+        setTruncStoreAction((MVT::SimpleValueType)VT,
+                            (MVT::SimpleValueType)InnerVT, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    }
+
+    // AArch64 has implementations of a lot of rounding-like FP operations.
+    static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
+    for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
+      MVT Ty = RoundingVecTypes[I];
+      setOperationAction(ISD::FFLOOR, Ty, Legal);
+      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+      setOperationAction(ISD::FCEIL, Ty, Legal);
+      setOperationAction(ISD::FRINT, Ty, Legal);
+      setOperationAction(ISD::FTRUNC, Ty, Legal);
+      setOperationAction(ISD::FROUND, Ty, Legal);
+    }
+  }
+}
+
+void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
+  if (VT == MVT::v2f32) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
+
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
+  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
+
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
+  }
+
+  // Mark vector float intrinsics as expand.
+  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
+    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+  }
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
+
+  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
+  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+
+  // CNT supports only B element sizes.
+  if (VT != MVT::v8i8 && VT != MVT::v16i8)
+    setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
+
+  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+
+  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+
+  if (Subtarget->isLittleEndian()) {
+    for (unsigned im = (unsigned)ISD::PRE_INC;
+         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+      setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
+      setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
+    }
+  }
+}
+
+void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &AArch64::FPR64RegClass);
+  addTypeForNEON(VT, MVT::v2i32);
+}
+
+void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &AArch64::FPR128RegClass);
+  addTypeForNEON(VT, MVT::v4i32);
+}
+
+EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// computeKnownBitsForTargetNode - Determine which of the bits specified in
+/// Mask are known to be either zero or one and return them in the
+/// KnownZero/KnownOne bitsets.
+void AArch64TargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
+    const SelectionDAG &DAG, unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    break;
+  case AArch64ISD::CSEL: {
+    APInt KnownZero2, KnownOne2;
+    DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
+    KnownZero &= KnownZero2;
+    KnownOne &= KnownOne2;
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+   ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+    switch (IntID) {
+    default: return;
+    case Intrinsic::aarch64_ldaxr:
+    case Intrinsic::aarch64_ldxr: {
+      unsigned BitWidth = KnownOne.getBitWidth();
+      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+      unsigned MemBits = VT.getScalarType().getSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      return;
+    }
+    }
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_umaxv:
+    case Intrinsic::aarch64_neon_uminv: {
+      // Figure out the datatype of the vector operand. The UMINV instruction
+      // will zero extend the result, so we can mark as known zero all the
+      // bits larger than the element datatype. 32-bit or larget doesn't need
+      // this as those are legal types and will be handled by isel directly.
+      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
+      unsigned BitWidth = KnownZero.getBitWidth();
+      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
+        assert(BitWidth >= 8 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
+        KnownZero |= Mask;
+      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
+        assert(BitWidth >= 16 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
+        KnownZero |= Mask;
+      }
+      break;
+    } break;
+    }
+  }
+  }
+}
+
+MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
+  return MVT::i64;
+}
+
+unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
+  // FIXME: On AArch64, this depends on the type.
+  // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes().
+  // and the offset has to be a multiple of the related size in bytes.
+  return 4095;
+}
+
+FastISel *
+AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                      const TargetLibraryInfo *libInfo) const {
+  return AArch64::createFastISel(funcInfo, libInfo);
+}
+
+const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    return nullptr;
+  case AArch64ISD::CALL:              return "AArch64ISD::CALL";
+  case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
+  case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
+  case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
+  case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
+  case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
+  case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
+  case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
+  case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
+  case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
+  case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
+  case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
+  case AArch64ISD::TLSDESC_CALL:      return "AArch64ISD::TLSDESC_CALL";
+  case AArch64ISD::ADC:               return "AArch64ISD::ADC";
+  case AArch64ISD::SBC:               return "AArch64ISD::SBC";
+  case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
+  case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
+  case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
+  case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
+  case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
+  case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
+  case AArch64ISD::FMIN:              return "AArch64ISD::FMIN";
+  case AArch64ISD::FMAX:              return "AArch64ISD::FMAX";
+  case AArch64ISD::DUP:               return "AArch64ISD::DUP";
+  case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
+  case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
+  case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
+  case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
+  case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
+  case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
+  case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
+  case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
+  case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
+  case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
+  case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
+  case AArch64ISD::BICi:              return "AArch64ISD::BICi";
+  case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
+  case AArch64ISD::BSL:               return "AArch64ISD::BSL";
+  case AArch64ISD::NEG:               return "AArch64ISD::NEG";
+  case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
+  case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
+  case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
+  case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
+  case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
+  case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
+  case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
+  case AArch64ISD::REV16:             return "AArch64ISD::REV16";
+  case AArch64ISD::REV32:             return "AArch64ISD::REV32";
+  case AArch64ISD::REV64:             return "AArch64ISD::REV64";
+  case AArch64ISD::EXT:               return "AArch64ISD::EXT";
+  case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
+  case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
+  case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
+  case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
+  case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
+  case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
+  case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
+  case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
+  case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
+  case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
+  case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
+  case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
+  case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
+  case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
+  case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
+  case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
+  case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
+  case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
+  case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
+  case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
+  case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
+  case AArch64ISD::NOT:               return "AArch64ISD::NOT";
+  case AArch64ISD::BIT:               return "AArch64ISD::BIT";
+  case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
+  case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
+  case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
+  case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
+  case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
+  case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
+  case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
+  case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
+  case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
+  case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
+  case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
+  case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
+  case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
+  case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
+  case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
+  case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
+  case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
+  case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
+  case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
+  case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
+  case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
+  case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
+  case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
+  case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
+  case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
+  case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
+  case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
+  case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
+  case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
+  case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
+  case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
+  case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
+  case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
+  case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
+  case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
+  case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
+  }
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
+                                    MachineBasicBlock *MBB) const {
+  // We materialise the F128CSEL pseudo-instruction as some control flow and a
+  // phi node:
+
+  // OrigBB:
+  //     [... previous instrs leading to comparison ...]
+  //     b.ne TrueBB
+  //     b EndBB
+  // TrueBB:
+  //     ; Fallthrough
+  // EndBB:
+  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineFunction *MF = MBB->getParent();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction::iterator It = MBB;
+  ++It;
+
+  unsigned DestReg = MI->getOperand(0).getReg();
+  unsigned IfTrueReg = MI->getOperand(1).getReg();
+  unsigned IfFalseReg = MI->getOperand(2).getReg();
+  unsigned CondCode = MI->getOperand(3).getImm();
+  bool NZCVKilled = MI->getOperand(4).isKill();
+
+  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, TrueBB);
+  MF->insert(It, EndBB);
+
+  // Transfer rest of current basic-block to EndBB
+  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
+                MBB->end());
+  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
+  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
+  MBB->addSuccessor(TrueBB);
+  MBB->addSuccessor(EndBB);
+
+  // TrueBB falls through to the end.
+  TrueBB->addSuccessor(EndBB);
+
+  if (!NZCVKilled) {
+    TrueBB->addLiveIn(AArch64::NZCV);
+    EndBB->addLiveIn(AArch64::NZCV);
+  }
+
+  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
+      .addReg(IfTrueReg)
+      .addMBB(TrueBB)
+      .addReg(IfFalseReg)
+      .addMBB(MBB);
+
+  MI->eraseFromParent();
+  return EndBB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    MI->dump();
+#endif
+    assert(0 && "Unexpected instruction for custom inserter!");
+    break;
+
+  case AArch64::F128CSEL:
+    return EmitF128CSEL(MI, BB);
+
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
+  }
+  llvm_unreachable("Unexpected instruction for custom inserter!");
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering private implementation.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
+/// CC
+static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown condition code!");
+  case ISD::SETNE:
+    return AArch64CC::NE;
+  case ISD::SETEQ:
+    return AArch64CC::EQ;
+  case ISD::SETGT:
+    return AArch64CC::GT;
+  case ISD::SETGE:
+    return AArch64CC::GE;
+  case ISD::SETLT:
+    return AArch64CC::LT;
+  case ISD::SETLE:
+    return AArch64CC::LE;
+  case ISD::SETUGT:
+    return AArch64CC::HI;
+  case ISD::SETUGE:
+    return AArch64CC::HS;
+  case ISD::SETULT:
+    return AArch64CC::LO;
+  case ISD::SETULE:
+    return AArch64CC::LS;
+  }
+}
+
+/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
+static void changeFPCCToAArch64CC(ISD::CondCode CC,
+                                  AArch64CC::CondCode &CondCode,
+                                  AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    CondCode = AArch64CC::EQ;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    CondCode = AArch64CC::GT;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    CondCode = AArch64CC::GE;
+    break;
+  case ISD::SETOLT:
+    CondCode = AArch64CC::MI;
+    break;
+  case ISD::SETOLE:
+    CondCode = AArch64CC::LS;
+    break;
+  case ISD::SETONE:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GT;
+    break;
+  case ISD::SETO:
+    CondCode = AArch64CC::VC;
+    break;
+  case ISD::SETUO:
+    CondCode = AArch64CC::VS;
+    break;
+  case ISD::SETUEQ:
+    CondCode = AArch64CC::EQ;
+    CondCode2 = AArch64CC::VS;
+    break;
+  case ISD::SETUGT:
+    CondCode = AArch64CC::HI;
+    break;
+  case ISD::SETUGE:
+    CondCode = AArch64CC::PL;
+    break;
+  case ISD::SETLT:
+  case ISD::SETULT:
+    CondCode = AArch64CC::LT;
+    break;
+  case ISD::SETLE:
+  case ISD::SETULE:
+    CondCode = AArch64CC::LE;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    CondCode = AArch64CC::NE;
+    break;
+  }
+}
+
+/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
+/// CC usable with the vector instructions. Fewer operations are available
+/// without a real NZCV register, so we have to use less efficient combinations
+/// to get the same effect.
+static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
+                                        AArch64CC::CondCode &CondCode,
+                                        AArch64CC::CondCode &CondCode2,
+                                        bool &Invert) {
+  Invert = false;
+  switch (CC) {
+  default:
+    // Mostly the scalar mappings work fine.
+    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+    break;
+  case ISD::SETUO:
+    Invert = true; // Fallthrough
+  case ISD::SETO:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GE;
+    break;
+  case ISD::SETUEQ:
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    // All of the compare-mask comparisons are ordered, but we can switch
+    // between the two by a double inversion. E.g. ULE == !OGT.
+    Invert = true;
+    changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
+    break;
+  }
+}
+
+static bool isLegalArithImmed(uint64_t C) {
+  // Matches AArch64DAGToDAGISel::SelectArithImmed().
+  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
+}
+
+static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                              SDLoc dl, SelectionDAG &DAG) {
+  EVT VT = LHS.getValueType();
+
+  if (VT.isFloatingPoint())
+    return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+
+  // The CMP instruction is just an alias for SUBS, and representing it as
+  // SUBS means that it's possible to get CSE with subtract operations.
+  // A later phase can perform the optimization of setting the destination
+  // register to WZR/XZR if it ends up being unused.
+  unsigned Opcode = AArch64ISD::SUBS;
+
+  if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
+      cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
+    // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
+    // can be set differently by this operation. It comes down to whether
+    // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
+    // everything is fine. If not then the optimization is wrong. Thus general
+    // comparisons are only valid if op2 != 0.
+
+    // So, finally, the only LLVM-native comparisons that don't mention C and V
+    // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+    // the absence of information about op2.
+    Opcode = AArch64ISD::ADDS;
+    RHS = RHS.getOperand(1);
+  } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
+             cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+             !isUnsignedIntSetCC(CC)) {
+    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+    // of the signed comparisons.
+    Opcode = AArch64ISD::ANDS;
+    RHS = LHS.getOperand(1);
+    LHS = LHS.getOperand(0);
+  }
+
+  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+      .getValue(1);
+}
+
+static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                             SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    EVT VT = RHS.getValueType();
+    uint64_t C = RHSC->getZExtValue();
+    if (!isLegalArithImmed(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if ((VT == MVT::i32 && C != 0x80000000 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0x80000000ULL &&
+             isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if ((VT == MVT::i32 && C != 0 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if ((VT == MVT::i32 && C != 0x7fffffff &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if ((VT == MVT::i32 && C != 0xffffffff &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      }
+    }
+  }
+
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+  AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
+  return Cmp;
+}
+
+static std::pair<SDValue, SDValue>
+getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
+  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
+         "Unsupported value type");
+  SDValue Value, Overflow;
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  unsigned Opc = 0;
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown overflow instruction!");
+  case ISD::SADDO:
+    Opc = AArch64ISD::ADDS;
+    CC = AArch64CC::VS;
+    break;
+  case ISD::UADDO:
+    Opc = AArch64ISD::ADDS;
+    CC = AArch64CC::HS;
+    break;
+  case ISD::SSUBO:
+    Opc = AArch64ISD::SUBS;
+    CC = AArch64CC::VS;
+    break;
+  case ISD::USUBO:
+    Opc = AArch64ISD::SUBS;
+    CC = AArch64CC::LO;
+    break;
+  // Multiply needs a little bit extra work.
+  case ISD::SMULO:
+  case ISD::UMULO: {
+    CC = AArch64CC::NE;
+    bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
+    if (Op.getValueType() == MVT::i32) {
+      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      // For a 32 bit multiply with overflow check we want the instruction
+      // selector to generate a widening multiply (SMADDL/UMADDL). For that we
+      // need to generate the following pattern:
+      // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
+      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
+      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
+                                DAG.getConstant(0, MVT::i64));
+      // On AArch64 the upper 32 bits are always zero extended for a 32 bit
+      // operation. We need to clear out the upper 32 bits, because we used a
+      // widening multiply that wrote all 64 bits. In the end this should be a
+      // noop.
+      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
+      if (IsSigned) {
+        // The signed overflow check requires more than just a simple check for
+        // any bit set in the upper 32 bits of the result. These bits could be
+        // just the sign bits of a negative number. To perform the overflow
+        // check we have to arithmetic shift right the 32nd bit of the result by
+        // 31 bits. Then we compare the result to the upper 32 bits.
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
+                                        DAG.getConstant(32, MVT::i64));
+        UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
+        SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
+                                        DAG.getConstant(31, MVT::i64));
+        // It is important that LowerBits is last, otherwise the arithmetic
+        // shift will not be folded into the compare (SUBS).
+        SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
+        Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                       .getValue(1);
+      } else {
+        // The overflow check for unsigned multiply is easy. We only need to
+        // check if any of the upper 32 bits are set. This can be done with a
+        // CMP (shifted register). For that we need to generate the following
+        // pattern:
+        // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
+                                        DAG.getConstant(32, MVT::i64));
+        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+        Overflow =
+            DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+                        UpperBits).getValue(1);
+      }
+      break;
+    }
+    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
+    // For the 64 bit multiply
+    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+    if (IsSigned) {
+      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
+      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
+                                      DAG.getConstant(63, MVT::i64));
+      // It is important that LowerBits is last, otherwise the arithmetic
+      // shift will not be folded into the compare (SUBS).
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                     .getValue(1);
+    } else {
+      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow =
+          DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+                      UpperBits).getValue(1);
+    }
+    break;
+  }
+  } // switch (...)
+
+  if (Opc) {
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+
+    // Emit the AArch64 operation with overflow check.
+    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
+    Overflow = Value.getValue(1);
+  }
+  return std::make_pair(Value, Overflow);
+}
+
+SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                                             RTLIB::Libcall Call) const {
+  SmallVector<SDValue, 2> Ops;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
+                     SDLoc(Op)).first;
+}
+
+static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
+  SDValue Sel = Op.getOperand(0);
+  SDValue Other = Op.getOperand(1);
+
+  // If neither operand is a SELECT_CC, give up.
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    std::swap(Sel, Other);
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    return Op;
+
+  // The folding we want to perform is:
+  // (xor x, (select_cc a, b, cc, 0, -1) )
+  //   -->
+  // (csel x, (xor x, -1), cc ...)
+  //
+  // The latter will get matched to a CSINV instruction.
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
+  SDValue LHS = Sel.getOperand(0);
+  SDValue RHS = Sel.getOperand(1);
+  SDValue TVal = Sel.getOperand(2);
+  SDValue FVal = Sel.getOperand(3);
+  SDLoc dl(Sel);
+
+  // FIXME: This could be generalized to non-integer comparisons.
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return Op;
+
+  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+  // The the values aren't constants, this isn't the pattern we're looking for.
+  if (!CFVal || !CTVal)
+    return Op;
+
+  // We can commute the SELECT_CC by inverting the condition.  This
+  // might be needed to make this fit into a CSINV pattern.
+  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+    std::swap(TVal, FVal);
+    std::swap(CTVal, CFVal);
+    CC = ISD::getSetCCInverse(CC, true);
+  }
+
+  // If the constants line up, perform the transform!
+  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+    FVal = Other;
+    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
+                       DAG.getConstant(-1ULL, Other.getValueType()));
+
+    return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
+                       CCVal, Cmp);
+  }
+
+  return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default:
+    assert(0 && "Invalid code");
+  case ISD::ADDC:
+    Opc = AArch64ISD::ADDS;
+    break;
+  case ISD::SUBC:
+    Opc = AArch64ISD::SUBS;
+    break;
+  case ISD::ADDE:
+    Opc = AArch64ISD::ADCS;
+    ExtraOp = true;
+    break;
+  case ISD::SUBE:
+    Opc = AArch64ISD::SBCS;
+    ExtraOp = true;
+    break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
+                     Op.getOperand(2));
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+    return SDValue();
+
+  AArch64CC::CondCode CC;
+  // The actual operation that sets the overflow or carry flag.
+  SDValue Value, Overflow;
+  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
+
+  // We use 0 and 1 as false and true values.
+  SDValue TVal = DAG.getConstant(1, MVT::i32);
+  SDValue FVal = DAG.getConstant(0, MVT::i32);
+
+  // We use an inverted condition, because the conditional select is inverted
+  // too. This will allow it to be selected to a single instruction:
+  // CSINC Wd, WZR, WZR, invert(cond).
+  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
+  Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal,
+                         CCVal, Overflow);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
+}
+
+// Prefetch operands are:
+// 1: Address to prefetch
+// 2: bool isWrite
+// 3: int locality (0 = no locality ... 3 = extreme locality)
+// 4: bool isDataCache
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+  // The data thing is not used.
+  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+
+  bool IsStream = !Locality;
+  // When the locality number is set
+  if (Locality) {
+    // The front-end should have filtered out the out-of-range values
+    assert(Locality <= 3 && "Prefetch locality out-of-range");
+    // The locality degree is the opposite of the cache speed.
+    // Put the number the other way around.
+    // The encoding starts at 0 for level 1
+    Locality = 3 - Locality;
+  }
+
+  // built the mask value encoding the expected behavior.
+  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
+                   (Locality << 1) |    // Cache level bits
+                   (unsigned)IsStream;  // Stream bit
+  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  // FP_ROUND node has a second operand indicating whether it is known to be
+  // precise. That doesn't take part in the LibCall so we can't directly use
+  // LowerF128Call.
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
+                     /*isSigned*/ false, SDLoc(Op)).first;
+}
+
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT InVT = Op.getOperand(0).getValueType();
+  EVT VT = Op.getValueType();
+
+  // FP_TO_XINT conversion from the same type are legal.
+  if (VT.getSizeInBits() == InVT.getSizeInBits())
+    return Op;
+
+  if (InVT == MVT::v2f64 || InVT == MVT::v4f32) {
+    SDLoc dl(Op);
+    SDValue Cv =
+        DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
+                    Op.getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
+  } else if (InVT == MVT::v2f32) {
+    SDLoc dl(Op);
+    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+    return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
+  }
+
+  // Type changing conversions are illegal.
+  return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType().isVector())
+    return LowerVectorFP_TO_INT(Op, DAG);
+
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::FP_TO_SINT)
+    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  SmallVector<SDValue, 2> Ops;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
+                     SDLoc(Op)).first;
+}
+
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  SDValue In = Op.getOperand(0);
+  EVT InVT = In.getValueType();
+
+  // v2i32 to v2f32 is legal.
+  if (VT == MVT::v2f32 && InVT == MVT::v2i32)
+    return Op;
+
+  // This function only handles v2f64 outputs.
+  if (VT == MVT::v2f64) {
+    // Extend the input argument to a v2i64 that we can feed into the
+    // floating point conversion. Zero or sign extend based on whether
+    // we're doing a signed or unsigned float conversion.
+    unsigned Opc =
+        Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+    assert(Op.getNumOperands() == 1 && "FP conversions take one argument");
+    SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0));
+    return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted);
+  }
+
+  // Scalarize v2i64 to v2f32 conversions.
+  std::vector<SDValue> BuildVectorOps;
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+    SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In,
+                               DAG.getConstant(i, MVT::i64));
+    Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr);
+    BuildVectorOps.push_back(Sclr);
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BuildVectorOps);
+}
+
+SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  if (Op.getValueType().isVector())
+    return LowerVectorINT_TO_FP(Op, DAG);
+
+  // i128 conversions are libcalls.
+  if (Op.getOperand(0).getValueType() == MVT::i128)
+    return SDValue();
+
+  // Other conversions are legal, unless it's to the completely software-based
+  // fp128.
+  if (Op.getValueType() != MVT::f128)
+    return Op;
+
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::SINT_TO_FP)
+    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two S / D registers.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  const char *LibcallName =
+      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0);
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.first;
+}
+
+SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unimplemented operand");
+    return SDValue();
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG);
+  case ISD::BR_CC:
+    return LowerBR_CC(Op, DAG);
+  case ISD::SELECT:
+    return LowerSELECT(Op, DAG);
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::VACOPY:
+    return LowerVACOPY(Op, DAG);
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:
+    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    return LowerXALUO(Op, DAG);
+  case ISD::FADD:
+    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
+  case ISD::FSUB:
+    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
+  case ISD::FMUL:
+    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+  case ISD::FDIV:
+    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
+  case ISD::FP_ROUND:
+    return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND:
+    return LowerFP_EXTEND(Op, DAG);
+  case ISD::FRAMEADDR:
+    return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:
+    return LowerRETURNADDR(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:
+    return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:
+    return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:
+    return LowerVectorSRA_SRL_SHL(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRL_PARTS:
+  case ISD::SRA_PARTS:
+    return LowerShiftRightParts(Op, DAG);
+  case ISD::CTPOP:
+    return LowerCTPOP(Op, DAG);
+  case ISD::FCOPYSIGN:
+    return LowerFCOPYSIGN(Op, DAG);
+  case ISD::AND:
+    return LowerVectorAND(Op, DAG);
+  case ISD::OR:
+    return LowerVectorOR(Op, DAG);
+  case ISD::XOR:
+    return LowerXOR(Op, DAG);
+  case ISD::PREFETCH:
+    return LowerPREFETCH(Op, DAG);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FSINCOS:
+    return LowerFSINCOS(Op, DAG);
+  }
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
+  return 2;
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "AArch64GenCallingConv.inc"
+
+/// Selects the correct CCAssignFn for a the given CallingConvention
+/// value.
+CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                     bool IsVarArg) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention.");
+  case CallingConv::WebKit_JS:
+    return CC_AArch64_WebKit_JS;
+  case CallingConv::C:
+  case CallingConv::Fast:
+    if (!Subtarget->isTargetDarwin())
+      return CC_AArch64_AAPCS;
+    return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
+  }
+}
+
+SDValue AArch64TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // At this point, Ins[].VT may already be promoted to i32. To correctly
+  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
+  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
+  // LocVT.
+  unsigned NumArgs = Ins.size();
+  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ValVT = Ins[i].VT;
+    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[i].OrigArgIndex;
+
+    // Get type of the original argument.
+    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+    MVT LocVT = ValVT;
+    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+      LocVT = MVT::i8;
+    else if (ActualMVT == MVT::i16)
+      LocVT = MVT::i16;
+
+    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+    bool Res =
+        AssignFn(i, ValVT, LocVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+  assert(ArgLocs.size() == Ins.size());
+  SmallVector<SDValue, 16> ArgValues;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    if (Ins[i].Flags.isByVal()) {
+      // Byval is used for HFAs in the PCS, but the system should work in a
+      // non-compliant manner for larger structs.
+      EVT PtrTy = getPointerTy();
+      int Size = Ins[i].Flags.getByValSize();
+      unsigned NumRegs = (Size + 7) / 8;
+
+      // FIXME: This works on big-endian for composite byvals, which are the common
+      // case. It should also work for fundamental types too.
+      unsigned FrameIdx =
+        MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
+      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
+      InVals.push_back(FrameIdxN);
+
+      continue;
+    } if (VA.isRegLoc()) {
+      // Arguments stored in registers.
+      EVT RegVT = VA.getLocVT();
+
+      SDValue ArgValue;
+      const TargetRegisterClass *RC;
+
+      if (RegVT == MVT::i32)
+        RC = &AArch64::GPR32RegClass;
+      else if (RegVT == MVT::i64)
+        RC = &AArch64::GPR64RegClass;
+      else if (RegVT == MVT::f32)
+        RC = &AArch64::FPR32RegClass;
+      else if (RegVT == MVT::f64 || RegVT.is64BitVector())
+        RC = &AArch64::FPR64RegClass;
+      else if (RegVT == MVT::f128 || RegVT.is128BitVector())
+        RC = &AArch64::FPR128RegClass;
+      else
+        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+      // Transform the arguments in physical registers into virtual ones.
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
+
+      // If this is an 8, 16 or 32-bit value, it is really passed promoted
+      // to 64 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::SExt:
+        ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::ZExt:
+        ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
+        break;
+      }
+
+      InVals.push_back(ArgValue);
+
+    } else { // VA.isRegLoc()
+      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
+      unsigned ArgOffset = VA.getLocMemOffset();
+      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+
+      uint32_t BEAlign = 0;
+      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+        BEAlign = 8 - ArgSize;
+
+      int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
+
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue ArgValue;
+
+      // If the loc type and val type are not the same, create an anyext load.
+      if (VA.getLocVT().getSizeInBits() != VA.getValVT().getSizeInBits()) {
+        // We should only get here if this is a pure integer.
+        assert(!VA.getValVT().isVector() && VA.getValVT().isInteger() &&
+               "Only integer extension supported!");
+        ArgValue = DAG.getExtLoad(ISD::EXTLOAD, DL, VA.getValVT(), Chain, FIN,
+                                  MachinePointerInfo::getFixedStack(FI),
+                                  VA.getLocVT(),
+                                  false, false, false, 0);
+      } else {
+        ArgValue = DAG.getLoad(VA.getValVT(), DL, Chain, FIN,
+                               MachinePointerInfo::getFixedStack(FI), false,
+                               false, false, 0);
+      }
+
+      InVals.push_back(ArgValue);
+    }
+  }
+
+  // varargs
+  if (isVarArg) {
+    if (!Subtarget->isTargetDarwin()) {
+      // The AAPCS variadic function ABI is identical to the non-variadic
+      // one. As a result there may be more arguments in registers and we should
+      // save them for future reference.
+      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
+    }
+
+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    // This will point to the next argument passed via stack.
+    unsigned StackOffset = CCInfo.getNextStackOffset();
+    // We currently pass all varargs at 8-byte alignment.
+    StackOffset = ((StackOffset + 7) & ~7);
+    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
+  }
+
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  unsigned StackArgSize = CCInfo.getNextStackOffset();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
+    // This is a non-standard ABI so by fiat I say we're allowed to make full
+    // use of the stack area to be popped, which must be aligned to 16 bytes in
+    // any case:
+    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+
+    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
+    // a multiple of 16.
+    FuncInfo->setArgumentStackToRestore(StackArgSize);
+
+    // This realignment carries over to the available bytes below. Our own
+    // callers will guarantee the space is free by giving an aligned value to
+    // CALLSEQ_START.
+  }
+  // Even if we're not expected to free up the space, it's useful to know how
+  // much is there while considering tail calls (because we can reuse it).
+  FuncInfo->setBytesInStackArgArea(StackArgSize);
+
+  return Chain;
+}
+
+void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
+                                                SelectionDAG &DAG, SDLoc DL,
+                                                SDValue &Chain) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
+  SmallVector<SDValue, 8> MemOps;
+
+  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
+                                          AArch64::X3, AArch64::X4, AArch64::X5,
+                                          AArch64::X6, AArch64::X7 };
+  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
+  unsigned FirstVariadicGPR =
+      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
+
+  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
+  int GPRIdx = 0;
+  if (GPRSaveSize != 0) {
+    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
+
+    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+
+    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
+      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                       MachinePointerInfo::getStack(i * 8), false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                        DAG.getConstant(8, getPointerTy()));
+    }
+  }
+  FuncInfo->setVarArgsGPRIndex(GPRIdx);
+  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
+
+  if (Subtarget->hasFPARMv8()) {
+    static const MCPhysReg FPRArgRegs[] = {
+        AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
+        AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
+    static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
+    unsigned FirstVariadicFPR =
+        CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
+
+    unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
+    int FPRIdx = 0;
+    if (FPRSaveSize != 0) {
+      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
+
+      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+
+      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+        unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
+
+        SDValue Store =
+            DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                         MachinePointerInfo::getStack(i * 16), false, false, 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                          DAG.getConstant(16, getPointerTy()));
+      }
+    }
+    FuncInfo->setVarArgsFPRIndex(FPRIdx);
+    FuncInfo->setVarArgsFPRSize(FPRSaveSize);
+  }
+
+  if (!MemOps.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+  }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue AArch64TargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+    SDValue ThisVal) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+
+    // Pass 'this' value directly from the argument to return value, to avoid
+    // reg unit interference
+    if (i == 0 && isThisReturn) {
+      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
+             "unexpected return calling convention register assignment");
+      InVals.push_back(ThisVal);
+      continue;
+    }
+
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+bool AArch64TargetLowering::isEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    bool isCalleeStructRet, bool isCallerStructRet,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+  // For CallingConv::C this function knows whether the ABI needs
+  // changing. That's not true for other conventions so they will have to opt in
+  // manually.
+  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
+    return false;
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // Byval parameters hand the function a pointer directly into the stack area
+  // we want to reuse during a tail call. Working around this *is* possible (see
+  // X86) but less efficient and uglier in LowerCall.
+  for (Function::const_arg_iterator i = CallerF->arg_begin(),
+                                    e = CallerF->arg_end();
+       i != e; ++i)
+    if (i->hasByValAttr())
+      return false;
+
+  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+    if (IsTailCallConvention(CalleeCC) && CCMatch)
+      return true;
+    return false;
+  }
+
+  // Now we search for cases where we can use a tail call without changing the
+  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
+  // concept.
+
+  // I want anyone implementing a new calling convention to think long and hard
+  // about this assert.
+  assert((!isVarArg || CalleeCC == CallingConv::C) &&
+         "Unexpected variadic calling convention");
+
+  if (isVarArg && !Outs.empty()) {
+    // At least two cases here: if caller is fastcc then we can't have any
+    // memory arguments (we'd be expected to clean up the stack afterwards). If
+    // caller is C then we could potentially use its argument area.
+
+    // FIXME: for now we take the most conservative of these in both cases:
+    // disallow all variadic memory operands.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
+
+    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+      if (!ArgLocs[i].isRegLoc())
+        return false;
+  }
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs1, *DAG.getContext());
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs2, *DAG.getContext());
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+          return false;
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+          return false;
+      }
+    }
+  }
+
+  // Nothing more to check if the callee is taking no arguments
+  if (Outs.empty())
+    return true;
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
+
+  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
+  // If the stack arguments for this call would fit into our own save area then
+  // the call can be made tail.
+  return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
+}
+
+SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
+                                                   SelectionDAG &DAG,
+                                                   MachineFrameInfo *MFI,
+                                                   int ClobberedFI) const {
+  SmallVector<SDValue, 8> ArgChains;
+  int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
+  int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
+
+  // Include the original chain at the beginning of the list. When this is
+  // used by target LowerCall hooks, this helps legalize find the
+  // CALLSEQ_BEGIN node.
+  ArgChains.push_back(Chain);
+
+  // Add a chain value for each stack argument corresponding
+  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
+                            UE = DAG.getEntryNode().getNode()->use_end();
+       U != UE; ++U)
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+        if (FI->getIndex() < 0) {
+          int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
+          int64_t InLastByte = InFirstByte;
+          InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
+
+          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
+              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
+            ArgChains.push_back(SDValue(L, 1));
+        }
+
+  // Build a tokenfactor for all the chains.
+  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+}
+
+bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
+                                                   bool TailCallOpt) const {
+  return CallCC == CallingConv::Fast && TailCallOpt;
+}
+
+bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
+  return CallCC == CallingConv::Fast;
+}
+
+/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
+/// and add input and output parameter nodes.
+SDValue
+AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool IsThisReturn = false;
+
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  bool IsSibCall = false;
+
+  if (IsTailCall) {
+    // Check if it's really possible to do a tail call.
+    IsTailCall = isEligibleForTailCallOptimization(
+        Callee, CallConv, IsVarArg, IsStructRet,
+        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
+    if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+      report_fatal_error("failed to perform tail call elimination on a call "
+                         "site marked musttail");
+
+    // A sibling call is one where we're under the usual C ABI and not planning
+    // to change that but can still do a tail call:
+    if (!TailCallOpt && IsTailCall)
+      IsSibCall = true;
+
+    if (IsTailCall)
+      ++NumTailCalls;
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  if (IsVarArg) {
+    // Handle fixed and variable vector arguments differently.
+    // Variable vector arguments always go into memory.
+    unsigned NumArgs = Outs.size();
+
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ArgVT = Outs[i].VT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
+                                               /*IsVarArg=*/ !Outs[i].IsFixed);
+      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  } else {
+    // At this point, Outs[].VT may already be promoted to i32. To correctly
+    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+    // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
+    // we use a special version of AnalyzeCallOperands to pass in ValVT and
+    // LocVT.
+    unsigned NumArgs = Outs.size();
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ValVT = Outs[i].VT;
+      // Get type of the original argument.
+      EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
+                                  /*AllowUnknown*/ true);
+      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+      MVT LocVT = ValVT;
+      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+        LocVT = MVT::i8;
+      else if (ActualMVT == MVT::i16)
+        LocVT = MVT::i16;
+
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+      bool Res = AssignFn(i, ValVT, LocVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  }
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  if (IsSibCall) {
+    // Since we're not changing the ABI to make this a tail call, the memory
+    // operands are already available in the caller's incoming argument space.
+    NumBytes = 0;
+  }
+
+  // FPDiff is the byte offset of the call's argument area from the callee's.
+  // Stores to callee stack arguments will be placed in FixedStackSlots offset
+  // by this amount for a tail call. In a sibling call it must be 0 because the
+  // caller will deallocate the entire stack and the callee still expects its
+  // arguments to begin at SP+0. Completely unused for non-tail calls.
+  int FPDiff = 0;
+
+  if (IsTailCall && !IsSibCall) {
+    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+
+    // Since callee will pop argument stack as a tail call, we must keep the
+    // popped size 16-byte aligned.
+    NumBytes = RoundUpToAlignment(NumBytes, 16);
+
+    // FPDiff will be negative if this tail call requires more space than we
+    // would automatically have in our incoming argument space. Positive if we
+    // can actually shrink the stack.
+    FPDiff = NumReusableBytes - NumBytes;
+
+    // The stack pointer must be 16-byte aligned at all times it's used for a
+    // memory operation, which in practice means at *all* times and in
+    // particular across call boundaries. Therefore our own arguments started at
+    // a 16-byte aligned SP and the delta applied for the tail call should
+    // satisfy the same constraint.
+    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+  }
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  if (!IsSibCall)
+    Chain =
+        DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
+
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::FPExt:
+      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.isRegLoc()) {
+      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
+        assert(VA.getLocVT() == MVT::i64 &&
+               "unexpected calling convention register assignment");
+        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
+               "unexpected use of 'returned'");
+        IsThisReturn = true;
+      }
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      SDValue DstAddr;
+      MachinePointerInfo DstInfo;
+
+      // FIXME: This works on big-endian for composite byvals, which are the
+      // common case. It should also work for fundamental types too.
+      uint32_t BEAlign = 0;
+      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+                                        : VA.getLocVT().getSizeInBits();
+      OpSize = (OpSize + 7) / 8;
+      if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+        if (OpSize < 8)
+          BEAlign = 8 - OpSize;
+      }
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      int32_t Offset = LocMemOffset + BEAlign;
+      SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+
+      if (IsTailCall) {
+        Offset = Offset + FPDiff;
+        int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+
+        DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+        DstInfo = MachinePointerInfo::getFixedStack(FI);
+
+        // Make sure any stack arguments overlapping with where we're storing
+        // are loaded before this eventual operation. Otherwise they'll be
+        // clobbered.
+        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+      } else {
+        SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+
+        DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+        DstInfo = MachinePointerInfo::getStack(LocMemOffset);
+      }
+
+      if (Outs[i].Flags.isByVal()) {
+        SDValue SizeNode =
+            DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
+        SDValue Cpy = DAG.getMemcpy(
+            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+            /*isVolatile = */ false,
+            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
+
+        MemOpChains.push_back(Cpy);
+      } else {
+        // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
+        // promoted to a legal register type i32, we should truncate Arg back to
+        // i1/i8/i16.
+        if (Arg.getValueType().isSimple() &&
+            Arg.getValueType().getSimpleVT() == MVT::i32 &&
+            (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
+             VA.getLocVT() == MVT::i16))
+          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
+
+        SDValue Store =
+            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
+        MemOpChains.push_back(Store);
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      Subtarget->isTargetMachO()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      const GlobalValue *GV = G->getGlobal();
+      bool InternalLinkage = GV->hasInternalLinkage();
+      if (InternalLinkage)
+        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+      else {
+        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
+                                            AArch64II::MO_GOT);
+        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+      }
+    } else if (ExternalSymbolSDNode *S =
+                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      const char *Sym = S->getSymbol();
+      Callee =
+          DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
+      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+    }
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
+  }
+
+  // We don't usually want to end the call-sequence here because we would tidy
+  // the frame up *after* the call, however in the ABI-changing tail-call case
+  // we've carefully laid out the parameters so that when sp is reset they'll be
+  // in the correct location.
+  if (IsTailCall && !IsSibCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(0, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
+  }
+
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  if (IsTailCall) {
+    // Each tail call may have to adjust the stack by a different amount, so
+    // this information must travel along with the operation for eventual
+    // consumption by emitEpilogue.
+    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
+  }
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const uint32_t *Mask;
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  if (IsThisReturn) {
+    // For 'this' returns, use the X0-preserving mask if applicable
+    Mask = ARI->getThisReturnPreservedMask(CallConv);
+    if (!Mask) {
+      IsThisReturn = false;
+      Mask = ARI->getCallPreservedMask(CallConv);
+    }
+  } else
+    Mask = ARI->getCallPreservedMask(CallConv);
+
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // If we're doing a tall call, use a TC_RETURN here rather than an
+  // actual call instruction.
+  if (IsTailCall)
+    return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
+
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
+                                ? RoundUpToAlignment(NumBytes, 16)
+                                : 0;
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(CalleePopBytes, true),
+                             InFlag, DL);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+                         InVals, IsThisReturn,
+                         IsThisReturn ? OutVals[0] : SDValue());
+}
+
+bool AArch64TargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC);
+}
+
+SDValue
+AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                   bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   SDLoc DL, SelectionDAG &DAG) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC);
+
+  // Copy the result values into the output registers.
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue Arg = OutVals[realRVLocIdx];
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain; // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  unsigned char OpFlags =
+      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+
+  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
+         "unexpected offset in global node");
+
+  // This also catched the large code model case for Darwin.
+  if ((OpFlags & AArch64II::MO_GOT) != 0) {
+    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes instead of using a wrapper node.
+    return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+  }
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
+    // the only correct model on Darwin.
+    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                            OpFlags | AArch64II::MO_PAGE);
+    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
+
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address (for Darwin, currently) and
+/// return an SDValue containing the final node.
+
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+///     + "extern __thread" declaration.
+///     + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i64] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first xword, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "x0".
+///
+/// Since this descriptor may be in a different unit, in general even the
+/// descriptor must be accessed via an indirect load. The "ideal" code sequence
+/// is:
+///     adrp x0, _var@TLVPPAGE
+///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
+///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
+///                                      ; the function pointer
+///     blr x1                           ; Uses descriptor address in x0
+///     ; Address of _var is now in x0.
+///
+/// If the address of _var's descriptor *is* known to the linker, then it can
+/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
+/// a slight efficiency gain.
+SDValue
+AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
+
+  SDLoc DL(Op);
+  MVT PtrVT = getPointerTy();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+  SDValue TLVPAddr =
+      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
+
+  // The first entry in the descriptor is a function pointer that we must call
+  // to obtain the address of the variable.
+  SDValue Chain = DAG.getEntryNode();
+  SDValue FuncTLVGet =
+      DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
+                  false, true, true, 8);
+  Chain = FuncTLVGet.getValue(1);
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setAdjustsStack(true);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+  // silly).
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+
+  // Finally, we can make the call. This is just a degenerate version of a
+  // normal AArch64 call node: x0 takes the address of the descriptor, and
+  // returns the address of the variable in this thread.
+  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
+  Chain =
+      DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+                  Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
+                  DAG.getRegisterMask(Mask), Chain.getValue(1));
+  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
+}
+
+/// When accessing thread-local variables under either the general-dynamic or
+/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
+/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
+/// is a function pointer to carry out the resolution. This function takes the
+/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
+/// other registers (except LR, NZCV) are preserved.
+///
+/// Thus, the ideal call sequence on AArch64 is:
+///
+///     adrp x0, :tlsdesc:thread_var
+///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
+///     add x0, x0, :tlsdesc_lo12:thread_var
+///     .tlsdesccall thread_var
+///     blr x8
+///     (TPIDR_EL0 offset now in x0).
+///
+/// The ".tlsdesccall" directive instructs the assembler to insert a particular
+/// relocation to help the linker relax this sequence if it turns out to be too
+/// conservative.
+///
+/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
+/// is harmless.
+SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
+                                                   SDValue DescAddr, SDLoc DL,
+                                                   SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+
+  // The function we need to call is simply the first entry in the GOT for this
+  // descriptor, load it in preparation.
+  SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+  // silly).
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+
+  // The function takes only one argument: the address of the descriptor itself
+  // in X0.
+  SDValue Glue, Chain;
+  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
+  Glue = Chain.getValue(1);
+
+  // We're now ready to populate the argument list, as with a normal call:
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Func);
+  Ops.push_back(SymAddr);
+  Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
+  Ops.push_back(DAG.getRegisterMask(Mask));
+  Ops.push_back(Glue);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
+  Glue = Chain.getValue(1);
+
+  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
+}
+
+SDValue
+AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
+  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+         "ELF TLS only supported in small memory model");
+  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+  SDValue TPOff;
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  const GlobalValue *GV = GA->getGlobal();
+
+  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
+
+  if (Model == TLSModel::LocalExec) {
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0,
+        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+                                       DAG.getTargetConstant(16, MVT::i32)),
+                    0);
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
+                                       DAG.getTargetConstant(0, MVT::i32)),
+                    0);
+  } else if (Model == TLSModel::InitialExec) {
+    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+    TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
+  } else if (Model == TLSModel::LocalDynamic) {
+    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
+    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
+    // the beginning of the module's TLS region, followed by a DTPREL offset
+    // calculation.
+
+    // These accesses will need deduplicating if there's more than one.
+    AArch64FunctionInfo *MFI =
+        DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+    MFI->incNumLocalDynamicTLSAccesses();
+
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetExternalSymbol(
+        "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    SDValue LoDesc = DAG.getTargetExternalSymbol(
+        "_TLS_MODULE_BASE_", PtrVT,
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    // First argument to the descriptor call is the address of the descriptor
+    // itself.
+    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
+    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+                                                  AArch64II::MO_TLS);
+
+    // Now we can calculate the offset from TPIDR_EL0 to this module's
+    // thread-local area.
+    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+
+    // Now use :dtprel_whatever: operations to calculate this variable's offset
+    // in its thread-storage area.
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0,
+        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+
+    SDValue DTPOff =
+        SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+                                   DAG.getTargetConstant(16, MVT::i32)),
+                0);
+    DTPOff =
+        SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
+                                   DAG.getTargetConstant(0, MVT::i32)),
+                0);
+
+    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
+  } else if (Model == TLSModel::GeneralDynamic) {
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    SDValue LoDesc = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0,
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    // First argument to the descriptor call is the address of the descriptor
+    // itself.
+    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
+    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr =
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+
+    // Finally we can make a call to calculate the offset from tpidr_el0.
+    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+  } else
+    llvm_unreachable("Unsupported ELF TLS access model");
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
+}
+
+SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  if (Subtarget->isTargetDarwin())
+    return LowerDarwinGlobalTLSAddress(Op, DAG);
+  else if (Subtarget->isTargetELF())
+    return LowerELFGlobalTLSAddress(Op, DAG);
+
+  llvm_unreachable("Unexpected platform trying to use TLS");
+}
+SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc dl(Op);
+
+  // Handle f128 first, since lowering it will result in comparing the return
+  // value of a libcall against zero, which is just what the rest of LowerBR_CC
+  // is expecting to deal with.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+  // instruction.
+  unsigned Opc = LHS.getOpcode();
+  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->isOne() &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+           "Unexpected condition code.");
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
+      return SDValue();
+
+    // The actual operation with overflow check.
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
+
+    if (CC == ISD::SETNE)
+      OFCC = getInvertedCondCode(OFCC);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+    return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
+                       CCVal, Overflow);
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    // If the RHS of the comparison is zero, we can potentially fold this
+    // to a specialized branch.
+    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if (RHSC && RHSC->getZExtValue() == 0) {
+      if (CC == ISD::SETEQ) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+
+          // TBZ only operates on i64's, but the ext should be free.
+          if (Test.getValueType() == MVT::i32)
+            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+          return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+        }
+
+        return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
+      } else if (CC == ISD::SETNE) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+
+          // TBNZ only operates on i64's, but the ext should be free.
+          if (Test.getValueType() == MVT::i32)
+            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+          return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+        }
+
+        return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+      }
+    }
+
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                       Cmp);
+  }
+
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+  SDValue BR1 =
+      DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
+  if (CC2 != AArch64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
+                       Cmp);
+  }
+
+  return BR1;
+}
+
+SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  SDValue In1 = Op.getOperand(0);
+  SDValue In2 = Op.getOperand(1);
+  EVT SrcVT = In2.getValueType();
+  if (SrcVT != VT) {
+    if (SrcVT == MVT::f32 && VT == MVT::f64)
+      In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+    else if (SrcVT == MVT::f64 && VT == MVT::f32)
+      In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
+    else
+      // FIXME: Src type is different, bail out for now. Can VT really be a
+      // vector type?
+      return SDValue();
+  }
+
+  EVT VecVT;
+  EVT EltVT;
+  SDValue EltMask, VecVal1, VecVal2;
+  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
+    EltVT = MVT::i32;
+    VecVT = MVT::v4i32;
+    EltMask = DAG.getConstant(0x80000000ULL, EltVT);
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
+    } else {
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+    }
+  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
+    EltVT = MVT::i64;
+    VecVT = MVT::v2i64;
+
+    // We want to materialize a mask with the the high bit set, but the AdvSIMD
+    // immediate moves cannot materialize that in a single instruction for
+    // 64-bit elements. Instead, materialize zero and then negate it.
+    EltMask = DAG.getConstant(0, EltVT);
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
+    } else {
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+    }
+  } else {
+    llvm_unreachable("Invalid type for copysign!");
+  }
+
+  std::vector<SDValue> BuildVectorOps;
+  for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
+    BuildVectorOps.push_back(EltMask);
+
+  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
+
+  // If we couldn't materialize the mask above, then the mask vector will be
+  // the zero vector, and we need to negate it here.
+  if (VT == MVT::f64 || VT == MVT::v2f64) {
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
+  }
+
+  SDValue Sel =
+      DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
+
+  if (VT == MVT::f32)
+    return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
+  else if (VT == MVT::f64)
+    return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
+  else
+    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
+}
+
+SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
+    return SDValue();
+
+  // While there is no integer popcount instruction, it can
+  // be more efficiently lowered to the following sequence that uses
+  // AdvSIMD registers/instructions as long as the copies to/from
+  // the AdvSIMD registers are cheap.
+  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
+  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
+  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
+  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
+  SDValue Val = Op.getOperand(0);
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
+
+  SDValue VecVal;
+  if (VT == MVT::i32) {
+    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
+    VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
+                                       VecVal);
+  } else {
+    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
+  }
+
+  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
+  SDValue UaddLV = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+      DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
+
+  if (VT == MVT::i64)
+    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
+  return UaddLV;
+}
+
+SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+  if (Op.getValueType().isVector())
+    return LowerVSETCC(Op, DAG);
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc dl(Op);
+
+  // We chose ZeroOrOneBooleanContents, so use zero and one.
+  EVT VT = Op.getValueType();
+  SDValue TVal = DAG.getConstant(1, VT);
+  SDValue FVal = DAG.getConstant(0, VT);
+
+  // Handle f128 first, since one possible outcome is a normal integer
+  // comparison which gets picked up by the next if statement.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, use it.
+    if (!RHS.getNode()) {
+      assert(LHS.getValueType() == Op.getValueType() &&
+             "Unexpected setcc expansion!");
+      return LHS;
+    }
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    SDValue CCVal;
+    SDValue Cmp =
+        getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
+
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+  }
+
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  if (CC2 == AArch64CC::AL) {
+    changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
+    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
+  } else {
+    // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
+    // totally clean.  Some of them require two CSELs to implement.  As is in
+    // this case, we emit the first CSEL and then emit a second using the output
+    // of the first as the RHS.  We're effectively OR'ing the two CC's together.
+
+    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
+    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+    SDValue CS1 =
+        DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+  }
+}
+
+/// A SELECT_CC operation is really some kind of max or min if both values being
+/// compared are, in some sense, equal to the results in either case. However,
+/// it is permissible to compare f32 values and produce directly extended f64
+/// values.
+///
+/// Extending the comparison operands would also be allowed, but is less likely
+/// to happen in practice since their use is right here. Note that truncate
+/// operations would *not* be semantically equivalent.
+static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
+  if (Cmp == Result)
+    return true;
+
+  ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
+  ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
+  if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
+      Result.getValueType() == MVT::f64) {
+    bool Lossy;
+    APFloat CmpVal = CCmp->getValueAPF();
+    CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
+    return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
+  }
+
+  return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
+}
+
+SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue CC = Op->getOperand(0);
+  SDValue TVal = Op->getOperand(1);
+  SDValue FVal = Op->getOperand(2);
+  SDLoc DL(Op);
+
+  unsigned Opc = CC.getOpcode();
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
+  // instruction.
+  if (CC.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
+      return SDValue();
+
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
+                       CCVal, Overflow);
+  }
+
+  if (CC.getOpcode() == ISD::SETCC)
+    return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
+                           cast<CondCodeSDNode>(CC.getOperand(2))->get());
+  else
+    return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
+                           FVal, ISD::SETNE);
+}
+
+SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TVal = Op.getOperand(2);
+  SDValue FVal = Op.getOperand(3);
+  SDLoc dl(Op);
+
+  // Handle f128 first, because it will result in a comparison of some RTLIB
+  // call result against zero.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  // Handle integers first.
+  if (LHS.getValueType().isInteger()) {
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    unsigned Opcode = AArch64ISD::CSEL;
+
+    // If both the TVal and the FVal are constants, see if we can swap them in
+    // order to for a CSINV or CSINC out of them.
+    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (TVal.getOpcode() == ISD::XOR) {
+      // If TVal is a NOT we want to swap TVal and FVal so that we can match
+      // with a CSINV rather than a CSEL.
+      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
+
+      if (CVal && CVal->isAllOnesValue()) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (TVal.getOpcode() == ISD::SUB) {
+      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
+      // that we can match with a CSNEG rather than a CSEL.
+      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
+
+      if (CVal && CVal->isNullValue()) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (CTVal && CFVal) {
+      const int64_t TrueVal = CTVal->getSExtValue();
+      const int64_t FalseVal = CFVal->getSExtValue();
+      bool Swap = false;
+
+      // If both TVal and FVal are constants, see if FVal is the
+      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
+      // instead of a CSEL in that case.
+      if (TrueVal == ~FalseVal) {
+        Opcode = AArch64ISD::CSINV;
+      } else if (TrueVal == -FalseVal) {
+        Opcode = AArch64ISD::CSNEG;
+      } else if (TVal.getValueType() == MVT::i32) {
+        // If our operands are only 32-bit wide, make sure we use 32-bit
+        // arithmetic for the check whether we can use CSINC. This ensures that
+        // the addition in the check will wrap around properly in case there is
+        // an overflow (which would not be the case if we do the check with
+        // 64-bit arithmetic).
+        const uint32_t TrueVal32 = CTVal->getZExtValue();
+        const uint32_t FalseVal32 = CFVal->getZExtValue();
+
+        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
+          Opcode = AArch64ISD::CSINC;
+
+          if (TrueVal32 > FalseVal32) {
+            Swap = true;
+          }
+        }
+        // 64-bit check whether we can use CSINC.
+      } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
+        Opcode = AArch64ISD::CSINC;
+
+        if (TrueVal > FalseVal) {
+          Swap = true;
+        }
+      }
+
+      // Swap TVal and FVal if necessary.
+      if (Swap) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+
+      if (Opcode != AArch64ISD::CSEL) {
+        // Drop FVal since we can get its value by simply inverting/negating
+        // TVal.
+        FVal = TVal;
+      }
+    }
+
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+    EVT VT = Op.getValueType();
+    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
+  }
+
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+  assert(LHS.getValueType() == RHS.getValueType());
+  EVT VT = Op.getValueType();
+
+  // Try to match this select into a max/min operation, which have dedicated
+  // opcode in the instruction set.
+  // FIXME: This is not correct in the presence of NaNs, so we only enable this
+  // in no-NaNs mode.
+  if (getTargetMachine().Options.NoNaNsFPMath) {
+    SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
+    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
+        selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
+      CC = ISD::getSetCCSwappedOperands(CC);
+      std::swap(MinMaxLHS, MinMaxRHS);
+    }
+
+    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
+        selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETGT:
+      case ISD::SETGE:
+      case ISD::SETUGT:
+      case ISD::SETUGE:
+      case ISD::SETOGT:
+      case ISD::SETOGE:
+        return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
+        break;
+      case ISD::SETLT:
+      case ISD::SETLE:
+      case ISD::SETULT:
+      case ISD::SETULE:
+      case ISD::SETOLT:
+      case ISD::SETOLE:
+        return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
+        break;
+      }
+    }
+  }
+
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two CSELs to implement.
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+  SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+  // If we need a second CSEL, emit it, using the output of the first as the
+  // RHS.  We're effectively OR'ing the two CC's together.
+  if (CC2 != AArch64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+  }
+
+  // Otherwise, return the output of the first CSEL.
+  return CS1;
+}
+
+SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  // Jump table entries as PC relative offsets. No additional tweaking
+  // is necessary here. Just get the address of the jump table.
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                               AArch64II::MO_G0 | MO_NC));
+  }
+
+  SDValue Hi =
+      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
+  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                      AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+  return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+}
+
+SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    // Use the GOT for the large code model on iOS.
+    if (Subtarget->isTargetMachO()) {
+      SDValue GotAddr = DAG.getTargetConstantPool(
+          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+          AArch64II::MO_GOT);
+      return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+    }
+
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G3),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
+    // ELF, the only valid one on Darwin.
+    SDValue Hi =
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetConstantPool(
+        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+        AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+  } else {
+    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
+                                                             AArch64II::MO_NC);
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  AArch64FunctionInfo *FuncInfo =
+      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FR =
+      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
+SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  // The layout of the va_list struct is specified in the AArch64 Procedure Call
+  // Standard, section B.3.
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  SDLoc DL(Op);
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue VAList = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  SmallVector<SDValue, 4> MemOps;
+
+  // void *__stack at offset 0
+  SDValue Stack =
+      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
+                                MachinePointerInfo(SV), false, false, 8));
+
+  // void *__gr_top at offset 8
+  int GPRSize = FuncInfo->getVarArgsGPRSize();
+  if (GPRSize > 0) {
+    SDValue GRTop, GRTopAddr;
+
+    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                            DAG.getConstant(8, getPointerTy()));
+
+    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
+    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
+                        DAG.getConstant(GPRSize, getPointerTy()));
+
+    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
+                                  MachinePointerInfo(SV, 8), false, false, 8));
+  }
+
+  // void *__vr_top at offset 16
+  int FPRSize = FuncInfo->getVarArgsFPRSize();
+  if (FPRSize > 0) {
+    SDValue VRTop, VRTopAddr;
+    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                            DAG.getConstant(16, getPointerTy()));
+
+    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
+    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
+                        DAG.getConstant(FPRSize, getPointerTy()));
+
+    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
+                                  MachinePointerInfo(SV, 16), false, false, 8));
+  }
+
+  // int __gr_offs at offset 24
+  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                                   DAG.getConstant(24, getPointerTy()));
+  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
+                                GROffsAddr, MachinePointerInfo(SV, 24), false,
+                                false, 4));
+
+  // int __vr_offs at offset 28
+  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                                   DAG.getConstant(28, getPointerTy()));
+  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
+                                VROffsAddr, MachinePointerInfo(SV, 28), false,
+                                false, 4));
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
+                                     : LowerAAPCS_VASTART(Op, DAG);
+}
+
+SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
+  // pointer.
+  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
+                       Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
+                       8, false, false, MachinePointerInfo(DestSV),
+                       MachinePointerInfo(SrcSV));
+}
+
+SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() &&
+         "automatic va_arg instruction only works on Darwin");
+
+  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  unsigned Align = Op.getConstantOperandVal(3);
+
+  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
+                               MachinePointerInfo(V), false, false, false, 0);
+  Chain = VAList.getValue(1);
+
+  if (Align > 8) {
+    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
+    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                         DAG.getConstant(Align - 1, getPointerTy()));
+    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
+                         DAG.getConstant(-(int64_t)Align, getPointerTy()));
+  }
+
+  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+
+  // Scalar integer and FP values smaller than 64 bits are implicitly extended
+  // up to 64 bits.  At the very least, we have to increase the striding of the
+  // vaargs list to match this, and for FP values we need to introduce
+  // FP_ROUND nodes as well.
+  if (VT.isInteger() && !VT.isVector())
+    ArgSize = 8;
+  bool NeedFPTrunc = false;
+  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
+    ArgSize = 8;
+    NeedFPTrunc = true;
+  }
+
+  // Increment the pointer, VAList, to the next vaarg
+  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                               DAG.getConstant(ArgSize, getPointerTy()));
+  // Store the incremented VAList to the legalized pointer
+  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
+                                 false, false, 0);
+
+  // Load the actual argument out of the pointer VAList
+  if (NeedFPTrunc) {
+    // Load the value as an f64.
+    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
+                                 MachinePointerInfo(), false, false, false, 0);
+    // Round the value down to an f32.
+    SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
+                                   DAG.getIntPtrConstant(1));
+    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
+    // Merge the rounded value with the chain output of the load.
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
+                     false, false, 0);
+}
+
+SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr =
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(), false, false, false, 0);
+  return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
+                                                  EVT VT) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("sp", AArch64::SP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
+SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(8, getPointerTy());
+    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
+  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+}
+
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  SDValue ARMcc;
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+                               ISD::SETGE, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
+
+  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue Lo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+
+  // AArch64 shifts larger than the register width are wrapped rather than
+  // clamped, so we can't just emit "hi >> x".
+  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue TrueValHi = Opc == ISD::SRA
+                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
+                                        DAG.getConstant(VTBits - 1, MVT::i64))
+                          : DAG.getConstant(0, VT);
+  SDValue Hi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  SDValue ARMcc;
+
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+
+  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+                               ISD::SETGE, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
+  SDValue Hi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
+
+  // AArch64 shifts of larger than register sizes are wrapped rather than
+  // clamped, so we can't just emit "lo << a" if a is too big.
+  SDValue TrueValLo = DAG.getConstant(0, VT);
+  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+bool AArch64TargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode *GA) const {
+  // The AArch64 target doesn't support folding offsets into global addresses.
+  return false;
+}
+
+bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
+  // FIXME: We should be able to handle f128 as well with a clever lowering.
+  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
+    return true;
+
+  if (VT == MVT::f64)
+    return AArch64_AM::getFP64Imm(Imm) != -1;
+  else if (VT == MVT::f32)
+    return AArch64_AM::getFP32Imm(Imm) != -1;
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                          AArch64 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//                          AArch64 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+// Table of Constraints
+// TODO: This is the current set of constraints supported by ARM for the
+// compiler, not all of them may make sense, e.g. S may be difficult to support.
+//
+// r - A general register
+// w - An FP/SIMD register of some size in the range v0-v31
+// x - An FP/SIMD register of some size in the range v0-v15
+// I - Constant that can be used with an ADD instruction
+// J - Constant that can be used with a SUB instruction
+// K - Constant that can be used with a 32-bit logical instruction
+// L - Constant that can be used with a 64-bit logical instruction
+// M - Constant that can be used as a 32-bit MOV immediate
+// N - Constant that can be used as a 64-bit MOV immediate
+// Q - A memory reference with base register and no offset
+// S - A symbolic address
+// Y - Floating point constant zero
+// Z - Integer constant zero
+//
+//   Note that general register operands will be output using their 64-bit x
+// register name, whatever the size of the variable, unless the asm operand
+// is prefixed by the %w modifier. Floating-point and SIMD register operands
+// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
+// %q modifier.
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+AArch64TargetLowering::ConstraintType
+AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'z':
+      return C_Other;
+    case 'x':
+    case 'w':
+      return C_RegisterClass;
+    // An address with a single base register. Due to the way we
+    // currently handle addresses it is the same as 'r'.
+    case 'Q':
+      return C_Memory;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+AArch64TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'x':
+  case 'w':
+    if (type->isFloatingPointTy() || type->isVectorTy())
+      weight = CW_Register;
+    break;
+  case 'z':
+    weight = CW_Constant;
+    break;
+  }
+  return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+AArch64TargetLowering::getRegForInlineAsmConstraint(
+    const std::string &Constraint, MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::GPR64commonRegClass);
+      return std::make_pair(0U, &AArch64::GPR32commonRegClass);
+    case 'w':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &AArch64::FPR32RegClass);
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::FPR64RegClass);
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::FPR128RegClass);
+      break;
+    // The instructions that this constraint is designed for can
+    // only take 128-bit registers so just use that regclass.
+    case 'x':
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::FPR128_loRegClass);
+      break;
+    }
+  }
+  if (StringRef("{cc}").equals_lower(Constraint))
+    return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
+
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass *> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // Not found as a standard register?
+  if (!Res.second) {
+    unsigned Size = Constraint.size();
+    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
+      const std::string Reg =
+          std::string(&Constraint[2], &Constraint[Size - 1]);
+      int RegNo = atoi(Reg.c_str());
+      if (RegNo >= 0 && RegNo <= 31) {
+        // v0 - v31 are aliases of q0 - q31.
+        // By default we'll emit v0-v31 for this unless there's a modifier where
+        // we'll emit the correct register as well.
+        Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
+        Res.second = &AArch64::FPR128RegClass;
+      }
+    }
+  }
+
+  return Res;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void AArch64TargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  SDValue Result;
+
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1)
+    return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default:
+    break;
+
+  // This set of constraints deal with valid constants for various instructions.
+  // Validate and return a target constant for them if we can.
+  case 'z': {
+    // 'z' maps to xzr or wzr so it needs an input of 0.
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C || C->getZExtValue() != 0)
+      return;
+
+    if (Op.getValueType() == MVT::i64)
+      Result = DAG.getRegister(AArch64::XZR, MVT::i64);
+    else
+      Result = DAG.getRegister(AArch64::WZR, MVT::i32);
+    break;
+  }
+
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    // Grab the value and do some validation.
+    uint64_t CVal = C->getZExtValue();
+    switch (ConstraintLetter) {
+    // The I constraint applies only to simple ADD or SUB immediate operands:
+    // i.e. 0 to 4095 with optional shift by 12
+    // The J constraint applies only to ADD or SUB immediates that would be
+    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
+    // instruction [or vice versa], in other words -1 to -4095 with optional
+    // left shift by 12.
+    case 'I':
+      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
+        break;
+      return;
+    case 'J': {
+      uint64_t NVal = -C->getSExtValue();
+      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
+        break;
+      return;
+    }
+    // The K and L constraints apply *only* to logical immediates, including
+    // what used to be the MOVI alias for ORR (though the MOVI alias has now
+    // been removed and MOV should be used). So these constraints have to
+    // distinguish between bit patterns that are valid 32-bit or 64-bit
+    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
+    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
+    // versa.
+    case 'K':
+      if (AArch64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      return;
+    case 'L':
+      if (AArch64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      return;
+    // The M and N constraints are a superset of K and L respectively, for use
+    // with the MOV (immediate) alias. As well as the logical immediates they
+    // also match 32 or 64-bit immediates that can be loaded either using a
+    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
+    // (M) or 64-bit 0x1234000000000000 (N) etc.
+    // As a note some of this code is liberally stolen from the asm parser.
+    case 'M': {
+      if (!isUInt<32>(CVal))
+        return;
+      if (AArch64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      if ((CVal & 0xFFFF) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~(uint32_t)CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      return;
+    }
+    case 'N': {
+      if (AArch64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      if ((CVal & 0xFFFFULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF00000000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF000000000000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
+        break;
+      return;
+    }
+    default:
+      return;
+    }
+
+    // All assembler immediates are 64-bit integers.
+    Result = DAG.getTargetConstant(CVal, MVT::i64);
+    break;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+//                     AArch64 Advanced SIMD Support
+//===----------------------------------------------------------------------===//
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
+  EVT VT = V64Reg.getValueType();
+  unsigned NarrowSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+  SDLoc DL(V64Reg);
+
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
+                     V64Reg, DAG.getConstant(0, MVT::i32));
+}
+
+/// getExtFactor - Determine the adjustment factor for the position when
+/// generating an "extract from vector registers" instruction.
+static unsigned getExtFactor(SDValue &V) {
+  EVT EltType = V.getValueType().getVectorElementType();
+  return EltType.getSizeInBits() / 8;
+}
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+  SDLoc DL(V128Reg);
+
+  return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
+}
+
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SmallVector<SDValue, 2> SourceVecs;
+  SmallVector<unsigned, 2> MinElts;
+  SmallVector<unsigned, 2> MaxElts;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+      // A shuffle can only come from building a vector from various
+      // elements of other vectors.
+      return SDValue();
+    }
+
+    // Record this extraction against the appropriate vector if possible...
+    SDValue SourceVec = V.getOperand(0);
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    bool FoundSource = false;
+    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
+      if (SourceVecs[j] == SourceVec) {
+        if (MinElts[j] > EltNo)
+          MinElts[j] = EltNo;
+        if (MaxElts[j] < EltNo)
+          MaxElts[j] = EltNo;
+        FoundSource = true;
+        break;
+      }
+    }
+
+    // Or record a new source if not...
+    if (!FoundSource) {
+      SourceVecs.push_back(SourceVec);
+      MinElts.push_back(EltNo);
+      MaxElts.push_back(EltNo);
+    }
+  }
+
+  // Currently only do something sane when at most two source vectors
+  // involved.
+  if (SourceVecs.size() > 2)
+    return SDValue();
+
+  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
+  int VEXTOffsets[2] = { 0, 0 };
+
+  // This loop extracts the usage patterns of the source vectors
+  // and prepares appropriate SDValues for a shuffle if possible.
+  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
+    if (SourceVecs[i].getValueType() == VT) {
+      // No VEXT necessary
+      ShuffleSrcs[i] = SourceVecs[i];
+      VEXTOffsets[i] = 0;
+      continue;
+    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
+      // We can pad out the smaller vector for free, so if it's part of a
+      // shuffle...
+      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i],
+                                   DAG.getUNDEF(SourceVecs[i].getValueType()));
+      continue;
+    }
+
+    // Don't attempt to extract subvectors from BUILD_VECTOR sources
+    // that expand or trunc the original value.
+    // TODO: We can try to bitcast and ANY_EXTEND the result but
+    // we need to consider the cost of vector ANY_EXTEND, and the
+    // legality of all the types.
+    if (SourceVecs[i].getValueType().getVectorElementType() !=
+        VT.getVectorElementType())
+      return SDValue();
+
+    // Since only 64-bit and 128-bit vectors are legal on ARM and
+    // we've eliminated the other cases...
+    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts &&
+           "unexpected vector sizes in ReconstructShuffle");
+
+    if (MaxElts[i] - MinElts[i] >= NumElts) {
+      // Span too large for a VEXT to cope
+      return SDValue();
+    }
+
+    if (MinElts[i] >= NumElts) {
+      // The extraction can just take the second half
+      VEXTOffsets[i] = NumElts;
+      ShuffleSrcs[i] =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
+                      DAG.getIntPtrConstant(NumElts));
+    } else if (MaxElts[i] < NumElts) {
+      // The extraction can just take the first half
+      VEXTOffsets[i] = 0;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                   SourceVecs[i], DAG.getIntPtrConstant(0));
+    } else {
+      // An actual VEXT is needed
+      VEXTOffsets[i] = MinElts[i];
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                     SourceVecs[i], DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc2 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
+                      DAG.getIntPtrConstant(NumElts));
+      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
+      ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
+                                   DAG.getConstant(Imm, MVT::i32));
+    }
+  }
+
+  SmallVector<int, 8> Mask;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Entry = Op.getOperand(i);
+    if (Entry.getOpcode() == ISD::UNDEF) {
+      Mask.push_back(-1);
+      continue;
+    }
+
+    SDValue ExtractVec = Entry.getOperand(0);
+    int ExtractElt =
+        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
+    if (ExtractVec == SourceVecs[0]) {
+      Mask.push_back(ExtractElt - VEXTOffsets[0]);
+    } else {
+      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
+    }
+  }
+
+  // Final check before we try to produce nonsense...
+  if (isShuffleMaskLegal(Mask, VT))
+    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
+                                &Mask[0]);
+
+  return SDValue();
+}
+
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
+
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  return true;
+}
+
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are different.
+static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
+                      unsigned &Imm) {
+  // Look for the first non-undef element.
+  const int *FirstRealElt = std::find_if(M.begin(), M.end(),
+      [](int Elt) {return Elt >= 0;});
+
+  // Benefit form APInt to handle overflow when calculating expected element.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+  // The following shuffle indices must be the successive elements after the
+  // first real element.
+  const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
+      [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
+  if (FirstWrongElt != M.end())
+    return false;
+
+  // The index of an EXT is the first element if it is not UNDEF.
+  // Watch out for the beginning UNDEFs. The EXT index should be the expected
+  // value of the first element.  E.g. 
+  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
+  // ExpectedElt is the last mask index plus 1.
+  Imm = ExpectedElt.getZExtValue();
+
+  // There are two difference cases requiring to reverse input vectors.
+  // For example, for vector <4 x i32> we have the following cases,
+  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
+  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
+  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
+  // to reverse two input vectors.
+  if (Imm < NumElts)
+    ReverseEXT = true;
+  else
+    Imm -= NumElts;
+
+  return true;
+}
+
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for REV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
+      return false;
+    Idx += 1;
+  }
+
+  return true;
+}
+
+static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != 2 * i + WhichResult)
+      return false;
+  }
+
+  return true;
+}
+
+static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
+      return false;
+    Idx += 1;
+  }
+
+  return true;
+}
+
+/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned Half = VT.getVectorNumElements() / 2;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned j = 0; j != 2; ++j) {
+    unsigned Idx = WhichResult;
+    for (unsigned i = 0; i != Half; ++i) {
+      int MIdx = M[i + j * Half];
+      if (MIdx >= 0 && (unsigned)MIdx != Idx)
+        return false;
+      Idx += 2;
+    }
+  }
+
+  return true;
+}
+
+/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+static bool isINSMask(ArrayRef<int> M, int NumInputElements,
+                      bool &DstIsLeft, int &Anomaly) {
+  if (M.size() != static_cast<size_t>(NumInputElements))
+    return false;
+
+  int NumLHSMatch = 0, NumRHSMatch = 0;
+  int LastLHSMismatch = -1, LastRHSMismatch = -1;
+
+  for (int i = 0; i < NumInputElements; ++i) {
+    if (M[i] == -1) {
+      ++NumLHSMatch;
+      ++NumRHSMatch;
+      continue;
+    }
+
+    if (M[i] == i)
+      ++NumLHSMatch;
+    else
+      LastLHSMismatch = i;
+
+    if (M[i] == i + NumInputElements)
+      ++NumRHSMatch;
+    else
+      LastRHSMismatch = i;
+  }
+
+  if (NumLHSMatch == NumInputElements - 1) {
+    DstIsLeft = true;
+    Anomaly = LastLHSMismatch;
+    return true;
+  } else if (NumRHSMatch == NumInputElements - 1) {
+    DstIsLeft = false;
+    Anomaly = LastRHSMismatch;
+    return true;
+  }
+
+  return false;
+}
+
+static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
+  if (VT.getSizeInBits() != 128)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (int I = 0, E = NumElts / 2; I != E; I++) {
+    if (Mask[I] != I)
+      return false;
+  }
+
+  int Offset = NumElts / 2;
+  for (int I = NumElts / 2, E = NumElts; I != E; I++) {
+    if (Mask[I] != I + SplitLHS * Offset)
+      return false;
+  }
+
+  return true;
+}
+
+static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue V0 = Op.getOperand(0);
+  SDValue V1 = Op.getOperand(1);
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
+      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
+    return SDValue();
+
+  bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
+
+  if (!isConcatMask(Mask, VT, SplitV0))
+    return SDValue();
+
+  EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                                VT.getVectorNumElements() / 2);
+  if (SplitV0) {
+    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
+                     DAG.getConstant(0, MVT::i64));
+  }
+  if (V1.getValueType().getSizeInBits() == 128) {
+    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
+                     DAG.getConstant(0, MVT::i64));
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      SDLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
+  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
+
+  enum {
+    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VREV,
+    OP_VDUP0,
+    OP_VDUP1,
+    OP_VDUP2,
+    OP_VDUP3,
+    OP_VEXT1,
+    OP_VEXT2,
+    OP_VEXT3,
+    OP_VUZPL, // VUZP, left result
+    OP_VUZPR, // VUZP, right result
+    OP_VZIPL, // VZIP, left result
+    OP_VZIPR, // VZIP, right result
+    OP_VTRNL, // VTRN, left result
+    OP_VTRNR  // VTRN, right result
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1 * 9 + 2) * 9 + 3)
+      return LHS;
+    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  EVT VT = OpLHS.getValueType();
+
+  switch (OpNum) {
+  default:
+    llvm_unreachable("Unknown shuffle opcode!");
+  case OP_VREV:
+    // VREV divides the vector in half and swaps within the half.
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
+      return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
+    // vrev <4 x i16> -> REV32
+    if (VT.getVectorElementType() == MVT::i16)
+      return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
+    // vrev <4 x i8> -> REV16
+    assert(VT.getVectorElementType() == MVT::i8);
+    return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
+  case OP_VDUP0:
+  case OP_VDUP1:
+  case OP_VDUP2:
+  case OP_VDUP3: {
+    EVT EltTy = VT.getVectorElementType();
+    unsigned Opcode;
+    if (EltTy == MVT::i8)
+      Opcode = AArch64ISD::DUPLANE8;
+    else if (EltTy == MVT::i16)
+      Opcode = AArch64ISD::DUPLANE16;
+    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
+      Opcode = AArch64ISD::DUPLANE32;
+    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
+      Opcode = AArch64ISD::DUPLANE64;
+    else
+      llvm_unreachable("Invalid vector element type?");
+
+    if (VT.getSizeInBits() == 64)
+      OpLHS = WidenVector(OpLHS, DAG);
+    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
+    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
+  }
+  case OP_VEXT1:
+  case OP_VEXT2:
+  case OP_VEXT3: {
+    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
+    return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+  case OP_VUZPL:
+    return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VUZPR:
+    return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VZIPL:
+    return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VZIPR:
+    return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VTRNL:
+    return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VTRNR:
+    return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  }
+}
+
+static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
+                           SelectionDAG &DAG) {
+  // Check to see if we can use the TBL instruction.
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  EVT EltVT = Op.getValueType().getVectorElementType();
+  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
+
+  SmallVector<SDValue, 8> TBLMask;
+  for (int Val : ShuffleMask) {
+    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+      unsigned Offset = Byte + Val * BytesPerElt;
+      TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
+    }
+  }
+
+  MVT IndexVT = MVT::v8i8;
+  unsigned IndexLen = 8;
+  if (Op.getValueType().getSizeInBits() == 128) {
+    IndexVT = MVT::v16i8;
+    IndexLen = 16;
+  }
+
+  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
+  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
+
+  SDValue Shuffle;
+  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
+    if (IndexLen == 8)
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
+    Shuffle = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+        DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                    makeArrayRef(TBLMask.data(), IndexLen)));
+  } else {
+    if (IndexLen == 8) {
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                      makeArrayRef(TBLMask.data(), IndexLen)));
+    } else {
+      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
+      // cannot currently represent the register constraints on the input
+      // table registers.
+      //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
+      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+      //                               &TBLMask[0], IndexLen));
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                      makeArrayRef(TBLMask.data(), IndexLen)));
+    }
+  }
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
+}
+
+static unsigned getDUPLANEOp(EVT EltType) {
+  if (EltType == MVT::i8)
+    return AArch64ISD::DUPLANE8;
+  if (EltType == MVT::i16)
+    return AArch64ISD::DUPLANE16;
+  if (EltType == MVT::i32 || EltType == MVT::f32)
+    return AArch64ISD::DUPLANE32;
+  if (EltType == MVT::i64 || EltType == MVT::f64)
+    return AArch64ISD::DUPLANE64;
+
+  llvm_unreachable("Invalid vector element type?");
+}
+
+SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+
+  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
+                                       V1.getValueType().getSimpleVT())) {
+    int Lane = SVN->getSplatIndex();
+    // If this is undef splat, generate it via "just" vdup, if possible.
+    if (Lane == -1)
+      Lane = 0;
+
+    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
+                         V1.getOperand(0));
+    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
+    // constant. If so, we can just reference the lane's definition directly.
+    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
+        !isa<ConstantSDNode>(V1.getOperand(Lane)))
+      return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
+
+    // Otherwise, duplicate from the lane of the input vector.
+    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
+
+    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
+    // to make a vector of the same size as this SHUFFLE. We can ignore the
+    // extract entirely, and canonicalise the concat using WidenVector.
+    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+      V1 = V1.getOperand(0);
+    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
+      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
+      Lane -= Idx * VT.getVectorNumElements() / 2;
+      V1 = WidenVector(V1.getOperand(Idx), DAG);
+    } else if (VT.getSizeInBits() == 64)
+      V1 = WidenVector(V1, DAG);
+
+    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
+  }
+
+  if (isREVMask(ShuffleMask, VT, 64))
+    return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 32))
+    return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 16))
+    return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
+
+  bool ReverseEXT = false;
+  unsigned Imm;
+  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
+    if (ReverseEXT)
+      std::swap(V1, V2);
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
+                       DAG.getConstant(Imm, MVT::i32));
+  } else if (V2->getOpcode() == ISD::UNDEF &&
+             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+
+  unsigned WhichResult;
+  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+
+  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+
+  SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
+  if (Concat.getNode())
+    return Concat;
+
+  bool DstIsLeft;
+  int Anomaly;
+  int NumInputElements = V1.getValueType().getVectorNumElements();
+  if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
+    SDValue DstVec = DstIsLeft ? V1 : V2;
+    SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
+
+    SDValue SrcVec = V1;
+    int SrcLane = ShuffleMask[Anomaly];
+    if (SrcLane >= NumInputElements) {
+      SrcVec = V2;
+      SrcLane -= VT.getVectorNumElements();
+    }
+    SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
+
+    EVT ScalarVT = VT.getVectorElementType();
+    if (ScalarVT.getSizeInBits() < 32)
+      ScalarVT = MVT::i32;
+
+    return DAG.getNode(
+        ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
+        DstLaneV);
+  }
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 4) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (ShuffleMask[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = ShuffleMask[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  return GenerateTBL(Op, ShuffleMask, DAG);
+}
+
+static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
+                               APInt &UndefBits) {
+  EVT VT = BVN->getValueType(0);
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
+
+    for (unsigned i = 0; i < NumSplats; ++i) {
+      CnstBits <<= SplatBitSize;
+      UndefBits <<= SplatBitSize;
+      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
+      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  SDValue LHS = Op.getOperand(0);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  if (!BVN)
+    return Op;
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We only have BIC vector immediate instruction, which is and-not.
+    CnstBits = ~CnstBits;
+
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = ~UndefBits;
+    goto AttemptModImm;
+  }
+
+// We can always fall back to a non-immediate AND.
+FailedModImm:
+  return Op;
+}
+
+// Specialized code to quickly find if PotentialBVec is a BuildVector that
+// consists of only the same constant int value, returned in reference arg
+// ConstVal
+static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
+                                     uint64_t &ConstVal) {
+  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
+  if (!Bvec)
+    return false;
+  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
+  if (!FirstElt)
+    return false;
+  EVT VT = Bvec->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned i = 1; i < NumElts; ++i)
+    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
+      return false;
+  ConstVal = FirstElt->getZExtValue();
+  return true;
+}
+
+static unsigned getIntrinsicID(const SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  switch (Opcode) {
+  default:
+    return Intrinsic::not_intrinsic;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return IID;
+    return Intrinsic::not_intrinsic;
+  }
+  }
+}
+
+// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
+// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
+// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
+// Also, logical shift right -> sri, with the same structure.
+static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // Is the first op an AND?
+  const SDValue And = N->getOperand(0);
+  if (And.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // Is the second op an shl or lshr?
+  SDValue Shift = N->getOperand(1);
+  // This will have been turned into: AArch64ISD::VSHL vector, #shift
+  // or AArch64ISD::VLSHR vector, #shift
+  unsigned ShiftOpc = Shift.getOpcode();
+  if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
+    return SDValue();
+  bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
+
+  // Is the shift amount constant?
+  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+  if (!C2node)
+    return SDValue();
+
+  // Is the and mask vector all constant?
+  uint64_t C1;
+  if (!isAllConstantBuildVector(And.getOperand(1), C1))
+    return SDValue();
+
+  // Is C1 == ~C2, taking into account how much one can shift elements of a
+  // particular size?
+  uint64_t C2 = C2node->getZExtValue();
+  unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
+  if (C2 > ElemSizeInBits)
+    return SDValue();
+  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
+  if ((C1 & ElemMask) != (~C2 & ElemMask))
+    return SDValue();
+
+  SDValue X = And.getOperand(0);
+  SDValue Y = Shift.getOperand(0);
+
+  unsigned Intrin =
+      IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
+  SDValue ResultSLI =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                  DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
+
+  DEBUG(dbgs() << "aarch64-lower: transformed: \n");
+  DEBUG(N->dump(&DAG));
+  DEBUG(dbgs() << "into: \n");
+  DEBUG(ResultSLI->dump(&DAG));
+
+  ++NumShiftInserts;
+  return ResultSLI;
+}
+
+SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
+  if (EnableAArch64SlrGeneration) {
+    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
+    if (Res.getNode())
+      return Res;
+  }
+
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
+  SDValue LHS = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  // OR commutes, so try swapping the operands.
+  if (!BVN) {
+    LHS = Op.getOperand(0);
+    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  }
+  if (!BVN)
+    return Op;
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+
+// We can always fall back to a non-immediate OR.
+FailedModImm:
+  return Op;
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      // Certain magic vector constants (used to express things like NOT
+      // and NEG) are passed through unmodified.  This allows codegen patterns
+      // for these operations to match.  Special-purpose patterns will lower
+      // these immediates to MOVIs if it proves necessary.
+      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
+        return Op;
+
+      // The many faces of MOVI...
+      if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
+        if (VT.getSizeInBits() == 128) {
+          SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
+                                    DAG.getConstant(CnstVal, MVT::i32));
+          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        }
+
+        // Support the V64 version via subregister insertion.
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(264, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(272, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      // The few faces of FMOV...
+      if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
+        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
+          VT.getSizeInBits() == 128) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
+        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      // The many faces of MVNI...
+      CnstVal = ~CnstVal;
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(264, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(272, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+FailedModImm:
+
+  // Scan through the operands to find some interesting properties we can
+  // exploit:
+  //   1) If only one value is used, we can use a DUP, or
+  //   2) if only the low element is not undef, we can just insert that, or
+  //   3) if only one constant value is used (w/ some non-constant lanes),
+  //      we can splat the constant value into the whole vector then fill
+  //      in the non-constant lanes.
+  //   4) FIXME: If different constant values are used, but we can intelligently
+  //             select the values we'll be overwriting for the non-constant
+  //             lanes such that we can directly materialize the vector
+  //             some other way (MOVI, e.g.), we can be sneaky.
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool usesOnlyOneConstantValue = true;
+  bool isConstant = true;
+  unsigned NumConstantLanes = 0;
+  SDValue Value;
+  SDValue ConstantValue;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
+      ++NumConstantLanes;
+      if (!ConstantValue.getNode())
+        ConstantValue = V;
+      else if (ConstantValue != V)
+        usesOnlyOneConstantValue = false;
+    }
+
+    if (!Value.getNode())
+      Value = V;
+    else if (V != Value)
+      usesOnlyOneValue = false;
+  }
+
+  if (!Value.getNode())
+    return DAG.getUNDEF(VT);
+
+  if (isOnlyLowElement)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
+
+  // Use DUP for non-constant splats.  For f32 constant splats, reduce to
+  // i32 and try again.
+  if (usesOnlyOneValue) {
+    if (!isConstant) {
+      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+          Value.getValueType() != VT)
+        return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
+
+      // This is actually a DUPLANExx operation, which keeps everything vectory.
+
+      // DUPLANE works on 128-bit vectors, widen it if necessary.
+      SDValue Lane = Value.getOperand(1);
+      Value = Value.getOperand(0);
+      if (Value.getValueType().getSizeInBits() == 64)
+        Value = WidenVector(Value, DAG);
+
+      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
+      return DAG.getNode(Opcode, dl, VT, Value, Lane);
+    }
+
+    if (VT.getVectorElementType().isFloatingPoint()) {
+      SmallVector<SDValue, 8> Ops;
+      MVT NewType =
+          (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+      for (unsigned i = 0; i < NumElts; ++i)
+        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
+      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+      Val = LowerBUILD_VECTOR(Val, DAG);
+      if (Val.getNode())
+        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+    }
+  }
+
+  // If there was only one constant value used and for more than one lane,
+  // start by splatting that value, then replace the non-constant lanes. This
+  // is better than the default, which will perform a separate initialization
+  // for each lane.
+  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
+    SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
+    // Now insert the non-constant lanes.
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
+        // Note that type legalization likely mucked about with the VT of the
+        // source operand, so we may have to convert it here before inserting.
+        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
+      }
+    }
+    return Val;
+  }
+
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+  if (NumElts >= 4) {
+    SDValue shuffle = ReconstructShuffle(Op, DAG);
+    if (shuffle != SDValue())
+      return shuffle;
+  }
+
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    SDValue Op0 = Op.getOperand(0);
+    unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
+    unsigned i = 0;
+    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+    // a) Avoid a RMW dependency on the full vector register, and
+    // b) Allow the register coalescer to fold away the copy if the
+    //    value is already in an S or D register.
+    if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
+      unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
+      MachineSDNode *N =
+          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
+                             DAG.getTargetConstant(SubIdx, MVT::i32));
+      Vec = SDValue(N, 0);
+      ++i;
+    }
+    for (; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
+
+  // Just use the default expansion. We failed to find a better alternative.
+  return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
+
+  // Check for non-constant lane.
+  if (!isa<ConstantSDNode>(Op.getOperand(2)))
+    return SDValue();
+
+  EVT VT = Op.getOperand(0).getValueType();
+
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+    return Op;
+
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32)
+    return SDValue();
+
+  // For V64 types, we perform insertion by expanding the value
+  // to a V128 type and perform the insertion on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
+
+  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
+                             Op.getOperand(1), Op.getOperand(2));
+  // Re-narrow the resultant vector.
+  return NarrowVector(Node, DAG);
+}
+
+SDValue
+AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
+
+  // Check for non-constant lane.
+  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+    return SDValue();
+
+  EVT VT = Op.getOperand(0).getValueType();
+
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+    return Op;
+
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32)
+    return SDValue();
+
+  // For V64 types, we perform extraction by expanding the value
+  // to a V128 type and perform the extraction on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
+
+  EVT ExtrTy = WideTy.getVectorElementType();
+  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
+    ExtrTy = MVT::i32;
+
+  // For extractions, we just return the result directly.
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
+                     Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  EVT VT = Op.getOperand(0).getValueType();
+  SDLoc dl(Op);
+  // Just in case...
+  if (!VT.isVector())
+    return SDValue();
+
+  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Cst)
+    return SDValue();
+  unsigned Val = Cst->getZExtValue();
+
+  unsigned Size = Op.getValueType().getSizeInBits();
+  if (Val == 0) {
+    switch (Size) {
+    case 8:
+      return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 16:
+      return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 32:
+      return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 64:
+      return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    default:
+      llvm_unreachable("Unexpected vector type in extract_subvector!");
+    }
+  }
+  // If this is extracting the upper 64-bits of a 128-bit vector, we match
+  // that directly.
+  if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
+    return Op;
+
+  return SDValue();
+}
+
+bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                               EVT VT) const {
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (M[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = M[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return true;
+  }
+
+  bool DummyBool;
+  int DummyInt;
+  unsigned DummyUnsigned;
+
+  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
+          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
+          isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
+          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
+          isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
+          isZIPMask(M, VT, DummyUnsigned) ||
+          isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
+          isConcatMask(M, VT, VT.getSizeInBits() == 128));
+}
+
+/// getVShiftImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                    HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation.  That value must be in the range:
+///   0 <= Value < ElementBits for a left shift; or
+///   0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation.  For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+///   1 <= |Value| <= ElementBits for a right shift; or
+///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+                         int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  if (isIntrinsic)
+    Cnt = -Cnt;
+  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
+}
+
+SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  int64_t Cnt;
+
+  if (!Op.getOperand(1).getValueType().isVector())
+    return Op;
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
+      return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32),
+                       Op.getOperand(0), Op.getOperand(1));
+  case ISD::SRA:
+  case ISD::SRL:
+    // Right shift immediate
+    if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
+        Cnt < EltSize) {
+      unsigned Opc =
+          (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
+      return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    }
+
+    // Right shift register.  Note, there is not a shift right register
+    // instruction, but the shift left register instruction takes a signed
+    // value, where negative numbers specify a right shift.
+    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
+                                                : Intrinsic::aarch64_neon_ushl;
+    // negate the shift amount
+    SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
+    SDValue NegShiftLeft =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                    DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
+    return NegShiftLeft;
+  }
+
+  return SDValue();
+}
+
+static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
+                                    AArch64CC::CondCode CC, bool NoNans, EVT VT,
+                                    SDLoc dl, SelectionDAG &DAG) {
+  EVT SrcVT = LHS.getValueType();
+
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
+  bool IsZero = IsCnst && (CnstBits == 0);
+
+  if (SrcVT.getVectorElementType().isFloatingPoint()) {
+    switch (CC) {
+    default:
+      return SDValue();
+    case AArch64CC::NE: {
+      SDValue Fcmeq;
+      if (IsZero)
+        Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+      else
+        Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+      return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
+    }
+    case AArch64CC::EQ:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+    case AArch64CC::GE:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
+    case AArch64CC::GT:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
+    case AArch64CC::LS:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
+    case AArch64CC::LT:
+      if (!NoNans)
+        return SDValue();
+    // If we ignore NaNs then we can use to the MI implementation.
+    // Fallthrough.
+    case AArch64CC::MI:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
+    }
+  }
+
+  switch (CC) {
+  default:
+    return SDValue();
+  case AArch64CC::NE: {
+    SDValue Cmeq;
+    if (IsZero)
+      Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+    else
+      Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+    return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
+  }
+  case AArch64CC::EQ:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+  case AArch64CC::GE:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
+  case AArch64CC::GT:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
+  case AArch64CC::LE:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
+  case AArch64CC::LS:
+    return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
+  case AArch64CC::LO:
+    return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
+  case AArch64CC::LT:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
+  case AArch64CC::HI:
+    return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
+  case AArch64CC::HS:
+    return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  if (LHS.getValueType().getVectorElementType().isInteger()) {
+    assert(LHS.getValueType() == RHS.getValueType());
+    AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+    return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(),
+                                dl, DAG);
+  }
+
+  assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
+         LHS.getValueType().getVectorElementType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  AArch64CC::CondCode CC1, CC2;
+  bool ShouldInvert;
+  changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
+
+  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
+  SDValue Cmp =
+      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
+  if (!Cmp.getNode())
+    return SDValue();
+
+  if (CC2 != AArch64CC::AL) {
+    SDValue Cmp2 =
+        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
+    if (!Cmp2.getNode())
+      return SDValue();
+
+    Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
+  }
+
+  if (ShouldInvert)
+    return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
+
+  return Cmp;
+}
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                               const CallInst &I,
+                                               unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_ld4:
+  case Intrinsic::aarch64_neon_ld1x2:
+  case Intrinsic::aarch64_neon_ld1x3:
+  case Intrinsic::aarch64_neon_ld1x4:
+  case Intrinsic::aarch64_neon_ld2lane:
+  case Intrinsic::aarch64_neon_ld3lane:
+  case Intrinsic::aarch64_neon_ld4lane:
+  case Intrinsic::aarch64_neon_ld2r:
+  case Intrinsic::aarch64_neon_ld3r:
+  case Intrinsic::aarch64_neon_ld4r: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_neon_st2:
+  case Intrinsic::aarch64_neon_st3:
+  case Intrinsic::aarch64_neon_st4:
+  case Intrinsic::aarch64_neon_st1x2:
+  case Intrinsic::aarch64_neon_st1x3:
+  case Intrinsic::aarch64_neon_st1x4:
+  case Intrinsic::aarch64_neon_st2lane:
+  case Intrinsic::aarch64_neon_st3lane:
+  case Intrinsic::aarch64_neon_st4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::aarch64_ldaxr:
+  case Intrinsic::aarch64_ldxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_stlxr:
+  case Intrinsic::aarch64_stxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::aarch64_ldaxp:
+  case Intrinsic::aarch64_ldxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_stlxp:
+  case Intrinsic::aarch64_stxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
+// Truncations from 64-bit GPR to 32-bit GPR is free.
+bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  if (NumBits1 <= NumBits2)
+    return false;
+  return true;
+}
+bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  if (NumBits1 <= NumBits2)
+    return false;
+  return true;
+}
+
+// All 32-bit GPR operations implicitly zero the high-half of the corresponding
+// 64-bit GPR.
+bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  if (NumBits1 == 32 && NumBits2 == 64)
+    return true;
+  return false;
+}
+bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  if (NumBits1 == 32 && NumBits2 == 64)
+    return true;
+  return false;
+}
+
+bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  EVT VT1 = Val.getValueType();
+  if (isZExtFree(VT1, VT2)) {
+    return true;
+  }
+
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
+  return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() &&
+          VT2.isInteger() && VT1.getSizeInBits() <= 32);
+}
+
+bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
+                                          unsigned &RequiredAligment) const {
+  if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
+                                          unsigned &RequiredAligment) const {
+  if (!LoadedType.isSimple() ||
+      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType.getSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+                       unsigned AlignCheck) {
+  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+          (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                               unsigned SrcAlign, bool IsMemset,
+                                               bool ZeroMemset,
+                                               bool MemcpyStrSrc,
+                                               MachineFunction &MF) const {
+  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
+  // instruction to materialize the v2i64 zero and one store (with restrictive
+  // addressing mode). Just do two i64 store of zero-registers.
+  bool Fast;
+  const Function *F = MF.getFunction();
+  if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
+      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::NoImplicitFloat) &&
+      (memOpAlign(SrcAlign, DstAlign, 16) ||
+       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+    return MVT::f128;
+
+  return Size >= 8 ? MVT::i64 : MVT::i32;
+}
+
+// 12-bit optionally shifted immediates are legal for adds.
+bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
+  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
+    return true;
+  return false;
+}
+
+// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
+// immediates is the same as for an add or a sub.
+bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
+  if (Immed < 0)
+    Immed *= -1;
+  return isLegalAddImmediate(Immed);
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                                  Type *Ty) const {
+  // AArch64 has five basic addressing modes:
+  //  reg
+  //  reg + 9-bit signed offset
+  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
+  //  reg1 + reg2
+  //  reg + SIZE_IN_BYTES * reg
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // No reg+reg+imm addressing.
+  if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
+    return false;
+
+  // check reg + imm case:
+  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
+  uint64_t NumBytes = 0;
+  if (Ty->isSized()) {
+    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
+    NumBytes = NumBits / 8;
+    if (!isPowerOf2_64(NumBits))
+      NumBytes = 0;
+  }
+
+  if (!AM.Scale) {
+    int64_t Offset = AM.BaseOffs;
+
+    // 9-bit signed offset
+    if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
+      return true;
+
+    // 12-bit unsigned offset
+    unsigned shift = Log2_64(NumBytes);
+    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
+        // Must be a multiple of NumBytes (NumBytes is a power of 2)
+        (Offset >> shift) << shift == Offset)
+      return true;
+    return false;
+  }
+
+  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
+
+  if (!AM.Scale || AM.Scale == 1 ||
+      (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
+    return true;
+  return false;
+}
+
+int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
+                                                Type *Ty) const {
+  // Scaling factors are not free at all.
+  // Operands                     | Rt Latency
+  // -------------------------------------------
+  // Rt, [Xn, Xm]                 | 4
+  // -------------------------------------------
+  // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
+  // Rt, [Xn, Wm, <extend> #imm]  |
+  if (isLegalAddressingMode(AM, Ty))
+    // Scale represents reg2 * scale, thus account for 1 if
+    // it is not equal to 0 or 1.
+    return AM.Scale != 0 && AM.Scale != 1;
+  return -1;
+}
+
+bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+const MCPhysReg *
+AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  // LR is a callee-save register, but we must treat it as clobbered by any call
+  // site. Hence we include LR in the scratch registers, which are in turn added
+  // as implicit-defs for stackmaps and patchpoints.
+  static const MCPhysReg ScratchRegs[] = {
+    AArch64::X16, AArch64::X17, AArch64::LR, 0
+  };
+  return ScratchRegs;
+}
+
+bool
+AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
+  EVT VT = N->getValueType(0);
+    // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
+    // it with shift to let it be lowered to UBFX.
+  if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
+      isa<ConstantSDNode>(N->getOperand(1))) {
+    uint64_t TruncMask = N->getConstantOperandVal(1);
+    if (isMask_64(TruncMask) &&
+      N->getOperand(0).getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+      return false;
+  }
+  return true;
+}
+
+bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                              Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return false;
+
+  int64_t Val = Imm.getSExtValue();
+  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
+    return true;
+
+  if ((int64_t)Val < 0)
+    Val = ~Val;
+  if (BitSize == 32)
+    Val &= (1LL << 32) - 1;
+
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  unsigned Shift = (63 - LZ) / 16;
+  // MOVZ is free so return true for one or fewer MOVK.
+  return (Shift < 3) ? true : false;
+}
+
+// Generate SUBS and CSEL for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CSEL.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
+        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                  N0.getOperand(0));
+        // Generate SUBS & CSEL.
+        SDValue Cmp =
+            DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+                        N0.getOperand(0), DAG.getConstant(0, VT));
+        return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
+                           DAG.getConstant(AArch64CC::PL, MVT::i32),
+                           SDValue(Cmp.getNode(), 1));
+      }
+  return SDValue();
+}
+
+// performXorCombine - Attempts to handle integer ABS.
+static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  return performIntegerAbsCombine(N, DAG);
+}
+
+static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Multiplication of a power of two plus/minus one can be done more
+  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
+  // future CPUs have a cheaper MADD instruction, this may need to be
+  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
+  // 64-bit is 5 cycles, so this is always a win.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+    APInt Value = C->getAPIntValue();
+    EVT VT = N->getValueType(0);
+    APInt VP1 = Value + 1;
+    if (VP1.isPowerOf2()) {
+      // Multiplying by one less than a power of two, replace with a shift
+      // and a subtract.
+      SDValue ShiftedVal =
+          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                      DAG.getConstant(VP1.logBase2(), MVT::i64));
+      return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+    }
+    APInt VM1 = Value - 1;
+    if (VM1.isPowerOf2()) {
+      // Multiplying by one more than a power of two, replace with a shift
+      // and an add.
+      SDValue ShiftedVal =
+          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                      DAG.getConstant(VM1.logBase2(), MVT::i64));
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+    }
+  }
+  return SDValue();
+}
+
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return SDValue();
+  // Only optimize when the source and destination types have the same width.
+  if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
+    return SDValue();
+
+  // If the result of an integer load is only used by an integer-to-float
+  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
+  // This eliminates an "integer-to-vector-move UOP and improve throughput.
+  SDValue N0 = N->getOperand(0);
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+                               LN0->getPointerInfo(), LN0->isVolatile(),
+                               LN0->isNonTemporal(), LN0->isInvariant(),
+                               LN0->getAlignment());
+
+    // Make sure successors of the original load stay after it by updating them
+    // to use the new Chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
+
+    unsigned Opcode =
+        (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
+    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
+  }
+
+  return SDValue();
+}
+
+/// An EXTR instruction is made up of two shifts, ORed together. This helper
+/// searches for and classifies those shifts.
+static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
+                         bool &FromHi) {
+  if (N.getOpcode() == ISD::SHL)
+    FromHi = false;
+  else if (N.getOpcode() == ISD::SRL)
+    FromHi = true;
+  else
+    return false;
+
+  if (!isa<ConstantSDNode>(N.getOperand(1)))
+    return false;
+
+  ShiftAmount = N->getConstantOperandVal(1);
+  Src = N->getOperand(0);
+  return true;
+}
+
+/// EXTR instruction extracts a contiguous chunk of bits from two existing
+/// registers viewed as a high/low pair. This function looks for the pattern:
+/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
+/// EXTR. Can't quite be done in TableGen because the two immediates aren't
+/// independent.
+static SDValue tryCombineToEXTR(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDValue LHS;
+  uint32_t ShiftLHS = 0;
+  bool LHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
+    return SDValue();
+
+  SDValue RHS;
+  uint32_t ShiftRHS = 0;
+  bool RHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
+    return SDValue();
+
+  // If they're both trying to come from the high part of the register, they're
+  // not really an EXTR.
+  if (LHSFromHi == RHSFromHi)
+    return SDValue();
+
+  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
+    return SDValue();
+
+  if (LHSFromHi) {
+    std::swap(LHS, RHS);
+    std::swap(ShiftLHS, ShiftRHS);
+  }
+
+  return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
+                     DAG.getConstant(ShiftRHS, MVT::i64));
+}
+
+static SDValue tryCombineToBSL(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // We only have to look for constant vectors here since the general, variable
+  // case can be handled in TableGen.
+  unsigned Bits = VT.getVectorElementType().getSizeInBits();
+  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+  for (int i = 1; i >= 0; --i)
+    for (int j = 1; j >= 0; --j) {
+      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+      if (!BVN0 || !BVN1)
+        continue;
+
+      bool FoundMatch = true;
+      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+        if (!CN0 || !CN1 ||
+            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+          FoundMatch = false;
+          break;
+        }
+      }
+
+      if (FoundMatch)
+        return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+                           N0->getOperand(1 - i), N1->getOperand(1 - j));
+    }
+
+  return SDValue();
+}
+
+static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                                const AArch64Subtarget *Subtarget) {
+  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
+  if (!EnableAArch64ExtrGeneration)
+    return SDValue();
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDValue Res = tryCombineToEXTR(N, DCI);
+  if (Res.getNode())
+    return Res;
+
+  Res = tryCombineToBSL(N, DCI);
+  if (Res.getNode())
+    return Res;
+
+  return SDValue();
+}
+
+static SDValue performBitcastCombine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Remove extraneous bitcasts around an extract_subvector.
+  // For example,
+  //    (v4i16 (bitconvert
+  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
+  //  becomes
+  //    (extract_subvector ((v8i16 ...), (i64 4)))
+
+  // Only interested in 64-bit vectors as the ultimate result.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+  if (VT.getSimpleVT().getSizeInBits() != 64)
+    return SDValue();
+  // Is the operand an extract_subvector starting at the beginning or halfway
+  // point of the vector? A low half may also come through as an
+  // EXTRACT_SUBREG, so look for that, too.
+  SDValue Op0 = N->getOperand(0);
+  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
+      !(Op0->isMachineOpcode() &&
+        Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
+    return SDValue();
+  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
+  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
+      return SDValue();
+  } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
+    if (idx != AArch64::dsub)
+      return SDValue();
+    // The dsub reference is equivalent to a lane zero subvector reference.
+    idx = 0;
+  }
+  // Look through the bitcast of the input to the extract.
+  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue Source = Op0->getOperand(0)->getOperand(0);
+  // If the source type has twice the number of elements as our destination
+  // type, we know this is an extract of the high or low half of the vector.
+  EVT SVT = Source->getValueType(0);
+  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
+    return SDValue();
+
+  DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
+
+  // Create the simplified form to just extract the low or high half of the
+  // vector directly rather than bothering with the bitcasts.
+  SDLoc dl(N);
+  unsigned NumElements = VT.getVectorNumElements();
+  if (idx) {
+    SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
+  } else {
+    SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32);
+    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
+                                      Source, SubReg),
+                   0);
+  }
+}
+
+static SDValue performConcatVectorsCombine(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
+  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
+  // canonicalise to that.
+  if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
+    assert(VT.getVectorElementType().getSizeInBits() == 64);
+    return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT,
+                       WidenVector(N->getOperand(0), DAG),
+                       DAG.getConstant(0, MVT::i64));
+  }
+
+  // Canonicalise concat_vectors so that the right-hand vector has as few
+  // bit-casts as possible before its real operation. The primary matching
+  // destination for these operations will be the narrowing "2" instructions,
+  // which depend on the operation being performed on this right-hand vector.
+  // For example,
+  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
+  // becomes
+  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
+
+  SDValue Op1 = N->getOperand(1);
+  if (Op1->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue RHS = Op1->getOperand(0);
+  MVT RHSTy = RHS.getValueType().getSimpleVT();
+  // If the RHS is not a vector, this is not the pattern we're looking for.
+  if (!RHSTy.isVector())
+    return SDValue();
+
+  DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
+
+  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
+                                  RHSTy.getVectorNumElements() * 2);
+  return DAG.getNode(
+      ISD::BITCAST, dl, VT,
+      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
+                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
+}
+
+static SDValue tryCombineFixedPointConvert(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+  // Transform a scalar conversion of a value from a lane extract into a
+  // lane extract of a vector conversion. E.g., from foo1 to foo2:
+  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
+  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
+  //
+  // The second form interacts better with instruction selection and the
+  // register allocator to avoid cross-class register copies that aren't
+  // coalescable due to a lane reference.
+
+  // Check the operand and see if it originates from a lane extract.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    // Yep, no additional predication needed. Perform the transform.
+    SDValue IID = N->getOperand(0);
+    SDValue Shift = N->getOperand(2);
+    SDValue Vec = Op1.getOperand(0);
+    SDValue Lane = Op1.getOperand(1);
+    EVT ResTy = N->getValueType(0);
+    EVT VecResTy;
+    SDLoc DL(N);
+
+    // The vector width should be 128 bits by the time we get here, even
+    // if it started as 64 bits (the extract_vector handling will have
+    // done so).
+    assert(Vec.getValueType().getSizeInBits() == 128 &&
+           "unexpected vector size on extract_vector_elt!");
+    if (Vec.getValueType() == MVT::v4i32)
+      VecResTy = MVT::v4f32;
+    else if (Vec.getValueType() == MVT::v2i64)
+      VecResTy = MVT::v2f64;
+    else
+      assert(0 && "unexpected vector type!");
+
+    SDValue Convert =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
+  }
+  return SDValue();
+}
+
+// AArch64 high-vector "long" operations are formed by performing the non-high
+// version on an extract_subvector of each operand which gets the high half:
+//
+//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
+//
+// However, there are cases which don't have an extract_high explicitly, but
+// have another operation that can be made compatible with one for free. For
+// example:
+//
+//  (dupv64 scalar) --> (extract_high (dup128 scalar))
+//
+// This routine does the actual conversion of such DUPs, once outer routines
+// have determined that everything else is in order.
+static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+  // We can handle most types of duplicate, but the lane ones have an extra
+  // operand saying *which* lane, so we need to know.
+  bool IsDUPLANE;
+  switch (N.getOpcode()) {
+  case AArch64ISD::DUP:
+    IsDUPLANE = false;
+    break;
+  case AArch64ISD::DUPLANE8:
+  case AArch64ISD::DUPLANE16:
+  case AArch64ISD::DUPLANE32:
+  case AArch64ISD::DUPLANE64:
+    IsDUPLANE = true;
+    break;
+  default:
+    return SDValue();
+  }
+
+  MVT NarrowTy = N.getSimpleValueType();
+  if (!NarrowTy.is64BitVector())
+    return SDValue();
+
+  MVT ElementTy = NarrowTy.getVectorElementType();
+  unsigned NumElems = NarrowTy.getVectorNumElements();
+  MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+
+  SDValue NewDUP;
+  if (IsDUPLANE)
+    NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
+                         N.getOperand(1));
+  else
+    NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
+
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
+                     NewDUP, DAG.getConstant(NumElems, MVT::i64));
+}
+
+static bool isEssentiallyExtractSubvector(SDValue N) {
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    return true;
+
+  return N.getOpcode() == ISD::BITCAST &&
+         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+}
+
+/// \brief Helper structure to keep track of ISD::SET_CC operands.
+struct GenericSetCCInfo {
+  const SDValue *Opnd0;
+  const SDValue *Opnd1;
+  ISD::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
+struct AArch64SetCCInfo {
+  const SDValue *Cmp;
+  AArch64CC::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of SetCC information.
+union SetCCInfo {
+  GenericSetCCInfo Generic;
+  AArch64SetCCInfo AArch64;
+};
+
+/// \brief Helper structure to be able to read SetCC information.  If set to
+/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
+/// GenericSetCCInfo.
+struct SetCCInfoAndKind {
+  SetCCInfo Info;
+  bool IsAArch64;
+};
+
+/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
+/// an
+/// AArch64 lowered one.
+/// \p SetCCInfo is filled accordingly.
+/// \post SetCCInfo is meanginfull only when this function returns true.
+/// \return True when Op is a kind of SET_CC operation.
+static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
+  // If this is a setcc, this is straight forward.
+  if (Op.getOpcode() == ISD::SETCC) {
+    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
+    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
+    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    SetCCInfo.IsAArch64 = false;
+    return true;
+  }
+  // Otherwise, check if this is a matching csel instruction.
+  // In other words:
+  // - csel 1, 0, cc
+  // - csel 0, 1, !cc
+  if (Op.getOpcode() != AArch64ISD::CSEL)
+    return false;
+  // Set the information about the operands.
+  // TODO: we want the operands of the Cmp not the csel
+  SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
+  SetCCInfo.IsAArch64 = true;
+  SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
+      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // Check that the operands matches the constraints:
+  // (1) Both operands must be constants.
+  // (2) One must be 1 and the other must be 0.
+  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+
+  // Check (1).
+  if (!TValue || !FValue)
+    return false;
+
+  // Check (2).
+  if (!TValue->isOne()) {
+    // Update the comparison when we are interested in !cc.
+    std::swap(TValue, FValue);
+    SetCCInfo.Info.AArch64.CC =
+        AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
+  }
+  return TValue->isOne() && FValue->isNullValue();
+}
+
+// Returns true if Op is setcc or zext of setcc.
+static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
+  if (isSetCC(Op, Info))
+    return true;
+  return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
+    isSetCC(Op->getOperand(0), Info));
+}
+
+// The folding we want to perform is:
+// (add x, [zext] (setcc cc ...) )
+//   -->
+// (csel x, (add x, 1), !cc ...)
+//
+// The latter will get matched to a CSINC instruction.
+static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
+  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
+  SDValue LHS = Op->getOperand(0);
+  SDValue RHS = Op->getOperand(1);
+  SetCCInfoAndKind InfoAndKind;
+
+  // If neither operand is a SET_CC, give up.
+  if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
+    std::swap(LHS, RHS);
+    if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
+      return SDValue();
+  }
+
+  // FIXME: This could be generatized to work for FP comparisons.
+  EVT CmpVT = InfoAndKind.IsAArch64
+                  ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
+                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
+  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
+    return SDValue();
+
+  SDValue CCVal;
+  SDValue Cmp;
+  SDLoc dl(Op);
+  if (InfoAndKind.IsAArch64) {
+    CCVal = DAG.getConstant(
+        AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32);
+    Cmp = *InfoAndKind.Info.AArch64.Cmp;
+  } else
+    Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
+                      *InfoAndKind.Info.Generic.Opnd1,
+                      ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
+                      CCVal, DAG, dl);
+
+  EVT VT = Op->getValueType(0);
+  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
+  return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
+}
+
+// The basic add/sub long vector instructions have variants with "2" on the end
+// which act on the high-half of their inputs. They are normally matched by
+// patterns like:
+//
+// (add (zeroext (extract_high LHS)),
+//      (zeroext (extract_high RHS)))
+// -> uaddl2 vD, vN, vM
+//
+// However, if one of the extracts is something like a duplicate, this
+// instruction can still be used profitably. This function puts the DAG into a
+// more appropriate form for those patterns to trigger.
+static SDValue performAddSubLongCombine(SDNode *N,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  if (!VT.is128BitVector()) {
+    if (N->getOpcode() == ISD::ADD)
+      return performSetccAddFolding(N, DAG);
+    return SDValue();
+  }
+
+  // Make sure both branches are extended in the same way.
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
+       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
+      LHS.getOpcode() != RHS.getOpcode())
+    return SDValue();
+
+  unsigned ExtType = LHS.getOpcode();
+
+  // It's not worth doing if at least one of the inputs isn't already an
+  // extract, but we don't know which it'll be so we have to try both.
+  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
+    if (!RHS.getNode())
+      return SDValue();
+
+    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
+  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
+    if (!LHS.getNode())
+      return SDValue();
+
+    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
+  }
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
+}
+
+// Massage DAGs which we can use the high-half "long" operations on into
+// something isel will recognize better. E.g.
+//
+// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
+//   (aarch64_neon_umull (extract_high (v2i64 vec)))
+//                     (extract_high (v2i64 (dup128 scalar)))))
+//
+static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  assert(LHS.getValueType().is64BitVector() &&
+         RHS.getValueType().is64BitVector() &&
+         "unexpected shape for long operation");
+
+  // Either node could be a DUP, but it's not worth doing both of them (you'd
+  // just as well use the non-high version) so look for a corresponding extract
+  // operation on the other "wing".
+  if (isEssentiallyExtractSubvector(LHS)) {
+    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
+    if (!RHS.getNode())
+      return SDValue();
+  } else if (isEssentiallyExtractSubvector(RHS)) {
+    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
+    if (!LHS.getNode())
+      return SDValue();
+  }
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), LHS, RHS);
+}
+
+static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
+  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
+  unsigned ElemBits = ElemTy.getSizeInBits();
+
+  int64_t ShiftAmount;
+  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
+    APInt SplatValue, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, ElemBits) ||
+        SplatBitSize != ElemBits)
+      return SDValue();
+
+    ShiftAmount = SplatValue.getSExtValue();
+  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
+    ShiftAmount = CVN->getSExtValue();
+  } else
+    return SDValue();
+
+  unsigned Opcode;
+  bool IsRightShift;
+  switch (IID) {
+  default:
+    llvm_unreachable("Unknown shift intrinsic");
+  case Intrinsic::aarch64_neon_sqshl:
+    Opcode = AArch64ISD::SQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::aarch64_neon_uqshl:
+    Opcode = AArch64ISD::UQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::aarch64_neon_srshl:
+    Opcode = AArch64ISD::SRSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::aarch64_neon_urshl:
+    Opcode = AArch64ISD::URSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::aarch64_neon_sqshlu:
+    Opcode = AArch64ISD::SQSHLU_I;
+    IsRightShift = false;
+    break;
+  }
+
+  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
+    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(-ShiftAmount, MVT::i32));
+  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits)
+    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(ShiftAmount, MVT::i32));
+
+  return SDValue();
+}
+
+// The CRC32[BH] instructions ignore the high bits of their data operand. Since
+// the intrinsics must be legal and take an i32, this means there's almost
+// certainly going to be a zext in the DAG which we can eliminate.
+static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
+  SDValue AndN = N->getOperand(2);
+  if (AndN.getOpcode() != ISD::AND)
+    return SDValue();
+
+  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
+  if (!CMask || CMask->getZExtValue() != Mask)
+    return SDValue();
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
+                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const AArch64Subtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned IID = getIntrinsicID(N);
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::aarch64_neon_vcvtfxs2fp:
+  case Intrinsic::aarch64_neon_vcvtfxu2fp:
+    return tryCombineFixedPointConvert(N, DCI, DAG);
+    break;
+  case Intrinsic::aarch64_neon_fmax:
+    return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_fmin:
+    return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull:
+  case Intrinsic::aarch64_neon_pmull:
+  case Intrinsic::aarch64_neon_sqdmull:
+    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
+  case Intrinsic::aarch64_neon_sqshl:
+  case Intrinsic::aarch64_neon_uqshl:
+  case Intrinsic::aarch64_neon_sqshlu:
+  case Intrinsic::aarch64_neon_srshl:
+  case Intrinsic::aarch64_neon_urshl:
+    return tryCombineShiftImm(IID, N, DAG);
+  case Intrinsic::aarch64_crc32b:
+  case Intrinsic::aarch64_crc32cb:
+    return tryCombineCRC32(0xff, N, DAG);
+  case Intrinsic::aarch64_crc32h:
+  case Intrinsic::aarch64_crc32ch:
+    return tryCombineCRC32(0xffff, N, DAG);
+  }
+  return SDValue();
+}
+
+static SDValue performExtendCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
+  // we can convert that DUP into another extract_high (of a bigger DUP), which
+  // helps the backend to decide that an sabdl2 would be useful, saving a real
+  // extract_high operation.
+  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
+      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    SDNode *ABDNode = N->getOperand(0).getNode();
+    unsigned IID = getIntrinsicID(ABDNode);
+    if (IID == Intrinsic::aarch64_neon_sabd ||
+        IID == Intrinsic::aarch64_neon_uabd) {
+      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
+      if (!NewABD.getNode())
+        return SDValue();
+
+      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
+                         NewABD);
+    }
+  }
+
+  // This is effectively a custom type legalization for AArch64.
+  //
+  // Type legalization will split an extend of a small, legal, type to a larger
+  // illegal type by first splitting the destination type, often creating
+  // illegal source types, which then get legalized in isel-confusing ways,
+  // leading to really terrible codegen. E.g.,
+  //   %result = v8i32 sext v8i8 %value
+  // becomes
+  //   %losrc = extract_subreg %value, ...
+  //   %hisrc = extract_subreg %value, ...
+  //   %lo = v4i32 sext v4i8 %losrc
+  //   %hi = v4i32 sext v4i8 %hisrc
+  // Things go rapidly downhill from there.
+  //
+  // For AArch64, the [sz]ext vector instructions can only go up one element
+  // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
+  // take two instructions.
+  //
+  // This implies that the most efficient way to do the extend from v8i8
+  // to two v4i32 values is to first extend the v8i8 to v8i16, then do
+  // the normal splitting to happen for the v8i16->v8i32.
+
+  // This is pre-legalization to catch some cases where the default
+  // type legalization will create ill-tempered code.
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // We're only interested in cleaning things up for non-legal vector types
+  // here. If both the source and destination are legal, things will just
+  // work naturally without any fiddling.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT ResVT = N->getValueType(0);
+  if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
+    return SDValue();
+  // If the vector type isn't a simple VT, it's beyond the scope of what
+  // we're  worried about here. Let legalization do its thing and hope for
+  // the best.
+  if (!ResVT.isSimple())
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+  MVT SrcVT = Src->getValueType(0).getSimpleVT();
+  // If the source VT is a 64-bit vector, we can play games and get the
+  // better results we want.
+  if (SrcVT.getSizeInBits() != 64)
+    return SDValue();
+
+  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
+  unsigned ElementCount = SrcVT.getVectorNumElements();
+  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
+  SDLoc DL(N);
+  Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
+
+  // Now split the rest of the operation into two halves, each with a 64
+  // bit source.
+  EVT LoVT, HiVT;
+  SDValue Lo, Hi;
+  unsigned NumElements = ResVT.getVectorNumElements();
+  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
+                                 ResVT.getVectorElementType(), NumElements / 2);
+
+  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
+                               LoVT.getVectorNumElements());
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getIntPtrConstant(0));
+  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
+
+  // Now combine the parts back together so we still have a single result
+  // like the combiner expects.
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+}
+
+/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
+/// value. The load store optimizer pass will merge them to store pair stores.
+/// This has better performance than a splat of the scalar followed by a split
+/// vector store. Even if the stores are not merged it is four stores vs a dup,
+/// followed by an ext.b and two stores.
+static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't replace floating point stores, they possibly won't be transformed to
+  // stp because of the store pair suppress pass.
+  if (VT.isFloatingPoint())
+    return SDValue();
+
+  // Check for insert vector elements.
+  if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
+    return SDValue();
+
+  // We can express a splat as store pair(s) for 2 or 4 elements.
+  unsigned NumVecElts = VT.getVectorNumElements();
+  if (NumVecElts != 4 && NumVecElts != 2)
+    return SDValue();
+  SDValue SplatVal = StVal.getOperand(1);
+  unsigned RemainInsertElts = NumVecElts - 1;
+
+  // Check that this is a splat.
+  while (--RemainInsertElts) {
+    SDValue NextInsertElt = StVal.getOperand(0);
+    if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
+      return SDValue();
+    if (NextInsertElt.getOperand(1) != SplatVal)
+      return SDValue();
+    StVal = NextInsertElt;
+  }
+  unsigned OrigAlignment = St->getAlignment();
+  unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
+  unsigned Alignment = std::min(OrigAlignment, EltOffset);
+
+  // Create scalar stores. This is at least as good as the code sequence for a
+  // split unaligned store wich is a dup.s, ext.b, and two stores.
+  // Most of the time the three stores should be replaced by store pair
+  // instructions (stp).
+  SDLoc DL(St);
+  SDValue BasePtr = St->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
+                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
+
+  unsigned Offset = EltOffset;
+  while (--NumVecElts) {
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                    DAG.getConstant(Offset, MVT::i64));
+    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
+                          St->getPointerInfo(), St->isVolatile(),
+                          St->isNonTemporal(), Alignment);
+    Offset += EltOffset;
+  }
+  return NewST1;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   SelectionDAG &DAG,
+                                   const AArch64Subtarget *Subtarget) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  StoreSDNode *S = cast<StoreSDNode>(N);
+  if (S->isVolatile())
+    return SDValue();
+
+  // Cyclone has bad performance on unaligned 16B stores when crossing line and
+  // page boundries. We want to split such stores.
+  if (!Subtarget->isCyclone())
+    return SDValue();
+
+  // Don't split at Oz.
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
+  if (IsMinSize)
+    return SDValue();
+
+  SDValue StVal = S->getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
+  // those up regresses performance on micro-benchmarks and olden/bh.
+  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+    return SDValue();
+
+  // Split unaligned 16B stores. They are terrible for performance.
+  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
+  // extensions can use this to mark that it does not want splitting to happen
+  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
+  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
+  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
+      S->getAlignment() <= 2)
+    return SDValue();
+
+  // If we get a splat of a scalar convert this vector store to a store of
+  // scalars. They will be merged into store pairs thereby removing two
+  // instructions.
+  SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
+  if (ReplacedSplat != SDValue())
+    return ReplacedSplat;
+
+  SDLoc DL(S);
+  unsigned NumElts = VT.getVectorNumElements() / 2;
+  // Split VT into two.
+  EVT HalfVT =
+      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getIntPtrConstant(0));
+  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getIntPtrConstant(NumElts));
+  SDValue BasePtr = S->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
+                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
+  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                  DAG.getConstant(8, MVT::i64));
+  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
+                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
+                      S->getAlignment());
+}
+
+/// Target-specific DAG combine function for post-increment LD1 (lane) and
+/// post-increment LD1R.
+static SDValue performPostLD1Combine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     bool IsLaneOp) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  unsigned LoadIdx = IsLaneOp ? 1 : 0;
+  SDNode *LD = N->getOperand(LoadIdx).getNode();
+  // If it is not LOAD, can not do such combine.
+  if (LD->getOpcode() != ISD::LOAD)
+    return SDValue();
+
+  LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
+  EVT MemVT = LoadSDN->getMemoryVT();
+  // Check if memory operand is the same type as the vector element.
+  if (MemVT != VT.getVectorElementType())
+    return SDValue();
+
+  // Check if there are other uses. If so, do not combine as it will introduce
+  // an extra load.
+  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
+       ++UI) {
+    if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
+      continue;
+    if (*UI != N)
+      return SDValue();
+  }
+
+  SDValue Addr = LD->getOperand(1);
+  SDValue Vector = N->getOperand(0);
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
+       Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD
+        || UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load.  Otherwise, folding it
+    // would create a cycle.
+    if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
+      continue;
+    // Also check that add is not used in the vector operand.  This would also
+    // create a cycle.
+    if (User->isPredecessorOf(Vector.getNode()))
+      continue;
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      unsigned NumBytes = VT.getScalarSizeInBits() / 8;
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+    }
+
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(LD->getOperand(0));  // Chain
+    if (IsLaneOp) {
+      Ops.push_back(Vector);           // The vector to be inserted
+      Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
+    }
+    Ops.push_back(Addr);
+    Ops.push_back(Inc);
+
+    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
+    unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
+                                           MemVT,
+                                           LoadSDN->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    NewResults.push_back(SDValue(LD, 0));             // The result of load
+    NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
+    DCI.CombineTo(LD, NewResults);
+    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
+
+    break;
+  }
+  return SDValue();
+}
+
+/// Target-specific DAG combine function for NEON load/store intrinsics
+/// to merge base address updates.
+static SDValue performNEONPostLDSTCombine(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  unsigned AddrOpIdx = N->getNumOperands() - 1;
+  SDValue Addr = N->getOperand(AddrOpIdx);
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle.
+    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool IsStore = false;
+    bool IsLaneOp = false;
+    bool IsDupOp = false;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: llvm_unreachable("unexpected intrinsic for Neon base update");
+    case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
+      NumVecs = 2; break;
+    case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
+      NumVecs = 3; break;
+    case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
+      NumVecs = 4; break;
+    case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
+      NumVecs = 2; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
+      NumVecs = 3; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
+      NumVecs = 4; IsStore = true; break;
+    case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
+      NumVecs = 2; break;
+    case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
+      NumVecs = 3; break;
+    case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
+      NumVecs = 4; break;
+    case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
+      NumVecs = 2; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
+      NumVecs = 3; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
+      NumVecs = 4; IsStore = true; break;
+    case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
+      NumVecs = 2; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
+      NumVecs = 3; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
+      NumVecs = 4; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
+      NumVecs = 2; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
+      NumVecs = 3; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
+      NumVecs = 4; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
+      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
+      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
+      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
+    }
+
+    EVT VecTy;
+    if (IsStore)
+      VecTy = N->getOperand(2).getValueType();
+    else
+      VecTy = N->getValueType(0);
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+      if (IsLaneOp || IsDupOp)
+        NumBytes /= VecTy.getVectorNumElements();
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+    }
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // Incoming chain
+    // Load lane and store have vector list as input.
+    if (IsLaneOp || IsStore)
+      for (unsigned i = 2; i < AddrOpIdx; ++i)
+        Ops.push_back(N->getOperand(i));
+    Ops.push_back(Addr); // Base register
+    Ops.push_back(Inc);
+
+    // Return Types.
+    EVT Tys[6];
+    unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = VecTy;
+    Tys[n++] = MVT::i64;  // Type of write back register
+    Tys[n] = MVT::Other;  // Type of the chain
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
+
+    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
+                                           MemInt->getMemoryVT(),
+                                           MemInt->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i) {
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
+    }
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+    break;
+  }
+  return SDValue();
+}
+
+// Optimize compare with zero and branch.
+static SDValue performBRCONDCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue CCVal = N->getOperand(2);
+  SDValue Cmp = N->getOperand(3);
+
+  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
+  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
+  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
+    return SDValue();
+
+  unsigned CmpOpc = Cmp.getOpcode();
+  if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
+    return SDValue();
+
+  // Only attempt folding if there is only one use of the flag and no use of the
+  // value.
+  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
+    return SDValue();
+
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+
+  assert(LHS.getValueType() == RHS.getValueType() &&
+         "Expected the value type to be the same for both operands!");
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return SDValue();
+
+  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
+    std::swap(LHS, RHS);
+
+  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
+    return SDValue();
+
+  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
+      LHS.getOpcode() == ISD::SRL)
+    return SDValue();
+
+  // Fold the compare into the branch instruction.
+  SDValue BR;
+  if (CC == AArch64CC::EQ)
+    BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+  else
+    BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+
+  // Do not add new nodes to DAG combiner worklist.
+  DCI.CombineTo(N, BR, false);
+
+  return SDValue();
+}
+
+// vselect (v1i1 setcc) ->
+//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
+// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
+// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
+// such VSELECT.
+static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT CCVT = N0.getValueType();
+
+  if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
+      CCVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  EVT ResVT = N->getValueType(0);
+  EVT CmpVT = N0.getOperand(0).getValueType();
+  // Only combine when the result type is of the same size as the compared
+  // operands.
+  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
+    return SDValue();
+
+  SDValue IfTrue = N->getOperand(1);
+  SDValue IfFalse = N->getOperand(2);
+  SDValue SetCC =
+      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
+                   N0.getOperand(0), N0.getOperand(1),
+                   cast<CondCodeSDNode>(N0.getOperand(2))->get());
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
+                     IfTrue, IfFalse);
+}
+
+/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
+/// the compare-mask instructions rather than going via NZCV, even if LHS and
+/// RHS are really scalar. This replaces any scalar setcc in the above pattern
+/// with a vector one followed by a DUP shuffle on the result.
+static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT ResVT = N->getValueType(0);
+
+  if (!N->getOperand(1).getValueType().isVector())
+    return SDValue();
+
+  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
+    return SDValue();
+
+  SDLoc DL(N0);
+
+  EVT SrcVT = N0.getOperand(0).getValueType();
+  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
+                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+  EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
+
+  // First perform a vector comparison, where lane 0 is the one we're interested
+  // in.
+  SDValue LHS =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
+  SDValue RHS =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
+  SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
+
+  // Now duplicate the comparison mask we want across all other lanes.
+  SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
+  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
+  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
+                     Mask);
+
+  return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
+}
+
+SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::ADD:
+  case ISD::SUB:
+    return performAddSubLongCombine(N, DCI, DAG);
+  case ISD::XOR:
+    return performXorCombine(N, DAG, DCI, Subtarget);
+  case ISD::MUL:
+    return performMulCombine(N, DAG, DCI, Subtarget);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return performIntToFpCombine(N, DAG);
+  case ISD::OR:
+    return performORCombine(N, DCI, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return performIntrinsicCombine(N, DCI, Subtarget);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+    return performExtendCombine(N, DCI, DAG);
+  case ISD::BITCAST:
+    return performBitcastCombine(N, DCI, DAG);
+  case ISD::CONCAT_VECTORS:
+    return performConcatVectorsCombine(N, DCI, DAG);
+  case ISD::SELECT:
+    return performSelectCombine(N, DAG);
+  case ISD::VSELECT:
+    return performVSelectCombine(N, DCI.DAG);
+  case ISD::STORE:
+    return performSTORECombine(N, DCI, DAG, Subtarget);
+  case AArch64ISD::BRCOND:
+    return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::DUP:
+    return performPostLD1Combine(N, DCI, false);
+  case ISD::INSERT_VECTOR_ELT:
+    return performPostLD1Combine(N, DCI, true);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::aarch64_neon_ld2:
+    case Intrinsic::aarch64_neon_ld3:
+    case Intrinsic::aarch64_neon_ld4:
+    case Intrinsic::aarch64_neon_ld1x2:
+    case Intrinsic::aarch64_neon_ld1x3:
+    case Intrinsic::aarch64_neon_ld1x4:
+    case Intrinsic::aarch64_neon_ld2lane:
+    case Intrinsic::aarch64_neon_ld3lane:
+    case Intrinsic::aarch64_neon_ld4lane:
+    case Intrinsic::aarch64_neon_ld2r:
+    case Intrinsic::aarch64_neon_ld3r:
+    case Intrinsic::aarch64_neon_ld4r:
+    case Intrinsic::aarch64_neon_st2:
+    case Intrinsic::aarch64_neon_st3:
+    case Intrinsic::aarch64_neon_st4:
+    case Intrinsic::aarch64_neon_st1x2:
+    case Intrinsic::aarch64_neon_st1x3:
+    case Intrinsic::aarch64_neon_st1x4:
+    case Intrinsic::aarch64_neon_st2lane:
+    case Intrinsic::aarch64_neon_st3lane:
+    case Intrinsic::aarch64_neon_st4lane:
+      return performNEONPostLDSTCombine(N, DCI, DAG);
+    default:
+      break;
+    }
+  }
+  return SDValue();
+}
+
+// Check if the return value is used as only a return value, as otherwise
+// we can't perform a tail-call. In particular, we need to check for
+// target ISD nodes that are returns and any other "odd" constructs
+// that the generic analysis code won't necessarily catch.
+bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
+                                               SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
+        MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+    return false;
+
+  bool HasRet = false;
+  for (SDNode *Node : Copy->uses()) {
+    if (Node->getOpcode() != AArch64ISD::RET_FLAG)
+      return false;
+    HasRet = true;
+  }
+
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
+}
+
+// Return whether the an instruction can potentially be optimized to a tail
+// call. This will cause the optimizers to attempt to move, or duplicate,
+// return instructions to help enable tail call optimizations for this
+// instruction.
+bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!CI->isTailCall())
+    return false;
+
+  return true;
+}
+
+bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
+                                                   SDValue &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   bool &IsInc,
+                                                   SelectionDAG &DAG) const {
+  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
+    return false;
+
+  Base = Op->getOperand(0);
+  // All of the indexed addressing mode instructions take a signed
+  // 9 bit immediate offset.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    int64_t RHSC = (int64_t)RHS->getZExtValue();
+    if (RHSC >= 256 || RHSC <= -256)
+      return false;
+    IsInc = (Op->getOpcode() == ISD::ADD);
+    Offset = Op->getOperand(1);
+    return true;
+  }
+  return false;
+}
+
+bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                      SDValue &Offset,
+                                                      ISD::MemIndexedMode &AM,
+                                                      SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
+
+  bool IsInc;
+  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
+    return false;
+  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
+  return true;
+}
+
+bool AArch64TargetLowering::getPostIndexedAddressParts(
+    SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
+    ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
+
+  bool IsInc;
+  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+    return false;
+  // Post-indexing updates the base, so it's not a valid transform
+  // if that's not the same as the load's pointer.
+  if (Ptr != Base)
+    return false;
+  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+  return true;
+}
+
+void AArch64TargetLowering::ReplaceNodeResults(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this");
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
+    // Let normal code take care of it by not adding anything to Results.
+    return;
+  }
+}
+
+bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
+  // Loads and stores less than 128-bits are already atomic; ones above that
+  // are doomed anyway, so defer to the default libcall and blame the OS when
+  // things go wrong:
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
+  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return LI->getType()->getPrimitiveSizeInBits() == 128;
+
+  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
+  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
+}
+
+Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                                             AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+  bool IsAcquire =
+      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
+  // intrinsic must return {i64, i64} and we have to recombine them into a
+  // single i128 here.
+  if (ValTy->getPrimitiveSizeInBits() == 128) {
+    Intrinsic::ID Int =
+        IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
+    Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
+
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
+
+    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+    return Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
+  }
+
+  Type *Tys[] = { Addr->getType() };
+  Intrinsic::ID Int =
+      IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
+  Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateTruncOrBitCast(
+      Builder.CreateCall(Ldxr, Addr),
+      cast<PointerType>(Addr->getType())->getElementType());
+}
+
+Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+                                                   Value *Val, Value *Addr,
+                                                   AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  bool IsRelease =
+      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since the intrinsics must have legal type, the i128 intrinsics take two
+  // parameters: "i64, i64". We must marshal Val into the appropriate form
+  // before the call.
+  if (Val->getType()->getPrimitiveSizeInBits() == 128) {
+    Intrinsic::ID Int =
+        IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
+    Function *Stxr = Intrinsic::getDeclaration(M, Int);
+    Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
+    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
+  }
+
+  Intrinsic::ID Int =
+      IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
+  Type *Tys[] = { Addr->getType() };
+  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateCall2(
+      Stxr, Builder.CreateZExtOrBitCast(
+                Val, Stxr->getFunctionType()->getParamType(0)),
+      Addr);
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
new file mode 100644
index 00000000000..de16c4d9d4b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -0,0 +1,464 @@
+//==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AArch64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64_ISELLOWERING_H
+#define LLVM_TARGET_AArch64_ISELLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+namespace AArch64ISD {
+
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
+  CALL,         // Function call.
+
+  // Almost the same as a normal call node, except that a TLSDesc relocation is
+  // needed so the linker can relax it correctly if possible.
+  TLSDESC_CALL,
+  ADRP,     // Page address of a TargetGlobalAddress operand.
+  ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
+  LOADgot,  // Load from automatically generated descriptor (e.g. Global
+            // Offset Table, TLS record).
+  RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
+  BRCOND,   // Conditional branch instruction; "b.cond".
+  CSEL,
+  FCSEL, // Conditional move instruction.
+  CSINV, // Conditional select invert.
+  CSNEG, // Conditional select negate.
+  CSINC, // Conditional select increment.
+
+  // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
+  // ELF.
+  THREAD_POINTER,
+  ADC,
+  SBC, // adc, sbc instructions
+
+  // Arithmetic instructions which write flags.
+  ADDS,
+  SUBS,
+  ADCS,
+  SBCS,
+  ANDS,
+
+  // Floating point comparison
+  FCMP,
+
+  // Floating point max and min instructions.
+  FMAX,
+  FMIN,
+
+  // Scalar extract
+  EXTR,
+
+  // Scalar-to-vector duplication
+  DUP,
+  DUPLANE8,
+  DUPLANE16,
+  DUPLANE32,
+  DUPLANE64,
+
+  // Vector immedate moves
+  MOVI,
+  MOVIshift,
+  MOVIedit,
+  MOVImsl,
+  FMOV,
+  MVNIshift,
+  MVNImsl,
+
+  // Vector immediate ops
+  BICi,
+  ORRi,
+
+  // Vector bit select: similar to ISD::VSELECT but not all bits within an
+  // element must be identical.
+  BSL,
+
+  // Vector arithmetic negation
+  NEG,
+
+  // Vector shuffles
+  ZIP1,
+  ZIP2,
+  UZP1,
+  UZP2,
+  TRN1,
+  TRN2,
+  REV16,
+  REV32,
+  REV64,
+  EXT,
+
+  // Vector shift by scalar
+  VSHL,
+  VLSHR,
+  VASHR,
+
+  // Vector shift by scalar (again)
+  SQSHL_I,
+  UQSHL_I,
+  SQSHLU_I,
+  SRSHR_I,
+  URSHR_I,
+
+  // Vector comparisons
+  CMEQ,
+  CMGE,
+  CMGT,
+  CMHI,
+  CMHS,
+  FCMEQ,
+  FCMGE,
+  FCMGT,
+
+  // Vector zero comparisons
+  CMEQz,
+  CMGEz,
+  CMGTz,
+  CMLEz,
+  CMLTz,
+  FCMEQz,
+  FCMGEz,
+  FCMGTz,
+  FCMLEz,
+  FCMLTz,
+
+  // Vector bitwise negation
+  NOT,
+
+  // Vector bitwise selection
+  BIT,
+
+  // Compare-and-branch
+  CBZ,
+  CBNZ,
+  TBZ,
+  TBNZ,
+
+  // Tail calls
+  TC_RETURN,
+
+  // Custom prefetch handling
+  PREFETCH,
+
+  // {s|u}int to FP within a FP register.
+  SITOF,
+  UITOF,
+
+  // NEON Load/Store with post-increment base updates
+  LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  LD3post,
+  LD4post,
+  ST2post,
+  ST3post,
+  ST4post,
+  LD1x2post,
+  LD1x3post,
+  LD1x4post,
+  ST1x2post,
+  ST1x3post,
+  ST1x4post,
+  LD1DUPpost,
+  LD2DUPpost,
+  LD3DUPpost,
+  LD4DUPpost,
+  LD1LANEpost,
+  LD2LANEpost,
+  LD3LANEpost,
+  LD4LANEpost,
+  ST2LANEpost,
+  ST3LANEpost,
+  ST4LANEpost
+};
+
+} // end namespace AArch64ISD
+
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64TargetLowering : public TargetLowering {
+  bool RequireStrictAlign;
+
+public:
+  explicit AArch64TargetLowering(AArch64TargetMachine &TM);
+
+  /// Selects the correct CCAssignFn for a the given CallingConvention
+  /// value.
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
+
+  /// computeKnownBitsForTargetNode - Determine which of the bits specified in
+  /// Mask are known to be either zero or one and return them in the
+  /// KnownZero/KnownOne bitsets.
+  void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                     APInt &KnownOne, const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  MVT getScalarShiftAmountTy(EVT LHSTy) const override;
+
+  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+  /// unaligned memory accesses. of the specified type.
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+                                     bool *Fast = nullptr) const override {
+    if (RequireStrictAlign)
+      return false;
+    // FIXME: True for Cyclone, but not necessary others.
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
+
+  /// LowerOperation - Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  /// getFunctionAlignment - Return the Log2 alignment of this function.
+  unsigned getFunctionAlignment(const Function *F) const;
+
+  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
+  /// be used for loads / stores from the global.
+  unsigned getMaximalGlobalOffset() const override;
+
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+    // Addrspacecasts are always noops.
+    return true;
+  }
+
+  /// createFastISel - This method returns a target specific FastISel object,
+  /// or null if the target does not support "fast" ISel.
+  FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                           const TargetLibraryInfo *libInfo) const override;
+
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+  /// isShuffleMaskLegal - Return true if the given shuffle mask can be
+  /// codegen'd directly, or if it should be stack expanded.
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
+
+  /// getSetCCResultType - Return the ISD::SETCC ValueType
+  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+
+  SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
+
+  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr *MI,
+                              MachineBasicBlock *MBB) const override;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          unsigned Intrinsic) const override;
+
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+  bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+  bool isZExtFree(EVT VT1, EVT VT2) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+  bool hasPairedLoad(Type *LoadedType,
+                     unsigned &RequiredAligment) const override;
+  bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
+
+  bool isLegalAddImmediate(int64_t) const override;
+  bool isLegalICmpImmediate(int64_t) const override;
+
+  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                          MachineFunction &MF) const override;
+
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type.
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+
+  /// \brief Return the cost of the scaling factor used in the addressing
+  /// mode represented by AM for this target, for a load/store
+  /// of the specified type.
+  /// If the AM is supported, the return value must be >= 0.
+  /// If the AM is not supported, it returns a negative value.
+  int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
+
+  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+  /// expanded to FMAs when this method returns true, otherwise fmuladd is
+  /// expanded to fmul + fadd.
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+  const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+  /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
+  bool isDesirableToCommuteWithShift(const SDNode *N) const override;
+
+  /// \brief Returns true if it is beneficial to convert a load of a constant
+  /// to just the constant itself.
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                         Type *Ty) const override;
+
+  Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                        AtomicOrdering Ord) const override;
+  Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                              Value *Addr, AtomicOrdering Ord) const override;
+
+  bool shouldExpandAtomicInIR(Instruction *Inst) const override;
+
+private:
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
+  void addDRTypeForNEON(MVT VT);
+  void addQRTypeForNEON(MVT VT);
+
+  SDValue
+  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                       SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerCall(CallLoweringInfo & /*CLI*/,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+                          bool isThisReturn, SDValue ThisVal) const;
+
+  bool isEligibleForTailCallOptimization(
+      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+      bool isCalleeStructRet, bool isCallerStructRet,
+      const SmallVectorImpl<ISD::OutputArg> &Outs,
+      const SmallVectorImpl<SDValue> &OutVals,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+
+  /// Finds the incoming stack arguments which overlap the given fixed stack
+  /// object and incorporates their load into the current chain. This prevents
+  /// an upcoming store from clobbering the stack argument before it's used.
+  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+                              MachineFrameInfo *MFI, int ClobberedFI) const;
+
+  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
+
+  bool IsTailCallConvention(CallingConv::ID CallCC) const;
+
+  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+                           SDValue &Chain) const;
+
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+                      SelectionDAG &DAG) const override;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
+                              SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                        RTLIB::Libcall Call) const;
+  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+
+  ConstraintType
+  getConstraintType(const std::string &Constraint) const override;
+  unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
+  /// Examine constraint string and operand type and determine a weight value.
+  /// The operand object must already have been set up with the operand type.
+  ConstraintWeight
+  getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                 const char *constraint) const override;
+
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const std::string &Constraint,
+                               MVT VT) const override;
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+  bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
+                              ISD::MemIndexedMode &AM, bool &IsInc,
+                              SelectionDAG &DAG) const;
+  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                 ISD::MemIndexedMode &AM,
+                                 SelectionDAG &DAG) const override;
+  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                  SDValue &Offset, ISD::MemIndexedMode &AM,
+                                  SelectionDAG &DAG) const override;
+
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+};
+
+namespace AArch64 {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo);
+} // end namespace AArch64
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_AArch64_ISELLOWERING_H
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
new file mode 100644
index 00000000000..3b9e3c63059
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -0,0 +1,364 @@
+//=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Atomic operand code-gen constructs.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------
+// Atomic fences
+//===----------------------------------
+def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
+def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
+
+//===----------------------------------
+// Atomic loads
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A atomic load operation that actually needs acquire semantics.
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected load ordering");
+  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic load operation that does not need either acquire or release
+// semantics.
+class relaxed_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit loads
+def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                                     ro_Wextend8:$offset)),
+          (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                                     ro_Xextend8:$offset)),
+          (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit loads
+def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend16:$extend)),
+          (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend16:$extend)),
+          (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)),
+          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_load<atomic_load_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit loads
+def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend32:$extend)),
+          (LDRWroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend32:$extend)),
+          (LDRWroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s4:$offset)),
+          (LDRWui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_load<atomic_load_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+          (LDURWi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit loads
+def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend64:$extend)),
+          (LDRXroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend64:$extend)),
+          (LDRXroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset)),
+          (LDRXui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (LDURXi GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Atomic stores
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A store operation that actually needs release semantics.
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected store ordering");
+  return Ordering == Release || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic store operation that doesn't actually need to be atomic on AArch64.
+class relaxed_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit stores
+def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
+          (STLRB GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+               GPR32:$val),
+          (STRBBroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+               GPR32:$val),
+          (STRBBroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), GPR32:$val),
+          (STRBBui GPR32:$val, GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURBBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit stores
+def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
+          (STLRH GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend16:$extend),
+                                          GPR32:$val),
+          (STRHHroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend16:$extend),
+                                          GPR32:$val),
+          (STRHHroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16>
+              (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), GPR32:$val),
+          (STRHHui GPR32:$val, GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_store<atomic_store_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURHHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit stores
+def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
+          (STLRW GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend32:$extend),
+                                          GPR32:$val),
+          (STRWroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend32:$extend),
+                                          GPR32:$val),
+          (STRWroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32>
+              (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), GPR32:$val),
+          (STRWui GPR32:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_store<atomic_store_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit stores
+def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
+          (STLRX GPR64:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend16:$extend),
+                                          GPR64:$val),
+          (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend16:$extend),
+                                          GPR64:$val),
+          (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64>
+              (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), GPR64:$val),
+          (STRXui GPR64:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_store<atomic_store_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val),
+          (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Low-level exclusive operations
+//===----------------------------------
+
+// Load-exclusives.
+
+def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldxr_1 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_2 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_4 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_8 GPR64sp:$addr), (LDXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldxr_1 GPR64sp:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_2 GPR64sp:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+
+// Load-exclusives.
+
+def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldaxr_1 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_2 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_4 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_8 GPR64sp:$addr), (LDAXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldaxr_1 GPR64sp:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_2 GPR64sp:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+
+// Store-exclusives.
+
+def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 GPR64:$val, GPR64sp:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 GPR64:$val, GPR64sp:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_8 GPR64:$val, GPR64sp:$addr),
+          (STXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+          (STXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+          (STXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_4 (zext GPR32:$val), GPR64sp:$addr),
+          (STXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+// Store-release-exclusives.
+
+def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr),
+          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 GPR64:$val, GPR64sp:$addr),
+          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 GPR64:$val, GPR64sp:$addr),
+          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_8 GPR64:$val, GPR64sp:$addr),
+          (STLXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+          (STLXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+          (STLXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (zext GPR32:$val), GPR64sp:$addr),
+          (STLXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+
+// And clear exclusive.
+
+def : Pat<(int_aarch64_clrex), (CLREX 0xf)>;
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
new file mode 100644
index 00000000000..d455d7e45e0
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -0,0 +1,8574 @@
+//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe AArch64 instructions format here
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def PseudoFrm   : Format<0>;
+def NormalFrm   : Format<1>; // Do we need any others?
+
+// AArch64 Instruction Format
+class AArch64Inst<Format f, string cstr> : Instruction {
+  field bits<32> Inst; // Instruction encoding.
+  // Mask of bits that cause an encoding to be UNPREDICTABLE.
+  // If a bit is set, then if the corresponding bit in the
+  // target encoding differs from its value in the "Inst" field,
+  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
+  field bits<32> Unpredictable = 0;
+  // SoftFail is the generic name for this field, but we alias it so
+  // as to make it more obvious what it means in ARM-land.
+  field bits<32> SoftFail = Unpredictable;
+  let Namespace   = "AArch64";
+  Format F        = f;
+  bits<2> Form    = F.Value;
+  let Pattern     = [];
+  let Constraints = cstr;
+}
+
+// Pseudo instructions (don't have encoding information)
+class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
+    : AArch64Inst<PseudoFrm, cstr> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let Pattern        = pattern;
+  let isCodeGenOnly  = 1;
+}
+
+// Real instructions (have encoding information)
+class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> {
+  let Pattern = pattern;
+  let Size = 4;
+}
+
+// Normal instructions
+class I<dag oops, dag iops, string asm, string operands, string cstr,
+        list<dag> pattern>
+    : EncodedI<cstr, pattern> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let AsmString      = !strconcat(asm, operands);
+}
+
+class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag<dag res>  : PatFrag<(ops node:$LHS), res>;
+
+// Helper fragment for an extract of the high portion of a 128-bit vector.
+def extract_high_v16i8 :
+   UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
+def extract_high_v8i16 :
+   UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
+def extract_high_v4i32 :
+   UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
+def extract_high_v2i64 :
+   UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
+
+//===----------------------------------------------------------------------===//
+// Asm Operand Classes.
+//
+
+// Shifter operand for arithmetic shifted encodings.
+def ShifterOperand : AsmOperandClass {
+  let Name = "Shifter";
+}
+
+// Shifter operand for mov immediate encodings.
+def MovImm32ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm32Shifter";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "InvalidMovImm32Shift";
+}
+def MovImm64ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm64Shifter";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "InvalidMovImm64Shift";
+}
+
+// Shifter operand for arithmetic register shifted encodings.
+class ArithmeticShifterOperand<int width> : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "ArithmeticShifter" # width;
+  let PredicateMethod = "isArithmeticShifter<" # width # ">";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "AddSubRegShift" # width;
+}
+
+def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>;
+def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>;
+
+// Shifter operand for logical register shifted encodings.
+class LogicalShifterOperand<int width> : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalShifter" # width;
+  let PredicateMethod = "isLogicalShifter<" # width # ">";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "AddSubRegShift" # width;
+}
+
+def LogicalShifterOperand32 : LogicalShifterOperand<32>;
+def LogicalShifterOperand64 : LogicalShifterOperand<64>;
+
+// Shifter operand for logical vector 128/64-bit shifted encodings.
+def LogicalVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalVecShifter";
+  let RenderMethod = "addShifterOperands";
+}
+def LogicalVecHalfWordShifterOperand : AsmOperandClass {
+  let SuperClasses = [LogicalVecShifterOperand];
+  let Name = "LogicalVecHalfWordShifter";
+  let RenderMethod = "addShifterOperands";
+}
+
+// The "MSL" shifter on the vector MOVI instruction.
+def MoveVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MoveVecShifter";
+  let RenderMethod = "addShifterOperands";
+}
+
+// Extend operand for arithmetic encodings.
+def ExtendOperand : AsmOperandClass {
+  let Name = "Extend";
+  let DiagnosticType = "AddSubRegExtendLarge";
+}
+def ExtendOperand64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "Extend64";
+  let DiagnosticType = "AddSubRegExtendSmall";
+}
+// 'extend' that's a lsl of a 64-bit register.
+def ExtendOperandLSL64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "ExtendLSL64";
+  let RenderMethod = "addExtend64Operands";
+  let DiagnosticType = "AddSubRegExtendLarge";
+}
+
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+  let Name = "FPImm";
+  let ParserMethod = "tryParseFPImm";
+  let DiagnosticType = "InvalidFPImm";
+}
+
+def CondCode : AsmOperandClass {
+  let Name = "CondCode";
+  let DiagnosticType = "InvalidCondCode";
+}
+
+// A 32-bit register pasrsed as 64-bit
+def GPR32as64Operand : AsmOperandClass {
+  let Name = "GPR32as64";
+}
+def GPR32as64 : RegisterOperand<GPR32> {
+  let ParserMatchClass = GPR32as64Operand;
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
+
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// ADR[P] instruction labels.
+def AdrpOperand : AsmOperandClass {
+  let Name = "AdrpLabel";
+  let ParserMethod = "tryParseAdrpLabel";
+  let DiagnosticType = "InvalidLabel";
+}
+def adrplabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let PrintMethod = "printAdrpLabel";
+  let ParserMatchClass = AdrpOperand;
+}
+
+def AdrOperand : AsmOperandClass {
+  let Name = "AdrLabel";
+  let ParserMethod = "tryParseAdrLabel";
+  let DiagnosticType = "InvalidLabel";
+}
+def adrlabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let ParserMatchClass = AdrOperand;
+}
+
+// simm9 predicate - True if the immediate is in the range [-256, 255].
+def SImm9Operand : AsmOperandClass {
+  let Name = "SImm9";
+  let DiagnosticType = "InvalidMemoryIndexedSImm9";
+}
+def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
+  let ParserMatchClass = SImm9Operand;
+}
+
+// simm7sN predicate - True if the immediate is a multiple of N in the range
+// [-64 * N, 63 * N].
+class SImm7Scaled<int Scale> : AsmOperandClass {
+  let Name = "SImm7s" # Scale;
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7";
+}
+
+def SImm7s4Operand : SImm7Scaled<4>;
+def SImm7s8Operand : SImm7Scaled<8>;
+def SImm7s16Operand : SImm7Scaled<16>;
+
+def simm7s4 : Operand<i32> {
+  let ParserMatchClass = SImm7s4Operand;
+  let PrintMethod = "printImmScale<4>";
+}
+
+def simm7s8 : Operand<i32> {
+  let ParserMatchClass = SImm7s8Operand;
+  let PrintMethod = "printImmScale<8>";
+}
+
+def simm7s16 : Operand<i32> {
+  let ParserMatchClass = SImm7s16Operand;
+  let PrintMethod = "printImmScale<16>";
+}
+
+class AsmImmRange<int Low, int High> : AsmOperandClass {
+  let Name = "Imm" # Low # "_" # High;
+  let DiagnosticType = "InvalidImm" # Low # "_" # High;
+}
+
+def Imm1_8Operand : AsmImmRange<1, 8>;
+def Imm1_16Operand : AsmImmRange<1, 16>;
+def Imm1_32Operand : AsmImmRange<1, 32>;
+def Imm1_64Operand : AsmImmRange<1, 64>;
+
+def MovZSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG3";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG3AsmOperand;
+}
+
+def MovZSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG2AsmOperand;
+}
+
+def MovZSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG1";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG1AsmOperand;
+}
+
+def MovZSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG0";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG0AsmOperand;
+}
+
+def MovKSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG3";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG3AsmOperand;
+}
+
+def MovKSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG2AsmOperand;
+}
+
+def MovKSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG1";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG1AsmOperand;
+}
+
+def MovKSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG0";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG0AsmOperand;
+}
+
+class fixedpoint_i32<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm32";
+  let ParserMatchClass = Imm1_32Operand;
+}
+
+class fixedpoint_i64<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm64";
+  let ParserMatchClass = Imm1_64Operand;
+}
+
+def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
+def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
+
+def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
+def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
+
+def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR8OpValue";
+  let DecoderMethod = "DecodeVecShiftR8Imm";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16Imm";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32Imm";
+  let ParserMatchClass = Imm1_32Operand;
+}
+def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64Imm";
+  let ParserMatchClass = Imm1_64Operand;
+}
+def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
+  let ParserMatchClass = Imm1_32Operand;
+}
+
+def Imm0_7Operand : AsmImmRange<0, 7>;
+def Imm0_15Operand : AsmImmRange<0, 15>;
+def Imm0_31Operand : AsmImmRange<0, 31>;
+def Imm0_63Operand : AsmImmRange<0, 63>;
+
+def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 8);
+}]> {
+  let EncoderMethod = "getVecShiftL8OpValue";
+  let DecoderMethod = "DecodeVecShiftL8Imm";
+  let ParserMatchClass = Imm0_7Operand;
+}
+def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 16);
+}]> {
+  let EncoderMethod = "getVecShiftL16OpValue";
+  let DecoderMethod = "DecodeVecShiftL16Imm";
+  let ParserMatchClass = Imm0_15Operand;
+}
+def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let EncoderMethod = "getVecShiftL32OpValue";
+  let DecoderMethod = "DecodeVecShiftL32Imm";
+  let ParserMatchClass = Imm0_31Operand;
+}
+def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 64);
+}]> {
+  let EncoderMethod = "getVecShiftL64OpValue";
+  let DecoderMethod = "DecodeVecShiftL64Imm";
+  let ParserMatchClass = Imm0_63Operand;
+}
+
+
+// Crazy immediate formats used by 32-bit and 64-bit logical immediate
+// instructions for splatting repeating bit patterns across the immediate.
+def logical_imm32_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
+  return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+def logical_imm64_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
+  return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+
+def LogicalImm32Operand : AsmOperandClass {
+  let Name = "LogicalImm32";
+  let DiagnosticType = "LogicalSecondSource";
+}
+def LogicalImm64Operand : AsmOperandClass {
+  let Name = "LogicalImm64";
+  let DiagnosticType = "LogicalSecondSource";
+}
+def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
+  return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32);
+}], logical_imm32_XFORM> {
+  let PrintMethod = "printLogicalImm32";
+  let ParserMatchClass = LogicalImm32Operand;
+}
+def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
+  return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 64);
+}], logical_imm64_XFORM> {
+  let PrintMethod = "printLogicalImm64";
+  let ParserMatchClass = LogicalImm64Operand;
+}
+
+// imm0_65535 predicate - True if the immediate is in the range [0,65535].
+def Imm0_65535Operand : AsmImmRange<0, 65535>;
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 65536;
+}]> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let PrintMethod = "printHexImm";
+}
+
+// imm0_255 predicate - True if the immediate is in the range [0,255].
+def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 256;
+}]> {
+  let ParserMatchClass = Imm0_255Operand;
+  let PrintMethod = "printHexImm";
+}
+
+// imm0_127 predicate - True if the immediate is in the range [0,127]
+def Imm0_127Operand : AsmImmRange<0, 127>;
+def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 128;
+}]> {
+  let ParserMatchClass = Imm0_127Operand;
+  let PrintMethod = "printHexImm";
+}
+
+// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
+// for all shift-amounts.
+
+// imm0_63 predicate - True if the immediate is in the range [0,63]
+def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 64;
+}]> {
+  let ParserMatchClass = Imm0_63Operand;
+}
+
+// imm0_31 predicate - True if the immediate is in the range [0,31]
+def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 32;
+}]> {
+  let ParserMatchClass = Imm0_31Operand;
+}
+
+// imm0_15 predicate - True if the immediate is in the range [0,15]
+def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = Imm0_15Operand;
+}
+
+// imm0_7 predicate - True if the immediate is in the range [0,7]
+def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = Imm0_7Operand;
+}
+
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
+//  {5-0} - imm6
+class arith_shift<ValueType Ty, int width> : Operand<Ty> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = !cast<AsmOperandClass>(
+                         "ArithmeticShifterOperand" # width);
+}
+
+def arith_shift32 : arith_shift<i32, 32>;
+def arith_shift64 : arith_shift<i64, 64>;
+
+class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width));
+}
+
+def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
+def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
+
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
+//  {5-0} - imm6
+class logical_shift<int width> : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = !cast<AsmOperandClass>(
+                         "LogicalShifterOperand" # width);
+}
+
+def logical_shift32 : logical_shift<32>;
+def logical_shift64 : logical_shift<64>;
+
+class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, shiftop);
+}
+
+def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
+def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
+
+// A logical vector shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0, #8, #16, or #24
+def logical_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecShifterOperand;
+}
+
+// A logical vector half-word shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #8
+def logical_vec_hw_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecHalfWordShifterOperand;
+}
+
+// A vector move shifter operand:
+//  {0} - imm1: #8 or #16
+def move_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getMoveVecShifterOpValue";
+  let ParserMatchClass = MoveVecShifterOperand;
+}
+
+def AddSubImmOperand : AsmOperandClass {
+  let Name = "AddSubImm";
+  let ParserMethod = "tryParseAddSubImm";
+  let DiagnosticType = "AddSubSecondSource";
+}
+// An ADD/SUB immediate shifter operand:
+//  second operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #12
+class addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
+def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
+
+class neg_addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
+def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
+
+// An extend operand:
+//  {5-3} - extend type
+//  {2-0} - imm3
+def arith_extend : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperand;
+}
+def arith_extend64 : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperand64;
+}
+
+// 'extend' that's a lsl of a 64-bit register.
+def arith_extendlsl64 : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperandLSL64;
+}
+
+class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend);
+}
+
+class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend64);
+}
+
+// Floating-point immediate.
+def fpimm32 : Operand<f32>,
+              PatLeaf<(f32 fpimm), [{
+      return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP32Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+def fpimm64 : Operand<f64>,
+              PatLeaf<(f64 fpimm), [{
+      return AArch64_AM::getFP64Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP64Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm8 : Operand<i32> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+// Vector lane operands
+class AsmVectorIndex<string Suffix> : AsmOperandClass {
+  let Name = "VectorIndex" # Suffix;
+  let DiagnosticType = "InvalidIndex" # Suffix;
+}
+def VectorIndex1Operand : AsmVectorIndex<"1">;
+def VectorIndexBOperand : AsmVectorIndex<"B">;
+def VectorIndexHOperand : AsmVectorIndex<"H">;
+def VectorIndexSOperand : AsmVectorIndex<"S">;
+def VectorIndexDOperand : AsmVectorIndex<"D">;
+
+def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) == 1;
+}]> {
+  let ParserMatchClass = VectorIndex1Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = VectorIndexBOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = VectorIndexHOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 4;
+}]> {
+  let ParserMatchClass = VectorIndexSOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = VectorIndexDOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def simdimmtype10 : Operand<i32>,
+                    PatLeaf<(f64 fpimm), [{
+      return AArch64_AM::isAdvSIMDModImmType10(N->getValueAPF()
+                                               .bitcastToAPInt()
+                                               .getZExtValue());
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
+                                                           .bitcastToAPInt()
+                                                           .getZExtValue());
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = SIMDImmType10Operand;
+  let PrintMethod = "printSIMDType10Operand";
+}
+
+
+//---
+// System management
+//---
+
+// Base encoding for system instruction operands.
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands>
+    : I<oops, iops, asm, operands, "", []> {
+  let Inst{31-22} = 0b1101010100;
+  let Inst{21}    = L;
+}
+
+// System instructions which do not have an Rt register.
+class SimpleSystemI<bit L, dag iops, string asm, string operands>
+    : BaseSystemI<L, (outs), iops, asm, operands> {
+  let Inst{4-0} = 0b11111;
+}
+
+// System instructions which have an Rt register.
+class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
+    : BaseSystemI<L, oops, iops, asm, operands>,
+      Sched<[WriteSys]> {
+  bits<5> Rt;
+  let Inst{4-0} = Rt;
+}
+
+// Hint instructions that take both a CRm and a 3-bit immediate.
+class HintI<string mnemonic>
+    : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">,
+      Sched<[WriteHint]> {
+  bits <7> imm;
+  let Inst{20-12} = 0b000110010;
+  let Inst{11-5} = imm;
+}
+
+// System instructions taking a single literal operand which encodes into
+// CRm. op2 differentiates the opcodes.
+def BarrierAsmOperand : AsmOperandClass {
+  let Name = "Barrier";
+  let ParserMethod = "tryParseBarrierOperand";
+}
+def barrier_op : Operand<i32> {
+  let PrintMethod = "printBarrierOption";
+  let ParserMatchClass = BarrierAsmOperand;
+}
+class CRmSystemI<Operand crmtype, bits<3> opc, string asm>
+    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">,
+      Sched<[WriteBarrier]> {
+  bits<4> CRm;
+  let Inst{20-12} = 0b000110011;
+  let Inst{11-8} = CRm;
+  let Inst{7-5} = opc;
+}
+
+// MRS/MSR system instructions. These have different operand classes because
+// a different subset of registers can be accessed through each instruction.
+def MRSSystemRegisterOperand : AsmOperandClass {
+  let Name = "MRSSystemRegister";
+  let ParserMethod = "tryParseSysReg";
+  let DiagnosticType = "MRS";
+}
+// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
+def mrs_sysreg_op : Operand<i32> {
+  let ParserMatchClass = MRSSystemRegisterOperand;
+  let DecoderMethod = "DecodeMRSSystemRegister";
+  let PrintMethod = "printMRSSystemRegister";
+}
+
+def MSRSystemRegisterOperand : AsmOperandClass {
+  let Name = "MSRSystemRegister";
+  let ParserMethod = "tryParseSysReg";
+  let DiagnosticType = "MSR";
+}
+def msr_sysreg_op : Operand<i32> {
+  let ParserMatchClass = MSRSystemRegisterOperand;
+  let DecoderMethod = "DecodeMSRSystemRegister";
+  let PrintMethod = "printMSRSystemRegister";
+}
+
+class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
+                       "mrs", "\t$Rt, $systemreg"> {
+  bits<15> systemreg;
+  let Inst{20} = 1;
+  let Inst{19-5} = systemreg;
+}
+
+// FIXME: Some of these def NZCV, others don't. Best way to model that?
+// Explicitly modeling each of the system register as a register class
+// would do it, but feels like overkill at this point.
+class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
+                       "msr", "\t$systemreg, $Rt"> {
+  bits<15> systemreg;
+  let Inst{20} = 1;
+  let Inst{19-5} = systemreg;
+}
+
+def SystemPStateFieldOperand : AsmOperandClass {
+  let Name = "SystemPStateField";
+  let ParserMethod = "tryParseSysReg";
+}
+def pstatefield_op : Operand<i32> {
+  let ParserMatchClass = SystemPStateFieldOperand;
+  let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateI
+  : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm),
+                  "msr", "\t$pstate_field, $imm">,
+    Sched<[WriteSys]> {
+  bits<6> pstatefield;
+  bits<4> imm;
+  let Inst{20-19} = 0b00;
+  let Inst{18-16} = pstatefield{5-3};
+  let Inst{15-12} = 0b0100;
+  let Inst{11-8} = imm;
+  let Inst{7-5} = pstatefield{2-0};
+
+  let DecoderMethod = "DecodeSystemPStateInstruction";
+}
+
+// SYS and SYSL generic system instructions.
+def SysCRAsmOperand : AsmOperandClass {
+  let Name = "SysCR";
+  let ParserMethod = "tryParseSysCROperand";
+}
+
+def sys_cr_op : Operand<i32> {
+  let PrintMethod = "printSysCROperand";
+  let ParserMatchClass = SysCRAsmOperand;
+}
+
+class SystemXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
+       asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+class SystemLXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
+       asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+
+// Branch (register) instructions:
+//
+//  case opc of
+//    0001 blr
+//    0000 br
+//    0101 dret
+//    0100 eret
+//    0010 ret
+//    otherwise UNDEFINED
+class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
+                    string operands, list<dag> pattern>
+    : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
+  let Inst{31-25} = 0b1101011;
+  let Inst{24-21} = opc;
+  let Inst{20-16} = 0b11111;
+  let Inst{15-10} = 0b000000;
+  let Inst{4-0}   = 0b00000;
+}
+
+class BranchReg<bits<4> opc, string asm, list<dag> pattern>
+    : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
+  bits<5> Rn;
+  let Inst{9-5} = Rn;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
+class SpecialReturn<bits<4> opc, string asm>
+    : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
+  let Inst{9-5} = 0b11111;
+}
+
+//---
+// Conditional branch instruction.
+//---
+
+// Condition code.
+// 4-bit immediate. Pretty-printed as <cc>
+def ccode : Operand<i32> {
+  let PrintMethod = "printCondCode";
+  let ParserMatchClass = CondCode;
+}
+def inv_ccode : Operand<i32> {
+  let PrintMethod = "printInverseCondCode";
+  let ParserMatchClass = CondCode;
+}
+
+// Conditional branch target. 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def PCRelLabel19Operand : AsmOperandClass {
+  let Name = "PCRelLabel19";
+  let DiagnosticType = "InvalidLabel";
+}
+def am_brcond : Operand<OtherVT> {
+  let EncoderMethod = "getCondBranchTargetOpValue";
+  let DecoderMethod = "DecodePCRelLabel19";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PCRelLabel19Operand;
+}
+
+class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
+                     "b", ".$cond\t$target", "",
+                     [(AArch64brcond bb:$target, imm:$cond, NZCV)]>,
+                   Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let Uses = [NZCV];
+
+  bits<4> cond;
+  bits<19> target;
+  let Inst{31-24} = 0b01010100;
+  let Inst{23-5} = target;
+  let Inst{4} = 0;
+  let Inst{3-0} = cond;
+}
+
+//---
+// Compare-and-branch instructions.
+//---
+class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, am_brcond:$target),
+         asm, "\t$Rt, $target", "",
+         [(node regtype:$Rt, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<19> target;
+  let Inst{30-25} = 0b011010;
+  let Inst{24}    = op;
+  let Inst{23-5}  = target;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass CmpBranch<bit op, string asm, SDNode node> {
+  def W : BaseCmpBranch<GPR32, op, asm, node> {
+    let Inst{31} = 0;
+  }
+  def X : BaseCmpBranch<GPR64, op, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Test-bit-and-branch instructions.
+//---
+// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
+// the target offset are implied zero and so are not part of the immediate.
+def BranchTarget14Operand : AsmOperandClass {
+  let Name = "BranchTarget14";
+}
+def am_tbrcond : Operand<OtherVT> {
+  let EncoderMethod = "getTestBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget14Operand;
+}
+
+// AsmOperand classes to emit (or not) special diagnostics
+def TBZImm0_31Operand : AsmOperandClass {
+  let Name = "TBZImm0_31";
+  let PredicateMethod = "isImm0_31";
+  let RenderMethod = "addImm0_31Operands";
+}
+def TBZImm32_63Operand : AsmOperandClass {
+  let Name = "Imm32_63";
+  let DiagnosticType = "InvalidImm0_63";
+}
+
+class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let ParserMatchClass = matcher;
+}
+
+def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>;
+def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>;
+
+def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{
+  return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64);
+}]> {
+  let ParserMatchClass = TBZImm32_63Operand;
+}
+
+class BaseTestBranch<RegisterClass regtype, Operand immtype,
+                     bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target),
+       asm, "\t$Rt, $bit_off, $target", "",
+       [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<6> bit_off;
+  bits<14> target;
+
+  let Inst{30-25} = 0b011011;
+  let Inst{24}    = op;
+  let Inst{23-19} = bit_off{4-0};
+  let Inst{18-5}  = target;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeTestAndBranch";
+}
+
+multiclass TestBranch<bit op, string asm, SDNode node> {
+  def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> {
+    let Inst{31} = 0;
+  }
+
+  def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> {
+    let Inst{31} = 1;
+  }
+
+  // Alias X-reg with 0-31 imm to W-Reg.
+  def : InstAlias<asm # "\t$Rd, $imm, $target",
+                  (!cast<Instruction>(NAME#"W") GPR32as64:$Rd,
+                  tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>;
+  def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target),
+            (!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32),
+            tbz_imm0_31_diag:$imm, bb:$target)>;
+}
+
+//---
+// Unconditional branch (immediate) instructions.
+//---
+def BranchTarget26Operand : AsmOperandClass {
+  let Name = "BranchTarget26";
+  let DiagnosticType = "InvalidLabel";
+}
+def am_b_target : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+def am_bl_target : Operand<i64> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+
+class BImm<bit op, dag iops, string asm, list<dag> pattern>
+    : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
+  bits<26> addr;
+  let Inst{31}    = op;
+  let Inst{30-26} = 0b00101;
+  let Inst{25-0}  = addr;
+
+  let DecoderMethod = "DecodeUnconditionalBranch";
+}
+
+class BranchImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_b_target:$addr), asm, pattern>;
+class CallImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
+
+//---
+// Basic one-operand data processing instructions.
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
+                         SDPatternOperator node>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+      [(set regtype:$Rd, (node regtype:$Rn))]>,
+    Sched<[WriteI, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+
+  let Inst{30-13} = 0b101101011000000000;
+  let Inst{12-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass OneOperandData<bits<3> opc, string asm,
+                          SDPatternOperator node = null_frag> {
+  def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR32, asm, node> {
+  let Inst{31} = 0;
+}
+
+class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR64, asm, node> {
+  let Inst{31} = 1;
+}
+
+//---
+// Basic two-operand data processing instructions.
+//---
+class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                          list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30}    = isSub;
+  let Inst{28-21} = 0b11010000;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                      SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>;
+
+class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
+                              SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)),
+         (implicit NZCV)]> {
+  let Defs = [NZCV];
+}
+
+multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
+                       SDNode OpNode, SDNode OpNode_setflags> {
+  def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
+    let Inst{31} = 0;
+    let Inst{29} = 0;
+  }
+  def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+    let Inst{29} = 0;
+  }
+
+  // Sets flags.
+  def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 0;
+    let Inst{29} = 1;
+  }
+  def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 1;
+    let Inst{29} = 1;
+  }
+}
+
+class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
+                     SDPatternOperator OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = 0b00;
+  let Inst{13-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
+              SDPatternOperator OpNode>
+    : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
+  let Inst{10}    = isSigned;
+}
+
+multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
+  def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
+           Sched<[WriteID32, ReadID, ReadID]> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
+           Sched<[WriteID64, ReadID, ReadID]> {
+    let Inst{31} = 1;
+  }
+}
+
+class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
+                SDPatternOperator OpNode = null_frag>
+  : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
+    Sched<[WriteIS, ReadI]> {
+  let Inst{11-10} = shift_type;
+}
+
+multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
+  def Wr : BaseShift<shift_type, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
+                                             (EXTRACT_SUBREG i64:$Rm, sub_32))>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+}
+
+class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
+
+class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
+                       RegisterClass addtype, string asm,
+                       list<dag> pattern>
+  : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
+      asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{30-24} = 0b0011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
+      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
+      Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
+    let Inst{31} = 0;
+  }
+
+  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
+      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
+      Sched<[WriteIM64, ReadIMA, ReadIM, ReadIM]> {
+    let Inst{31} = 1;
+  }
+}
+
+class WideMulAccum<bit isSub, bits<3> opc, string asm,
+                   SDNode AccNode, SDNode ExtNode>
+  : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
+    [(set GPR64:$Rd, (AccNode GPR64:$Ra,
+                            (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
+    Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
+  let Inst{31} = 1;
+}
+
+class MulHi<bits<3> opc, string asm, SDNode OpNode>
+  : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
+    Sched<[WriteIM64, ReadIM, ReadIM]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-24} = 0b10011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+  // (i.e. all bits 1) but is ignored by the processor.
+  let PostEncoderMethod = "fixMulHigh";
+}
+
+class MulAccumWAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
+class MulAccumXAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
+class WideMulAccumAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
+
+class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
+              SDPatternOperator OpNode, string asm>
+  : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
+    Sched<[WriteISReg, ReadI, ReadISReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31} = sf;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0b010;
+  let Inst{12} = C;
+  let Inst{11-10} = sz;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+  let Predicates = [HasCRC];
+}
+
+//---
+// Address generation.
+//---
+
+class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
+    : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
+        pattern>,
+      Sched<[WriteI]> {
+  bits<5>  Xd;
+  bits<21> label;
+  let Inst{31}    = page;
+  let Inst{30-29} = label{1-0};
+  let Inst{28-24} = 0b10000;
+  let Inst{23-5}  = label{20-2};
+  let Inst{4-0}   = Xd;
+
+  let DecoderMethod = "DecodeAdrInstruction";
+}
+
+//---
+// Move immediate.
+//---
+
+def movimm32_imm : Operand<i32> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let EncoderMethod = "getMoveWideImmOpValue";
+  let PrintMethod = "printHexImm";
+}
+def movimm32_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm32ShifterOperand;
+}
+def movimm64_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm64ShifterOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                        string asm>
+  : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "", []>,
+    Sched<[WriteImm]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass MoveImmediate<bits<2> opc, string asm> {
+  def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                          string asm>
+  : I<(outs regtype:$Rd),
+      (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
+    Sched<[WriteI, ReadI]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass InsertImmediate<bits<2> opc, string asm> {
+  def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Add/Subtract
+//---
+
+class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                    RegisterClass srcRegtype, addsub_shifted_imm immtype,
+                    string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
+        asm, "\t$Rd, $Rn, $imm", "",
+        [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
+      Sched<[WriteI, ReadI]>  {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<14> imm;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b10001;
+  let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
+  let Inst{21-10} = imm{11-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+  let DecoderMethod = "DecodeBaseAddSubImm";
+}
+
+class BaseAddSubRegPseudo<RegisterClass regtype,
+                          SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI, ReadI, ReadI]>;
+
+class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
+                     arith_shifted_reg shifted_regtype, string asm,
+                     SDPatternOperator OpNode>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "",
+        [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = 0;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, Operand src2Regtype,
+                     string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$R1),
+        (ins src1Regtype:$R2, src2Regtype:$R3),
+        asm, "\t$R1, $R2, $R3", "",
+        [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = ext{5-3};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                       RegisterClass src1Regtype, RegisterClass src2Regtype,
+                       Operand ext_op, string asm>
+    : I<(outs dstRegtype:$Rd),
+        (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
+        asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = ext{5};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+// Aliases for register+register add/subtract.
+class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, RegisterClass src2Regtype,
+                     int shiftExt>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
+                      shiftExt)>;
+
+multiclass AddSub<bit isSub, string mnemonic,
+                  SDPatternOperator OpNode = null_frag> {
+  let hasSideEffects = 0 in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register - Only used for CodeGen
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1, hasSideEffects = 0 in {
+  def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
+                           arith_extended_reg32to64<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when either the destination or
+  // first source register is SP.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp> {
+  let isCompare = 1, Defs = [NZCV] in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1 in {
+  def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
+                           arith_extended_reg32<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+  } // Defs = [NZCV]
+
+  // Compare aliases
+  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
+                  WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
+  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
+                  XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
+                  WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
+                  XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
+                  XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
+                  WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
+                  XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
+
+  // Compare shorthands
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs")
+                  WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs")
+                  XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when the first source register
+  // is SP.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+//---
+// Extract
+//---
+def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                      SDTCisPtrTy<3>]>;
+def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
+
+class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
+                     list<dag> patterns>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
+      Sched<[WriteExtr, ReadExtrHi]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> imm;
+
+  let Inst{30-23} = 0b00100111;
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = imm;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ExtractImm<string asm> {
+  def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
+                      [(set GPR32:$Rd,
+                        (AArch64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imm<5> must be zero.
+    let imm{5}   = 0;
+  }
+  def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
+                      [(set GPR64:$Rd,
+                        (AArch64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
+
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Bitfield
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImm<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
+      Sched<[WriteIS, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImm<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imms<5> and immr<5> must be zero, else ReservedValue().
+    let Inst{21} = 0;
+    let Inst{15} = 0;
+  }
+  def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImmWith2RegArgs<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
+                             imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
+      Sched<[WriteIS, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imms<5> and immr<5> must be zero, else ReservedValue().
+    let Inst{21} = 0;
+    let Inst{15} = 0;
+  }
+  def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Logical
+//---
+
+// Logical (immediate)
+class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
+                     RegisterClass sregtype, Operand imm_type, string asm,
+                     list<dag> pattern>
+    : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $imm", "", pattern>,
+      Sched<[WriteI, ReadI]> {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<13> imm;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100100;
+  let Inst{22}    = imm{12};
+  let Inst{21-16} = imm{11-6};
+  let Inst{15-10} = imm{5-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeLogicalImmInstruction";
+}
+
+// Logical (shifted register)
+class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
+                      logical_shifted_reg shifted_regtype, string asm,
+                      list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-24} = 0b01010;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = N;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+// Aliases for register+register logical instructions.
+class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
+
+let AddedComplexity = 6 in
+multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> {
+  def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
+                           [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
+                                               logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
+                           [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
+                                               logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+}
+
+multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
+  let isCompare = 1, Defs = [NZCV] in {
+  def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  def Xri  : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+  } // end Defs = [NZCV]
+}
+
+class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI, ReadI, ReadI]>;
+
+// Split from LogicalImm as not all instructions have both.
+multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
+                      SDPatternOperator OpNode> {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+                            [(set GPR32:$Rd, (OpNode GPR32:$Rn,
+                                                 logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+                            [(set GPR64:$Rd, (OpNode GPR64:$Rn,
+                                                 logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+// Split from LogicalReg to allow setting NZCV Defs
+multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
+                       SDPatternOperator OpNode = null_frag> {
+  let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+            [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+            [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+  } // Defs = [NZCV]
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+//---
+// Conditionally set flags
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
+      Sched<[WriteI, ReadI]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> imm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = imm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass CondSetFlagsImm<bit op, string asm> {
+  def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass CondSetFlagsReg<bit op, string asm> {
+  def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Conditional select
+//---
+
+class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass CondSelect<bit op, bits<2> op2, string asm> {
+  def Wr : BaseCondSelect<op, op2, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelect<op, op2, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
+                       PatFrag frag>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel regtype:$Rn, (frag regtype:$Rm),
+               (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+  AArch64CC::CondCode CC = static_cast<AArch64CC::CondCode>(N->getZExtValue());
+  return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), MVT::i32);
+}]>;
+
+multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
+  def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV),
+            (!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm,
+                                           (inv_cond_XFORM imm:$cond))>;
+
+  def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV),
+            (!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm,
+                                           (inv_cond_XFORM imm:$cond))>;
+}
+
+//---
+// Special Mask Value
+//---
+def maski8_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
+}
+def maski16_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
+}
+
+
+//---
+// Load/store
+//---
+
+// (unsigned immediate)
+// Indexed for 8-bit registers. offset is in range [0,4095].
+def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>;
+def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>;
+def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
+def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
+def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;
+
+class UImm12OffsetOperand<int Scale> : AsmOperandClass {
+  let Name = "UImm12Offset" # Scale;
+  let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">";
+  let PredicateMethod = "isUImm12Offset<" # Scale # ">";
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale;
+}
+
+def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>;
+def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>;
+def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>;
+def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>;
+def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>;
+
+class uimm12_scaled<int Scale> : Operand<i64> {
+  let ParserMatchClass
+   = !cast<AsmOperandClass>("UImm12OffsetScale" # Scale # "Operand");
+  let EncoderMethod
+   = "getLdStUImm12OpValue<AArch64::fixup_aarch64_ldst_imm12_scale" # Scale # ">";
+  let PrintMethod = "printUImm12Offset<" # Scale # ">";
+}
+
+def uimm12s1 : uimm12_scaled<1>;
+def uimm12s2 : uimm12_scaled<2>;
+def uimm12s4 : uimm12_scaled<4>;
+def uimm12s8 : uimm12_scaled<8>;
+def uimm12s16 : uimm12_scaled<16>;
+
+class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                      string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+  bits<5> Rt;
+
+  bits<5> Rn;
+  bits<12> offset;
+
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b01;
+  let Inst{23-22} = opc;
+  let Inst{21-10} = offset;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeUnsignedLdStInstruction";
+}
+
+multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                  Operand indextype, string asm, list<dag> pattern> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
+                           (ins GPR64sp:$Rn, indextype:$offset),
+                           asm, pattern>,
+           Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             Operand indextype, string asm, list<dag> pattern> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def ui : BaseLoadStoreUI<sz, V, opc, (outs),
+                           (ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
+                           asm, pattern>,
+           Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+def PrefetchOperand : AsmOperandClass {
+  let Name = "Prefetch";
+  let ParserMethod = "tryParsePrefetch";
+}
+def prfop : Operand<i32> {
+  let PrintMethod = "printPrefetchOp";
+  let ParserMatchClass = PrefetchOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
+    : BaseLoadStoreUI<sz, V, opc,
+                      (outs), (ins prfop:$Rt, GPR64sp:$Rn, uimm12s8:$offset),
+                      asm, pat>,
+      Sched<[WriteLD]>;
+
+//---
+// Load literal
+//---
+
+// Load literal address: 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def am_ldrlit : Operand<OtherVT> {
+  let EncoderMethod = "getLoadLiteralOpValue";
+  let DecoderMethod = "DecodePCRelLabel19";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PCRelLabel19Operand;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rt), (ins am_ldrlit:$label),
+        asm, "\t$Rt, $label", "", []>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
+    : I<(outs), (ins prfop:$Rt, am_ldrlit:$label),
+        asm, "\t$Rt, $label", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
+}
+
+//---
+// Load/store register offset
+//---
+
+def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>;
+def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>;
+def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
+def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
+def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
+
+def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
+def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
+def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
+def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
+def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;
+
+class MemExtendOperand<string Reg, int Width> : AsmOperandClass {
+  let Name = "Mem" # Reg # "Extend" # Width;
+  let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">";
+  let RenderMethod = "addMemExtendOperands";
+  let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width;
+}
+
+def MemWExtend8Operand : MemExtendOperand<"W", 8> {
+  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+  // the trivial shift.
+  let RenderMethod = "addMemExtend8Operands";
+}
+def MemWExtend16Operand : MemExtendOperand<"W", 16>;
+def MemWExtend32Operand : MemExtendOperand<"W", 32>;
+def MemWExtend64Operand : MemExtendOperand<"W", 64>;
+def MemWExtend128Operand : MemExtendOperand<"W", 128>;
+
+def MemXExtend8Operand : MemExtendOperand<"X", 8> {
+  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+  // the trivial shift.
+  let RenderMethod = "addMemExtend8Operands";
+}
+def MemXExtend16Operand : MemExtendOperand<"X", 16>;
+def MemXExtend32Operand : MemExtendOperand<"X", 32>;
+def MemXExtend64Operand : MemExtendOperand<"X", 64>;
+def MemXExtend128Operand : MemExtendOperand<"X", 128>;
+
+class ro_extend<AsmOperandClass ParserClass, string Reg, int Width>
+        : Operand<i32> {
+  let ParserMatchClass = ParserClass;
+  let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">";
+  let DecoderMethod = "DecodeMemExtend";
+  let EncoderMethod = "getMemExtendOpValue";
+  let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift);
+}
+
+def ro_Wextend8   : ro_extend<MemWExtend8Operand,   "w", 8>;
+def ro_Wextend16  : ro_extend<MemWExtend16Operand,  "w", 16>;
+def ro_Wextend32  : ro_extend<MemWExtend32Operand,  "w", 32>;
+def ro_Wextend64  : ro_extend<MemWExtend64Operand,  "w", 64>;
+def ro_Wextend128 : ro_extend<MemWExtend128Operand, "w", 128>;
+
+def ro_Xextend8   : ro_extend<MemXExtend8Operand,   "x", 8>;
+def ro_Xextend16  : ro_extend<MemXExtend16Operand,  "x", 16>;
+def ro_Xextend32  : ro_extend<MemXExtend32Operand,  "x", 32>;
+def ro_Xextend64  : ro_extend<MemXExtend64Operand,  "x", 64>;
+def ro_Xextend128 : ro_extend<MemXExtend128Operand, "x", 128>;
+
+class ROAddrMode<ComplexPattern windex, ComplexPattern xindex,
+                  Operand wextend, Operand xextend>  {
+  // CodeGen-level pattern covering the entire addressing mode.
+  ComplexPattern Wpat = windex;
+  ComplexPattern Xpat = xindex;
+
+  // Asm-level Operand covering the valid "uxtw #3" style syntax.
+  Operand Wext = wextend;
+  Operand Xext = xextend;
+}
+
+def ro8 : ROAddrMode<ro_Windexed8, ro_Xindexed8, ro_Wextend8, ro_Xextend8>;
+def ro16 : ROAddrMode<ro_Windexed16, ro_Xindexed16, ro_Wextend16, ro_Xextend16>;
+def ro32 : ROAddrMode<ro_Windexed32, ro_Xindexed32, ro_Wextend32, ro_Xextend32>;
+def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
+def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
+                       ro_Xextend128>;
+
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
+  : InstAlias<asm # " $Rt, [$Rn, $Rm]",
+              (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+
+multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                             ro_Wextend8:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                             ro_Xextend8:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+                 (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+                 [(storeop (Ty regtype:$Rt),
+                           (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend8:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+                 (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+                 [(storeop (Ty regtype:$Rt),
+                           (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend8:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                              ro_Wextend16:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                             ro_Xextend16:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend16:$extend))]>,
+           Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend16:$extend))]>,
+           Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                              ro_Wextend32:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                              ro_Xextend32:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend32:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                        ro_Xextend32:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                [(set (Ty regtype:$Rt),
+                      (loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                             ro_Wextend64:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                              ro_Xextend64:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend64:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend64:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+                                               ro_Wextend128:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+                                               ro_Xextend128:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+               (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+                                          ro_Wextend128:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+               (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+                                          ro_Xextend128:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BasePrefetchRO<bits<2> sz, bit V, bits<2> opc, dag outs, dag ins,
+                     string asm, list<dag> pat>
+    : I<outs, ins, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
+  def roW : BasePrefetchRO<sz, V, opc, (outs),
+                (ins prfop:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                asm, [(AArch64Prefetch imm:$Rt,
+                                     (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                    ro_Wextend64:$extend))]> {
+    let Inst{13} = 0b0;
+  }
+
+  def roX : BasePrefetchRO<sz, V, opc, (outs),
+                (ins prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                asm,  [(AArch64Prefetch imm:$Rt,
+                                      (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                     ro_Xextend64:$extend))]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
+               (!cast<Instruction>(NAME # "roX") prfop:$Rt,
+                                                 GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+}
+
+//---
+// Load/store unscaled immediate
+//---
+
+def am_unscaled8 :  ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
+def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
+def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
+def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
+def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
+
+class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                           string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, list<dag> pattern> {
+  let AddedComplexity = 1 in // try this before LoadUI
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
+                               (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                         string asm, list<dag> pattern> {
+  let AddedComplexity = 1 in // try this before StoreUI
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                               (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, pattern>,
+          Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
+                            list<dag> pat> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                               (ins prfop:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, pat>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store unscaled immediate, unprivileged
+//---
+
+class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                                dag oops, dag iops, string asm>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                            RegisterClass regtype, string asm> {
+  let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in
+  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs regtype:$Rt),
+                                    (ins GPR64sp:$Rn, simm9:$offset), asm>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                             RegisterClass regtype, string asm> {
+  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
+  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs),
+                                 (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                                 asm>,
+          Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store pre-indexed
+//---
+
+class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr, list<dag> pat>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]!", cstr, pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                     (outs GPR64sp:$wback, regtype:$Rt),
+                     (ins GPR64sp:$Rn, simm9:$offset), asm,
+                     "$Rn = $wback", []>,
+      Sched<[WriteLD, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                  string asm, SDPatternOperator storeop, ValueType Ty>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                      (outs GPR64sp:$wback),
+                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                      asm, "$Rn = $wback",
+      [(set GPR64sp:$wback,
+            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+      Sched<[WriteAdr, WriteST]>;
+} // hasSideEffects = 0
+
+//---
+// Load/store post-indexed
+//---
+
+// (pre-index) load/stores.
+class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr, list<dag> pat>
+    : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0b0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs GPR64sp:$wback, regtype:$Rt),
+                      (ins GPR64sp:$Rn, simm9:$offset),
+                      asm, "$Rn = $wback", []>,
+      Sched<[WriteLD, WriteI]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, SDPatternOperator storeop, ValueType Ty>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs GPR64sp:$wback),
+                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                       asm, "$Rn = $wback",
+      [(set GPR64sp:$wback,
+            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+    Sched<[WriteAdr, WriteST, ReadAdrBase]>;
+} // hasSideEffects = 0
+
+
+//---
+// Load/store pair
+//---
+
+// (indexed, offset)
+
+class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b010;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                          Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStorePairOffset<opc, V, 1,
+                                  (outs regtype:$Rt, regtype:$Rt2),
+                                  (ins GPR64sp:$Rn, indextype:$offset), asm>,
+          Sched<[WriteLD, WriteLDHi]>;
+
+  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+
+multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                           Operand indextype, string asm> {
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+  def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
+                                  (ins regtype:$Rt, regtype:$Rt2,
+                                       GPR64sp:$Rn, indextype:$offset),
+                                  asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+// (pre-indexed)
+class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b011;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                     Operand indextype, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 1,
+                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+                              (ins GPR64sp:$Rn, indextype:$offset), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
+                             (ins regtype:$Rt, regtype:$Rt2,
+                                  GPR64sp:$Rn, indextype:$offset),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+// (post-indexed)
+
+class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b001;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 1,
+                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+                              (ins GPR64sp:$Rn, idxtype:$offset), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                       Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
+                             (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2,
+                                  GPR64sp:$Rn, idxtype:$offset),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+//  (no-allocate)
+
+class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b000;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                           Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStorePairNoAlloc<opc, V, 1,
+                                   (outs regtype:$Rt, regtype:$Rt2),
+                                   (ins GPR64sp:$Rn, indextype:$offset), asm>,
+          Sched<[WriteLD, WriteLDHi]>;
+
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in
+  def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
+                                   (ins regtype:$Rt, regtype:$Rt2,
+                                        GPR64sp:$Rn, indextype:$offset),
+                                   asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store exclusive
+//---
+
+// True exclusive operations write to and/or read from the system's exclusive
+// monitors, which as far as a compiler is concerned can be modelled as a
+// random shared memory address. Hence LoadExclusive mayStore.
+//
+// Since these instructions have the undefined register bits set to 1 in
+// their canonical form, we need a post encoder method to set those bits
+// to 1 when encoding these instructions. We do this using the
+// fixLoadStoreExclusive function. This function has template parameters:
+//
+// fixLoadStoreExclusive<int hasRs, int hasRt2>
+//
+// hasRs indicates that the instruction uses the Rs field, so we won't set
+// it to 1 (and the same for Rt2). We don't need template parameters for
+// the other register fields since Rt and Rn are always used.
+//
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
+class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                             dag oops, dag iops, string asm, string operands>
+    : I<oops, iops, asm, operands, "", []> {
+  let Inst{31-30} = sz;
+  let Inst{29-24} = 0b001000;
+  let Inst{23}    = o2;
+  let Inst{22}    = L;
+  let Inst{21}    = o1;
+  let Inst{15}    = o0;
+
+  let DecoderMethod = "DecodeExclusiveLdStInstruction";
+}
+
+// Neither Rs nor Rt2 operands.
+class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                               dag oops, dag iops, string asm, string operands>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
+  bits<5> Rt;
+  bits<5> Rn;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
+}
+
+// Simple load acquires don't set the exclusive monitor
+let mayLoad = 1, mayStore = 0 in
+class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                  RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteLD]>;
+
+class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                    RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteLD]>;
+
+class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                       RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs regtype:$Rt, regtype:$Rt2),
+                             (ins GPR64sp0:$Rn), asm,
+                             "\t$Rt, $Rt2, [$Rn]">,
+      Sched<[WriteLD, WriteLDHi]> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
+}
+
+// Simple store release operations do not check the exclusive monitor.
+let mayLoad = 0, mayStore = 1 in
+class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                   RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
+                               (ins regtype:$Rt, GPR64sp0:$Rn),
+                               asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteST]>;
+
+let mayLoad = 1, mayStore = 1 in
+class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                     RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
+                             (ins regtype:$Rt, GPR64sp0:$Rn),
+                             asm, "\t$Ws, $Rt, [$Rn]">,
+      Sched<[WriteSTX]> {
+  bits<5> Ws;
+  bits<5> Rt;
+  bits<5> Rn;
+  let Inst{20-16} = Ws;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let Constraints = "@earlyclobber $Ws";
+  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
+}
+
+class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                         RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs GPR32:$Ws),
+                             (ins regtype:$Rt, regtype:$Rt2, GPR64sp0:$Rn),
+                              asm, "\t$Ws, $Rt, $Rt2, [$Rn]">,
+      Sched<[WriteSTX]> {
+  bits<5> Ws;
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{20-16} = Ws;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let Constraints = "@earlyclobber $Ws";
+}
+
+//---
+// Exception generation
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
+    : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
+      Sched<[WriteSys]> {
+  bits<16> imm;
+  let Inst{31-24} = 0b11010100;
+  let Inst{23-21} = op1;
+  let Inst{20-5}  = imm;
+  let Inst{4-2}   = 0b000;
+  let Inst{1-0}   = ll;
+}
+
+let Predicates = [HasFPARMv8] in {
+
+//---
+// Floating point to integer conversion
+//---
+
+class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-29} = 0b00;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-29} = 0b00;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 0;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
+           SDPatternOperator OpN> {
+  // Unscaled single-precision to 32-bit
+  def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled single-precision to 64-bit
+  def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Unscaled double-precision to 32-bit
+  def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled double-precision to 64-bit
+  def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
+                             SDPatternOperator OpN> {
+  // Scaled single-precision to 32-bit
+  def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
+                              fixedpoint_f32_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn,
+                                          fixedpoint_f32_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+  }
+
+  // Scaled single-precision to 64-bit
+  def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
+                              fixedpoint_f32_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn,
+                                          fixedpoint_f32_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Scaled double-precision to 32-bit
+  def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
+                              fixedpoint_f64_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn,
+                                          fixedpoint_f64_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+  }
+
+  // Scaled double-precision to 64-bit
+  def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
+                              fixedpoint_f64_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn,
+                                          fixedpoint_f64_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+//---
+// Integer to floating point conversion
+//---
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseIntegerToFP<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21-17} = 0b00001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseIntegerToFPUnscaled<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      ValueType dvt, string asm, SDNode node>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21-17} = 0b10001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
+  // Unscaled
+  def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  // Scaled
+  def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
+                             [(set FPR32:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f32_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+    let scale{5} = 1;
+  }
+
+  def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm,
+                             [(set FPR64:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f64_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+    let scale{5} = 1;
+  }
+
+  def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
+                             [(set FPR32:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f32_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
+                             [(set FPR64:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f64_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+}
+
+//---
+// Unscaled integer <-> floating point conversion (i.e. FMOV)
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
+        // We use COPY_TO_REGCLASS for these bitconvert operations.
+        // copyPhysReg() expands the resultant COPY instructions after
+        // regalloc is done. This gives greater freedom for the allocator
+        // and related passes (coalescing, copy propagation, et. al.) to
+        // be more effective.
+        [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterClass srcType, RegisterOperand dstType, string asm,
+                     string kind>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+        "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod =  "DecodeFMOVLaneInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterOperand srcType, RegisterClass dstType, string asm,
+                     string kind>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+        "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod =  "DecodeFMOVLaneInstruction";
+}
+
+
+
+multiclass UnscaledConversion<string asm> {
+  def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
+                                             asm, ".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+
+  def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
+                                               asm, ".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+}
+
+//---
+// Floating point conversion
+//---
+
+class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
+                       RegisterClass srcType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-24} = 0b00011110;
+  let Inst{23-22} = type;
+  let Inst{21-17} = 0b10001;
+  let Inst{16-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPConversion<string asm> {
+  // Double-precision to Half-precision
+  def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
+                             [(set FPR16:$Rd, (fround FPR64:$Rn))]>;
+
+  // Double-precision to Single-precision
+  def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
+                             [(set FPR32:$Rd, (fround FPR64:$Rn))]>;
+
+  // Half-precision to Double-precision
+  def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
+                             [(set FPR64:$Rd, (fextend FPR16:$Rn))]>;
+
+  // Half-precision to Single-precision
+  def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
+                             [(set FPR32:$Rd, (fextend FPR16:$Rn))]>;
+
+  // Single-precision to Double-precision
+  def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
+                             [(set FPR64:$Rd, (fextend FPR32:$Rn))]>;
+
+  // Single-precision to Half-precision
+  def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
+                             [(set FPR16:$Rd, (fround FPR32:$Rn))]>;
+}
+
+//---
+// Single operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
+                              ValueType vt, string asm, SDPatternOperator node>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+         [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21-19} = 0b100;
+  let Inst{18-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SingleOperandFPData<bits<4> opcode, string asm,
+                               SDPatternOperator node = null_frag> {
+  def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+//---
+// Two operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
+                           string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+         asm, "\t$Rd, $Rn, $Rm", "", pat>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass TwoOperandFPData<bits<4> opcode, string asm,
+                            SDPatternOperator node = null_frag> {
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                         [(set (f32 FPR32:$Rd),
+                               (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                         [(set (f64 FPR64:$Rd),
+                               (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                  [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                  [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+
+//---
+// Three operand floating point data processing
+//---
+
+class BaseThreeOperandFPData<bit isNegated, bit isSub,
+                             RegisterClass regtype, string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
+         asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
+      Sched<[WriteFMul]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{31-23} = 0b000111110;
+  let Inst{21}    = isNegated;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
+                              SDPatternOperator node> {
+  def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
+            [(set FPR32:$Rd,
+                  (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
+            [(set FPR64:$Rd,
+                  (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+//---
+// Floating point data comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandFPComparison<bit signalAllNans,
+                                 RegisterClass regtype, string asm,
+                                 list<dag> pat>
+    : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b1000;
+
+  // Rm should be 0b00000 canonically, but we need to accept any value.
+  let PostEncoderMethod = "fixOneOperandFPComparison";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
+                                string asm, list<dag> pat>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rm;
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b0000;
+}
+
+multiclass FPComparison<bit signalAllNans, string asm,
+                        SDPatternOperator OpNode = null_frag> {
+  let Defs = [NZCV] in {
+  def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
+    let Inst{22} = 0;
+  }
+
+  def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{22} = 0;
+  }
+
+  def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
+    let Inst{22} = 1;
+  }
+
+  def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{22} = 1;
+  }
+  } // Defs = [NZCV]
+}
+
+//---
+// Floating point conditional comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPCondComparison<bit signalAllNans,
+                              RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass FPCondComparison<bit signalAllNans, string asm> {
+  let Defs = [NZCV], Uses = [NZCV] in {
+  def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
+    let Inst{22} = 1;
+  }
+  } // Defs = [NZCV], Uses = [NZCV]
+}
+
+//---
+// Floating point conditional select
+//---
+
+class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel (vt regtype:$Rn), regtype:$Rm,
+                          (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPCondSelect<string asm> {
+  let Uses = [NZCV] in {
+  def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
+    let Inst{22} = 1;
+  }
+  } // Uses = [NZCV]
+}
+
+//---
+// Floating move immediate
+//---
+
+class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
+  : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
+      [(set regtype:$Rd, fpimmtype:$imm)]>,
+    Sched<[WriteFImm]> {
+  bits<5> Rd;
+  bits<8> imm;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-13} = imm;
+  let Inst{12-5}  = 0b10000000;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPMoveImmediate<string asm> {
+  def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
+    let Inst{22} = 1;
+  }
+}
+} // end of 'let Predicates = [HasFPARMv8]'
+
+//----------------------------------------------------------------------------
+// AdvSIMD
+//----------------------------------------------------------------------------
+
+let Predicates = [HasNEON] in {
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register vector instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// All operand sizes distinguished in the encoding.
+multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+         [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+         [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
+                                      asm, ".2d",
+         [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
+// As above, but D sized elements unsupported.
+multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+        [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+        [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+        [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+        [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+        [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+        [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
+}
+
+multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+      [(set (v8i8 V64:$dst),
+            (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+      [(set (v16i8 V128:$dst),
+            (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+      [(set (v4i16 V64:$dst),
+            (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+      [(set (v8i16 V128:$dst),
+            (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+      [(set (v2i32 V64:$dst),
+            (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+      [(set (v4i32 V128:$dst),
+            (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// As above, but only B sized elements supported.
+multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+    [(set (v16i8 V128:$Rd),
+          (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+}
+
+// As above, but only S and D sized floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
+                                    string asm,
+                                    SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+        [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+     [(set (v2f32 V64:$dst),
+           (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+     [(set (v4f32 V128:$dst),
+           (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+     [(set (v2f64 V128:$dst),
+           (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+// As above, but D and B sized elements unsupported.
+multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+        [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// Logical three vector ops share opcode bits, and only use B sized elements.
+multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
+                                     asm, ".8b",
+                         [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
+  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
+                                     asm, ".16b",
+                         [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
+
+  def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+}
+
+multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
+                                  string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
+                                     asm, ".8b",
+             [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
+                                     asm, ".16b",
+             [(set (v16i8 V128:$dst),
+                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                           (v16i8 V128:$Rm)))]>;
+
+  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+                           (v4i16 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+                           (v2i32 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+                           (v1i64 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+                           (v8i16 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+                           (v4i32 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+                           (v2i64 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string dstkind,
+                        string srckind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Supports B, H, and S element sizes.
+multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b", ".8b",
+                          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b", ".16b",
+                          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h", ".4h",
+                          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h", ".8h",
+                          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, string amount>
+  : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
+      "|" # dstkind # "\t$Rd, $Rn, #" #  amount # "}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-24} = 0b101110;
+  let Inst{23-22} = size;
+  let Inst{21-10} = 0b100001001110;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDVectorLShiftLongBySizeBHS {
+  let neverHasSideEffects = 1 in {
+  def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
+                                             "shll", ".8h",  ".8b", "8">;
+  def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
+                                             "shll2", ".8h", ".16b", "8">;
+  def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
+                                             "shll", ".4s",  ".4h", "16">;
+  def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
+                                             "shll2", ".4s", ".8h", "16">;
+  def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
+                                             "shll", ".2d",  ".2s", "32">;
+  def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
+                                             "shll2", ".2d", ".4s", "32">;
+  }
+}
+
+// Supports all element sizes.
+multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".4h", ".8b",
+               [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".8h", ".16b",
+               [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".2s", ".4h",
+               [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".4s", ".8h",
+               [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".1d", ".2s",
+               [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".2d", ".4s",
+               [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+                                          asm, ".4h", ".8b",
+      [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
+                                      (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+                                          asm, ".8h", ".16b",
+      [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
+                                      (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+                                          asm, ".2s", ".4h",
+      [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
+                                      (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+                                          asm, ".4s", ".8h",
+      [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+                                      (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+                                          asm, ".1d", ".2s",
+      [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
+                                      (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+                                          asm, ".2d", ".4s",
+      [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
+                                      (v4i32 V128:$Rn)))]>;
+}
+
+// Supports all element sizes, except 1xD.
+multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+                                    asm, ".8b", ".8b",
+    [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+                                    asm, ".16b", ".16b",
+    [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+                                    asm, ".4h", ".4h",
+    [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+                                    asm, ".8h", ".8h",
+    [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+                                    asm, ".2s", ".2s",
+    [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+                                    asm, ".4s", ".4s",
+    [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
+                                    asm, ".2d", ".2d",
+    [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                asm, ".8b", ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                asm, ".16b", ".16b",
+    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                asm, ".4h", ".4h",
+    [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                asm, ".8h", ".8h",
+    [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                asm, ".2s", ".2s",
+    [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                asm, ".4s", ".4s",
+    [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
+                                asm, ".2d", ".2d",
+    [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+// Supports only B element sizes.
+multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
+                          SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+}
+
+// Supports only B and H element sizes.
+multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
+}
+
+// Supports only S and D element sizes, uses high bit of the size field
+// as an extra opcode bit.
+multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+// Supports only S element size.
+multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+
+multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
+                                      asm, ".8b", ".8h",
+        [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
+                                      asm#"2", ".16b", ".8h", []>;
+  def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
+                                      asm, ".4h", ".4s",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
+                                      asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
+                                      asm, ".2s", ".2d",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+  def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
+                                      asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand regtype,
+                           string asm, string kind, string zero,
+                           ValueType dty, ValueType sty, SDNode OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
+      "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
+      [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Comparisons support all element sizes, except 1xD.
+multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
+                            SDNode OpNode> {
+  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
+                                     asm, ".8b", "0",
+                                     v8i8, v8i8, OpNode>;
+  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
+                                     asm, ".16b", "0",
+                                     v16i8, v16i8, OpNode>;
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
+                                     asm, ".4h", "0",
+                                     v4i16, v4i16, OpNode>;
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
+                                     asm, ".8h", "0",
+                                     v8i16, v8i16, OpNode>;
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
+                                     asm, ".2s", "0",
+                                     v2i32, v2i32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
+                                     asm, ".4s", "0",
+                                     v4i32, v4i32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
+                                     asm, ".2d", "0",
+                                     v2i64, v2i64, OpNode>;
+}
+
+// FP Comparisons support only S and D element sizes.
+multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
+                              string asm, SDNode OpNode> {
+
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
+                                     asm, ".2s", "0.0",
+                                     v2i32, v2f32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
+                                     asm, ".4s", "0.0",
+                                     v4i32, v4f32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
+                                     asm, ".2d", "0.0",
+                                     v2i64, v2f64, OpNode>;
+
+  def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0",
+                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0",
+                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0",
+                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # ".2s $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # ".4s $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # ".2d $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
+                                    asm, ".4s", ".4h", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".4s", ".8h", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
+                                    asm, ".2d", ".2s", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".2d", ".4s", []>;
+}
+
+multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
+                                    asm, ".4h", ".4s", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                    asm, ".2s", ".2d", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+}
+
+multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
+                                     Intrinsic OpNode> {
+  def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                     asm, ".2s", ".2d",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+  def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4f32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register different-size vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// FIXME: TableGen doesn't know how to deal with expanded types that also
+//        change the element count (in this case, placing the results in
+//        the high elements of the result register rather than the low
+//        elements). Until that's fixed, we can't code-gen those.
+multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                    Intrinsic IntOp> {
+  def v8i16_v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".8b", ".8h", ".8h",
+     [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v8i16_v16i8  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".16b", ".8h", ".8h",
+     []>;
+  def v4i32_v4i16  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".4h", ".4s", ".4s",
+     [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v4i32_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".4s", ".4s",
+     []>;
+  def v2i64_v2i32  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".2s", ".2d", ".2d",
+     [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+  def v2i64_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".2d", ".2d",
+     []>;
+
+
+  // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
+  // a version attached to an instruction.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
+                                                   (v8i16 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v8i16_v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
+                                                    (v4i32 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v4i32_v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
+                                                    (v2i64 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v2i64_v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
+                                      Intrinsic IntOp> {
+  def v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                            V128, V64, V64,
+                                            asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                            V128, V128, V128,
+                                            asm#"2", ".8h", ".16b", ".16b", []>;
+  let Predicates = [HasCrypto] in {
+    def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
+                                              V128, V64, V64,
+                                              asm, ".1q", ".1d", ".1d", []>;
+    def v2i64  : BaseSIMDDifferentThreeVector<U, 0b111, opc,
+                                              V128, V128, V128,
+                                              asm#"2", ".1q", ".2d", ".2d", []>;
+  }
+
+  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
+                          (v8i8 (extract_high_v16i8 V128:$Rm)))),
+      (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                (extract_high_v16i8 V128:$Rm)))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                  (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                 (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
+                                          string asm,
+                                          SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                   (extract_high_v16i8 V128:$Rm))))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                    (extract_high_v8i16 V128:$Rm))))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                    (extract_high_v4i32 V128:$Rm))))))]>;
+}
+
+multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
+                                      (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
+                                      string asm,
+                                      SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd),
+                  (extract_high_v16i8 V128:$Rn),
+                  (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
+                                           SDPatternOperator Accum> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+                                                (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
+                                            (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn),
+                                                (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
+                                            (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".8h", ".8h", ".8b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".8h", ".16b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                                       (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".4s", ".4s", ".4h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".4s", ".8h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                                       (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".2d", ".2d", ".2s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".2d", ".4s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                                       (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
+                             string asm, string kind>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
+      "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
+      [(set (vty regtype:$Rd),
+            (AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> imm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size;
+  let Inst{29-21} = 0b101110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-11} = imm;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+multiclass SIMDBitwiseExtract<string asm> {
+  def v8i8  : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> {
+    let imm{3} = 0;
+  }
+  def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
+                        string asm, string kind, SDNode OpNode, ValueType valty>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "",
+      [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDZipVector<bits<3>opc, string asm,
+                         SDNode OpNode> {
+  def v8i8   : BaseSIMDZipVector<0b000, opc, V64,
+      asm, ".8b", OpNode, v8i8>;
+  def v16i8  : BaseSIMDZipVector<0b001, opc, V128,
+      asm, ".16b", OpNode, v16i8>;
+  def v4i16  : BaseSIMDZipVector<0b010, opc, V64,
+      asm, ".4h", OpNode, v4i16>;
+  def v8i16  : BaseSIMDZipVector<0b011, opc, V128,
+      asm, ".8h", OpNode, v8i16>;
+  def v2i32  : BaseSIMDZipVector<0b100, opc, V64,
+      asm, ".2s", OpNode, v2i32>;
+  def v4i32  : BaseSIMDZipVector<0b101, opc, V128,
+      asm, ".4s", OpNode, v4i32>;
+  def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
+      asm, ".2d", OpNode, v2i64>;
+
+  def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
+        (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
+  def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, string asm,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "\t$Rd, $Rn, $Rm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+}
+
+multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
+  def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
+            (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
+                             [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+}
+
+multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+      [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+      [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  }
+
+  def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
+                                SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+      [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+      [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
+              dag oops, dag iops, string asm, string cstr, list<dag> pat>
+  : I<oops, iops, asm,
+      "\t$Rd, $Rn, $Rm", cstr, pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$Rd),
+                                      (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$Rd),
+                                      (ins FPR32:$Rn, FPR32:$Rm), asm, "",
+            [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$dst),
+                                      (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
+                                      asm, "$Rd = $dst", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$dst),
+                                      (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
+                                      asm, "$Rd = $dst",
+            [(set (i64 FPR64:$dst),
+                  (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "$Rd = $dst", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, string asm, string zero>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "\t$Rd, $Rn, #" # zero, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
+  : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
+     [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-17} = 0b011111100110000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
+
+  def : Pat<(v1i64 (OpNode FPR64:$Rn)),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
+  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
+
+  def : InstAlias<asm # " $Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
+  def : InstAlias<asm # " $Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
+                          SDPatternOperator OpNode = null_frag> {
+  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
+  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
+  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
+}
+
+multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
+                                [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
+  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
+                                [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+}
+
+multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+           [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
+           [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
+                                 Intrinsic OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
+        [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
+        [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
+}
+
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
+        [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
+  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, RegisterOperand vectype,
+                        string asm, string kind>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
+  def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
+  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
+                                      asm, ".2s">;
+  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
+                          RegisterClass regtype, RegisterOperand vectype,
+                          string asm, string kind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
+                              string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8,  V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8,  V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
+                            Intrinsic intOp> {
+  def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
+                                   asm, ".4s",
+        [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+// FIXME: There has got to be a better way to factor these. ugh.
+
+class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
+                     string operands, string constraints, list<dag> pattern>
+  : I<outs, ins, asm, operands, constraints, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{15} = 0;
+  let Inst{10} = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
+                      RegisterOperand vecreg, RegisterClass regtype>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
+                   "{\t$Rd" # size # ", $Rn" #
+                   "|" # size # "\t$Rd, $Rn}", "",
+                   [(set (vectype vecreg:$Rd), (AArch64dup regtype:$Rn))]> {
+  let Inst{20-16} = imm5;
+  let Inst{14-11} = 0b0001;
+}
+
+class SIMDDupFromElement<bit Q, string dstkind, string srckind,
+                         ValueType vectype, ValueType insreg,
+                         RegisterOperand vecreg, Operand idxtype,
+                         ValueType elttype, SDNode OpNode>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
+                   "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
+                   "|" # dstkind # "\t$Rd, $Rn$idx}", "",
+                 [(set (vectype vecreg:$Rd),
+                       (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
+  let Inst{14-11} = 0b0000;
+}
+
+class SIMDDup64FromElement
+  : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
+                       VectorIndexD, i64, AArch64duplane64> {
+  bits<1> idx;
+  let Inst{20} = idx;
+  let Inst{19-16} = 0b1000;
+}
+
+class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
+                       VectorIndexS, i64, AArch64duplane32> {
+  bits<2> idx;
+  let Inst{20-19} = idx;
+  let Inst{18-16} = 0b100;
+}
+
+class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
+                       VectorIndexH, i64, AArch64duplane16> {
+  bits<3> idx;
+  let Inst{20-18} = idx;
+  let Inst{17-16} = 0b10;
+}
+
+class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
+                          RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
+                       VectorIndexB, i64, AArch64duplane8> {
+  bits<4> idx;
+  let Inst{20-17} = idx;
+  let Inst{16} = 1;
+}
+
+class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
+                  Operand idxtype, string asm, list<dag> pattern>
+  : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
+                   "{\t$Rd, $Rn" # size # "$idx" #
+                   "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
+  let Inst{14-11} = imm4;
+}
+
+class SIMDSMov<bit Q, string size, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
+class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
+      [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
+
+class SIMDMovAlias<string asm, string size, Instruction inst,
+                   RegisterClass regtype, Operand idxtype>
+    : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
+                    "|" # size # "\t$dst, $src$idx}",
+                (inst regtype:$dst, V128:$src, idxtype:$idx)>;
+
+multiclass SMov {
+  def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+}
+
+multiclass UMov {
+  def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+  def : SIMDMovAlias<"mov", ".s",
+                     !cast<Instruction>(NAME#"vi32"),
+                     GPR32, VectorIndexS>;
+  def : SIMDMovAlias<"mov", ".d",
+                     !cast<Instruction>(NAME#"vi64"),
+                     GPR64, VectorIndexD>;
+}
+
+class SIMDInsFromMain<string size, ValueType vectype,
+                      RegisterClass regtype, Operand idxtype>
+  : BaseSIMDInsDup<1, 0, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" #
+                   "|" # size # "\t$Rd$idx, $Rn}",
+                   "$Rd = $dst",
+            [(set V128:$dst,
+              (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
+  let Inst{14-11} = 0b0011;
+}
+
+class SIMDInsFromElement<string size, ValueType vectype,
+                         ValueType elttype, Operand idxtype>
+  : BaseSIMDInsDup<1, 1, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
+                   "|" # size # "\t$Rd$idx, $Rn$idx2}",
+                   "$Rd = $dst",
+         [(set V128:$dst,
+               (vector_insert
+                 (vectype V128:$Rd),
+                 (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
+                 idxtype:$idx))]>;
+
+class SIMDInsMainMovAlias<string size, Instruction inst,
+                          RegisterClass regtype, Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
+                        "|" # size #"\t$dst$idx, $src}",
+                (inst V128:$dst, idxtype:$idx, regtype:$src)>;
+class SIMDInsElementMovAlias<string size, Instruction inst,
+                             Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
+                      # "|" # size #" $dst$idx, $src$idx2}",
+                (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
+
+
+multiclass SIMDIns {
+  def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
+    bits<4> idx;
+    bits<4> idx2;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+    let Inst{14-11} = idx2;
+  }
+  def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
+    bits<3> idx;
+    bits<3> idx2;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+    let Inst{14-12} = idx2;
+    let Inst{11} = 0;
+  }
+  def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
+    bits<2> idx;
+    bits<2> idx2;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+    let Inst{14-13} = idx2;
+    let Inst{12-11} = 0;
+  }
+  def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
+    bits<1> idx;
+    bits<1> idx2;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+    let Inst{14} = idx2;
+    let Inst{13-11} = 0;
+  }
+
+  // For all forms of the INS instruction, the "mov" mnemonic is the
+  // preferred alias. Why they didn't just call the instruction "mov" in
+  // the first place is a very good question indeed...
+  def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
+                         GPR32, VectorIndexB>;
+  def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
+                         GPR32, VectorIndexH>;
+  def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
+                         GPR32, VectorIndexS>;
+  def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
+                         GPR64, VectorIndexD>;
+
+  def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
+                         VectorIndexB>;
+  def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
+                         VectorIndexH>;
+  def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
+                         VectorIndexS>;
+  def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
+                         VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+class SIMDTableLookupAlias<string asm, Instruction inst,
+                          RegisterOperand vectype, RegisterOperand listtype>
+    : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
+                (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
+
+multiclass SIMDTableLookup<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+multiclass SIMDTableLookupTied<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY
+//----------------------------------------------------------------------------
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
+                        string kind, Operand idxtype>
+  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
+       "{\t$dst, $src" # kind # "$idx" #
+       "|\t$dst, $src$idx}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> dst;
+  bits<5> src;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{15-10} = 0b000001;
+  let Inst{9-5}   = src;
+  let Inst{4-0}   = dst;
+}
+
+class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
+      RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
+    : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
+                    # "|\t$dst, $src$index}",
+                (inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
+
+
+multiclass SIMDScalarCPY<string asm> {
+  def i8  : BaseSIMDScalarCPY<FPR8,  V128, ".b", VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src),
+                                                          VectorIndexD:$idx)))),
+            (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;
+
+  // 'DUP' mnemonic aliases.
+  def : SIMDScalarCPYAlias<"dup", ".b",
+                           !cast<Instruction>(NAME#"i8"),
+                           FPR8, V128, VectorIndexB>;
+  def : SIMDScalarCPYAlias<"dup", ".h",
+                           !cast<Instruction>(NAME#"i16"),
+                           FPR16, V128, VectorIndexH>;
+  def : SIMDScalarCPYAlias<"dup", ".s",
+                           !cast<Instruction>(NAME#"i32"),
+                           FPR32, V128, VectorIndexS>;
+  def : SIMDScalarCPYAlias<"dup", ".d",
+                           !cast<Instruction>(NAME#"i64"),
+                           FPR64, V128, VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//----------------------------------------------------------------------------
+
+class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
+                          string asm, string op_string,
+                          string cstr, list<dag> pattern>
+  : I<oops, iops, asm, op_string, cstr, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<8> imm8;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = op;
+  let Inst{28-19} = 0b0111100000;
+  let Inst{18-16} = imm8{7-5};
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = imm8{4-0};
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
+                        !con((ins immtype:$imm8), opt_shift_iop), asm,
+                        "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                        "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "", pattern> {
+  let DecoderMethod = "DecodeModImmInstruction";
+}
+
+class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
+                        !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
+                        asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                             "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "$Rd = $dst", pattern> {
+  let DecoderMethod = "DecodeModImmTiedInstruction";
+}
+
+class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+
+class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
+                                      string asm> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
+                                                 asm, ".4h", []>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
+                                                 asm, ".8h", []>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
+                                             asm, ".2s", []>;
+  def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
+                                             asm, ".4s", []>;
+}
+
+multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
+                                      bits<2> w_cmode, string asm,
+                                      SDNode OpNode> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
+                                                 asm, ".4h",
+             [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
+                                                 asm, ".8h",
+             [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
+                                             asm, ".2s",
+             [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
+                                             asm, ".4s",
+             [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+}
+
+class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
+                             RegisterOperand vectype, string asm,
+                             string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins move_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<1> shift;
+  let Inst{15-13} = cmode{3-1};
+  let Inst{12}    = shift;
+}
+
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
+                                   RegisterOperand vectype,
+                                   Operand imm_type, string asm,
+                                   string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
+                              asm, kind, pattern> {
+  let Inst{15-12} = cmode;
+}
+
+class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
+                                   list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+                        "\t$Rd, $imm8", "", pattern> {
+  let Inst{15-12} = cmode;
+  let DecoderMethod = "DecodeModImmInstruction";
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
+      asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
+  let Inst{23-22} = size;
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$dst),
+      (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
+  let Inst{23-22} = size;
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2f32 V64:$Rd),
+        (OpNode (v2f32 V64:$Rn),
+         (v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4f32 V128:$Rd),
+        (OpNode (v4f32 V128:$Rn),
+         (v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d",
+    [(set (v2f64 V128:$Rd),
+        (OpNode (v2f64 V128:$Rn),
+         (v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (f32 FPR32Op:$Rd),
+          (OpNode (f32 FPR32Op:$Rn),
+                  (f32 (vector_extract (v4f32 V128:$Rm),
+                                       VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d",
+    [(set (f64 FPR64Op:$Rd),
+          (OpNode (f64 FPR64Op:$Rn),
+                  (f64 (vector_extract (v2f64 V128:$Rm),
+                                       VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
+  // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # v2i32_indexed)
+                V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+
+  // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v4i32_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64duplane64 (v2f64 V128:$Rm),
+                                           VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v2i64_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64dup (f64 FPR64Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexD:$idx)>;
+}
+
+multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
+                                          V128, VectorIndexS,
+                                          asm, ".2s", ".2s", ".2s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
+                         SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s",  ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+      [(set (i32 FPR32Op:$Rd),
+            (OpNode FPR32Op:$Rn,
+                    (i32 (vector_extract (v4i32 V128:$Rm),
+                                         VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$dst),
+        (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$dst),
+       (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$dst),
+       (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$dst),
+       (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator Accum> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull
+                             (v4i16 V64:$Rn),
+                             (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
+  // intermediate EXTRACT_SUBREG would be untyped.
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                (i32 (vector_extract (v4i32
+                         (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+                             (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx)))),
+                         (i64 0))))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(NAME # v4i16_indexed)
+                    (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
+                    V128_lo:$Rm, VectorIndexH:$idx),
+                ssub)>;
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull
+                            (extract_high_v8i16 V128:$Rn),
+                            (extract_high_v8i16
+                                (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (Accum (v2i64 V128:$Rd),
+               (v2i64 (int_aarch64_neon_sqdmull
+                          (v2i32 V64:$Rn),
+                          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull
+                            (extract_high_v4i32 V128:$Rn),
+                            (extract_high_v4i32
+                                (AArch64duplane32 (v4i32 V128:$Rm),
+                                                VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (i64 FPR64Op:$dst),
+          (Accum (i64 FPR64Op:$Rd),
+                 (i64 (int_aarch64_neon_sqdmulls_scalar
+                            (i32 FPR32Op:$Rn),
+                            (i32 (vector_extract (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+        (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift by immediate
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+
+multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$Rd),
+     (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
+                                                   (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                           (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
+                                            vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (v1i64 FPR64:$Rd),
+       (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode = null_frag> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR16, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR32, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR64, vecshiftR32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+}
+
+multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftL8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftL16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftL32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector x indexed element
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand dst_reg, RegisterOperand src_reg,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand vectype1, RegisterOperand vectype2,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
+                              Intrinsic OpNode> {
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
+                                  Intrinsic OpNode> {
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
+                                     SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V128, vecshiftR16Narrow,
+                                  asm, ".8b", ".8h",
+      [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR16Narrow,
+                                  asm#"2", ".16b", ".8h", []> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V128, vecshiftR32Narrow,
+                                  asm, ".4h", ".4s",
+      [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR32Narrow,
+                                  asm#"2", ".8h", ".4s", []> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V128, vecshiftR64Narrow,
+                                  asm, ".2s", ".2d",
+      [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR64Narrow,
+                                  asm#"2", ".4s", ".2d", []> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
+  // themselves, so put them here instead.
+
+  // Patterns involving what's effectively an insert high and a normal
+  // intrinsic, represented by CONCAT_VECTORS.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
+                                                   vecshiftR16Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v16i8_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR16Narrow:$imm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
+                                                     vecshiftR32Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v8i16_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR32Narrow:$imm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
+                                                     vecshiftR64Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v4i32_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR64Narrow:$imm)>;
+}
+
+multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8, asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                           (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8, asm, ".16b", ".16b",
+             [(set (v16i8 V128:$dst),
+               (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16, asm, ".4h", ".4h",
+              [(set (v4i16 V64:$dst),
+                (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                        (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16, asm, ".8h", ".8h",
+            [(set (v8i16 V128:$dst),
+              (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                      (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32, asm, ".2s", ".2s",
+              [(set (v2i32 V64:$dst),
+                (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                        (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32, asm, ".4s", ".4s",
+            [(set (v4i32 V128:$dst),
+              (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                      (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
+              (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                      (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$dst),
+                          (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$dst),
+                          (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$dst),
+                           (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                                   (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$dst),
+                          (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+                    [(set (v2i32 V64:$dst),
+                          (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+                    [(set (v4i32 V128:$dst),
+                          (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+                    [(set (v2i64 V128:$dst),
+                          (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V64, vecshiftL8, asm, ".8h", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm#"2", ".8h", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V64, vecshiftL16, asm, ".4s", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm#"2", ".4s", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
+
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V64, vecshiftL32, asm, ".2d", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm#"2", ".2d", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+}
+
+
+//---
+// Vector load/store
+//---
+// SIMD ldX/stX no-index memory references don't allow the optional
+// ", #0" constant and handle post-indexing explicitly, so we use
+// a more specialized parse method for them. Otherwise, it's the same as
+// the general GPR64sp handling.
+
+class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
+                   string asm, dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, "\t$Vt, [$Rn]", "", pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29-23} = 0b0011000;
+  let Inst{22} = L;
+  let Inst{21-16} = 0b000000;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
+                       string asm, dag oops, dag iops>
+  : I<oops, iops, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", []> {
+  bits<5> Vt;
+  bits<5> Rn;
+  bits<5> Xm;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29-23} = 0b0011001;
+  let Inst{22} = L;
+  let Inst{21} = 0;
+  let Inst{20-16} = Xm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+// The immediate form of AdvSIMD post-indexed addressing is encoded with
+// register post-index addressing from the zero register.
+multiclass SIMDLdStAliases<string asm, string layout, string Count,
+                           int Offset, int Size> {
+  // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
+  //      "ld1\t$Vt, [$Rn], #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      XZR), 1>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], #16"
+  //      "ld1.8b\t$Vt, [$Rn], #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      XZR), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1]"
+  //      "ld1\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+                  (!cast<Instruction>(NAME # Count # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], x2"
+  //      "ld1\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+
+
+    def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+// Only ld1/st1 has a v1d version.
+multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
+    def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
+                            (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                                 GPR64sp:$Rn), []>;
+    def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                                GPR64sp:$Rn), []>;
+
+    def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDLd1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // LD1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDSt1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // ST1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+    def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                                GPR64sp:$Rn), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass SIMDLd1Multiple<string asm> {
+  defm One   : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDSt1Multiple<string asm> {
+  defm One   : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDLd2Multiple<string asm> {
+  defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDSt2Multiple<string asm> {
+  defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDLd3Multiple<string asm> {
+  defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDSt3Multiple<string asm> {
+  defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDLd4Multiple<string asm> {
+  defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+multiclass SIMDSt4Multiple<string asm> {
+  defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+//---
+// AdvSIMD Load/store single-element
+//---
+
+class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, string cst,
+                         dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, operands, cst, pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
+  let Inst{15-13} = opcode;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, string cst,
+                         dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
+  let Inst{15-13} = opcode;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
+                  Operand listtype>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
+                       (outs listtype:$Vt), (ins GPR64sp:$Rn),
+                       []> {
+  let Inst{30} = Q;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = S;
+  let Inst{11-10} = size;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
+                      string asm, Operand listtype, Operand GPR64pi>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
+                       "$Rn = $wback",
+                       (outs GPR64sp:$wback, listtype:$Vt),
+                       (ins GPR64sp:$Rn, GPR64pi:$Xm), []> {
+  bits<5> Xm;
+  let Inst{30} = Q;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = S;
+  let Inst{11-10} = size;
+}
+
+multiclass SIMDLdrAliases<string asm, string layout, string Count,
+                          int Offset, int Size> {
+  // E.g. "ld1r { v0.8b }, [x1], #1"
+  //      "ld1r.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      XZR), 1>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], #1"
+  //      "ld1r.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      XZR), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1]"
+  //      "ld1r.8b\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+                  (!cast<Instruction>(NAME # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], x2"
+  //      "ld1r.8b\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
+  int Offset1, int Offset2, int Offset4, int Offset8> {
+  def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count # "8b")>;
+  def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count #"16b")>;
+  def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"4h")>;
+  def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"8h")>;
+  def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"2s")>;
+  def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"4s")>;
+  def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"1d")>;
+  def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"2d")>;
+
+  def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "8b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "16b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "4h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "8h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "2s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "4s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "1d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+  def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "2d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+
+  defm : SIMDLdrAliases<asm, "8b",  Count, Offset1,  64>;
+  defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
+  defm : SIMDLdrAliases<asm, "4h",  Count, Offset2,  64>;
+  defm : SIMDLdrAliases<asm, "8h",  Count, Offset2, 128>;
+  defm : SIMDLdrAliases<asm, "2s",  Count, Offset4,  64>;
+  defm : SIMDLdrAliases<asm, "4s",  Count, Offset4, 128>;
+  defm : SIMDLdrAliases<asm, "1d",  Count, Offset8,  64>;
+  defm : SIMDLdrAliases<asm, "2d",  Count, Offset8, 128>;
+}
+
+class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+
+class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+
+class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
+                           (outs listtype:$dst),
+                           (ins listtype:$Vt, VectorIndexB:$idx,
+                                GPR64sp:$Rn), []>;
+
+  def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexB:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleB<0, R, opcode, asm,
+                           (outs), (ins listtype:$Vt, VectorIndexB:$idx,
+                                        GPR64sp:$Rn), []>;
+
+  def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
+                                    (outs GPR64sp:$wback),
+                                    (ins listtype:$Vt, VectorIndexB:$idx,
+                                         GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+
+multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
+                                 string Count, int Offset, Operand idxtype> {
+  // E.g. "ld1 { v0.8b }[0], [x1], #1"
+  //      "ld1\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt$idx, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Type  # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      idxtype:$idx, XZR), 1>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], #1"
+  //      "ld1.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Type # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                      idxtype:$idx, XZR), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1]"
+  //      "ld1.8b\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn]",
+                      (!cast<Instruction>(NAME # Type)
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx, GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], x2"
+  //      "ld1.8b\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], $Xm",
+                      (!cast<Instruction>(NAME # Type # "_POST")
+                         GPR64sp:$Rn,
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx,
+                         !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdSt1SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
+}
+
+multiclass SIMDLdSt2SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
+}
+
+multiclass SIMDLdSt3SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
+}
+
+multiclass SIMDLdSt4SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
+}
+} // end of 'let Predicates = [HasNEON]'
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+let Predicates = [HasCrypto] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
+              list<dag> pat>
+  : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0100111000101000;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
+            [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
+            "$Rd = $dst",
+            [(set (v16i8 V128:$dst),
+                  (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
+                     dag oops, dag iops, list<dag> pat>
+  : I<oops, iops, asm,
+      "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
+      "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
+                   (ins V128:$Rd, V128:$Rn, V128:$Rm),
+                   [(set (v4i32 V128:$dst),
+                         (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA2OpInst<bits<4> opc, string asm, string kind,
+                 string cstr, dag oops, dag iops,
+                 list<dag> pat>
+  : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
+                       "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0101111000101000;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
+               (ins V128:$Rd, V128:$Rn),
+               [(set (v4i32 V128:$dst),
+                     (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+
+class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
+               [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+} // end of 'let Predicates = [HasCrypto]'
+
+// Allow the size specifier tokens to be upper case, not just lower.
+def : TokenAlias<".8B", ".8b">;
+def : TokenAlias<".4H", ".4h">;
+def : TokenAlias<".2S", ".2s">;
+def : TokenAlias<".1D", ".1d">;
+def : TokenAlias<".16B", ".16b">;
+def : TokenAlias<".8H", ".8h">;
+def : TokenAlias<".4S", ".4s">;
+def : TokenAlias<".2D", ".2d">;
+def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".B", ".b">;
+def : TokenAlias<".H", ".h">;
+def : TokenAlias<".S", ".s">;
+def : TokenAlias<".D", ".d">;
+def : TokenAlias<".Q", ".q">;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
new file mode 100644
index 00000000000..52e3b333eb0
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -0,0 +1,2065 @@
+//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AArch64GenInstrInfo.inc"
+
+AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
+    : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
+      RI(this, &STI), Subtarget(STI) {}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MCInstrDesc &Desc = MI->getDesc();
+
+  switch (Desc.getOpcode()) {
+  default:
+    // Anything not explicitly designated otherwise is a nomal 4-byte insn.
+    return 4;
+  case TargetOpcode::DBG_VALUE:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+    return 0;
+  }
+
+  llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size");
+}
+
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  // Block ends with fall-through condbranch.
+  switch (LastInst->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown branch instruction?");
+  case AArch64::Bcc:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    Target = LastInst->getOperand(2).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    Cond.push_back(LastInst->getOperand(1));
+  }
+}
+
+// Branch analysis.
+bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      parseCondBranch(LastInst, TBB, Cond);
+      return false;
+    }
+    return true; // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If AllowModify is true and the block ends with two or more unconditional
+  // branches, delete all but the first unconditional branch.
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+        // Return now the only terminator is an unconditional branch.
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      } else {
+        SecondLastInst = I;
+        SecondLastOpc = SecondLastInst->getOpcode();
+      }
+    }
+  }
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    parseCondBranch(SecondLastInst, TBB, Cond);
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // ...likewise if it ends with an indirect branch followed by an unconditional
+  // branch.
+  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+bool AArch64InstrInfo::ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
+  } else {
+    // Folded compare-and-branch
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown conditional branch!");
+    case AArch64::CBZW:
+      Cond[1].setImm(AArch64::CBNZW);
+      break;
+    case AArch64::CBNZW:
+      Cond[1].setImm(AArch64::CBZW);
+      break;
+    case AArch64::CBZX:
+      Cond[1].setImm(AArch64::CBNZX);
+      break;
+    case AArch64::CBNZX:
+      Cond[1].setImm(AArch64::CBZX);
+      break;
+    case AArch64::TBZW:
+      Cond[1].setImm(AArch64::TBNZW);
+      break;
+    case AArch64::TBNZW:
+      Cond[1].setImm(AArch64::TBZW);
+      break;
+    case AArch64::TBZX:
+      Cond[1].setImm(AArch64::TBNZX);
+      break;
+    case AArch64::TBNZX:
+      Cond[1].setImm(AArch64::TBZX);
+      break;
+    }
+  }
+
+  return false;
+}
+
+unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+  if (!isUncondBranchOpcode(I->getOpcode()) &&
+      !isCondBranchOpcode(I->getOpcode()))
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin())
+    return 1;
+  --I;
+  if (!isCondBranchOpcode(I->getOpcode()))
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+void AArch64InstrInfo::instantiateCondBranch(
+    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
+    const SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
+  } else {
+    // Folded compare-and-branch
+    const MachineInstrBuilder MIB =
+        BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
+    if (Cond.size() > 3)
+      MIB.addImm(Cond[3].getImm());
+    MIB.addMBB(TBB);
+  }
+}
+
+unsigned AArch64InstrInfo::InsertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  if (!FBB) {
+    if (Cond.empty()) // Unconditional branch?
+      BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
+    else
+      instantiateCondBranch(MBB, DL, TBB, Cond);
+    return 1;
+  }
+
+  // Two-way conditional branch.
+  instantiateCondBranch(MBB, DL, TBB, Cond);
+  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
+  return 2;
+}
+
+// Find the original register that VReg is copied from.
+static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
+  while (TargetRegisterInfo::isVirtualRegister(VReg)) {
+    const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+    if (!DefMI->isFullCopy())
+      return VReg;
+    VReg = DefMI->getOperand(1).getReg();
+  }
+  return VReg;
+}
+
+// Determine if VReg is defined by an instruction that can be folded into a
+// csel instruction. If so, return the folded opcode, and the replacement
+// register.
+static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
+                                unsigned *NewVReg = nullptr) {
+  VReg = removeCopies(MRI, VReg);
+  if (!TargetRegisterInfo::isVirtualRegister(VReg))
+    return 0;
+
+  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
+  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+  unsigned Opc = 0;
+  unsigned SrcOpNum = 0;
+  switch (DefMI->getOpcode()) {
+  case AArch64::ADDSXri:
+  case AArch64::ADDSWri:
+    // if NZCV is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+      return 0;
+  // fall-through to ADDXri and ADDWri.
+  case AArch64::ADDXri:
+  case AArch64::ADDWri:
+    // add x, 1 -> csinc.
+    if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
+        DefMI->getOperand(3).getImm() != 0)
+      return 0;
+    SrcOpNum = 1;
+    Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
+    break;
+
+  case AArch64::ORNXrr:
+  case AArch64::ORNWrr: {
+    // not x -> csinv, represented as orn dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
+    break;
+  }
+
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSWrr:
+    // if NZCV is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+      return 0;
+  // fall-through to SUBXrr and SUBWrr.
+  case AArch64::SUBXrr:
+  case AArch64::SUBWrr: {
+    // neg x -> csneg, represented as sub dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
+    break;
+  }
+  default:
+    return 0;
+  }
+  assert(Opc && SrcOpNum && "Missing parameters");
+
+  if (NewVReg)
+    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
+  return Opc;
+}
+
+bool AArch64InstrInfo::canInsertSelect(
+    const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond,
+    unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
+    int &FalseCycles) const {
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
+  unsigned ExtraCondLat = Cond.size() != 1;
+
+  // GPRs are handled by csel.
+  // FIXME: Fold in x+1, -x, and ~x when applicable.
+  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
+      AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+    // Single-cycle csel, csinc, csinv, and csneg.
+    CondCycles = 1 + ExtraCondLat;
+    TrueCycles = FalseCycles = 1;
+    if (canFoldIntoCSel(MRI, TrueReg))
+      TrueCycles = 0;
+    else if (canFoldIntoCSel(MRI, FalseReg))
+      FalseCycles = 0;
+    return true;
+  }
+
+  // Scalar floating point is handled by fcsel.
+  // FIXME: Form fabs, fmin, and fmax when applicable.
+  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
+      AArch64::FPR32RegClass.hasSubClassEq(RC)) {
+    CondCycles = 5 + ExtraCondLat;
+    TrueCycles = FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
+  return false;
+}
+
+void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I, DebugLoc DL,
+                                    unsigned DstReg,
+                                    const SmallVectorImpl<MachineOperand> &Cond,
+                                    unsigned TrueReg, unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  // Parse the condition code, see parseCondBranch() above.
+  AArch64CC::CondCode CC;
+  switch (Cond.size()) {
+  default:
+    llvm_unreachable("Unknown condition opcode in Cond");
+  case 1: // b.cc
+    CC = AArch64CC::CondCode(Cond[0].getImm());
+    break;
+  case 3: { // cbz/cbnz
+    // We must insert a compare against 0.
+    bool Is64Bit;
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case AArch64::CBZW:
+      Is64Bit = 0;
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::CBZX:
+      Is64Bit = 1;
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::CBNZW:
+      Is64Bit = 0;
+      CC = AArch64CC::NE;
+      break;
+    case AArch64::CBNZX:
+      Is64Bit = 1;
+      CC = AArch64CC::NE;
+      break;
+    }
+    unsigned SrcReg = Cond[2].getReg();
+    if (Is64Bit) {
+      // cmp reg, #0 is actually subs xzr, reg, #0.
+      MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
+      BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    } else {
+      MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
+      BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    }
+    break;
+  }
+  case 4: { // tbz/tbnz
+    // We must insert a tst instruction.
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case AArch64::TBZW:
+    case AArch64::TBZX:
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::TBNZW:
+    case AArch64::TBNZX:
+      CC = AArch64CC::NE;
+      break;
+    }
+    // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
+    if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
+      BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
+          .addReg(Cond[2].getReg())
+          .addImm(
+              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
+    else
+      BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
+          .addReg(Cond[2].getReg())
+          .addImm(
+              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
+    break;
+  }
+  }
+
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  bool TryFold = false;
+  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
+    RC = &AArch64::GPR64RegClass;
+    Opc = AArch64::CSELXr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
+    RC = &AArch64::GPR32RegClass;
+    Opc = AArch64::CSELWr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
+    RC = &AArch64::FPR64RegClass;
+    Opc = AArch64::FCSELDrrr;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
+    RC = &AArch64::FPR32RegClass;
+    Opc = AArch64::FCSELSrrr;
+  }
+  assert(RC && "Unsupported regclass");
+
+  // Try folding simple instructions into the csel.
+  if (TryFold) {
+    unsigned NewVReg = 0;
+    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
+    if (FoldedOpc) {
+      // The folded opcodes csinc, csinc and csneg apply the operation to
+      // FalseReg, so we need to invert the condition.
+      CC = AArch64CC::getInvertedCondCode(CC);
+      TrueReg = FalseReg;
+    } else
+      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
+
+    // Fold the operation. Leave any dead instructions for DCE to clean up.
+    if (FoldedOpc) {
+      FalseReg = NewVReg;
+      Opc = FoldedOpc;
+      // The extends the live range of NewVReg.
+      MRI.clearKillFlags(NewVReg);
+    }
+  }
+
+  // Pull all virtual register into the appropriate class.
+  MRI.constrainRegClass(TrueReg, RC);
+  MRI.constrainRegClass(FalseReg, RC);
+
+  // Insert the csel.
+  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
+      CC);
+}
+
+bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                             unsigned &SrcReg, unsigned &DstReg,
+                                             unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::SBFMXri: // aka sxtw
+  case AArch64::UBFMXri: // aka uxtw
+    // Check for the 32 -> 64 bit extension case, these instructions can do
+    // much more.
+    if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
+      return false;
+    // This is a signed or unsigned 32 -> 64 bit extension.
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SubIdx = AArch64::sub_32;
+    return true;
+  }
+}
+
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+/// Return true if the comparison instruction can be analyzed.
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &CmpMask,
+                                      int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBSXrx:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDSXrx:
+    // Replace SUBSWrr with SUBWrr if NZCV is not used.
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = MI->getOperand(2).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case AArch64::SUBSWri:
+  case AArch64::ADDSWri:
+  case AArch64::SUBSXri:
+  case AArch64::ADDSXri:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(2).getImm();
+    return true;
+  case AArch64::ANDSWri:
+  case AArch64::ANDSXri:
+    // ANDS does not use the same encoding scheme as the others xxxS
+    // instructions.
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = AArch64_AM::decodeLogicalImmediate(
+        MI->getOperand(2).getImm(),
+        MI->getOpcode() == AArch64::ANDSWri ? 32 : 64);
+    return true;
+  }
+
+  return false;
+}
+
+static bool UpdateOperandRegClass(MachineInstr *Instr) {
+  MachineBasicBlock *MBB = Instr->getParent();
+  assert(MBB && "Can't get MachineBasicBlock here");
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Can't get MachineFunction here");
+  const TargetMachine *TM = &MF->getTarget();
+  const TargetInstrInfo *TII = TM->getInstrInfo();
+  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
+       ++OpIdx) {
+    MachineOperand &MO = Instr->getOperand(OpIdx);
+    const TargetRegisterClass *OpRegCstraints =
+        Instr->getRegClassConstraint(OpIdx, TII, TRI);
+
+    // If there's no constraint, there's nothing to do.
+    if (!OpRegCstraints)
+      continue;
+    // If the operand is a frame index, there's nothing to do here.
+    // A frame index operand will resolve correctly during PEI.
+    if (MO.isFI())
+      continue;
+
+    assert(MO.isReg() &&
+           "Operand has register constraints without being a register!");
+
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (!OpRegCstraints->contains(Reg))
+        return false;
+    } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
+               !MRI->constrainRegClass(Reg, OpRegCstraints))
+      return false;
+  }
+
+  return true;
+}
+
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register.
+bool AArch64InstrInfo::optimizeCompareInstr(
+    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
+
+  // Replace SUBSWrr with SUBWrr if NZCV is not used.
+  int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
+  if (Cmp_NZCV != -1) {
+    unsigned NewOpc;
+    switch (CmpInstr->getOpcode()) {
+    default:
+      return false;
+    case AArch64::ADDSWrr:      NewOpc = AArch64::ADDWrr; break;
+    case AArch64::ADDSWri:      NewOpc = AArch64::ADDWri; break;
+    case AArch64::ADDSWrs:      NewOpc = AArch64::ADDWrs; break;
+    case AArch64::ADDSWrx:      NewOpc = AArch64::ADDWrx; break;
+    case AArch64::ADDSXrr:      NewOpc = AArch64::ADDXrr; break;
+    case AArch64::ADDSXri:      NewOpc = AArch64::ADDXri; break;
+    case AArch64::ADDSXrs:      NewOpc = AArch64::ADDXrs; break;
+    case AArch64::ADDSXrx:      NewOpc = AArch64::ADDXrx; break;
+    case AArch64::SUBSWrr:      NewOpc = AArch64::SUBWrr; break;
+    case AArch64::SUBSWri:      NewOpc = AArch64::SUBWri; break;
+    case AArch64::SUBSWrs:      NewOpc = AArch64::SUBWrs; break;
+    case AArch64::SUBSWrx:      NewOpc = AArch64::SUBWrx; break;
+    case AArch64::SUBSXrr:      NewOpc = AArch64::SUBXrr; break;
+    case AArch64::SUBSXri:      NewOpc = AArch64::SUBXri; break;
+    case AArch64::SUBSXrs:      NewOpc = AArch64::SUBXrs; break;
+    case AArch64::SUBSXrx:      NewOpc = AArch64::SUBXrx; break;
+    }
+
+    const MCInstrDesc &MCID = get(NewOpc);
+    CmpInstr->setDesc(MCID);
+    CmpInstr->RemoveOperand(Cmp_NZCV);
+    bool succeeded = UpdateOperandRegClass(CmpInstr);
+    (void)succeeded;
+    assert(succeeded && "Some operands reg class are incompatible!");
+    return true;
+  }
+
+  // Continue only if we have a "ri" where immediate is zero.
+  if (CmpValue != 0 || SrcReg2 != 0)
+    return false;
+
+  // CmpInstr is a Compare instruction if destination register is not used.
+  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+    return false;
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI)
+    return false;
+
+  // We iterate backward, starting from the instruction before CmpInstr and
+  // stop when reaching the definition of the source register or done with the
+  // basic block, to check whether NZCV is used or modified in between.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr->getParent()->begin();
+
+  // Early exit if CmpInstr is at the beginning of the BB.
+  if (I == B)
+    return false;
+
+  // Check whether the definition of SrcReg is in the same basic block as
+  // Compare. If not, we can't optimize away the Compare.
+  if (MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  // Check that NZCV isn't set between the comparison instruction and the one we
+  // want to change.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  for (--I; I != E; --I) {
+    const MachineInstr &Instr = *I;
+
+    if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
+        Instr.readsRegister(AArch64::NZCV, TRI))
+      // This instruction modifies or uses NZCV after the one we want to
+      // change. We can't do this transformation.
+      return false;
+    if (I == B)
+      // The 'and' is below the comparison instruction.
+      return false;
+  }
+
+  unsigned NewOpc = MI->getOpcode();
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXri:
+    break;
+  case AArch64::ADDWrr:    NewOpc = AArch64::ADDSWrr; break;
+  case AArch64::ADDWri:    NewOpc = AArch64::ADDSWri; break;
+  case AArch64::ADDXrr:    NewOpc = AArch64::ADDSXrr; break;
+  case AArch64::ADDXri:    NewOpc = AArch64::ADDSXri; break;
+  case AArch64::ADCWr:     NewOpc = AArch64::ADCSWr; break;
+  case AArch64::ADCXr:     NewOpc = AArch64::ADCSXr; break;
+  case AArch64::SUBWrr:    NewOpc = AArch64::SUBSWrr; break;
+  case AArch64::SUBWri:    NewOpc = AArch64::SUBSWri; break;
+  case AArch64::SUBXrr:    NewOpc = AArch64::SUBSXrr; break;
+  case AArch64::SUBXri:    NewOpc = AArch64::SUBSXri; break;
+  case AArch64::SBCWr:     NewOpc = AArch64::SBCSWr; break;
+  case AArch64::SBCXr:     NewOpc = AArch64::SBCSXr; break;
+  case AArch64::ANDWri:    NewOpc = AArch64::ANDSWri; break;
+  case AArch64::ANDXri:    NewOpc = AArch64::ANDSXri; break;
+  }
+
+  // Scan forward for the use of NZCV.
+  // When checking against MI: if it's a conditional code requires
+  // checking of V bit, then this is not safe to do.
+  // It is safe to remove CmpInstr if NZCV is redefined or killed.
+  // If we are done with the basic block, we need to check whether NZCV is
+  // live-out.
+  bool IsSafe = false;
+  for (MachineBasicBlock::iterator I = CmpInstr,
+                                   E = CmpInstr->getParent()->end();
+       !IsSafe && ++I != E;) {
+    const MachineInstr &Instr = *I;
+    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
+         ++IO) {
+      const MachineOperand &MO = Instr.getOperand(IO);
+      if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) {
+        IsSafe = true;
+        break;
+      }
+      if (!MO.isReg() || MO.getReg() != AArch64::NZCV)
+        continue;
+      if (MO.isDef()) {
+        IsSafe = true;
+        break;
+      }
+
+      // Decode the condition code.
+      unsigned Opc = Instr.getOpcode();
+      AArch64CC::CondCode CC;
+      switch (Opc) {
+      default:
+        return false;
+      case AArch64::Bcc:
+        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm();
+        break;
+      case AArch64::CSINVWr:
+      case AArch64::CSINVXr:
+      case AArch64::CSINCWr:
+      case AArch64::CSINCXr:
+      case AArch64::CSELWr:
+      case AArch64::CSELXr:
+      case AArch64::CSNEGWr:
+      case AArch64::CSNEGXr:
+      case AArch64::FCSELSrrr:
+      case AArch64::FCSELDrrr:
+        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm();
+        break;
+      }
+
+      // It is not safe to remove Compare instruction if Overflow(V) is used.
+      switch (CC) {
+      default:
+        // NZCV can be used multiple times, we should continue.
+        break;
+      case AArch64CC::VS:
+      case AArch64CC::VC:
+      case AArch64CC::GE:
+      case AArch64CC::LT:
+      case AArch64CC::GT:
+      case AArch64CC::LE:
+        return false;
+      }
+    }
+  }
+
+  // If NZCV is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if (!IsSafe) {
+    MachineBasicBlock *ParentBlock = CmpInstr->getParent();
+    for (auto *MBB : ParentBlock->successors())
+      if (MBB->isLiveIn(AArch64::NZCV))
+        return false;
+  }
+
+  // Update the instruction to set NZCV.
+  MI->setDesc(get(NewOpc));
+  CmpInstr->eraseFromParent();
+  bool succeeded = UpdateOperandRegClass(MI);
+  (void)succeeded;
+  assert(succeeded && "Some operands reg class are incompatible!");
+  MI->addRegisterDefined(AArch64::NZCV, TRI);
+  return true;
+}
+
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::ANDSWrs:
+  case AArch64::ANDSXrs:
+  case AArch64::ANDWrs:
+  case AArch64::ANDXrs:
+  case AArch64::BICSWrs:
+  case AArch64::BICSXrs:
+  case AArch64::BICWrs:
+  case AArch64::BICXrs:
+  case AArch64::CRC32Brr:
+  case AArch64::CRC32CBrr:
+  case AArch64::CRC32CHrr:
+  case AArch64::CRC32CWrr:
+  case AArch64::CRC32CXrr:
+  case AArch64::CRC32Hrr:
+  case AArch64::CRC32Wrr:
+  case AArch64::CRC32Xrr:
+  case AArch64::EONWrs:
+  case AArch64::EONXrs:
+  case AArch64::EORWrs:
+  case AArch64::EORXrs:
+  case AArch64::ORNWrs:
+  case AArch64::ORNXrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBXrs:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
+  }
+  return false;
+}
+
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrx:
+  case AArch64::ADDSXrx64:
+  case AArch64::ADDWrx:
+  case AArch64::ADDXrx:
+  case AArch64::ADDXrx64:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrx:
+  case AArch64::SUBSXrx64:
+  case AArch64::SUBWrx:
+  case AArch64::SUBXrx:
+  case AArch64::SUBXrx64:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
+  }
+
+  return false;
+}
+
+// Return true if this instruction simply sets its single destination register
+// to zero. This is equivalent to a register rename of the zero-register.
+bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::MOVZWi:
+  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
+    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
+      assert(MI->getDesc().getNumOperands() == 3 &&
+             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+      return true;
+    }
+    break;
+  case AArch64::ANDWri: // and Rd, Rzr, #imm
+    return MI->getOperand(1).getReg() == AArch64::WZR;
+  case AArch64::ANDXri:
+    return MI->getOperand(1).getReg() == AArch64::XZR;
+  case TargetOpcode::COPY:
+    return MI->getOperand(1).getReg() == AArch64::WZR;
+  }
+  return false;
+}
+
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::COPY: {
+    // GPR32 copies will by lowered to ORRXrs
+    unsigned DstReg = MI->getOperand(0).getReg();
+    return (AArch64::GPR32RegClass.contains(DstReg) ||
+            AArch64::GPR64RegClass.contains(DstReg));
+  }
+  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
+    if (MI->getOperand(1).getReg() == AArch64::XZR) {
+      assert(MI->getDesc().getNumOperands() == 4 &&
+             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+      return true;
+    }
+  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
+    if (MI->getOperand(2).getImm() == 0) {
+      assert(MI->getDesc().getNumOperands() == 4 &&
+             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+      return true;
+    }
+  }
+  return false;
+}
+
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::COPY: {
+    // FPR64 copies will by lowered to ORR.16b
+    unsigned DstReg = MI->getOperand(0).getReg();
+    return (AArch64::FPR64RegClass.contains(DstReg) ||
+            AArch64::FPR128RegClass.contains(DstReg));
+  }
+  case AArch64::ORRv16i8:
+    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
+             "invalid ORRv16i8 operands");
+      return true;
+    }
+  }
+  return false;
+}
+
+unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                               int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRBui:
+  case AArch64::STRHui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+/// Return true if this is load/store scales or extends its register offset.
+/// This refers to scaling a dynamic index as opposed to scaled immediates.
+/// MI should be a memory op that allows scaled addressing.
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBroW:
+  case AArch64::LDRDroW:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHroW:
+  case AArch64::LDRQroW:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSroW:
+  case AArch64::LDRWroW:
+  case AArch64::LDRXroW:
+  case AArch64::STRBBroW:
+  case AArch64::STRBroW:
+  case AArch64::STRDroW:
+  case AArch64::STRHHroW:
+  case AArch64::STRHroW:
+  case AArch64::STRQroW:
+  case AArch64::STRSroW:
+  case AArch64::STRWroW:
+  case AArch64::STRXroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRBroX:
+  case AArch64::LDRDroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRHroX:
+  case AArch64::LDRQroX:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSroX:
+  case AArch64::LDRWroX:
+  case AArch64::LDRXroX:
+  case AArch64::STRBBroX:
+  case AArch64::STRBroX:
+  case AArch64::STRDroX:
+  case AArch64::STRHHroX:
+  case AArch64::STRHroX:
+  case AArch64::STRQroX:
+  case AArch64::STRSroX:
+  case AArch64::STRWroX:
+  case AArch64::STRXroX:
+
+    unsigned Val = MI->getOperand(3).getImm();
+    AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
+    return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
+  }
+  return false;
+}
+
+/// Check all MachineMemOperands for a hint to suppress pairing.
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
+  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+         "Too many target MO flags");
+  for (auto *MM : MI->memoperands()) {
+    if (MM->getFlags() &
+        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Set a flag on the first MachineMemOperand to suppress pairing.
+void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
+  if (MI->memoperands_empty())
+    return;
+
+  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+         "Too many target MO flags");
+  (*MI->memoperands_begin())
+      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
+}
+
+bool
+AArch64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                                       unsigned &Offset,
+                                       const TargetRegisterInfo *TRI) const {
+  switch (LdSt->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+  case AArch64::STRXui:
+  case AArch64::STRWui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+  case AArch64::LDRXui:
+  case AArch64::LDRWui:
+    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+      return false;
+    BaseReg = LdSt->getOperand(1).getReg();
+    MachineFunction &MF = *LdSt->getParent()->getParent();
+    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
+    Offset = LdSt->getOperand(2).getImm() * Width;
+    return true;
+  };
+}
+
+/// Detect opportunities for ldp/stp formation.
+///
+/// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
+bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
+                                          MachineInstr *SecondLdSt,
+                                          unsigned NumLoads) const {
+  // Only cluster up to a single pair.
+  if (NumLoads > 1)
+    return false;
+  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+    return false;
+  // getLdStBaseRegImmOfs guarantees that oper 2 isImm.
+  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
+  // Allow 6 bits of positive range.
+  if (Ofs1 > 64)
+    return false;
+  // The caller should already have ordered First/SecondLdSt by offset.
+  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
+  return Ofs1 + 1 == Ofs2;
+}
+
+bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
+                                              MachineInstr *Second) const {
+  // Cyclone can fuse CMN, CMP followed by Bcc.
+
+  // FIXME: B0 can also fuse:
+  // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
+  if (Second->getOpcode() != AArch64::Bcc)
+    return false;
+  switch (First->getOpcode()) {
+  default:
+    return false;
+  case AArch64::SUBSWri:
+  case AArch64::ADDSWri:
+  case AArch64::ANDSWri:
+  case AArch64::SUBSXri:
+  case AArch64::ADDSXri:
+  case AArch64::ANDSXri:
+    return true;
+  }
+}
+
+MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+                                                         int FrameIx,
+                                                         uint64_t Offset,
+                                                         const MDNode *MDPtr,
+                                                         DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
+                                .addFrameIndex(FrameIx)
+                                .addImm(0)
+                                .addImm(Offset)
+                                .addMetadata(MDPtr);
+  return &*MIB;
+}
+
+static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg, unsigned SubIdx,
+                                            unsigned State,
+                                            const TargetRegisterInfo *TRI) {
+  if (!SubIdx)
+    return MIB.addReg(Reg, State);
+
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+  return MIB.addReg(Reg, State, SubIdx);
+}
+
+static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
+                                        unsigned NumRegs) {
+  // We really want the positive remainder mod 32 here, that happens to be
+  // easily obtainable with a mask.
+  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
+}
+
+void AArch64InstrInfo::copyPhysRegTuple(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+    unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
+    llvm::ArrayRef<unsigned> Indices) const {
+  assert(getSubTarget().hasNEON() &&
+         "Unexpected register copy without NEON");
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
+  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
+  unsigned NumRegs = Indices.size();
+
+  int SubReg = 0, End = NumRegs, Incr = 1;
+  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
+    SubReg = NumRegs - 1;
+    End = -1;
+    Incr = -1;
+  }
+
+  for (; SubReg != End; SubReg += Incr) {
+    const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode));
+    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
+  }
+}
+
+void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   bool KillSrc) const {
+  if (AArch64::GPR32spRegClass.contains(DestReg) &&
+      (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+    if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
+      // If either operand is WSP, expand to ADD #0.
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                    &AArch64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
+            .addReg(SrcRegX, RegState::Undef)
+            .addImm(0)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc))
+            .addImm(0)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+      }
+    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg).addImm(0).addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                    &AArch64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
+            .addReg(AArch64::XZR)
+            .addReg(SrcRegX, RegState::Undef)
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        // Otherwise, expand to ORR WZR.
+        BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
+            .addReg(AArch64::WZR)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+      }
+    }
+    return;
+  }
+
+  if (AArch64::GPR64spRegClass.contains(DestReg) &&
+      (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
+    if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
+      // If either operand is SP, expand to ADD #0.
+      BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg).addImm(0).addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      // Otherwise, expand to ORR XZR.
+      BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
+          .addReg(AArch64::XZR)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  // Copy a DDDD register quad by copying the individual sub-registers.
+  if (AArch64::DDDDRegClass.contains(DestReg) &&
+      AArch64::DDDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+                                        AArch64::dsub2, AArch64::dsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a DDD register triple by copying the individual sub-registers.
+  if (AArch64::DDDRegClass.contains(DestReg) &&
+      AArch64::DDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+                                        AArch64::dsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a DD register pair by copying the individual sub-registers.
+  if (AArch64::DDRegClass.contains(DestReg) &&
+      AArch64::DDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQQ register quad by copying the individual sub-registers.
+  if (AArch64::QQQQRegClass.contains(DestReg) &&
+      AArch64::QQQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+                                        AArch64::qsub2, AArch64::qsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQ register triple by copying the individual sub-registers.
+  if (AArch64::QQQRegClass.contains(DestReg) &&
+      AArch64::QQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+                                        AArch64::qsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQ register pair by copying the individual sub-registers.
+  if (AArch64::QQRegClass.contains(DestReg) &&
+      AArch64::QQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  if (AArch64::FPR128RegClass.contains(DestReg) &&
+      AArch64::FPR128RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::STRQpre))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(AArch64::SP)
+        .addImm(-16);
+      BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(DestReg, RegState::Define)
+        .addReg(AArch64::SP)
+        .addImm(16);
+    }
+    return;
+  }
+
+  if (AArch64::FPR64RegClass.contains(DestReg) &&
+      AArch64::FPR64RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  if (AArch64::FPR32RegClass.contains(DestReg) &&
+      AArch64::FPR32RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  if (AArch64::FPR16RegClass.contains(DestReg) &&
+      AArch64::FPR16RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+                                       &AArch64::FPR32RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                      &AArch64::FPR32RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  if (AArch64::FPR8RegClass.contains(DestReg) &&
+      AArch64::FPR8RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+                                       &AArch64::FPR32RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                      &AArch64::FPR32RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  // Copies between GPR64 and FPR64.
+  if (AArch64::FPR64RegClass.contains(DestReg) &&
+      AArch64::GPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (AArch64::GPR64RegClass.contains(DestReg) &&
+      AArch64::FPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  // Copies between GPR32 and FPR32.
+  if (AArch64::FPR32RegClass.contains(DestReg) &&
+      AArch64::GPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (AArch64::GPR32RegClass.contains(DestReg) &&
+      AArch64::FPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  assert(0 && "unimplemented reg-to-reg copy");
+}
+
+void AArch64InstrInfo::storeRegToStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+    bool isKill, int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRHui;
+    break;
+  case 4:
+    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STRWui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+      else
+        assert(SrcReg != AArch64::WSP);
+    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRSui;
+    break;
+  case 8:
+    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STRXui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+      else
+        assert(SrcReg != AArch64::SP);
+    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRQui;
+    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Twov1d, Offset = false;
+    }
+    break;
+  case 24:
+    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Threev1d, Offset = false;
+    }
+    break;
+  case 32:
+    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Fourv1d, Offset = false;
+    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Twov2d, Offset = false;
+    }
+    break;
+  case 48:
+    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Threev2d, Offset = false;
+    }
+    break;
+  case 64:
+    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Fourv2d, Offset = false;
+    }
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(SrcReg, getKillRegState(isKill))
+                                      .addFrameIndex(FI);
+
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void AArch64InstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+    int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
+
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRHui;
+    break;
+  case 4:
+    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDRWui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
+      else
+        assert(DestReg != AArch64::WSP);
+    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRSui;
+    break;
+  case 8:
+    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDRXui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
+      else
+        assert(DestReg != AArch64::SP);
+    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRQui;
+    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Twov1d, Offset = false;
+    }
+    break;
+  case 24:
+    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Threev1d, Offset = false;
+    }
+    break;
+  case 32:
+    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Fourv1d, Offset = false;
+    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Twov2d, Offset = false;
+    }
+    break;
+  case 48:
+    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Threev2d, Offset = false;
+    }
+    break;
+  case 64:
+    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Fourv2d, Offset = false;
+    }
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(DestReg, getDefRegState(true))
+                                      .addFrameIndex(FI);
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void llvm::emitFrameOffset(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg, int Offset,
+                           const AArch64InstrInfo *TII,
+                           MachineInstr::MIFlag Flag, bool SetNZCV) {
+  if (DestReg == SrcReg && Offset == 0)
+    return;
+
+  bool isSub = Offset < 0;
+  if (isSub)
+    Offset = -Offset;
+
+  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
+  // scratch register.  If DestReg is a virtual register, use it as the
+  // scratch register; otherwise, create a new virtual register (to be
+  // replaced by the scavenger at the end of PEI).  That case can be optimized
+  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
+  // register can be loaded with offset%8 and the add/sub can use an extending
+  // instruction with LSL#3.
+  // Currently the function handles any offsets but generates a poor sequence
+  // of code.
+  //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
+
+  unsigned Opc;
+  if (SetNZCV)
+    Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
+  else
+    Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
+  const unsigned MaxEncoding = 0xfff;
+  const unsigned ShiftSize = 12;
+  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
+  while (((unsigned)Offset) >= (1 << ShiftSize)) {
+    unsigned ThisVal;
+    if (((unsigned)Offset) > MaxEncodableValue) {
+      ThisVal = MaxEncodableValue;
+    } else {
+      ThisVal = Offset & MaxEncodableValue;
+    }
+    assert((ThisVal >> ShiftSize) <= MaxEncoding &&
+           "Encoding cannot handle value that big");
+    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+        .addReg(SrcReg)
+        .addImm(ThisVal >> ShiftSize)
+        .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
+        .setMIFlag(Flag);
+
+    SrcReg = DestReg;
+    Offset -= ThisVal;
+    if (Offset == 0)
+      return;
+  }
+  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+      .addReg(SrcReg)
+      .addImm(Offset)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+      .setMIFlag(Flag);
+}
+
+MachineInstr *
+AArch64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        int FrameIndex) const {
+  // This is a bit of a hack. Consider this instruction:
+  //
+  //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
+  //
+  // We explicitly chose GPR64all for the virtual register so such a copy might
+  // be eliminated by RegisterCoalescer. However, that may not be possible, and
+  // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
+  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
+  //
+  // To prevent that, we are going to constrain the %vreg0 register class here.
+  //
+  // <rdar://problem/11522048>
+  //
+  if (MI->isCopy()) {
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    if (SrcReg == AArch64::SP &&
+        TargetRegisterInfo::isVirtualRegister(DstReg)) {
+      MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
+      return nullptr;
+    }
+    if (DstReg == AArch64::SP &&
+        TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+      return nullptr;
+    }
+  }
+
+  // Cannot fold.
+  return nullptr;
+}
+
+int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                                    bool *OutUseUnscaledOp,
+                                    unsigned *OutUnscaledOp,
+                                    int *EmittableOffset) {
+  int Scale = 1;
+  bool IsSigned = false;
+  // The ImmIdx should be changed case by case if it is not 2.
+  unsigned ImmIdx = 2;
+  unsigned UnscaledOp = 0;
+  // Set output values in case of early exit.
+  if (EmittableOffset)
+    *EmittableOffset = 0;
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = false;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = 0;
+  switch (MI.getOpcode()) {
+  default:
+    assert(0 && "unhandled opcode in rewriteAArch64FrameIndex");
+  // Vector spills/fills can't take an immediate offset.
+  case AArch64::LD1Twov2d:
+  case AArch64::LD1Threev2d:
+  case AArch64::LD1Fourv2d:
+  case AArch64::LD1Twov1d:
+  case AArch64::LD1Threev1d:
+  case AArch64::LD1Fourv1d:
+  case AArch64::ST1Twov2d:
+  case AArch64::ST1Threev2d:
+  case AArch64::ST1Fourv2d:
+  case AArch64::ST1Twov1d:
+  case AArch64::ST1Threev1d:
+  case AArch64::ST1Fourv1d:
+    return AArch64FrameOffsetCannotUpdate;
+  case AArch64::PRFMui:
+    Scale = 8;
+    UnscaledOp = AArch64::PRFUMi;
+    break;
+  case AArch64::LDRXui:
+    Scale = 8;
+    UnscaledOp = AArch64::LDURXi;
+    break;
+  case AArch64::LDRWui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURWi;
+    break;
+  case AArch64::LDRBui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURBi;
+    break;
+  case AArch64::LDRHui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURHi;
+    break;
+  case AArch64::LDRSui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURSi;
+    break;
+  case AArch64::LDRDui:
+    Scale = 8;
+    UnscaledOp = AArch64::LDURDi;
+    break;
+  case AArch64::LDRQui:
+    Scale = 16;
+    UnscaledOp = AArch64::LDURQi;
+    break;
+  case AArch64::LDRBBui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURBBi;
+    break;
+  case AArch64::LDRHHui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURHHi;
+    break;
+  case AArch64::LDRSBXui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURSBXi;
+    break;
+  case AArch64::LDRSBWui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURSBWi;
+    break;
+  case AArch64::LDRSHXui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURSHXi;
+    break;
+  case AArch64::LDRSHWui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURSHWi;
+    break;
+  case AArch64::LDRSWui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURSWi;
+    break;
+
+  case AArch64::STRXui:
+    Scale = 8;
+    UnscaledOp = AArch64::STURXi;
+    break;
+  case AArch64::STRWui:
+    Scale = 4;
+    UnscaledOp = AArch64::STURWi;
+    break;
+  case AArch64::STRBui:
+    Scale = 1;
+    UnscaledOp = AArch64::STURBi;
+    break;
+  case AArch64::STRHui:
+    Scale = 2;
+    UnscaledOp = AArch64::STURHi;
+    break;
+  case AArch64::STRSui:
+    Scale = 4;
+    UnscaledOp = AArch64::STURSi;
+    break;
+  case AArch64::STRDui:
+    Scale = 8;
+    UnscaledOp = AArch64::STURDi;
+    break;
+  case AArch64::STRQui:
+    Scale = 16;
+    UnscaledOp = AArch64::STURQi;
+    break;
+  case AArch64::STRBBui:
+    Scale = 1;
+    UnscaledOp = AArch64::STURBBi;
+    break;
+  case AArch64::STRHHui:
+    Scale = 2;
+    UnscaledOp = AArch64::STURHHi;
+    break;
+
+  case AArch64::LDPXi:
+  case AArch64::LDPDi:
+  case AArch64::STPXi:
+  case AArch64::STPDi:
+    IsSigned = true;
+    Scale = 8;
+    break;
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+    IsSigned = true;
+    Scale = 16;
+    break;
+  case AArch64::LDPWi:
+  case AArch64::LDPSi:
+  case AArch64::STPWi:
+  case AArch64::STPSi:
+    IsSigned = true;
+    Scale = 4;
+    break;
+
+  case AArch64::LDURXi:
+  case AArch64::LDURWi:
+  case AArch64::LDURBi:
+  case AArch64::LDURHi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSWi:
+  case AArch64::STURXi:
+  case AArch64::STURWi:
+  case AArch64::STURBi:
+  case AArch64::STURHi:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURBBi:
+  case AArch64::STURHHi:
+    Scale = 1;
+    break;
+  }
+
+  Offset += MI.getOperand(ImmIdx).getImm() * Scale;
+
+  bool useUnscaledOp = false;
+  // If the offset doesn't match the scale, we rewrite the instruction to
+  // use the unscaled instruction instead. Likewise, if we have a negative
+  // offset (and have an unscaled op to use).
+  if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
+    useUnscaledOp = true;
+
+  // Use an unscaled addressing mode if the instruction has a negative offset
+  // (or if the instruction is already using an unscaled addressing mode).
+  unsigned MaskBits;
+  if (IsSigned) {
+    // ldp/stp instructions.
+    MaskBits = 7;
+    Offset /= Scale;
+  } else if (UnscaledOp == 0 || useUnscaledOp) {
+    MaskBits = 9;
+    IsSigned = true;
+    Scale = 1;
+  } else {
+    MaskBits = 12;
+    IsSigned = false;
+    Offset /= Scale;
+  }
+
+  // Attempt to fold address computation.
+  int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
+  int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
+  if (Offset >= MinOff && Offset <= MaxOff) {
+    if (EmittableOffset)
+      *EmittableOffset = Offset;
+    Offset = 0;
+  } else {
+    int NewOff = Offset < 0 ? MinOff : MaxOff;
+    if (EmittableOffset)
+      *EmittableOffset = NewOff;
+    Offset = (Offset - NewOff) * Scale;
+  }
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = useUnscaledOp;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = UnscaledOp;
+  return AArch64FrameOffsetCanUpdate |
+         (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
+}
+
+bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                    unsigned FrameReg, int &Offset,
+                                    const AArch64InstrInfo *TII) {
+  unsigned Opcode = MI.getOpcode();
+  unsigned ImmIdx = FrameRegIdx + 1;
+
+  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
+    Offset += MI.getOperand(ImmIdx).getImm();
+    emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
+                    MI.getOperand(0).getReg(), FrameReg, Offset, TII,
+                    MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
+    MI.eraseFromParent();
+    Offset = 0;
+    return true;
+  }
+
+  int NewOffset;
+  unsigned UnscaledOp;
+  bool UseUnscaledOp;
+  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
+                                         &UnscaledOp, &NewOffset);
+  if (Status & AArch64FrameOffsetCanUpdate) {
+    if (Status & AArch64FrameOffsetIsLegal)
+      // Replace the FrameIndex with FrameReg.
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+    if (UseUnscaledOp)
+      MI.setDesc(TII->get(UnscaledOp));
+
+    MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
+    return Offset == 0;
+  }
+
+  return false;
+}
+
+void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(AArch64::HINT);
+  NopInst.addOperand(MCOperand::CreateImm(0));
+}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
new file mode 100644
index 00000000000..90ce75f26d4
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -0,0 +1,231 @@
+//===- AArch64InstrInfo.h - AArch64 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64INSTRINFO_H
+#define LLVM_TARGET_AArch64INSTRINFO_H
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "AArch64GenInstrInfo.inc"
+
+namespace llvm {
+
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64InstrInfo : public AArch64GenInstrInfo {
+  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
+  // They will be shifted into MOTargetHintStart when accessed.
+  enum TargetMemOperandFlags {
+    MOSuppressPair = 1
+  };
+
+  const AArch64RegisterInfo RI;
+  const AArch64Subtarget &Subtarget;
+
+public:
+  explicit AArch64InstrInfo(const AArch64Subtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  const AArch64RegisterInfo &getRegisterInfo() const { return RI; }
+
+  const AArch64Subtarget &getSubTarget() const { return Subtarget; }
+
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                             unsigned &DstReg, unsigned &SubIdx) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  /// Returns true if there is a shiftable register and that the shift value
+  /// is non-zero.
+  bool hasShiftedReg(const MachineInstr *MI) const;
+
+  /// Returns true if there is an extendable register and that the extending
+  /// value is non-zero.
+  bool hasExtendedReg(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction set its full destination register to zero?
+  bool isGPRZero(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename a GPR without modifying bits?
+  bool isGPRCopy(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename an FPR without modifying bits?
+  bool isFPRCopy(const MachineInstr *MI) const;
+
+  /// Return true if this is load/store scales or extends its register offset.
+  /// This refers to scaling a dynamic index as opposed to scaled immediates.
+  /// MI should be a memory op that allows scaled addressing.
+  bool isScaledAddr(const MachineInstr *MI) const;
+
+  /// Return true if pairing the given load or store is hinted to be
+  /// unprofitable.
+  bool isLdStPairSuppressed(const MachineInstr *MI) const;
+
+  /// Hint that pairing the given load or store is unprofitable.
+  void suppressLdStPair(MachineInstr *MI) const;
+
+  bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                            unsigned &Offset,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool enableClusterLoads() const override { return true; }
+
+  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
+                          unsigned NumLoads) const override;
+
+  bool shouldScheduleAdjacent(MachineInstr *First,
+                              MachineInstr *Second) const override;
+
+  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *MDPtr,
+                                         DebugLoc DL) const;
+  void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                        bool KillSrc, unsigned Opcode,
+                        llvm::ArrayRef<unsigned> Indices) const;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                        const SmallVectorImpl<unsigned> &Ops,
+                        int FrameIndex) const override;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  bool canInsertSelect(const MachineBasicBlock &,
+                       const SmallVectorImpl<MachineOperand> &Cond, unsigned,
+                       unsigned, int &, int &, int &) const override;
+  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    DebugLoc DL, unsigned DstReg,
+                    const SmallVectorImpl<MachineOperand> &Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+  /// Return true if the comparison instruction can be analyzed.
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+  /// optimizeCompareInstr - Convert the instruction supplying the argument to
+  /// the comparison into one that sets the zero bit in the flags register.
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+private:
+  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
+                             MachineBasicBlock *TBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const;
+};
+
+/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
+/// plus Offset.  This is intended to be used from within the prolog/epilog
+/// insertion (PEI) pass, where a virtual scratch register may be allocated
+/// if necessary, to be replaced by the scavenger at the end of PEI.
+void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
+                     const AArch64InstrInfo *TII,
+                     MachineInstr::MIFlag = MachineInstr::NoFlags,
+                     bool SetNZCV = false);
+
+/// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
+/// FP. Return false if the offset could not be handled directly in MI, and
+/// return the left-over portion by reference.
+bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                            unsigned FrameReg, int &Offset,
+                            const AArch64InstrInfo *TII);
+
+/// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal.
+enum AArch64FrameOffsetStatus {
+  AArch64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
+  AArch64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
+  AArch64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
+};
+
+/// \brief Check if the @p Offset is a valid frame offset for @p MI.
+/// The returned value reports the validity of the frame offset for @p MI.
+/// It uses the values defined by AArch64FrameOffsetStatus for that.
+/// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
+/// use an offset.eq
+/// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
+/// rewriten in @p MI.
+/// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
+/// amount that is off the limit of the legal offset.
+/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
+/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
+/// If set, @p EmittableOffset contains the amount that can be set in @p MI
+/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
+/// is a legal offset.
+int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                            bool *OutUseUnscaledOp = nullptr,
+                            unsigned *OutUnscaledOp = nullptr,
+                            int *EmittableOffset = nullptr);
+
+static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
+
+static inline bool isCondBranchOpcode(int Opc) {
+  switch (Opc) {
+  case AArch64::Bcc:
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static inline bool isIndirectBranchOpcode(int Opc) { return Opc == AArch64::BR; }
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
new file mode 100644
index 00000000000..9ad36e8740d
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -0,0 +1,5284 @@
+//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Instruction definitions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
+                               AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
+def HasNEON          : Predicate<"Subtarget->hasNEON()">,
+                                 AssemblerPredicate<"FeatureNEON", "neon">;
+def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
+                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC           : Predicate<"Subtarget->hasCRC()">,
+                                 AssemblerPredicate<"FeatureCRC", "crc">;
+def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
+def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
+
+//===----------------------------------------------------------------------===//
+// AArch64-specific DAG Nodes.
+//
+
+// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
+def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
+                                              [SDTCisSameAs<0, 2>,
+                                               SDTCisSameAs<0, 3>,
+                                               SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<3, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+
+def SDT_AArch64Brcond  : SDTypeProfile<0, 3,
+                                     [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
+                                      SDTCisVT<2, i32>]>;
+def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
+def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+                                        SDTCisVT<2, OtherVT>]>;
+
+
+def SDT_AArch64CSel  : SDTypeProfile<1, 4,
+                                   [SDTCisSameAs<0, 1>,
+                                    SDTCisSameAs<0, 2>,
+                                    SDTCisInt<3>,
+                                    SDTCisVT<4, i32>]>;
+def SDT_AArch64FCmp   : SDTypeProfile<0, 2,
+                                   [SDTCisFP<0>,
+                                    SDTCisSameAs<0, 1>]>;
+def SDT_AArch64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDT_AArch64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
+def SDT_AArch64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisSameAs<0, 1>,
+                                          SDTCisSameAs<0, 2>]>;
+def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisInt<2>, SDTCisInt<3>]>;
+def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                          SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+
+def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
+def SDT_AArch64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
+def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>]>;
+def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>,
+                                           SDTCisSameAs<0,3>]>;
+def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
+def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
+
+def SDT_AArch64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
+
+def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
+                                                 SDTCisPtrTy<1>]>;
+def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
+                                        [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
+                                         SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
+                                         SDTCisSameAs<1, 4>]>;
+
+
+// Node definitions.
+def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
+def AArch64addlow        : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
+def AArch64LOADgot       : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
+def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
+                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+                                [SDNPHasChain, SDNPOutGlue]>;
+def AArch64callseq_end   : SDNode<"ISD::CALLSEQ_END",
+                                SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                               SDTCisVT<1, i32> ]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64call          : SDNode<"AArch64ISD::CALL",
+                                SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                 SDNPVariadic]>;
+def AArch64brcond        : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
+                                [SDNPHasChain]>;
+def AArch64cbz           : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
+                                [SDNPHasChain]>;
+def AArch64cbnz           : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz,
+                                [SDNPHasChain]>;
+def AArch64tbz           : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
+                                [SDNPHasChain]>;
+def AArch64tbnz           : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
+                                [SDNPHasChain]>;
+
+
+def AArch64csel          : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
+def AArch64csinv         : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>;
+def AArch64csneg         : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>;
+def AArch64csinc         : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>;
+def AArch64retflag       : SDNode<"AArch64ISD::RET_FLAG", SDTNone,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def AArch64adc       : SDNode<"AArch64ISD::ADC",  SDTBinaryArithWithFlagsIn >;
+def AArch64sbc       : SDNode<"AArch64ISD::SBC",  SDTBinaryArithWithFlagsIn>;
+def AArch64add_flag  : SDNode<"AArch64ISD::ADDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def AArch64sub_flag  : SDNode<"AArch64ISD::SUBS",  SDTBinaryArithWithFlagsOut>;
+def AArch64and_flag  : SDNode<"AArch64ISD::ANDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def AArch64adc_flag  : SDNode<"AArch64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
+def AArch64sbc_flag  : SDNode<"AArch64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
+
+def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
+
+def AArch64fcmp      : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
+
+def AArch64fmax      : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>;
+def AArch64fmin      : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>;
+
+def AArch64dup       : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
+def AArch64duplane8  : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
+def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
+def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
+def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
+
+def AArch64zip1      : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
+def AArch64zip2      : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
+def AArch64uzp1      : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
+def AArch64uzp2      : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
+def AArch64trn1      : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
+def AArch64trn2      : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;
+
+def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
+def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
+def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
+def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>;
+def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
+def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
+def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
+
+def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
+def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
+def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
+def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
+
+def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
+def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
+def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
+def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
+def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
+def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
+def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
+def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
+
+def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
+def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
+def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+
+def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
+def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
+def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>;
+def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>;
+def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>;
+
+def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>;
+def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>;
+def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>;
+
+def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>;
+def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>;
+def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
+def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
+def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
+def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
+                        (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
+
+def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
+def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
+def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>;
+def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
+def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;
+
+def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
+def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
+
+def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;
+
+def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
+                  [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def AArch64Prefetch        : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
+                               [SDNPHasChain, SDNPSideEffect]>;
+
+def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
+def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
+
+def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL",
+                                 SDT_AArch64TLSDescCall,
+                                 [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
+                                  SDNPVariadic]>;
+
+def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
+                                 SDT_AArch64WrapperLarge>;
+
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+// AArch64 Instruction Predicate Definitions.
+//
+def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
+def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
+def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
+def ForCodeSize   : Predicate<"ForCodeSize">;
+def NotForCodeSize   : Predicate<"!ForCodeSize">;
+
+include "AArch64InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous instructions.
+//===----------------------------------------------------------------------===//
+
+let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              [(AArch64callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
+} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
+
+let isReMaterializable = 1, isCodeGenOnly = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, they can be
+// removed, along with the AArch64Wrapper node.
+
+let AddedComplexity = 10 in
+def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
+                     [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
+              Sched<[WriteLDAdr]>;
+
+// The MOVaddr instruction should match only when the add is not folded
+// into a load or store address.
+def MOVaddr
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
+                                            tglobaladdr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrJT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
+                                             tjumptable:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrCP
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
+                                             tconstpool:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrBA
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
+                                             tblockaddress:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrTLS
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
+                                            tglobaltlsaddr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrEXT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
+                                            texternalsym:$low))]>,
+      Sched<[WriteAdrAdr]>;
+
+} // isReMaterializable, isCodeGenOnly
+
+def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
+          (LOADgot tglobaltlsaddr:$addr)>;
+
+def : Pat<(AArch64LOADgot texternalsym:$addr),
+          (LOADgot texternalsym:$addr)>;
+
+def : Pat<(AArch64LOADgot tconstpool:$addr),
+          (LOADgot tconstpool:$addr)>;
+
+//===----------------------------------------------------------------------===//
+// System instructions.
+//===----------------------------------------------------------------------===//
+
+def HINT  : HintI<"hint">;
+def : InstAlias<"nop",  (HINT 0b000)>;
+def : InstAlias<"yield",(HINT 0b001)>;
+def : InstAlias<"wfe",  (HINT 0b010)>;
+def : InstAlias<"wfi",  (HINT 0b011)>;
+def : InstAlias<"sev",  (HINT 0b100)>;
+def : InstAlias<"sevl", (HINT 0b101)>;
+
+  // As far as LLVM is concerned this writes to the system's exclusive monitors.
+let mayLoad = 1, mayStore = 1 in
+def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
+
+def DMB   : CRmSystemI<barrier_op, 0b101, "dmb">;
+def DSB   : CRmSystemI<barrier_op, 0b100, "dsb">;
+def ISB   : CRmSystemI<barrier_op, 0b110, "isb">;
+def : InstAlias<"clrex", (CLREX 0xf)>;
+def : InstAlias<"isb", (ISB 0xf)>;
+
+def MRS    : MRSI;
+def MSR    : MSRI;
+def MSRpstate: MSRpstateI;
+
+// The thread pointer (on Linux, at least, where this has been implemented) is
+// TPIDR_EL0.
+def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
+
+// Generic system instructions
+def SYSxt  : SystemXtI<0, "sys">;
+def SYSLxt : SystemLXtI<1, "sysl">;
+
+def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
+                (SYSxt imm0_7:$op1, sys_cr_op:$Cn,
+                 sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
+
+//===----------------------------------------------------------------------===//
+// Move immediate instructions.
+//===----------------------------------------------------------------------===//
+
+defm MOVK : InsertImmediate<0b11, "movk">;
+defm MOVN : MoveImmediate<0b00, "movn">;
+
+let PostEncoderMethod = "fixMOVZ" in
+defm MOVZ : MoveImmediate<0b10, "movz">;
+
+// First group of aliases covers an implicit "lsl #0".
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
+
+// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
+
+// Final group of aliases covers true "mov $Rd, $imm" cases.
+multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
+                          int width, int shift> {
+  def _asmoperand : AsmOperandClass {
+    let Name = basename # width # "_lsl" # shift # "MovAlias";
+    let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
+                               # shift # ">";
+    let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
+  }
+
+  def _movimm : Operand<i32> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
+  }
+
+  def : InstAlias<"mov $Rd, $imm",
+                  (INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
+}
+
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;
+
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;
+
+let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
+    isAsCheapAsAMove = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, we can select
+// directly to the real instructions and get rid of these pseudos.
+
+def MOVi32imm
+    : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
+             [(set GPR32:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+def MOVi64imm
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
+             [(set GPR64:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+} // isReMaterializable, isCodeGenOnly
+
+// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
+// eventual expansion code fewer bits to worry about getting right. Marshalling
+// the types is a little tricky though:
+def i64imm_32bit : ImmLeaf<i64, [{
+  return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
+}]>;
+
+def trunc_imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i32);
+}]>;
+
+def : Pat<(i64 i64imm_32bit:$src),
+          (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
+
+// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
+// sequences.
+def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
+                             tglobaladdr:$g1, tglobaladdr:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
+                                  tglobaladdr:$g2, 32),
+                          tglobaladdr:$g1, 16),
+                  tglobaladdr:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
+                             tblockaddress:$g1, tblockaddress:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
+                                  tblockaddress:$g2, 32),
+                          tblockaddress:$g1, 16),
+                  tblockaddress:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
+                             tconstpool:$g1, tconstpool:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
+                                  tconstpool:$g2, 32),
+                          tconstpool:$g1, 16),
+                  tconstpool:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
+                             tjumptable:$g1, tjumptable:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
+                                  tjumptable:$g2, 32),
+                          tjumptable:$g1, 16),
+                  tjumptable:$g0, 0)>;
+
+
+//===----------------------------------------------------------------------===//
+// Arithmetic instructions.
+//===----------------------------------------------------------------------===//
+
+// Add/subtract with carry.
+defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>;
+defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>;
+
+def : InstAlias<"ngc $dst, $src",  (SBCWr  GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngc $dst, $src",  (SBCXr  GPR64:$dst, XZR, GPR64:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
+
+// Add/subtract
+defm ADD : AddSub<0, "add", add>;
+defm SUB : AddSub<1, "sub">;
+
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
+
+defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn">;
+defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp">;
+
+// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
+def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
+          (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
+          (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
+          (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
+          (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
+          (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
+          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src$shift",
+                (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"neg $dst, $src$shift",
+                (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src$shift",
+                (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"negs $dst, $src$shift",
+                (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+
+// Unsigned/Signed divide
+defm UDIV : Div<0, "udiv", udiv>;
+defm SDIV : Div<1, "sdiv", sdiv>;
+let isCodeGenOnly = 1 in {
+defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>;
+defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>;
+}
+
+// Variable shift
+defm ASRV : Shift<0b10, "asr", sra>;
+defm LSLV : Shift<0b00, "lsl", shl>;
+defm LSRV : Shift<0b01, "lsr", srl>;
+defm RORV : Shift<0b11, "ror", rotr>;
+
+def : ShiftAlias<"asrv", ASRVWr, GPR32>;
+def : ShiftAlias<"asrv", ASRVXr, GPR64>;
+def : ShiftAlias<"lslv", LSLVWr, GPR32>;
+def : ShiftAlias<"lslv", LSLVXr, GPR64>;
+def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
+def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
+def : ShiftAlias<"rorv", RORVWr, GPR32>;
+def : ShiftAlias<"rorv", RORVXr, GPR64>;
+
+// Multiply-add
+let AddedComplexity = 7 in {
+defm MADD : MulAccum<0, "madd", add>;
+defm MSUB : MulAccum<1, "msub", sub>;
+
+def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
+          (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
+          (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+
+def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
+          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
+          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+} // AddedComplexity = 7
+
+let AddedComplexity = 5 in {
+def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
+def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
+def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
+def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
+          (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
+          (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
+          (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
+          (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+} // AddedComplexity = 5
+
+def : MulAccumWAlias<"mul", MADDWrrr>;
+def : MulAccumXAlias<"mul", MADDXrrr>;
+def : MulAccumWAlias<"mneg", MSUBWrrr>;
+def : MulAccumXAlias<"mneg", MSUBXrrr>;
+def : WideMulAccumAlias<"smull", SMADDLrrr>;
+def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
+def : WideMulAccumAlias<"umull", UMADDLrrr>;
+def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
+
+// Multiply-high
+def SMULHrr : MulHi<0b010, "smulh", mulhs>;
+def UMULHrr : MulHi<0b110, "umulh", mulhu>;
+
+// CRC32
+def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">;
+def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">;
+def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">;
+def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">;
+
+def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">;
+def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
+def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
+def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
+
+
+//===----------------------------------------------------------------------===//
+// Logical instructions.
+//===----------------------------------------------------------------------===//
+
+// (immediate)
+defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag>;
+defm AND  : LogicalImm<0b00, "and", and>;
+defm EOR  : LogicalImm<0b10, "eor", xor>;
+defm ORR  : LogicalImm<0b01, "orr", or>;
+
+// FIXME: these aliases *are* canonical sometimes (when movz can't be
+// used). Actually, it seems to be working right now, but putting logical_immXX
+// here is a bit dodgy on the AsmParser side too.
+def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
+                                          logical_imm32:$imm), 0>;
+def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
+                                          logical_imm64:$imm), 0>;
+
+
+// (register)
+defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>;
+defm BICS : LogicalRegS<0b11, 1, "bics",
+                        BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>;
+defm AND  : LogicalReg<0b00, 0, "and", and>;
+defm BIC  : LogicalReg<0b00, 1, "bic",
+                       BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+defm EON  : LogicalReg<0b10, 1, "eon",
+                       BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
+defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
+defm ORN  : LogicalReg<0b01, 1, "orn",
+                       BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
+defm ORR  : LogicalReg<0b01, 0, "orr", or>;
+
+def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
+def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
+
+def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
+def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
+
+def : InstAlias<"mvn $Wd, $Wm$sh",
+                (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
+def : InstAlias<"mvn $Xd, $Xm$sh",
+                (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+                (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
+def : InstAlias<"tst $src1, $src2",
+                (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+                        (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
+def : InstAlias<"tst $src1, $src2",
+                        (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
+
+def : InstAlias<"tst $src1, $src2$sh",
+               (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
+def : InstAlias<"tst $src1, $src2$sh",
+               (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
+
+
+def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
+def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
+
+
+//===----------------------------------------------------------------------===//
+// One operand data processing instructions.
+//===----------------------------------------------------------------------===//
+
+defm CLS    : OneOperandData<0b101, "cls">;
+defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
+defm RBIT   : OneOperandData<0b000, "rbit">;
+def  REV16Wr : OneWRegData<0b001, "rev16",
+                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
+def  REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
+
+def : Pat<(cttz GPR32:$Rn),
+          (CLZWr (RBITWr GPR32:$Rn))>;
+def : Pat<(cttz GPR64:$Rn),
+          (CLZXr (RBITXr GPR64:$Rn))>;
+def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
+                (i32 1))),
+          (CLSWr GPR32:$Rn)>;
+def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
+                (i64 1))),
+          (CLSXr GPR64:$Rn)>;
+
+// Unlike the other one operand instructions, the instructions with the "rev"
+// mnemonic do *not* just different in the size bit, but actually use different
+// opcode bits for the different sizes.
+def REVWr   : OneWRegData<0b010, "rev", bswap>;
+def REVXr   : OneXRegData<0b011, "rev", bswap>;
+def REV32Xr : OneXRegData<0b010, "rev32",
+                                 UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
+
+// The bswap commutes with the rotr so we want a pattern for both possible
+// orders.
+def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
+def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
+
+//===----------------------------------------------------------------------===//
+// Bitfield immediate extraction instruction.
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in
+defm EXTR : ExtractImm<"extr">;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
+
+def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
+          (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
+def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
+          (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Other bitfield immediate instructions.
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in {
+defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
+defm SBFM : BitfieldImm<0b00, "sbfm">;
+defm UBFM : BitfieldImm<0b10, "ubfm">;
+}
+
+def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(7, 31 - shift_amt)
+def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(15, 31 - shift_amt)
+def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(7, 63 - shift_amt)
+def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(15, 63 - shift_amt)
+def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+// min(31, 63 - shift_amt)
+def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 31 ? 31 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
+
+def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+                              (i64 (i32shift_b imm0_31:$imm)))>;
+def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_b imm0_63:$imm)))>;
+
+let AddedComplexity = 10 in {
+def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+}
+
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+//===----------------------------------------------------------------------===//
+// Conditionally set flags instructions.
+//===----------------------------------------------------------------------===//
+defm CCMN : CondSetFlagsImm<0, "ccmn">;
+defm CCMP : CondSetFlagsImm<1, "ccmp">;
+
+defm CCMN : CondSetFlagsReg<0, "ccmn">;
+defm CCMP : CondSetFlagsReg<1, "ccmp">;
+
+//===----------------------------------------------------------------------===//
+// Conditional select instructions.
+//===----------------------------------------------------------------------===//
+defm CSEL  : CondSelect<0, 0b00, "csel">;
+
+def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
+defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
+defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
+defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
+
+def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+
+def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
+          (CSINCWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
+          (CSINCXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
+          (CSINVWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
+          (CSINVXr XZR, XZR, (i32 imm:$cc))>;
+
+// The inverse of the condition code from the alias instruction is what is used
+// in the aliased instruction. The parser all ready inverts the condition code
+// for these aliases.
+def : InstAlias<"cset $dst, $cc",
+                (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"cset $dst, $cc",
+                (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"csetm $dst, $cc",
+                (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"csetm $dst, $cc",
+                (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+//===----------------------------------------------------------------------===//
+// PC-relative instructions.
+//===----------------------------------------------------------------------===//
+let isReMaterializable = 1 in {
+let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
+def ADR  : ADRI<0, "adr", adrlabel, []>;
+} // neverHasSideEffects = 1
+
+def ADRP : ADRI<1, "adrp", adrplabel,
+                [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
+} // isReMaterializable = 1
+
+// page address of a constant pool entry, block address
+def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
+def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (register) instructions.
+//===----------------------------------------------------------------------===//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+def RET  : BranchReg<0b0010, "ret", []>;
+def DRPS : SpecialReturn<0b0101, "drps">;
+def ERET : SpecialReturn<0b0100, "eret">;
+} // isReturn = 1, isTerminator = 1, isBarrier = 1
+
+// Default to the LR register.
+def : InstAlias<"ret", (RET LR)>;
+
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
+} // isCall
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
+} // isBranch, isTerminator, isBarrier, isIndirectBranch
+
+// Create a separate pseudo-instruction for codegen to use so that we don't
+// flag lr as used in every function. It'll be restored before the RET by the
+// epilogue if it's legitimately used.
+def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+}
+
+// This is a directive-like pseudo-instruction. The purpose is to insert an
+// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
+// (which in the usual case is a BLR).
+let hasSideEffects = 1 in
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
+  let AsmString = ".tlsdesccall $sym";
+}
+
+// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
+// gets expanded to two MCInsts during lowering.
+let isCall = 1, Defs = [LR] in
+def TLSDESC_BLR
+    : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
+             [(AArch64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
+
+def : Pat<(AArch64tlsdesc_call GPR64:$dest, texternalsym:$sym),
+          (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
+//===----------------------------------------------------------------------===//
+// Conditional branch (immediate) instruction.
+//===----------------------------------------------------------------------===//
+def Bcc : BranchCond;
+
+//===----------------------------------------------------------------------===//
+// Compare-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm CBZ  : CmpBranch<0, "cbz", AArch64cbz>;
+defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>;
+
+//===----------------------------------------------------------------------===//
+// Test-bit-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm TBZ  : TestBranch<0, "tbz", AArch64tbz>;
+defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (immediate) instructions.
+//===----------------------------------------------------------------------===//
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def B  : BranchImm<0, "b", [(br bb:$addr)]>;
+} // isBranch, isTerminator, isBarrier
+
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>;
+} // isCall
+def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
+
+//===----------------------------------------------------------------------===//
+// Exception generation instructions.
+//===----------------------------------------------------------------------===//
+def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
+def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
+def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
+def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
+def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
+def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
+def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
+def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
+
+// DCPSn defaults to an immediate operand of zero if unspecified.
+def : InstAlias<"dcps1", (DCPS1 0)>;
+def : InstAlias<"dcps2", (DCPS2 0)>;
+def : InstAlias<"dcps3", (DCPS3 0)>;
+
+//===----------------------------------------------------------------------===//
+// Load instructions.
+//===----------------------------------------------------------------------===//
+
+// Pair (indexed, offset)
+defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
+defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
+defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
+defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
+defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;
+
+defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (pre-indexed)
+def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (post-indexed)
+def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+
+// Pair (no allocate)
+defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
+defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
+defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
+defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
+defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;
+
+//---
+// (register offset)
+//---
+
+// Integer
+defm LDRBB : Load8RO<0b00,  0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
+defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
+defm LDRW  : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
+defm LDRX  : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
+
+// Floating-point
+defm LDRB : Load8RO<0b00,   1, 0b01, FPR8,   "ldr", untyped, load>;
+defm LDRH : Load16RO<0b01,  1, 0b01, FPR16,  "ldr", f16, load>;
+defm LDRS : Load32RO<0b10,  1, 0b01, FPR32,  "ldr", f32, load>;
+defm LDRD : Load64RO<0b11,  1, 0b01, FPR64,  "ldr", f64, load>;
+defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;
+
+// Load sign-extended half-word
+defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
+defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;
+
+// Load sign-extended byte
+defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
+defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;
+
+// Load sign-extended word
+defm LDRSW  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
+
+// Pre-fetch.
+defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
+                              ValueType ScalTy, ValueType VecTy,
+                              Instruction LOADW, Instruction LOADX,
+                              SubRegIndex sub> {
+  def : Pat<(VecTy (scalar_to_vector (ScalTy
+              (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
+            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+                           (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
+                           sub)>;
+
+  def : Pat<(VecTy (scalar_to_vector (ScalTy
+              (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
+            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+                           (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
+                           sub)>;
+}
+
+let AddedComplexity = 10 in {
+defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v8i8,  LDRBroW, LDRBroX, bsub>;
+defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v16i8, LDRBroW, LDRBroX, bsub>;
+
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
+
+defm : ScalToVecROLoadPat<ro32, load,       i32, v2i32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load,       i32, v4i32, LDRSroW, LDRSroX, ssub>;
+
+defm : ScalToVecROLoadPat<ro32, load,       f32, v2f32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load,       f32, v4f32, LDRSroW, LDRSroX, ssub>;
+
+defm : ScalToVecROLoadPat<ro64, load,       i64, v2i64, LDRDroW, LDRDroX, dsub>;
+
+defm : ScalToVecROLoadPat<ro64, load,       f64, v2f64, LDRDroW, LDRDroX, dsub>;
+
+
+def : Pat <(v1i64 (scalar_to_vector (i64
+                      (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                           ro_Wextend64:$extend))))),
+           (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+
+def : Pat <(v1i64 (scalar_to_vector (i64
+                      (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                           ro_Xextend64:$extend))))),
+           (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+}
+
+// Match all load 64 bits width whose type is compatible with FPR64
+multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
+                        Instruction LOADW, Instruction LOADX> {
+
+  def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+  // We must do vector loads with LD1 in big-endian.
+  defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v8i8,  LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
+}
+
+defm : VecROLoadPat<ro64, v1i64,  LDRDroW, LDRDroX>;
+defm : VecROLoadPat<ro64, v1f64,  LDRDroW, LDRDroX>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must do vector loads with LD1 in big-endian.
+  defm : VecROLoadPat<ro128, v2i64,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v2f64,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v4i32,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v4f32,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v8i16,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v16i8,  LDRQroW, LDRQroX>;
+}
+} // AddedComplexity = 10
+
+// zextload -> i64
+multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
+                            Instruction INSTW, Instruction INSTX> {
+  def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (SUBREG_TO_REG (i64 0),
+                           (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+                           sub_32)>;
+
+  def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (SUBREG_TO_REG (i64 0),
+                           (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+                           sub_32)>;
+}
+
+let AddedComplexity = 10 in {
+  defm : ExtLoadTo64ROPat<ro8,  zextloadi8,  LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW,  LDRWroX>;
+
+  // zextloadi1 -> zextloadi8
+  defm : ExtLoadTo64ROPat<ro8,  zextloadi1,  LDRBBroW, LDRBBroX>;
+
+  // extload -> zextload
+  defm : ExtLoadTo64ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo64ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo64ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
+
+  // extloadi1 -> zextloadi8
+  defm : ExtLoadTo64ROPat<ro8,  extloadi1,   LDRBBroW, LDRBBroX>;
+}
+
+
+// zextload -> i64
+multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
+                            Instruction INSTW, Instruction INSTX> {
+  def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+
+}
+
+let AddedComplexity = 10 in {
+  // extload -> zextload
+  defm : ExtLoadTo32ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo32ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo32ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
+
+  // zextloadi1 -> zextloadi8
+  defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
+}
+
+//---
+// (unsigned immediate)
+//---
+defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
+                   [(set GPR64:$Rt,
+                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
+                   [(set GPR32:$Rt,
+                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
+                   [(set FPR8:$Rt,
+                         (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
+defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
+                   [(set (f16 FPR16:$Rt),
+                         (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
+defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
+                   [(set (f32 FPR32:$Rt),
+                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
+                   [(set (f64 FPR64:$Rt),
+                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
+                 [(set (f128 FPR128:$Rt),
+                       (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+def : Pat <(v8i8 (scalar_to_vector (i32
+               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v16i8 (scalar_to_vector (i32
+               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v4i16 (scalar_to_vector (i32
+               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v8i16 (scalar_to_vector (i32
+               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v2i32 (scalar_to_vector (i32
+               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v4i32 (scalar_to_vector (i32
+               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat <(v2i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                          (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use LD1 to perform vector loads in big-endian.
+  def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use LD1 to perform vector loads in big-endian.
+  def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(f128  (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+          (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+
+defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
+                    [(set GPR32:$Rt,
+                          (zextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                     uimm12s2:$offset)))]>;
+defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
+                    [(set GPR32:$Rt,
+                          (zextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                   uimm12s1:$offset)))]>;
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+
+// zextloadi1 -> zextloadi8
+def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// extload -> zextload
+def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
+                     [(set GPR32:$Rt,
+                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)))]>;
+defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
+                     [(set GPR64:$Rt,
+                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)))]>;
+
+// load sign-extended byte
+defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
+                     [(set GPR32:$Rt,
+                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)))]>;
+defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
+                     [(set GPR64:$Rt,
+                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)))]>;
+
+// load sign-extended word
+defm LDRSW  : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
+                     [(set GPR64:$Rt,
+                           (sextloadi32 (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s4:$offset)))]>;
+
+// load zero-extended word
+def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+      (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+
+// Pre-fetch.
+def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
+                        [(AArch64Prefetch imm:$Rt,
+                                        (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset))]>;
+
+def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
+
+//---
+// (literal)
+def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
+def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
+def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
+def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
+def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
+
+// load sign-extended word
+def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
+
+// prefetch
+def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
+//                   [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>;
+
+//---
+// (unscaled immediate)
+defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
+                    [(set GPR64:$Rt,
+                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
+                    [(set GPR32:$Rt,
+                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
+                    [(set FPR8:$Rt,
+                          (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
+                    [(set FPR16:$Rt,
+                          (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
+                    [(set (f32 FPR32:$Rt),
+                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
+                    [(set (f64 FPR64:$Rt),
+                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
+                    [(set (f128 FPR128:$Rt),
+                          (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
+
+defm LDURHH
+    : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
+             [(set GPR32:$Rt,
+                    (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURBB
+    : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
+             [(set GPR32:$Rt,
+                    (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+}
+
+//  anyext -> zext
+def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+// unscaled zext
+def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+
+//---
+// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
+
+// Define new assembler match classes as we want to only match these when
+// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
+// associate a DiagnosticType either, as we want the diagnostic for the
+// canonical form (the scaled operand) to take precedence.
+class SImm9OffsetOperand<int Width> : AsmOperandClass {
+  let Name = "SImm9OffsetFB" # Width;
+  let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
+  let RenderMethod = "addImmOperands";
+}
+
+def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
+def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
+def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
+def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
+def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;
+
+def simm9_offset_fb8 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB8Operand;
+}
+def simm9_offset_fb16 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB16Operand;
+}
+def simm9_offset_fb32 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB32Operand;
+}
+def simm9_offset_fb64 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB64Operand;
+}
+def simm9_offset_fb128 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB128Operand;
+}
+
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+               (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+  (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+  (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDURSHW
+    : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
+               [(set GPR32:$Rt,
+                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSHX
+    : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
+              [(set GPR64:$Rt,
+                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended byte
+defm LDURSBW
+    : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
+                [(set GPR32:$Rt,
+                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSBX
+    : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
+                [(set GPR64:$Rt,
+                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended word
+defm LDURSW
+    : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
+              [(set GPR64:$Rt,
+                    (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
+def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
+                (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
+                (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+                (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+                (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+                (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+                (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
+                (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+
+// Pre-fetch.
+defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
+                  [(AArch64Prefetch imm:$Rt,
+                                  (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
+defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
+
+defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
+defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
+
+// load sign-extended half-word
+defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
+defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
+
+// load sign-extended byte
+defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
+defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
+
+// load sign-extended word
+defm LDTRSW  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
+
+//---
+// (immediate pre-indexed)
+def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+//---
+// (immediate post-indexed)
+def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+//===----------------------------------------------------------------------===//
+// Store instructions.
+//===----------------------------------------------------------------------===//
+
+// Pair (indexed, offset)
+// FIXME: Use dedicated range-checked addressing mode operand here.
+defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
+defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
+defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
+defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
+defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (no allocate)
+defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
+defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
+defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
+defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
+defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;
+
+//---
+// (Register offset)
+
+// Integer
+defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
+defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
+defm STRW  : Store32RO<0b10, 0, 0b00, GPR32, "str",  i32, store>;
+defm STRX  : Store64RO<0b11, 0, 0b00, GPR64, "str",  i64, store>;
+
+
+// Floating-point
+defm STRB : Store8RO< 0b00,  1, 0b00, FPR8,   "str", untyped, store>;
+defm STRH : Store16RO<0b01,  1, 0b00, FPR16,  "str", f16,     store>;
+defm STRS : Store32RO<0b10,  1, 0b00, FPR32,  "str", f32,     store>;
+defm STRD : Store64RO<0b11,  1, 0b00, FPR64,  "str", f64,     store>;
+defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128,    store>;
+
+multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
+                                 Instruction STRW, Instruction STRX> {
+
+  def : Pat<(storeop GPR64:$Rt,
+                     (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(storeop GPR64:$Rt,
+                     (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+  // truncstore i64
+  defm : TruncStoreFrom64ROPat<ro8,  truncstorei8,  STRBBroW, STRBBroX>;
+  defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
+  defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW,  STRWroX>;
+}
+
+multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
+                         Instruction STRW, Instruction STRX> {
+  def : Pat<(store (VecTy FPR:$Rt),
+                   (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(store (VecTy FPR:$Rt),
+                   (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
+}
+
+defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
+defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
+}
+} // AddedComplexity = 10
+
+//---
+// (unsigned immediate)
+defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
+                   [(store GPR64:$Rt,
+                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str",
+                    [(store GPR32:$Rt,
+                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
+                    [(store FPR8:$Rt,
+                            (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
+defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
+                    [(store (f16 FPR16:$Rt),
+                            (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
+defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
+                    [(store (f32 FPR32:$Rt),
+                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
+                    [(store (f64 FPR64:$Rt),
+                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
+
+defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh",
+                     [(truncstorei16 GPR32:$Rt,
+                                     (am_indexed16 GPR64sp:$Rn,
+                                                   uimm12s2:$offset))]>;
+defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1,  "strb",
+                     [(truncstorei8 GPR32:$Rt,
+                                    (am_indexed8 GPR64sp:$Rn,
+                                                 uimm12s1:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v2f32 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v8i8 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v4i16 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v2i32 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v4f32 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v16i8 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v8i16 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v4i32 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v2i64 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(store (f128  FPR128:$Rt),
+                 (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+          (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+
+// truncstore i64
+def : Pat<(truncstorei32 GPR64:$Rt,
+                         (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
+  (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt,
+                         (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+  (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
+  (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;
+
+} // AddedComplexity = 10
+
+//---
+// (unscaled immediate)
+defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
+                         [(store GPR64:$Rt,
+                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
+                         [(store GPR32:$Rt,
+                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
+                         [(store FPR8:$Rt,
+                                 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
+                         [(store (f16 FPR16:$Rt),
+                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
+                         [(store (f32 FPR32:$Rt),
+                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
+                         [(store (f64 FPR64:$Rt),
+                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
+                         [(store (f128 FPR128:$Rt),
+                                 (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
+                         [(truncstorei16 GPR32:$Rt,
+                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
+                         [(truncstorei8 GPR32:$Rt,
+                                  (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v2f32 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8i8 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4i16 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2i32 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v4f32 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v16i8 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8i16 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4i32 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2i64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+
+// unscaled i64 truncating stores
+def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+  (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+  (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+  (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+
+//---
+// STR mnemonics fall back to STUR for negative or unaligned offsets.
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+def : InstAlias<"strb $Rt, [$Rn, $offset]",
+                (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"strh $Rt, [$Rn, $offset]",
+                (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
+defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
+
+defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
+defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
+
+//---
+// (immediate pre-indexed)
+def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str",  pre_store, i32>;
+def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str",  pre_store, i64>;
+def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str",  pre_store, untyped>;
+def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str",  pre_store, f16>;
+def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str",  pre_store, f32>;
+def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str",  pre_store, f64>;
+def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;
+
+def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8,  i32>;
+def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+           simm9:$off)>;
+def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+
+def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//---
+// (immediate post-indexed)
+def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32,  "str", post_store, i32>;
+def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64,  "str", post_store, i64>;
+def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,   "str", post_store, untyped>;
+def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16,  "str", post_store, f16>;
+def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32,  "str", post_store, f32>;
+def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64,  "str", post_store, f64>;
+def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;
+
+def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
+def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+             simm9:$off)>;
+def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+             simm9:$off)>;
+
+def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//===----------------------------------------------------------------------===//
+// Load/store exclusive instructions.
+//===----------------------------------------------------------------------===//
+
+def LDARW  : LoadAcquire   <0b10, 1, 1, 0, 1, GPR32, "ldar">;
+def LDARX  : LoadAcquire   <0b11, 1, 1, 0, 1, GPR64, "ldar">;
+def LDARB  : LoadAcquire   <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
+def LDARH  : LoadAcquire   <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
+
+def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
+def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
+def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
+def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
+
+def LDXRW  : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
+def LDXRX  : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
+def LDXRB  : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
+def LDXRH  : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
+
+def STLRW  : StoreRelease  <0b10, 1, 0, 0, 1, GPR32, "stlr">;
+def STLRX  : StoreRelease  <0b11, 1, 0, 0, 1, GPR64, "stlr">;
+def STLRB  : StoreRelease  <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
+def STLRH  : StoreRelease  <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
+
+def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
+def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
+def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
+def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
+
+def STXRW  : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
+def STXRX  : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
+def STXRB  : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
+def STXRH  : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
+
+def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
+def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
+
+def LDXPW  : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
+def LDXPX  : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
+
+def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
+def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
+
+def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
+def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
+
+//===----------------------------------------------------------------------===//
+// Scaled floating point to integer conversion instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>;
+defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>;
+defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>;
+defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>;
+defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>;
+defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
+defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
+defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
+defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+}
+
+//===----------------------------------------------------------------------===//
+// Scaled integer to floating point conversion instructions.
+//===----------------------------------------------------------------------===//
+
+defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
+defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
+
+//===----------------------------------------------------------------------===//
+// Unscaled integer to floating point conversion instruction.
+//===----------------------------------------------------------------------===//
+
+defm FMOV : UnscaledConversion<"fmov">;
+
+def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
+def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
+
+//===----------------------------------------------------------------------===//
+// Floating point conversion instruction.
+//===----------------------------------------------------------------------===//
+
+defm FCVT : FPConversion<"fcvt">;
+
+def : Pat<(f32_to_f16 FPR32:$Rn),
+          (i32 (COPY_TO_REGCLASS
+                   (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)),
+                   GPR32))>;
+
+def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn),
+                          [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>;
+
+//===----------------------------------------------------------------------===//
+// Floating point single operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
+defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
+defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
+defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
+defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
+
+def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
+          (FRINTNDr FPR64:$Rn)>;
+
+// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
+// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
+// <rdar://problem/13715968>
+// TODO: We should really model the FPSR flags correctly. This is really ugly.
+let hasSideEffects = 1 in {
+defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
+}
+
+defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
+
+let SchedRW = [WriteFDiv] in {
+defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating point two operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
+let SchedRW = [WriteFDiv] in {
+defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
+}
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>;
+defm FMAX   : TwoOperandFPData<0b0100, "fmax", AArch64fmax>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>;
+defm FMIN   : TwoOperandFPData<0b0101, "fmin", AArch64fmin>;
+let SchedRW = [WriteFMul] in {
+defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
+defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
+}
+defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
+
+def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point three operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
+defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
+     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
+defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
+     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
+defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
+     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+
+// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
+// the NEON variant.
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
+          (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
+          (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
+// "(-a) + b*(-c)".
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
+          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
+          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
+          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
+          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point comparison instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCMPE : FPComparison<1, "fcmpe">;
+defm FCMP  : FPComparison<0, "fcmp", AArch64fcmp>;
+
+//===----------------------------------------------------------------------===//
+// Floating point conditional comparison instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCCMPE : FPCondComparison<1, "fccmpe">;
+defm FCCMP  : FPCondComparison<0, "fccmp">;
+
+//===----------------------------------------------------------------------===//
+// Floating point conditional select instruction.
+//===----------------------------------------------------------------------===//
+
+defm FCSEL : FPCondSelect<"fcsel">;
+
+// CSEL instructions providing f128 types need to be handled by a
+// pseudo-instruction since the eventual code will need to introduce basic
+// blocks and control flow.
+def F128CSEL : Pseudo<(outs FPR128:$Rd),
+                      (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
+                      [(set (f128 FPR128:$Rd),
+                            (AArch64csel FPR128:$Rn, FPR128:$Rm,
+                                       (i32 imm:$cond), NZCV))]> {
+  let Uses = [NZCV];
+  let usesCustomInserter = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Floating point immediate move.
+//===----------------------------------------------------------------------===//
+
+let isReMaterializable = 1 in {
+defm FMOV : FPMoveImmediate<"fmov">;
+}
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
+defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
+defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE   : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
+defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
+defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
+
+defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>;
+defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>;
+defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
+          (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
+                                                              (i64 4)))),
+          (FCVTLv8i16 V128:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
+                                                    (i64 2))))),
+          (FCVTLv4i32 V128:$Rn)>;
+
+defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
+defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
+defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
+defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>;
+defm FCVTN  : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
+def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
+          (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd,
+                          (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
+          (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
+          (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
+defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
+defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
+                                        int_aarch64_neon_fcvtxn>;
+defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
+defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
+                                       int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
+                                       int_aarch64_neon_fcvtzu>;
+}
+defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
+defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
+defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
+defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
+defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
+defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
+defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
+defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
+defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
+                               UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
+// Aliases for MVN -> NOT.
+def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
+                (NOTv8i8 V64:$Vd, V64:$Vn)>;
+def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
+                (NOTv16i8 V128:$Vd, V128:$Vn)>;
+
+def : Pat<(AArch64neg (v8i8  V64:$Rn)),  (NEGv8i8  V64:$Rn)>;
+def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
+def : Pat<(AArch64neg (v4i16 V64:$Rn)),  (NEGv4i16 V64:$Rn)>;
+def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
+def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
+
+def : Pat<(AArch64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v1i64 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
+defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
+defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
+defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
+defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
+       BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
+defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
+defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
+defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
+defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN  : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
+defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
+defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
+defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
+       BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
+defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
+                    int_aarch64_neon_uaddlp>;
+defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
+defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
+defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
+defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
+defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
+defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
+
+def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
+
+// Patterns for vector long shift (by element width). These need to match all
+// three of zext, sext and anyext so it's easier to pull the patterns out of the
+// definition.
+multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
+  def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
+            (SHLLv8i8 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
+            (SHLLv16i8 V128:$Rn)>;
+  def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
+            (SHLLv4i16 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
+            (SHLLv8i16 V128:$Rn)>;
+  def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
+            (SHLLv2i32 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
+            (SHLLv4i32 V128:$Rn)>;
+}
+
+defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
+defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
+defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>;
+defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>;
+defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
+defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
+defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
+defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
+defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>;
+defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>;
+defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>;
+
+// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
+// instruction expects the addend first, while the fma intrinsic puts it last.
+defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
+            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
+            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
+          (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
+defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
+defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
+                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
+                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
+defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
+      TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
+defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
+defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
+defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
+defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
+defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>;
+defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
+defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>;
+defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
+defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
+defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
+defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
+defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
+defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
+      TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
+defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
+defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
+defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
+defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
+defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>;
+defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
+defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>;
+defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
+defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
+defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
+defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
+
+defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
+defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
+                                  BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
+defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
+    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
+defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
+defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
+                                  BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
+defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
+
+def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
+def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+
+def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
+def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+
+def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmls.8b\t$dst, $src1, $src2}",
+                (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmls.16b\t$dst, $src1, $src2}",
+                (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmls.4h\t$dst, $src1, $src2}",
+                (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmls.8h\t$dst, $src1, $src2}",
+                (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmls.2s\t$dst, $src1, $src2}",
+                (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmls.4s\t$dst, $src1, $src2}",
+                (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmls.2d\t$dst, $src1, $src2}",
+                (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlo.8b\t$dst, $src1, $src2}",
+                (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlo.16b\t$dst, $src1, $src2}",
+                (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlo.4h\t$dst, $src1, $src2}",
+                (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlo.8h\t$dst, $src1, $src2}",
+                (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlo.2s\t$dst, $src1, $src2}",
+                (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlo.4s\t$dst, $src1, $src2}",
+                (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlo.2d\t$dst, $src1, $src2}",
+                (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmle.8b\t$dst, $src1, $src2}",
+                (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmle.16b\t$dst, $src1, $src2}",
+                (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmle.4h\t$dst, $src1, $src2}",
+                (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmle.8h\t$dst, $src1, $src2}",
+                (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmle.2s\t$dst, $src1, $src2}",
+                (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmle.4s\t$dst, $src1, $src2}",
+                (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmle.2d\t$dst, $src1, $src2}",
+                (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlt.8b\t$dst, $src1, $src2}",
+                (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlt.16b\t$dst, $src1, $src2}",
+                (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlt.4h\t$dst, $src1, $src2}",
+                (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlt.8h\t$dst, $src1, $src2}",
+                (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlt.2s\t$dst, $src1, $src2}",
+                (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlt.4s\t$dst, $src1, $src2}",
+                (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlt.2d\t$dst, $src1, $src2}",
+                (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmle.2s\t$dst, $src1, $src2}",
+                (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmle.4s\t$dst, $src1, $src2}",
+                (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmle.2d\t$dst, $src1, $src2}",
+                (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmlt.2s\t$dst, $src1, $src2}",
+                (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmlt.4s\t$dst, $src1, $src2}",
+                (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmlt.2d\t$dst, $src1, $src2}",
+                (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|facle.2s\t$dst, $src1, $src2}",
+                (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|facle.4s\t$dst, $src1, $src2}",
+                (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|facle.2d\t$dst, $src1, $src2}",
+                (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|faclt.2s\t$dst, $src1, $src2}",
+                (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|faclt.4s\t$dst, $src1, $src2}",
+                (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|faclt.2d\t$dst, $src1, $src2}",
+                (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three scalar instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADD      : SIMDThreeScalarD<0, 0b10000, "add", add>;
+defm CMEQ     : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE     : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>;
+def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FABD64 FPR64:$Rn, FPR64:$Rm)>;
+defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
+                                     int_aarch64_neon_facge>;
+defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
+                                     int_aarch64_neon_facgt>;
+defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
+defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
+defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
+defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB    : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>;
+defm SRSHL    : SIMDThreeScalarD<   0, 0b01010, "srshl", int_aarch64_neon_srshl>;
+defm SSHL     : SIMDThreeScalarD<   0, 0b01000, "sshl", int_aarch64_neon_sshl>;
+defm SUB      : SIMDThreeScalarD<   1, 0b10000, "sub", sub>;
+defm UQADD    : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
+defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_aarch64_neon_urshl>;
+defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_aarch64_neon_ushl>;
+
+def : InstAlias<"cmls $dst, $src1, $src2",
+                (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmle $dst, $src1, $src2",
+                (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlo $dst, $src1, $src2",
+                (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlt $dst, $src1, $src2",
+                (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three scalar instructions (mixed operands).
+//===----------------------------------------------------------------------===//
+defm SQDMULL  : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
+                                       int_aarch64_neon_sqdmulls_scalar>;
+defm SQDMLAL  : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
+defm SQDMLSL  : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
+
+def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
+                   (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
+                   (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two scalar instructions.
+//===----------------------------------------------------------------------===//
+
+defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
+defm FCMEQ  : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoScalarSD<   0, 0, 0b11100, "fcvtas">;
+defm FCVTAU : SIMDTwoScalarSD<   1, 0, 0b11100, "fcvtau">;
+defm FCVTMS : SIMDTwoScalarSD<   0, 0, 0b11011, "fcvtms">;
+defm FCVTMU : SIMDTwoScalarSD<   1, 0, 0b11011, "fcvtmu">;
+defm FCVTNS : SIMDTwoScalarSD<   0, 0, 0b11010, "fcvtns">;
+defm FCVTNU : SIMDTwoScalarSD<   1, 0, 0b11010, "fcvtnu">;
+defm FCVTPS : SIMDTwoScalarSD<   0, 1, 0b11010, "fcvtps">;
+defm FCVTPU : SIMDTwoScalarSD<   1, 1, 0b11010, "fcvtpu">;
+def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
+defm FCVTZS : SIMDTwoScalarSD<   0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDTwoScalarSD<   1, 1, 0b11011, "fcvtzu">;
+defm FRECPE : SIMDTwoScalarSD<   0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDTwoScalarSD<   0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDTwoScalarSD<  1, 1, 0b11101, "frsqrte">;
+defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
+                                 UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
+defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
+defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
+                                     int_aarch64_neon_suqadd>;
+defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
+defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
+                                    int_aarch64_neon_usqadd>;
+
+def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
+
+def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
+          (FCVTASv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
+          (FCVTAUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))),
+          (FCVTMSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))),
+          (FCVTMUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))),
+          (FCVTNSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))),
+          (FCVTNUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
+          (FCVTPSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
+          (FCVTPUv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
+          (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
+          (FRECPXv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
+          (FRECPXv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
+          (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// Here are the patterns for 8 and 16-bits to float.
+// 8-bits -> float.
+multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
+                             SDPatternOperator loadop, Instruction UCVTF,
+                             ROAddrMode ro, Instruction LDRW, Instruction LDRX,
+                             SubRegIndex sub> {
+  def : Pat<(DstTy (uint_to_fp (SrcTy
+                     (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
+                                      ro.Wext:$extend))))),
+           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+                                 (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+                                 sub))>;
+
+  def : Pat<(DstTy (uint_to_fp (SrcTy
+                     (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
+                                      ro.Wext:$extend))))),
+           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+                                 (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+                                 sub))>;
+}
+
+defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
+                         UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f32 (uint_to_fp (i32
+               (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+                     (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> float.
+defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
+                         UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f32 (uint_to_fp (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// UCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
+                         UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f64 (uint_to_fp (i32
+                    (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
+                         UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, load,
+                         UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three different-sized vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
+defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
+defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
+defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
+defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
+defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
+                                             int_aarch64_neon_sabd>;
+defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
+                                          int_aarch64_neon_sabd>;
+defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
+            BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
+defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
+                 BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
+defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
+                                               int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
+                                               int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
+                                     int_aarch64_neon_sqdmull>;
+defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
+                 BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
+defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
+                 BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
+defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
+                                              int_aarch64_neon_uabd>;
+defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
+                                          int_aarch64_neon_uabd>;
+defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
+                 BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
+defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
+                 BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
+defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
+defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
+                 BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
+defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
+                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
+
+// Patterns for 64-bit pmull
+def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
+          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
+def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
+                                  (vector_extract (v2i64 V128:$Rm), (i64 1))),
+          (PMULLv2i64 V128:$Rn, V128:$Rm)>;
+
+// CodeGen patterns for addhn and subhn instructions, which can actually be
+// written in LLVM IR without too much difficulty.
+
+// ADDHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
+          (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+// SUBHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
+          (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector instruction.
+//----------------------------------------------------------------------------
+
+defm EXT : SIMDBitwiseExtract<"ext">;
+
+def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+
+// We use EXT to handle extract_subvector to copy the upper 64-bits of a
+// 128-bit vector.
+def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>;
+defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>;
+defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>;
+defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
+defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
+defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX instructions
+//----------------------------------------------------------------------------
+
+defm TBL : SIMDTableLookup<    0, "tbl">;
+defm TBX : SIMDTableLookupTied<1, "tbx">;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBLv16i8One V128:$Ri, V128:$Rn)>;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd),
+                  (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
+                   (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY instruction
+//----------------------------------------------------------------------------
+
+defm CPY : SIMDScalarCPY<"cpy">;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
+defm FADDP   : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
+defm FMAXP   : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
+defm FMINP   : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
+def : Pat<(i64 (int_aarch64_neon_saddv (v2i64 V128:$Rn))),
+          (ADDPv2i64p V128:$Rn)>;
+def : Pat<(i64 (int_aarch64_neon_uaddv (v2i64 V128:$Rn))),
+          (ADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))),
+          (FADDPv2i32p V64:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))),
+          (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
+def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))),
+          (FADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))),
+          (FMAXNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))),
+          (FMAXNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))),
+          (FMAXPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))),
+          (FMAXPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))),
+          (FMINNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))),
+          (FMINNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))),
+          (FMINPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))),
+          (FMINPv2i64p V128:$Rn)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+def DUPv8i8gpr  : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>;
+def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>;
+def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>;
+def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>;
+def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>;
+def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>;
+def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>;
+
+def DUPv2i64lane : SIMDDup64FromElement;
+def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
+def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
+def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
+def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
+def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
+def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
+
+def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
+          (v2f32 (DUPv2i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))),
+          (v4f32 (DUPv4i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
+          (v2f64 (DUPv2i64lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
+            (i64 0)))>;
+
+def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+          (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+         (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
+          (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
+
+// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
+// instruction even if the types don't match: we just have to remap the lane
+// carefully. N.b. this trick only applies to truncations.
+def VecIndex_x2 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(2 * N->getZExtValue(), MVT::i64);
+}]>;
+def VecIndex_x4 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(4 * N->getZExtValue(), MVT::i64);
+}]>;
+def VecIndex_x8 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(8 * N->getZExtValue(), MVT::i64);
+}]>;
+
+multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
+                            ValueType Src128VT, ValueType ScalVT,
+                            Instruction DUP, SDNodeXForm IdxXFORM> {
+  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
+                                                     imm:$idx)))),
+            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
+                                                     imm:$idx)))),
+            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTruncPats<v8i8,   v4i16, v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
+defm : DUPWithTruncPats<v8i8,   v2i32, v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
+defm : DUPWithTruncPats<v4i16,  v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+
+defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
+
+multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
+                               SDNodeXForm IdxXFORM> {
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
+                                                         imm:$idx))))),
+            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
+                                                         imm:$idx))))),
+            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTrunci64Pats<v8i8,  DUPv8i8lane,   VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane,  VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane,  VecIndex_x2>;
+
+defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+
+// SMOV and UMOV definitions, with some extra patterns for convenience
+defm SMOV : SMov;
+defm UMOV : UMov;
+
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
+          (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
+
+// Extracting i8 or i16 elements will have the zero-extend transformed to
+// an 'and' mask by type legalization since neither i8 nor i16 are legal types
+// for AArch64. Match these patterns here since UMOV already zeroes out the high
+// bits of the destination register.
+def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
+               (i32 0xff)),
+          (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
+               (i32 0xffff)),
+          (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
+
+defm INS : SIMDIns;
+
+def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+            (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                  (i64 FPR64:$Rn), dsub))>;
+
+def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
+
+def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (EXTRACT_SUBREG
+            (INSvi32lane
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+              VectorIndexS:$imm,
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+              (i64 0)),
+            dsub)>;
+def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (INSvi32lane
+            V128:$Rn, VectorIndexS:$imm,
+            (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+            (i64 0))>;
+def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
+            (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
+          (INSvi64lane
+            V128:$Rn, VectorIndexD:$imm,
+            (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
+            (i64 0))>;
+
+// Copy an element at a constant index in one vector into a constant indexed
+// element of another.
+// FIXME refactor to a shared class/dev parameterized on vector type, vector
+// index type and INS extension
+def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane
+                   (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
+                   VectorIndexB:$idx2)),
+          (v16i8 (INSvi8lane
+                   V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
+          )>;
+def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane
+                   (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
+                   VectorIndexH:$idx2)),
+          (v8i16 (INSvi16lane
+                   V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
+          )>;
+def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane
+                   (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
+                   VectorIndexS:$idx2)),
+          (v4i32 (INSvi32lane
+                   V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
+          )>;
+def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
+                   (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
+                   VectorIndexD:$idx2)),
+          (v2i64 (INSvi64lane
+                   V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
+          )>;
+
+multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
+                                ValueType VTScal, Instruction INS> {
+  def : Pat<(VT128 (vector_insert V128:$src,
+                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
+
+  def : Pat<(VT128 (vector_insert V128:$src,
+                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (INS V128:$src, imm:$Immd,
+                 (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
+
+  def : Pat<(VT64 (vector_insert V64:$src,
+                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+                                 imm:$Immd, V128:$Rn, imm:$Immn),
+                            dsub)>;
+
+  def : Pat<(VT64 (vector_insert V64:$src,
+                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (EXTRACT_SUBREG
+                (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
+                     (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
+                dsub)>;
+}
+
+defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8,  i32, INSvi8lane>;
+defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi32lane>;
+
+
+// Floating point vector extractions are codegen'd as either a sequence of
+// subregister extractions, possibly fed by an INS if the lane number is
+// anything other than zero.
+def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
+          (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
+          (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
+def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
+          (f64 (EXTRACT_SUBREG
+            (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexD:$idx),
+            dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
+          (f32 (EXTRACT_SUBREG
+            (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexS:$idx),
+            ssub))>;
+
+// All concat_vectors operations are canonicalised to act on i64 vectors for
+// AArch64. In the general case we need an instruction, which had just as well be
+// INS.
+class ConcatPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
+        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
+                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
+
+def : ConcatPat<v2i64, v1i64>;
+def : ConcatPat<v2f64, v1f64>;
+def : ConcatPat<v4i32, v2i32>;
+def : ConcatPat<v4f32, v2f32>;
+def : ConcatPat<v8i16, v4i16>;
+def : ConcatPat<v16i8, v8i8>;
+
+// If the high lanes are undef, though, we can just ignore them:
+class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
+        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
+
+def : ConcatUndefPat<v2i64, v1i64>;
+def : ConcatUndefPat<v2f64, v1f64>;
+def : ConcatUndefPat<v4i32, v2i32>;
+def : ConcatUndefPat<v4f32, v2f32>;
+def : ConcatUndefPat<v8i16, v4i16>;
+def : ConcatUndefPat<v16i8, v8i8>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+defm ADDV    : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
+defm SMAXV   : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
+defm SMINV   : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
+defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
+defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
+defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
+defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
+defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+
+multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+  def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          (i64 0)))>;
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+          ssub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+  def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          ssub))>;
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+          ssub))>;
+
+}
+
+multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+           ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
+                                                Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+            ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+defm : SIMDAcrossLanesSignedIntrinsic<"ADDV",  int_aarch64_neon_saddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))),
+          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV",  int_aarch64_neon_uaddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))),
+          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_aarch64_neon_smaxv>;
+def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_aarch64_neon_sminv>;
+def : Pat<(i32 (int_aarch64_neon_sminv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_aarch64_neon_umaxv>;
+def : Pat<(i32 (int_aarch64_neon_umaxv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_aarch64_neon_uminv>;
+def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
+defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
+
+// The vaddlv_s32 intrinsic gets mapped to SADDLP.
+def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (SADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+// The vaddlv_u32 intrinsic gets mapped to UADDLP.
+def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (UADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+
+//------------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//------------------------------------------------------------------------------
+
+// AdvSIMD BIC
+defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
+// AdvSIMD ORR
+defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;
+
+def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+// AdvSIMD FMOV
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
+                                              "fmov", ".2d",
+                       [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
+                                              "fmov", ".2s",
+                       [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
+                                              "fmov", ".4s",
+                       [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+
+// AdvSIMD MOVI
+
+// EDIT byte mask: scalar
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
+                    [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 here.
+def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
+          (MOVID imm0_255:$shift)>;
+
+def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
+
+def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
+
+// EDIT byte mask: 2d
+
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 in the pattern
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
+                                                simdimmtype10,
+                                                "movi", ".2d",
+                   [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
+
+
+// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
+// Complexity is added to break a tie with a plain MOVI.
+let AddedComplexity = 1 in {
+def : Pat<(f32   fpimm0),
+          (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
+      Requires<[HasZCZ]>;
+def : Pat<(f64   fpimm0),
+          (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
+      Requires<[HasZCZ]>;
+}
+
+def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+
+def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+
+def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
+
+def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+// Per byte: 8b & 16b
+def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
+                                                 "movi", ".8b",
+                       [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
+def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
+                                                 "movi", ".16b",
+                       [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
+
+// AdvSIMD MVNI
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
+
+def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let neverHasSideEffects = 1 in {
+  defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
+  defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
+}
+
+// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
+// instruction expects the addend first, while the intrinsic expects it last.
+
+// On the other hand, there are quite a few valid combinatorial options due to
+// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+
+multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
+  // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (v2f32 (AArch64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (v4f32 (AArch64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
+  // (DUPLANE from 64-bit would be trivial).
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64duplane64 (v2f64 (fneg V128:$Rm)),
+                                           VectorIndexD:$idx))),
+            (FMLSv2i64_indexed
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64dup (f64 (fneg FPR64Op:$Rm))))),
+            (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 (fneg V64:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+}
+
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+
+defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
+defm FMUL  : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
+
+def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+          (FMULv2i32_indexed V64:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+          (FMULv4i32_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
+          (FMULv2i64_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
+            (i64 0))>;
+
+defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
+              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
+              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
+defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
+                int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
+                                           int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
+                                           int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
+defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
+                int_aarch64_neon_umull>;
+
+// A scalar sqdmull with the second operand being a vector lane can be
+// handled directly with the indexed instruction encoding.
+def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                          (vector_extract (v4i32 V128:$Vm),
+                                                           VectorIndexS:$idx)),
+          (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
+defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
+defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
+// Codegen patterns for the above. We don't put these directly on the
+// instructions because TableGen's type inference can't handle the truth.
+// Having the same base pattern for fp <--> int totally freaks it out.
+def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
+          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
+          (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+
+defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", AArch64vshl>;
+defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
+defm SQRSHRN  : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
+                                     int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
+                                     int_aarch64_neon_sqrshrun>;
+defm SQSHLU   : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL    : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN   : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
+                                     int_aarch64_neon_sqshrn>;
+defm SQSHRUN  : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
+                                     int_aarch64_neon_sqshrun>;
+defm SRI      : SIMDScalarRShiftDTied<   1, 0b01000, "sri">;
+defm SRSHR    : SIMDScalarRShiftD<   0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64srshri node:$MHS, node:$RHS))>>;
+defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
+                                     int_aarch64_neon_uqrshrn>;
+defm UQSHL    : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN   : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
+                                     int_aarch64_neon_uqshrn>;
+defm URSHR    : SIMDScalarRShiftD<   1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64urshri node:$MHS, node:$RHS))>>;
+defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64vlshr node:$MHS, node:$RHS))>>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
+defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
+defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
+                                   int_aarch64_neon_vcvtfxs2fp>;
+defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
+                                         int_aarch64_neon_rshrn>;
+defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
+defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
+                          BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
+defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
+def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftL64:$imm))),
+          (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
+defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
+                                         int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
+                                         int_aarch64_neon_sqrshrun>;
+defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL  : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
+                                         int_aarch64_neon_sqshrn>;
+defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
+                                         int_aarch64_neon_sqshrun>;
+defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
+def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftR64:$imm))),
+          (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
+defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA   : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
+                 TriOpFrag<(add node:$LHS,
+                                (AArch64srshri node:$MHS, node:$RHS))> >;
+defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
+                BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>;
+
+defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
+                TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
+                        int_aarch64_neon_vcvtfxu2fp>;
+defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
+                                         int_aarch64_neon_uqrshrn>;
+defm UQSHL   : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN  : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
+                                         int_aarch64_neon_uqshrn>;
+defm URSHR   : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA   : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
+                TriOpFrag<(add node:$LHS,
+                               (AArch64urshri node:$MHS, node:$RHS))> >;
+defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
+                BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
+defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
+                TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
+
+// SHRN patterns for when a logical right shift was used instead of arithmetic
+// (the immediate guarantees no sign bits actually end up in the result so it
+// doesn't matter).
+def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
+          (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
+          (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
+          (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
+
+def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
+                                 (trunc (AArch64vlshr (v8i16 V128:$Rn),
+                                                    vecshiftR16Narrow:$imm)))),
+          (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
+                                 (trunc (AArch64vlshr (v4i32 V128:$Rn),
+                                                    vecshiftR32Narrow:$imm)))),
+          (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
+                                 (trunc (AArch64vlshr (v2i64 V128:$Rn),
+                                                    vecshiftR64Narrow:$imm)))),
+          (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+
+// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
+// Anyexts are implemented as zexts.
+def : Pat<(v8i16 (sext   (v8i8 V64:$Rn))),  (SSHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+// Also match an extend from the upper half of a 128 bit source register.
+def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (sext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
+
+// Vector shift sxtl aliases
+def : InstAlias<"sxtl.8h $dst, $src1",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.8h, $src1.8b",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.4s $dst, $src1",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.4s, $src1.4h",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.2d $dst, $src1",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.2d, $src1.2s",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift sxtl2 aliases
+def : InstAlias<"sxtl2.8h $dst, $src1",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.4s $dst, $src1",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.2d $dst, $src1",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// Vector shift uxtl aliases
+def : InstAlias<"uxtl.8h $dst, $src1",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.8h, $src1.8b",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.4s $dst, $src1",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.4s, $src1.4h",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.2d $dst, $src1",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.2d, $src1.2s",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift uxtl2 aliases
+def : InstAlias<"uxtl2.8h $dst, $src1",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.4s $dst, $src1",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.2d $dst, $src1",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// These patterns are more complex because floating point loads do not
+// support sign extension.
+// The sign extension has to be explicitly added and is only supported for
+// one step: byte-to-half, half-to-word, word-to-doubleword.
+// SCVTF GPR -> FPR is 9 cycles.
+// SCVTF FPR -> FPR is 4 cyclces.
+// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
+// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
+// and still being faster.
+// However, this is not good for code size.
+// 8-bits -> float. 2 sizes step-up.
+class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
+  : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
+        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                            (SSHLLv4i16_shift
+                              (f64
+                                (EXTRACT_SUBREG
+                                  (SSHLLv8i8_shift
+                                    (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                        INST,
+                                        bsub),
+                                    0),
+                                  dsub)),
+                               0),
+                             ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
+                          (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
+                          (LDRBroX  GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bits -> float. 1 size step-up.
+class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
+  : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                            (SSHLLv4i16_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                  INST,
+                                  hsub),
+                                0),
+                            ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+                           (LDRHroW   GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+                           (LDRHroX   GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bits to 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// SCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double. 3 size step-up: give up.
+// 16-bits -> double. 2 size step.
+class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
+  : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                 (f64
+                                  (EXTRACT_SUBREG
+                                    (SSHLLv4i16_shift
+                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                        INST,
+                                        hsub),
+                                     0),
+                                   dsub)),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+                           (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+                           (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+// 32-bits -> double. 1 size step-up.
+class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
+  : Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                  INST,
+                                  ssub),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
+                           (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
+                           (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
+                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
+                           (LDURSi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD Load-Store Structure
+//----------------------------------------------------------------------------
+defm LD1 : SIMDLd1Multiple<"ld1">;
+defm LD2 : SIMDLd2Multiple<"ld2">;
+defm LD3 : SIMDLd3Multiple<"ld3">;
+defm LD4 : SIMDLd4Multiple<"ld4">;
+
+defm ST1 : SIMDSt1Multiple<"st1">;
+defm ST2 : SIMDSt2Multiple<"st2">;
+defm ST3 : SIMDSt3Multiple<"st3">;
+defm ST4 : SIMDSt4Multiple<"st4">;
+
+class Ld1Pat<ValueType ty, Instruction INST>
+  : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;
+
+def : Ld1Pat<v16i8, LD1Onev16b>;
+def : Ld1Pat<v8i16, LD1Onev8h>;
+def : Ld1Pat<v4i32, LD1Onev4s>;
+def : Ld1Pat<v2i64, LD1Onev2d>;
+def : Ld1Pat<v8i8,  LD1Onev8b>;
+def : Ld1Pat<v4i16, LD1Onev4h>;
+def : Ld1Pat<v2i32, LD1Onev2s>;
+def : Ld1Pat<v1i64, LD1Onev1d>;
+
+class St1Pat<ValueType ty, Instruction INST>
+  : Pat<(store ty:$Vt, GPR64sp:$Rn),
+        (INST ty:$Vt, GPR64sp:$Rn)>;
+
+def : St1Pat<v16i8, ST1Onev16b>;
+def : St1Pat<v8i16, ST1Onev8h>;
+def : St1Pat<v4i32, ST1Onev4s>;
+def : St1Pat<v2i64, ST1Onev2d>;
+def : St1Pat<v8i8,  ST1Onev8b>;
+def : St1Pat<v4i16, ST1Onev4h>;
+def : St1Pat<v2i32, ST1Onev2s>;
+def : St1Pat<v1i64, ST1Onev1d>;
+
+//---
+// Single-element
+//---
+
+defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
+defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
+defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
+defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
+let mayLoad = 1, neverHasSideEffects = 1 in {
+defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
+defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
+defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
+defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned,   GPR64pi8>;
+defm LD2 : SIMDLdSingleBTied<1, 0b000,       "ld2", VecListTwob,   GPR64pi2>;
+defm LD2 : SIMDLdSingleHTied<1, 0b010, 0,    "ld2", VecListTwoh,   GPR64pi4>;
+defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos,   GPR64pi8>;
+defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod,   GPR64pi16>;
+defm LD3 : SIMDLdSingleBTied<0, 0b001,       "ld3", VecListThreeb, GPR64pi3>;
+defm LD3 : SIMDLdSingleHTied<0, 0b011, 0,    "ld3", VecListThreeh, GPR64pi6>;
+defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
+defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
+defm LD4 : SIMDLdSingleBTied<1, 0b001,       "ld4", VecListFourb,  GPR64pi4>;
+defm LD4 : SIMDLdSingleHTied<1, 0b011, 0,    "ld4", VecListFourh,  GPR64pi8>;
+defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours,  GPR64pi16>;
+defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd,  GPR64pi32>;
+}
+
+def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+          (LD1Rv8b GPR64sp:$Rn)>;
+def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+          (LD1Rv16b GPR64sp:$Rn)>;
+def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+          (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+          (LD1Rv8h GPR64sp:$Rn)>;
+def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;
+// Grab the floating point version too
+def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;
+
+class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne128:$Rd),
+           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
+def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
+
+class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne64:$Rd),
+           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (EXTRACT_SUBREG
+            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+                          VecIndex:$idx, GPR64sp:$Rn),
+            dsub)>;
+
+def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
+def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
+
+
+defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
+defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
+defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
+defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
+
+// Stores
+defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
+defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
+defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
+defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
+
+let AddedComplexity = 15 in
+class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+             GPR64sp:$Rn),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
+def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
+def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
+def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
+
+let AddedComplexity = 15 in
+class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+             GPR64sp:$Rn),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
+def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
+def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
+def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
+
+multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                             ValueType VTy, ValueType STy, Instruction ST1,
+                             int offset> {
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, offset),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, GPR64:$Rm),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
+defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
+                        2>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
+
+multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                             ValueType VTy, ValueType STy, Instruction ST1,
+                             int offset> {
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, offset),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, GPR64:$Rm),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
+                         1>;
+defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
+                         2>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
+defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
+defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
+defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
+defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
+defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
+defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
+defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
+defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
+defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
+defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
+defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
+}
+
+defm ST1 : SIMDLdSt1SingleAliases<"st1">;
+defm ST2 : SIMDLdSt2SingleAliases<"st2">;
+defm ST3 : SIMDLdSt3SingleAliases<"st3">;
+defm ST4 : SIMDLdSt4SingleAliases<"st4">;
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+def AESErr   : AESTiedInst<0b0100, "aese",   int_aarch64_crypto_aese>;
+def AESDrr   : AESTiedInst<0b0101, "aesd",   int_aarch64_crypto_aesd>;
+def AESMCrr  : AESInst<    0b0110, "aesmc",  int_aarch64_crypto_aesmc>;
+def AESIMCrr : AESInst<    0b0111, "aesimc", int_aarch64_crypto_aesimc>;
+
+def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_aarch64_crypto_sha1c>;
+def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_aarch64_crypto_sha1p>;
+def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_aarch64_crypto_sha1m>;
+def SHA1SU0rrr   : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>;
+def SHA256Hrrr   : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>;
+def SHA256H2rrr  : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>;
+def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>;
+
+def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_aarch64_crypto_sha1h>;
+def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_aarch64_crypto_sha1su1>;
+def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
+
+//----------------------------------------------------------------------------
+// Compiler-pseudos
+//----------------------------------------------------------------------------
+// FIXME: Like for X86, these should go in their own separate .td file.
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+// FIXME: X86 also checks for CMOV here. Do we need something similar?
+def def32 : PatLeaf<(i32 GPR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
+
+// For an anyext, we don't care what the high bits are, so we can perform an
+// INSERT_SUBREF into an IMPLICIT_DEF.
+def : Pat<(i64 (anyext GPR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+
+// When we need to explicitly zero-extend, we use an unsigned bitfield move
+// instruction (UBFM) on the enclosing super-reg.
+def : Pat<(i64 (zext GPR32:$src)),
+ (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+
+// To sign extend, we use a signed bitfield move instruction (SBFM) on the
+// containing super-reg.
+def : Pat<(i64 (sext GPR32:$src)),
+   (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i8)),  (SBFMXri GPR64:$src, 0, 7)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i1)),  (SBFMXri GPR64:$src, 0, 0)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i8)),  (SBFMWri GPR32:$src, 0, 7)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i1)),  (SBFMWri GPR32:$src, 0, 0)>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a       imm0_31:$imm)),
+                              (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a        imm0_31:$imm)),
+                              (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a        imm0_63:$imm)),
+                              (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
+
+def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 (i64shift_a        imm0_63:$imm)),
+                   (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
+
+// sra patterns have an AddedComplexity of 10, so make sure we have a higher
+// AddedComplexity for the following patterns since we want to match sext + sra
+// patterns before we attempt to match a single sra node.
+let AddedComplexity = 20 in {
+// We support all sext + sra combinations which preserve at least one bit of the
+// original value which is to be sign extended. E.g. we support shifts up to
+// bitwidth-1 bits.
+def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
+
+def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
+
+def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 imm0_31:$imm), 31)>;
+} // AddedComplexity = 20
+
+// To truncate, we can simply extract from a subregister.
+def : Pat<(i32 (trunc GPR64sp:$src)),
+          (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
+
+// __builtin_trap() uses the BRK instruction on AArch64.
+def : Pat<(trap), (BRK 1)>;
+
+// Conversions within AdvSIMD types in the same register size are free.
+// But because we need a consistent lane ordering, in big endian many
+// conversions require one or more REV instructions.
+//
+// Consider a simple memory load followed by a bitconvert then a store.
+//   v0 = load v2i32
+//   v1 = BITCAST v2i32 v0 to v4i16
+//        store v4i16 v2
+//
+// In big endian mode every memory access has an implicit byte swap. LDR and
+// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
+// is, they treat the vector as a sequence of elements to be byte-swapped.
+// The two pairs of instructions are fundamentally incompatible. We've decided
+// to use LD1/ST1 only to simplify compiler implementation.
+//
+// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
+// the original code sequence:
+//   v0 = load v2i32
+//   v1 = REV v2i32                  (implicit)
+//   v2 = BITCAST v2i32 v1 to v4i16
+//   v3 = REV v4i16 v2               (implicit)
+//        store v4i16 v3
+//
+// But this is now broken - the value stored is different to the value loaded
+// due to lane reordering. To fix this, on every BITCAST we must perform two
+// other REVs:
+//   v0 = load v2i32
+//   v1 = REV v2i32                  (implicit)
+//   v2 = REV v2i32
+//   v3 = BITCAST v2i32 v2 to v4i16
+//   v4 = REV v4i16
+//   v5 = REV v4i16 v4               (implicit)
+//        store v4i16 v5
+//
+// This means an extra two instructions, but actually in most cases the two REV
+// instructions can be combined into one. For example:
+//   (REV64_2s (REV64_4h X)) === (REV32_4h X)
+//
+// There is also no 128-bit REV instruction. This must be synthesized with an
+// EXT instruction.
+//
+// Most bitconverts require some sort of conversion. The only exceptions are:
+//   a) Identity conversions -  vNfX <-> vNiX
+//   b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
+//
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)),
+                 (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
+                 (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
+                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
+                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
+          (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+}
+def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+
+def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
+          (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
+def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
+          (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
+def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
+          (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
+                             (v1i64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
+                             (v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
+                             (v1i64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
+                             (v1i64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
+                             (v2i32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))),
+                             (v2i32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))),
+                             (v4i16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))),
+                             (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))),
+                             (v8i8 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))),
+                             (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
+                             (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))),
+                             (f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))),
+                             (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
+                             (f64 (REV64v8i8 FPR64:$src))>;
+}
+def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
+                             (v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
+                             (v1f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))),
+                             (v1f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
+                             (v1f64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
+                             (v2f32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))),
+                             (v2f32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
+                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
+                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
+                                            (REV64v16i8 FPR128:$src), (i32 8)))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))),
+                             (v2f64 (EXTv16i8 FPR128:$src,
+                                              FPR128:$src, (i32 8)))>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
+                             (v2f64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
+                             (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
+                             (v2f64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
+                             (v2f64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))),
+                             (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                    (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
+                             (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
+                             (v4f32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
+                             (v4f32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
+                             (v4f32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
+                             (v2i64 (EXTv16i8 FPR128:$src,
+                                              FPR128:$src, (i32 8)))>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
+                             (v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
+                             (v2i64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
+                             (v2i64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
+                             (v2i64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
+                             (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                              (REV64v4i32 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
+                             (v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
+                             (v4i32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
+                             (v4i32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
+                             (v4i32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))),
+                             (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                              (REV64v8i16 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
+                             (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
+                             (v8i16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
+                             (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
+                             (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
+                                              (REV64v16i8 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
+                             (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
+                             (v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
+                             (v16i8 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
+                             (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
+                             (v16i8 (REV32v16i8 FPR128:$src))>;
+}
+
+def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+
+// A 64-bit subvector insert to the first 128-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
+// or v2f32.
+def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
+                    (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
+           (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
+def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
+                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
+           (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
+    // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
+    // so we match on v4f32 here, not v2f32. This will also catch adding
+    // the low two lanes of a true v4f32 vector.
+def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
+                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
+          (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
+
+// Scalar 64-bit shifts in FPR64 registers.
+def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+
+// Tail call return handling. These are all compiler pseudo-instructions,
+// so no encoding information or anything like that.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
+  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
+}
+
+def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+
+include "AArch64InstrAtomics.td"
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
new file mode 100644
index 00000000000..e7454be125b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -0,0 +1,942 @@
+//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ldst-opt"
+
+/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
+/// load / store instructions to form ldp / stp instructions.
+
+STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
+STATISTIC(NumPostFolded, "Number of post-index updates folded");
+STATISTIC(NumPreFolded, "Number of pre-index updates folded");
+STATISTIC(NumUnscaledPairCreated,
+          "Number of load/store from unscaled generated");
+
+static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", cl::init(20),
+                                   cl::Hidden);
+
+// Place holder while testing unscaled load/store combining
+static cl::opt<bool>
+EnableAArch64UnscaledMemOp("aarch64-unscaled-mem-op", cl::Hidden,
+                         cl::desc("Allow AArch64 unscaled load/store combining"),
+                         cl::init(true));
+
+namespace {
+struct AArch64LoadStoreOpt : public MachineFunctionPass {
+  static char ID;
+  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
+
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  // Scan the instructions looking for a load/store that can be combined
+  // with the current instruction into a load/store pair.
+  // Return the matching instruction if one is found, else MBB->end().
+  // If a matching instruction is found, mergeForward is set to true if the
+  // merge is to remove the first instruction and replace the second with
+  // a pair-wise insn, and false if the reverse is true.
+  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
+                                               bool &mergeForward,
+                                               unsigned Limit);
+  // Merge the two instructions indicated into a single pair-wise instruction.
+  // If mergeForward is true, erase the first instruction and fold its
+  // operation into the second. If false, the reverse. Return the instruction
+  // following the first instruction (which may change during processing).
+  MachineBasicBlock::iterator
+  mergePairedInsns(MachineBasicBlock::iterator I,
+                   MachineBasicBlock::iterator Paired, bool mergeForward);
+
+  // Scan the instruction list to find a base register update that can
+  // be combined with the current instruction (a load or store) using
+  // pre or post indexed addressing with writeback. Scan forwards.
+  MachineBasicBlock::iterator
+  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
+                                int Value);
+
+  // Scan the instruction list to find a base register update that can
+  // be combined with the current instruction (a load or store) using
+  // pre or post indexed addressing with writeback. Scan backwards.
+  MachineBasicBlock::iterator
+  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
+
+  // Merge a pre-index base register update into a ld/st instruction.
+  MachineBasicBlock::iterator
+  mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+                        MachineBasicBlock::iterator Update);
+
+  // Merge a post-index base register update into a ld/st instruction.
+  MachineBasicBlock::iterator
+  mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
+                         MachineBasicBlock::iterator Update);
+
+  bool optimizeBlock(MachineBasicBlock &MBB);
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  const char *getPassName() const override {
+    return "AArch64 load / store optimization pass";
+  }
+
+private:
+  int getMemSize(MachineInstr *MemMI);
+};
+char AArch64LoadStoreOpt::ID = 0;
+}
+
+static bool isUnscaledLdst(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::STURSi:
+    return true;
+  case AArch64::STURDi:
+    return true;
+  case AArch64::STURQi:
+    return true;
+  case AArch64::STURWi:
+    return true;
+  case AArch64::STURXi:
+    return true;
+  case AArch64::LDURSi:
+    return true;
+  case AArch64::LDURDi:
+    return true;
+  case AArch64::LDURQi:
+    return true;
+  case AArch64::LDURWi:
+    return true;
+  case AArch64::LDURXi:
+    return true;
+  }
+}
+
+// Size in bytes of the data moved by an unscaled load or store
+int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
+  switch (MemMI->getOpcode()) {
+  default:
+    llvm_unreachable("Opcode has has unknown size!");
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+    return 4;
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+    return 8;
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+    return 16;
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+    return 4;
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+    return 8;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return 4;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+    return 8;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+    return 16;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return 4;
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+    return 8;
+  }
+}
+
+static unsigned getMatchingPairOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pairwise equivalent!");
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+    return AArch64::STPSi;
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+    return AArch64::STPDi;
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+    return AArch64::STPQi;
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+    return AArch64::STPWi;
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+    return AArch64::STPXi;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return AArch64::LDPSi;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+    return AArch64::LDPDi;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+    return AArch64::LDPQi;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return AArch64::LDPWi;
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+    return AArch64::LDPXi;
+  }
+}
+
+static unsigned getPreIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pre-indexed equivalent!");
+  case AArch64::STRSui:    return AArch64::STRSpre;
+  case AArch64::STRDui:    return AArch64::STRDpre;
+  case AArch64::STRQui:    return AArch64::STRQpre;
+  case AArch64::STRWui:    return AArch64::STRWpre;
+  case AArch64::STRXui:    return AArch64::STRXpre;
+  case AArch64::LDRSui:    return AArch64::LDRSpre;
+  case AArch64::LDRDui:    return AArch64::LDRDpre;
+  case AArch64::LDRQui:    return AArch64::LDRQpre;
+  case AArch64::LDRWui:    return AArch64::LDRWpre;
+  case AArch64::LDRXui:    return AArch64::LDRXpre;
+  }
+}
+
+static unsigned getPostIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
+  case AArch64::STRSui:
+    return AArch64::STRSpost;
+  case AArch64::STRDui:
+    return AArch64::STRDpost;
+  case AArch64::STRQui:
+    return AArch64::STRQpost;
+  case AArch64::STRWui:
+    return AArch64::STRWpost;
+  case AArch64::STRXui:
+    return AArch64::STRXpost;
+  case AArch64::LDRSui:
+    return AArch64::LDRSpost;
+  case AArch64::LDRDui:
+    return AArch64::LDRDpost;
+  case AArch64::LDRQui:
+    return AArch64::LDRQpost;
+  case AArch64::LDRWui:
+    return AArch64::LDRWpost;
+  case AArch64::LDRXui:
+    return AArch64::LDRXpost;
+  }
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator Paired,
+                                      bool mergeForward) {
+  MachineBasicBlock::iterator NextI = I;
+  ++NextI;
+  // If NextI is the second of the two instructions to be merged, we need
+  // to skip one further. Either way we merge will invalidate the iterator,
+  // and we don't need to scan the new instruction, as it's a pairwise
+  // instruction, which we're not considering for further action anyway.
+  if (NextI == Paired)
+    ++NextI;
+
+  bool IsUnscaled = isUnscaledLdst(I->getOpcode());
+  int OffsetStride =
+      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;
+
+  unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
+  // Insert our new paired instruction after whichever of the paired
+  // instructions mergeForward indicates.
+  MachineBasicBlock::iterator InsertionPoint = mergeForward ? Paired : I;
+  // Also based on mergeForward is from where we copy the base register operand
+  // so we get the flags compatible with the input code.
+  MachineOperand &BaseRegOp =
+      mergeForward ? Paired->getOperand(1) : I->getOperand(1);
+
+  // Which register is Rt and which is Rt2 depends on the offset order.
+  MachineInstr *RtMI, *Rt2MI;
+  if (I->getOperand(2).getImm() ==
+      Paired->getOperand(2).getImm() + OffsetStride) {
+    RtMI = Paired;
+    Rt2MI = I;
+  } else {
+    RtMI = I;
+    Rt2MI = Paired;
+  }
+  // Handle Unscaled
+  int OffsetImm = RtMI->getOperand(2).getImm();
+  if (IsUnscaled && EnableAArch64UnscaledMemOp)
+    OffsetImm /= OffsetStride;
+
+  // Construct the new instruction.
+  MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
+                                    I->getDebugLoc(), TII->get(NewOpc))
+                                .addOperand(RtMI->getOperand(0))
+                                .addOperand(Rt2MI->getOperand(0))
+                                .addOperand(BaseRegOp)
+                                .addImm(OffsetImm);
+  (void)MIB;
+
+  // FIXME: Do we need/want to copy the mem operands from the source
+  //        instructions? Probably. What uses them after this?
+
+  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Paired->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions.
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  return NextI;
+}
+
+/// trackRegDefsUses - Remember what registers the specified instruction uses
+/// and modifies.
+static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
+                             BitVector &UsedRegs,
+                             const TargetRegisterInfo *TRI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isRegMask())
+      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
+
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (MO.isDef()) {
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        ModifiedRegs.set(*AI);
+    } else {
+      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        UsedRegs.set(*AI);
+    }
+  }
+}
+
+static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
+  if (!IsUnscaled && (Offset > 63 || Offset < -64))
+    return false;
+  if (IsUnscaled) {
+    // Convert the byte-offset used by unscaled into an "element" offset used
+    // by the scaled pair load/store instructions.
+    int elemOffset = Offset / OffsetStride;
+    if (elemOffset > 63 || elemOffset < -64)
+      return false;
+  }
+  return true;
+}
+
+// Do alignment, specialized to power of 2 and for signed ints,
+// avoiding having to do a C-style cast from uint_64t to int when
+// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
+// FIXME: Move this function to include/MathExtras.h?
+static int alignTo(int Num, int PowOf2) {
+  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
+}
+
+/// findMatchingInsn - Scan the instructions looking for a load/store that can
+/// be combined with the current instruction into a load/store pair.
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
+                                      bool &mergeForward, unsigned Limit) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = I;
+  MachineInstr *FirstMI = I;
+  ++MBBI;
+
+  int Opc = FirstMI->getOpcode();
+  bool mayLoad = FirstMI->mayLoad();
+  bool IsUnscaled = isUnscaledLdst(Opc);
+  unsigned Reg = FirstMI->getOperand(0).getReg();
+  unsigned BaseReg = FirstMI->getOperand(1).getReg();
+  int Offset = FirstMI->getOperand(2).getImm();
+
+  // Early exit if the first instruction modifies the base register.
+  // e.g., ldr x0, [x0]
+  // Early exit if the offset if not possible to match. (6 bits of positive
+  // range, plus allow an extra one in case we find a later insn that matches
+  // with Offset-1
+  if (FirstMI->modifiesRegister(BaseReg, TRI))
+    return E;
+  int OffsetStride =
+      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1;
+  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) {
+      // If we've found another instruction with the same opcode, check to see
+      // if the base and offset are compatible with our starting instruction.
+      // These instructions all have scaled immediate operands, so we just
+      // check for +1/-1. Make sure to check the new instruction offset is
+      // actually an immediate and not a symbolic reference destined for
+      // a relocation.
+      //
+      // Pairwise instructions have a 7-bit signed offset field. Single insns
+      // have a 12-bit unsigned offset field. To be a valid combine, the
+      // final offset must be in range.
+      unsigned MIBaseReg = MI->getOperand(1).getReg();
+      int MIOffset = MI->getOperand(2).getImm();
+      if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
+                                   (Offset + OffsetStride == MIOffset))) {
+        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
+        // If this is a volatile load/store that otherwise matched, stop looking
+        // as something is going on that we don't have enough information to
+        // safely transform. Similarly, stop if we see a hint to avoid pairs.
+        if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+          return E;
+        // If the resultant immediate offset of merging these instructions
+        // is out of range for a pairwise instruction, bail and keep looking.
+        bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
+        if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+        // If the alignment requirements of the paired (scaled) instruction
+        // can't express the offset of the unscaled input, bail and keep
+        // looking.
+        if (IsUnscaled && EnableAArch64UnscaledMemOp &&
+            (alignTo(MinOffset, OffsetStride) != MinOffset)) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+        // If the destination register of the loads is the same register, bail
+        // and keep looking. A load-pair instruction with both destination
+        // registers the same is UNPREDICTABLE and will result in an exception.
+        if (mayLoad && Reg == MI->getOperand(0).getReg()) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+
+        // If the Rt of the second instruction was not modified or used between
+        // the two instructions, we can combine the second into the first.
+        if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
+            !UsedRegs[MI->getOperand(0).getReg()]) {
+          mergeForward = false;
+          return MBBI;
+        }
+
+        // Likewise, if the Rt of the first instruction is not modified or used
+        // between the two instructions, we can combine the first into the
+        // second.
+        if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
+            !UsedRegs[FirstMI->getOperand(0).getReg()]) {
+          mergeForward = true;
+          return MBBI;
+        }
+        // Unable to combine these instructions due to interference in between.
+        // Keep looking.
+      }
+    }
+
+    // If the instruction wasn't a matching load or store, but does (or can)
+    // modify memory, stop searching, as we don't have alias analysis or
+    // anything like that to tell us whether the access is tromping on the
+    // locations we care about. The big one we want to catch is calls.
+    //
+    // FIXME: Theoretically, we can do better than that for SP and FP based
+    // references since we can effectively know where those are touching. It's
+    // unclear if it's worth the extra code, though. Most paired instructions
+    // will be sequential, perhaps with a few intervening non-memory related
+    // instructions.
+    if (MI->mayStore() || MI->isCall())
+      return E;
+    // Likewise, if we're matching a store instruction, we don't want to
+    // move across a load, as it may be reading the same location.
+    if (FirstMI->mayStore() && MI->mayLoad())
+      return E;
+
+    // Update modified / uses register lists.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+                                           MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == AArch64::ADDXri ||
+          Update->getOpcode() == AArch64::SUBXri) &&
+         "Unexpected base register update instruction to merge!");
+  MachineBasicBlock::iterator NextI = I;
+  // Return the instruction following the merged instruction, which is
+  // the instruction following our unmerged load. Unless that's the add/sub
+  // instruction we're merging, in which case it's the one after that.
+  if (++NextI == Update)
+    ++NextI;
+
+  int Value = Update->getOperand(2).getImm();
+  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+         "Can't merge 1 << 12 offset into pre-indexed load / store");
+  if (Update->getOpcode() == AArch64::SUBXri)
+    Value = -Value;
+
+  unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
+  MachineInstrBuilder MIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(Update->getOperand(0))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addImm(Value);
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating pre-indexed load/store.");
+  DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Update->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
+    MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == AArch64::ADDXri ||
+          Update->getOpcode() == AArch64::SUBXri) &&
+         "Unexpected base register update instruction to merge!");
+  MachineBasicBlock::iterator NextI = I;
+  // Return the instruction following the merged instruction, which is
+  // the instruction following our unmerged load. Unless that's the add/sub
+  // instruction we're merging, in which case it's the one after that.
+  if (++NextI == Update)
+    ++NextI;
+
+  int Value = Update->getOperand(2).getImm();
+  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+         "Can't merge 1 << 12 offset into post-indexed load / store");
+  if (Update->getOpcode() == AArch64::SUBXri)
+    Value = -Value;
+
+  unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
+  MachineInstrBuilder MIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(Update->getOperand(0))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addImm(Value);
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating post-indexed load/store.");
+  DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Update->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
+static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
+                                 int Offset) {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::SUBXri:
+    // Negate the offset for a SUB instruction.
+    Offset *= -1;
+  // FALLTHROUGH
+  case AArch64::ADDXri:
+    // Make sure it's a vanilla immediate operand, not a relocation or
+    // anything else we can't handle.
+    if (!MI->getOperand(2).isImm())
+      break;
+    // Watch out for 1 << 12 shifted value.
+    if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
+      break;
+    // If the instruction has the base register as source and dest and the
+    // immediate will fit in a signed 9-bit integer, then we have a match.
+    if (MI->getOperand(0).getReg() == BaseReg &&
+        MI->getOperand(1).getReg() == BaseReg &&
+        MI->getOperand(2).getImm() <= 255 &&
+        MI->getOperand(2).getImm() >= -256) {
+      // If we have a non-zero Offset, we check that it matches the amount
+      // we're adding to the register.
+      if (!Offset || Offset == MI->getOperand(2).getImm())
+        return true;
+    }
+    break;
+  }
+  return false;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
+    MachineBasicBlock::iterator I, unsigned Limit, int Value) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr *MemMI = I;
+  MachineBasicBlock::iterator MBBI = I;
+  const MachineFunction &MF = *MemMI->getParent()->getParent();
+
+  unsigned DestReg = MemMI->getOperand(0).getReg();
+  unsigned BaseReg = MemMI->getOperand(1).getReg();
+  int Offset = MemMI->getOperand(2).getImm() *
+               TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+
+  // If the base register overlaps the destination register, we can't
+  // merge the update.
+  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+    return E;
+
+  // Scan forward looking for post-index opportunities.
+  // Updating instructions can't be formed if the memory insn already
+  // has an offset other than the value we're looking for.
+  if (Offset != Value)
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  ++MBBI;
+  for (unsigned Count = 0; MBBI != E; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingUpdateInsn(MI, BaseReg, Value))
+      return MBBI;
+
+    // Update the status of what the instruction clobbered and used.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
+    MachineBasicBlock::iterator I, unsigned Limit) {
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr *MemMI = I;
+  MachineBasicBlock::iterator MBBI = I;
+  const MachineFunction &MF = *MemMI->getParent()->getParent();
+
+  unsigned DestReg = MemMI->getOperand(0).getReg();
+  unsigned BaseReg = MemMI->getOperand(1).getReg();
+  int Offset = MemMI->getOperand(2).getImm();
+  unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+
+  // If the load/store is the first instruction in the block, there's obviously
+  // not any matching update. Ditto if the memory offset isn't zero.
+  if (MBBI == B || Offset != 0)
+    return E;
+  // If the base register overlaps the destination register, we can't
+  // merge the update.
+  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  --MBBI;
+  for (unsigned Count = 0; MBBI != B; --MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
+      return MBBI;
+
+    // Update the status of what the instruction clobbered and used.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  // Two tranformations to do here:
+  // 1) Find loads and stores that can be merged into a single load or store
+  //    pair instruction.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        ldr x1, [x2, #8]
+  //        ; becomes
+  //        ldp x0, x1, [x2]
+  // 2) Find base register updates that can be merged into the load or store
+  //    as a base-reg writeback.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        add x2, x2, #4
+  //        ; becomes
+  //        ldr x0, [x2], #4
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    switch (MI->getOpcode()) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    case AArch64::STRSui:
+    case AArch64::STRDui:
+    case AArch64::STRQui:
+    case AArch64::STRXui:
+    case AArch64::STRWui:
+    case AArch64::LDRSui:
+    case AArch64::LDRDui:
+    case AArch64::LDRQui:
+    case AArch64::LDRXui:
+    case AArch64::LDRWui:
+    // do the unscaled versions as well
+    case AArch64::STURSi:
+    case AArch64::STURDi:
+    case AArch64::STURQi:
+    case AArch64::STURWi:
+    case AArch64::STURXi:
+    case AArch64::LDURSi:
+    case AArch64::LDURDi:
+    case AArch64::LDURQi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi: {
+      // If this is a volatile load/store, don't mess with it.
+      if (MI->hasOrderedMemoryRef()) {
+        ++MBBI;
+        break;
+      }
+      // Make sure this is a reg+imm (as opposed to an address reloc).
+      if (!MI->getOperand(2).isImm()) {
+        ++MBBI;
+        break;
+      }
+      // Check if this load/store has a hint to avoid pair formation.
+      // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+      if (TII->isLdStPairSuppressed(MI)) {
+        ++MBBI;
+        break;
+      }
+      // Look ahead up to ScanLimit instructions for a pairable instruction.
+      bool mergeForward = false;
+      MachineBasicBlock::iterator Paired =
+          findMatchingInsn(MBBI, mergeForward, ScanLimit);
+      if (Paired != E) {
+        // Merge the loads into a pair. Keeping the iterator straight is a
+        // pain, so we let the merge routine tell us what the next instruction
+        // is after it's done mucking about.
+        MBBI = mergePairedInsns(MBBI, Paired, mergeForward);
+
+        Modified = true;
+        ++NumPairCreated;
+        if (isUnscaledLdst(MI->getOpcode()))
+          ++NumUnscaledPairCreated;
+        break;
+      }
+      ++MBBI;
+      break;
+    }
+      // FIXME: Do the other instructions.
+    }
+  }
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    // Do update merging. It's simpler to keep this separate from the above
+    // switch, though not strictly necessary.
+    int Opc = MI->getOpcode();
+    switch (Opc) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    case AArch64::STRSui:
+    case AArch64::STRDui:
+    case AArch64::STRQui:
+    case AArch64::STRXui:
+    case AArch64::STRWui:
+    case AArch64::LDRSui:
+    case AArch64::LDRDui:
+    case AArch64::LDRQui:
+    case AArch64::LDRXui:
+    case AArch64::LDRWui:
+    // do the unscaled versions as well
+    case AArch64::STURSi:
+    case AArch64::STURDi:
+    case AArch64::STURQi:
+    case AArch64::STURWi:
+    case AArch64::STURXi:
+    case AArch64::LDURSi:
+    case AArch64::LDURDi:
+    case AArch64::LDURQi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi: {
+      // Make sure this is a reg+imm (as opposed to an address reloc).
+      if (!MI->getOperand(2).isImm()) {
+        ++MBBI;
+        break;
+      }
+      // Look ahead up to ScanLimit instructions for a mergable instruction.
+      MachineBasicBlock::iterator Update =
+          findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePostIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPostFolded;
+        break;
+      }
+      // Don't know how to handle pre/post-index versions, so move to the next
+      // instruction.
+      if (isUnscaledLdst(Opc)) {
+        ++MBBI;
+        break;
+      }
+
+      // Look back to try to find a pre-index instruction. For example,
+      // add x0, x0, #8
+      // ldr x1, [x0]
+      //   merged into:
+      // ldr x1, [x0, #8]!
+      Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPreFolded;
+        break;
+      }
+
+      // Look forward to try to find a post-index instruction. For example,
+      // ldr x1, [x0, #64]
+      // add x0, x0, #64
+      //   merged into:
+      // ldr x1, [x0, #64]!
+
+      // The immediate in the load/store is scaled by the size of the register
+      // being loaded. The immediate in the add we're looking for,
+      // however, is not, so adjust here.
+      int Value = MI->getOperand(2).getImm() *
+                  TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
+                      ->getSize();
+      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPreFolded;
+        break;
+      }
+
+      // Nothing found. Just move to the next instruction.
+      ++MBBI;
+      break;
+    }
+      // FIXME: Do the other instructions.
+    }
+  }
+
+  return Modified;
+}
+
+bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+  TRI = TM.getRegisterInfo();
+
+  bool Modified = false;
+  for (auto &MBB : Fn)
+    Modified |= optimizeBlock(MBB);
+
+  return Modified;
+}
+
+// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
+// loads and stores near one another?
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
+  return new AArch64LoadStoreOpt();
+}
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
new file mode 100644
index 00000000000..ab6d37532a7
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -0,0 +1,202 @@
+//==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower AArch64 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCInstLower.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, Mangler &mang,
+                                       AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
+
+MCSymbol *
+AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *
+AArch64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                                       MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+  if ((MO.getTargetFlags() & AArch64II::MO_GOT) != 0) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
+    else
+      assert(0 && "Unexpected target flags with MO_GOT on GV operand");
+  } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
+    else
+      llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
+  } else {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_PAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_PAGEOFF;
+  }
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
+                                                    MCSymbol *Sym) const {
+  uint32_t RefFlags = 0;
+
+  if (MO.getTargetFlags() & AArch64II::MO_GOT)
+    RefFlags |= AArch64MCExpr::VK_GOT;
+  else if (MO.getTargetFlags() & AArch64II::MO_TLS) {
+    TLSModel::Model Model;
+    if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      Model = Printer.TM.getTLSModel(GV);
+    } else {
+      assert(MO.isSymbol() &&
+             StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
+             "unexpected external TLS symbol");
+      Model = TLSModel::GeneralDynamic;
+    }
+    switch (Model) {
+    case TLSModel::InitialExec:
+      RefFlags |= AArch64MCExpr::VK_GOTTPREL;
+      break;
+    case TLSModel::LocalExec:
+      RefFlags |= AArch64MCExpr::VK_TPREL;
+      break;
+    case TLSModel::LocalDynamic:
+      RefFlags |= AArch64MCExpr::VK_DTPREL;
+      break;
+    case TLSModel::GeneralDynamic:
+      RefFlags |= AArch64MCExpr::VK_TLSDESC;
+      break;
+    }
+  } else {
+    // No modifier means this is a generic reference, classified as absolute for
+    // the cases where it matters (:abs_g0: etc).
+    RefFlags |= AArch64MCExpr::VK_ABS;
+  }
+
+  if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+    RefFlags |= AArch64MCExpr::VK_PAGE;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+           AArch64II::MO_PAGEOFF)
+    RefFlags |= AArch64MCExpr::VK_PAGEOFF;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3)
+    RefFlags |= AArch64MCExpr::VK_G3;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G2)
+    RefFlags |= AArch64MCExpr::VK_G2;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G1)
+    RefFlags |= AArch64MCExpr::VK_G1;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0)
+    RefFlags |= AArch64MCExpr::VK_G0;
+
+  if (MO.getTargetFlags() & AArch64II::MO_NC)
+    RefFlags |= AArch64MCExpr::VK_NC;
+
+  const MCExpr *Expr =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+
+  AArch64MCExpr::VariantKind RefKind;
+  RefKind = static_cast<AArch64MCExpr::VariantKind>(RefFlags);
+  Expr = AArch64MCExpr::Create(Expr, RefKind, Ctx);
+
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                                 MCSymbol *Sym) const {
+  if (TargetTriple.isOSDarwin())
+    return lowerSymbolOperandDarwin(MO, Sym);
+
+  assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
+  return lowerSymbolOperandELF(MO, Sym);
+}
+
+bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
+                                      MCOperand &MCOp) const {
+  switch (MO.getType()) {
+  default:
+    assert(0 && "unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      return false;
+    MCOp = MCOperand::CreateReg(MO.getReg());
+    break;
+  case MachineOperand::MO_RegisterMask:
+    // Regmasks are like implicit defs.
+    return false;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(
+        MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = LowerSymbolOperand(
+        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
+    break;
+  }
+  return true;
+}
+
+void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MCOperand MCOp;
+    if (lowerOperand(MI->getOperand(i), MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h
new file mode 100644
index 00000000000..ba50ba9e2fe
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MCInstLower.h
@@ -0,0 +1,52 @@
+//===-- AArch64MCInstLower.h - Lower MachineInstr to MCInst ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64_MCINSTLOWER_H
+#define AArch64_MCINSTLOWER_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCAsmInfo;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+/// AArch64MCInstLower - This class is used to lower an MachineInstr
+/// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower {
+  MCContext &Ctx;
+  AsmPrinter &Printer;
+  Triple TargetTriple;
+
+public:
+  AArch64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
+
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                     MCSymbol *Sym) const;
+  MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
+                                  MCSymbol *Sym) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+};
+}
+
+#endif
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
new file mode 100644
index 00000000000..7c257ba9116
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -0,0 +1,163 @@
+//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares AArch64-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64MACHINEFUNCTIONINFO_H
+#define AArch64MACHINEFUNCTIONINFO_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+
+namespace llvm {
+
+/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private AArch64-specific information for each MachineFunction.
+class AArch64FunctionInfo : public MachineFunctionInfo {
+
+  /// Number of bytes of arguments this function has on the stack. If the callee
+  /// is expected to restore the argument stack this should be a multiple of 16,
+  /// all usable during a tail call.
+  ///
+  /// The alternative would forbid tail call optimisation in some cases: if we
+  /// want to transfer control from a function with 8-bytes of stack-argument
+  /// space to a function with 16-bytes then misalignment of this value would
+  /// make a stack adjustment necessary, which could not be undone by the
+  /// callee.
+  unsigned BytesInStackArgArea;
+
+  /// The number of bytes to restore to deallocate space for incoming
+  /// arguments. Canonically 0 in the C calling convention, but non-zero when
+  /// callee is expected to pop the args.
+  unsigned ArgumentStackToRestore;
+
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool HasStackFrame;
+
+  /// \brief Amount of stack frame size, not including callee-saved registers.
+  unsigned LocalStackSize;
+
+  /// \brief Number of TLS accesses using the special (combinable)
+  /// _TLS_MODULE_BASE_ symbol.
+  unsigned NumLocalDynamicTLSAccesses;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed on the
+  /// stack.
+  int VarArgsStackIndex;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// general purpose registers.
+  int VarArgsGPRIndex;
+
+  /// \brief Size of the varargs area for arguments passed in general purpose
+  /// registers.
+  unsigned VarArgsGPRSize;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// floating-point registers.
+  int VarArgsFPRIndex;
+
+  /// \brief Size of the varargs area for arguments passed in floating-point
+  /// registers.
+  unsigned VarArgsFPRSize;
+
+public:
+  AArch64FunctionInfo()
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
+
+  explicit AArch64FunctionInfo(MachineFunction &MF)
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+    (void)MF;
+  }
+
+  unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
+  void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
+
+  unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
+  void setArgumentStackToRestore(unsigned bytes) {
+    ArgumentStackToRestore = bytes;
+  }
+
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+  void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
+  unsigned getLocalStackSize() const { return LocalStackSize; }
+
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
+  unsigned getNumLocalDynamicTLSAccesses() const {
+    return NumLocalDynamicTLSAccesses;
+  }
+
+  int getVarArgsStackIndex() const { return VarArgsStackIndex; }
+  void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
+
+  int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
+  void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
+
+  unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
+  void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
+
+  int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
+  void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
+
+  unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
+  void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
+
+  typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
+
+  const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
+
+  // Shortcuts for LOH related types.
+  class MILOHDirective {
+    MCLOHType Kind;
+
+    /// Arguments of this directive. Order matters.
+    SmallVector<const MachineInstr *, 3> Args;
+
+  public:
+    typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
+
+    MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
+        : Kind(Kind), Args(Args.begin(), Args.end()) {
+      assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
+    }
+
+    MCLOHType getKind() const { return Kind; }
+    const LOHArgs &getArgs() const { return Args; }
+  };
+
+  typedef MILOHDirective::LOHArgs MILOHArgs;
+  typedef SmallVector<MILOHDirective, 32> MILOHContainer;
+
+  const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
+
+  /// Add a LOH directive of this @p Kind and this @p Args.
+  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
+    LOHContainerSet.push_back(MILOHDirective(Kind, Args));
+    LOHRelated.insert(Args.begin(), Args.end());
+  }
+
+private:
+  // Hold the lists of LOHs.
+  MILOHContainer LOHContainerSet;
+  SetOfInstructions LOHRelated;
+};
+} // End llvm namespace
+
+#endif // AArch64MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/AArch64/AArch64PerfectShuffle.h b/lib/Target/AArch64/AArch64PerfectShuffle.h
new file mode 100644
index 00000000000..b22fa2424d5
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -0,0 +1,6586 @@
+//===-- AArch64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using AdvSIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+  835584U, // <0,1,2,3>: Cost 0 copy LHS
+  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+  835584U, // <0,1,2,u>: Cost 0 copy LHS
+  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+  835584U, // <0,1,u,3>: Cost 0 copy LHS
+  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+  835584U, // <0,1,u,u>: Cost 0 copy LHS
+  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+  835584U, // <0,u,2,3>: Cost 0 copy LHS
+  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+  835584U, // <0,u,2,u>: Cost 0 copy LHS
+  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+  835584U, // <0,u,u,3>: Cost 0 copy LHS
+  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+  835584U, // <0,u,u,u>: Cost 0 copy LHS
+  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+  27705344U, // <4,5,6,7>: Cost 0 copy RHS
+  27705344U, // <4,5,6,u>: Cost 0 copy RHS
+  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+  27705344U, // <4,5,u,7>: Cost 0 copy RHS
+  27705344U, // <4,5,u,u>: Cost 0 copy RHS
+  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+  27705344U, // <4,u,6,7>: Cost 0 copy RHS
+  27705344U, // <4,u,6,u>: Cost 0 copy RHS
+  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  27705344U, // <4,u,u,7>: Cost 0 copy RHS
+  27705344U, // <4,u,u,u>: Cost 0 copy RHS
+  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+  835584U, // <u,1,2,3>: Cost 0 copy LHS
+  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+  835584U, // <u,1,2,u>: Cost 0 copy LHS
+  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  835584U, // <u,1,u,3>: Cost 0 copy LHS
+  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+  835584U, // <u,1,u,u>: Cost 0 copy LHS
+  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  27705344U, // <u,5,6,7>: Cost 0 copy RHS
+  27705344U, // <u,5,6,u>: Cost 0 copy RHS
+  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  27705344U, // <u,5,u,7>: Cost 0 copy RHS
+  27705344U, // <u,5,u,u>: Cost 0 copy RHS
+  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+  835584U, // <u,u,2,3>: Cost 0 copy LHS
+  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  835584U, // <u,u,2,u>: Cost 0 copy LHS
+  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+  27705344U, // <u,u,6,7>: Cost 0 copy RHS
+  27705344U, // <u,u,6,u>: Cost 0 copy RHS
+  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+  835584U, // <u,u,u,3>: Cost 0 copy LHS
+  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+  27705344U, // <u,u,u,7>: Cost 0 copy RHS
+  835584U, // <u,u,u,u>: Cost 0 copy LHS
+  0
+};
diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
new file mode 100644
index 00000000000..4723cc4978e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -0,0 +1,578 @@
+//=- AArch64PromoteConstant.cpp --- Promote constant to global for AArch64 -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64PromoteConstant pass which promotes constants
+// to global variables when this is likely to be more efficient. Currently only
+// types related to constant vector (i.e., constant vector, array of constant
+// vectors, constant structure with a constant vector field, etc.) are promoted
+// to global variables. Constant vectors are likely to be lowered in target
+// constant pool during instruction selection already; therefore, the access
+// will remain the same (memory load), but the structure types are not split
+// into different constant pool accesses for each field. A bonus side effect is
+// that created globals may be merged by the global merge pass.
+//
+// FIXME: This pass may be useful for other targets too.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-promote-const"
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("aarch64-stress-promote-const", cl::Hidden,
+                            cl::desc("Promote all vector constants"));
+
+STATISTIC(NumPromoted, "Number of promoted constants");
+STATISTIC(NumPromotedUses, "Number of promoted constants uses");
+
+//===----------------------------------------------------------------------===//
+//                       AArch64PromoteConstant
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Promotes interesting constant into global variables.
+/// The motivating example is:
+/// static const uint16_t TableA[32] = {
+///   41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768,
+///   31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215,
+///   25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846,
+///   21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725,
+/// };
+///
+/// uint8x16x4_t LoadStatic(void) {
+///   uint8x16x4_t ret;
+///   ret.val[0] = vld1q_u16(TableA +  0);
+///   ret.val[1] = vld1q_u16(TableA +  8);
+///   ret.val[2] = vld1q_u16(TableA + 16);
+///   ret.val[3] = vld1q_u16(TableA + 24);
+///   return ret;
+/// }
+///
+/// The constants in this example are folded into the uses. Thus, 4 different
+/// constants are created.
+///
+/// As their type is vector the cheapest way to create them is to load them
+/// for the memory.
+///
+/// Therefore the final assembly final has 4 different loads. With this pass
+/// enabled, only one load is issued for the constants.
+class AArch64PromoteConstant : public ModulePass {
+
+public:
+  static char ID;
+  AArch64PromoteConstant() : ModulePass(ID) {}
+
+  const char *getPassName() const override { return "AArch64 Promote Constant"; }
+
+  /// Iterate over the functions and promote the interesting constants into
+  /// global variables with module scope.
+  bool runOnModule(Module &M) override {
+    DEBUG(dbgs() << getPassName() << '\n');
+    bool Changed = false;
+    for (auto &MF : M) {
+      Changed |= runOnFunction(MF);
+    }
+    return Changed;
+  }
+
+private:
+  /// Look for interesting constants used within the given function.
+  /// Promote them into global variables, load these global variables within
+  /// the related function, so that the number of inserted load is minimal.
+  bool runOnFunction(Function &F);
+
+  // This transformation requires dominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  /// Type to store a list of User.
+  typedef SmallVector<Value::user_iterator, 4> Users;
+  /// Map an insertion point to all the uses it dominates.
+  typedef DenseMap<Instruction *, Users> InsertionPoints;
+  /// Map a function to the required insertion point of load for a
+  /// global variable.
+  typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
+
+  /// Find the closest point that dominates the given Use.
+  Instruction *findInsertionPoint(Value::user_iterator &Use);
+
+  /// Check if the given insertion point is dominated by an existing
+  /// insertion point.
+  /// If true, the given use is added to the list of dominated uses for
+  /// the related existing point.
+  /// \param NewPt the insertion point to be checked
+  /// \param UseIt the use to be added into the list of dominated uses
+  /// \param InsertPts existing insertion points
+  /// \pre NewPt and all instruction in InsertPts belong to the same function
+  /// \return true if one of the insertion point in InsertPts dominates NewPt,
+  ///         false otherwise
+  bool isDominated(Instruction *NewPt, Value::user_iterator &UseIt,
+                   InsertionPoints &InsertPts);
+
+  /// Check if the given insertion point can be merged with an existing
+  /// insertion point in a common dominator.
+  /// If true, the given use is added to the list of the created insertion
+  /// point.
+  /// \param NewPt the insertion point to be checked
+  /// \param UseIt the use to be added into the list of dominated uses
+  /// \param InsertPts existing insertion points
+  /// \pre NewPt and all instruction in InsertPts belong to the same function
+  /// \pre isDominated returns false for the exact same parameters.
+  /// \return true if it exists an insertion point in InsertPts that could
+  ///         have been merged with NewPt in a common dominator,
+  ///         false otherwise
+  bool tryAndMerge(Instruction *NewPt, Value::user_iterator &UseIt,
+                   InsertionPoints &InsertPts);
+
+  /// Compute the minimal insertion points to dominates all the interesting
+  /// uses of value.
+  /// Insertion points are group per function and each insertion point
+  /// contains a list of all the uses it dominates within the related function
+  /// \param Val constant to be examined
+  /// \param[out] InsPtsPerFunc output storage of the analysis
+  void computeInsertionPoints(Constant *Val,
+                              InsertionPointsPerFunc &InsPtsPerFunc);
+
+  /// Insert a definition of a new global variable at each point contained in
+  /// InsPtsPerFunc and update the related uses (also contained in
+  /// InsPtsPerFunc).
+  bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc);
+
+  /// Compute the minimal insertion points to dominate all the interesting
+  /// uses of Val and insert a definition of a new global variable
+  /// at these points.
+  /// Also update the uses of Val accordingly.
+  /// Currently a use of Val is considered interesting if:
+  /// - Val is not UndefValue
+  /// - Val is not zeroinitialized
+  /// - Replacing Val per a load of a global variable is valid.
+  /// \see shouldConvert for more details
+  bool computeAndInsertDefinitions(Constant *Val);
+
+  /// Promote the given constant into a global variable if it is expected to
+  /// be profitable.
+  /// \return true if Cst has been promoted
+  bool promoteConstant(Constant *Cst);
+
+  /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
+  /// Append UseIt to this list and delete the entry of IPI in InsertPts.
+  static void appendAndTransferDominatedUses(Instruction *NewPt,
+                                             Value::user_iterator &UseIt,
+                                             InsertionPoints::iterator &IPI,
+                                             InsertionPoints &InsertPts) {
+    // Record the dominated use.
+    IPI->second.push_back(UseIt);
+    // Transfer the dominated uses of IPI to NewPt
+    // Inserting into the DenseMap may invalidate existing iterator.
+    // Keep a copy of the key to find the iterator to erase.
+    Instruction *OldInstr = IPI->first;
+    InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second));
+    // Erase IPI.
+    IPI = InsertPts.find(OldInstr);
+    InsertPts.erase(IPI);
+  }
+};
+} // end anonymous namespace
+
+char AArch64PromoteConstant::ID = 0;
+
+namespace llvm {
+void initializeAArch64PromoteConstantPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(AArch64PromoteConstant, "aarch64-promote-const",
+                      "AArch64 Promote Constant Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AArch64PromoteConstant, "aarch64-promote-const",
+                    "AArch64 Promote Constant Pass", false, false)
+
+ModulePass *llvm::createAArch64PromoteConstantPass() {
+  return new AArch64PromoteConstant();
+}
+
+/// Check if the given type uses a vector type.
+static bool isConstantUsingVectorTy(const Type *CstTy) {
+  if (CstTy->isVectorTy())
+    return true;
+  if (CstTy->isStructTy()) {
+    for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements();
+         EltIdx < EndEltIdx; ++EltIdx)
+      if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx)))
+        return true;
+  } else if (CstTy->isArrayTy())
+    return isConstantUsingVectorTy(CstTy->getArrayElementType());
+  return false;
+}
+
+/// Check if the given use (Instruction + OpIdx) of Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A use should be converted if it is legal to do so.
+/// For instance, it is not legal to turn the mask operand of a shuffle vector
+/// into a load of a global variable.
+static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
+                             unsigned OpIdx) {
+  // shufflevector instruction expects a const for the mask argument, i.e., the
+  // third argument. Do not promote this use in that case.
+  if (isa<const ShuffleVectorInst>(Instr) && OpIdx == 2)
+    return false;
+
+  // extractvalue instruction expects a const idx.
+  if (isa<const ExtractValueInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // extractvalue instruction expects a const idx.
+  if (isa<const InsertValueInst>(Instr) && OpIdx > 1)
+    return false;
+
+  if (isa<const AllocaInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Alignment argument must be constant.
+  if (isa<const LoadInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Alignment argument must be constant.
+  if (isa<const StoreInst>(Instr) && OpIdx > 1)
+    return false;
+
+  // Index must be constant.
+  if (isa<const GetElementPtrInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Personality function and filters must be constant.
+  // Give up on that instruction.
+  if (isa<const LandingPadInst>(Instr))
+    return false;
+
+  // Switch instruction expects constants to compare to.
+  if (isa<const SwitchInst>(Instr))
+    return false;
+
+  // Expected address must be a constant.
+  if (isa<const IndirectBrInst>(Instr))
+    return false;
+
+  // Do not mess with intrinsics.
+  if (isa<const IntrinsicInst>(Instr))
+    return false;
+
+  // Do not mess with inline asm.
+  const CallInst *CI = dyn_cast<const CallInst>(Instr);
+  if (CI && isa<const InlineAsm>(CI->getCalledValue()))
+    return false;
+
+  return true;
+}
+
+/// Check if the given Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A constant should be converted if it is likely that the materialization of
+/// the constant will be tricky. Thus, we give up on zero or undef values.
+///
+/// \todo Currently, accept only vector related types.
+/// Also we give up on all simple vector type to keep the existing
+/// behavior. Otherwise, we should push here all the check of the lowering of
+/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging
+/// constant via global merge and the fact that the same constant is stored
+/// only once with this method (versus, as many function that uses the constant
+/// for the regular approach, even for float).
+/// Again, the simplest solution would be to promote every
+/// constant and rematerialize them when they are actually cheap to create.
+static bool shouldConvert(const Constant *Cst) {
+  if (isa<const UndefValue>(Cst))
+    return false;
+
+  // FIXME: In some cases, it may be interesting to promote in memory
+  // a zero initialized constant.
+  // E.g., when the type of Cst require more instructions than the
+  // adrp/add/load sequence or when this sequence can be shared by several
+  // instances of Cst.
+  // Ideally, we could promote this into a global and rematerialize the constant
+  // when it was a bad idea.
+  if (Cst->isZeroValue())
+    return false;
+
+  if (Stress)
+    return true;
+
+  // FIXME: see function \todo
+  if (Cst->getType()->isVectorTy())
+    return false;
+  return isConstantUsingVectorTy(Cst->getType());
+}
+
+Instruction *
+AArch64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
+  // If this user is a phi, the insertion point is in the related
+  // incoming basic block.
+  PHINode *PhiInst = dyn_cast<PHINode>(*Use);
+  Instruction *InsertionPoint;
+  if (PhiInst)
+    InsertionPoint =
+        PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
+  else
+    InsertionPoint = dyn_cast<Instruction>(*Use);
+  assert(InsertionPoint && "User is not an instruction!");
+  return InsertionPoint;
+}
+
+bool AArch64PromoteConstant::isDominated(Instruction *NewPt,
+                                         Value::user_iterator &UseIt,
+                                         InsertionPoints &InsertPts) {
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+      *NewPt->getParent()->getParent()).getDomTree();
+
+  // Traverse all the existing insertion points and check if one is dominating
+  // NewPt. If it is, remember that.
+  for (auto &IPI : InsertPts) {
+    if (NewPt == IPI.first || DT.dominates(IPI.first, NewPt) ||
+        // When IPI.first is a terminator instruction, DT may think that
+        // the result is defined on the edge.
+        // Here we are testing the insertion point, not the definition.
+        (IPI.first->getParent() != NewPt->getParent() &&
+         DT.dominates(IPI.first->getParent(), NewPt->getParent()))) {
+      // No need to insert this point. Just record the dominated use.
+      DEBUG(dbgs() << "Insertion point dominated by:\n");
+      DEBUG(IPI.first->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+      IPI.second.push_back(UseIt);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt,
+                                         Value::user_iterator &UseIt,
+                                         InsertionPoints &InsertPts) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+      *NewPt->getParent()->getParent()).getDomTree();
+  BasicBlock *NewBB = NewPt->getParent();
+
+  // Traverse all the existing insertion point and check if one is dominated by
+  // NewPt and thus useless or can be combined with NewPt into a common
+  // dominator.
+  for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                 EndIPI = InsertPts.end();
+       IPI != EndIPI; ++IPI) {
+    BasicBlock *CurBB = IPI->first->getParent();
+    if (NewBB == CurBB) {
+      // Instructions are in the same block.
+      // By construction, NewPt is dominating the other.
+      // Indeed, isDominated returned false with the exact same arguments.
+      DEBUG(dbgs() << "Merge insertion point with:\n");
+      DEBUG(IPI->first->print(dbgs()));
+      DEBUG(dbgs() << "\nat considered insertion point.\n");
+      appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+      return true;
+    }
+
+    // Look for a common dominator
+    BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB);
+    // If none exists, we cannot merge these two points.
+    if (!CommonDominator)
+      continue;
+
+    if (CommonDominator != NewBB) {
+      // By construction, the CommonDominator cannot be CurBB.
+      assert(CommonDominator != CurBB &&
+             "Instruction has not been rejected during isDominated check!");
+      // Take the last instruction of the CommonDominator as insertion point
+      NewPt = CommonDominator->getTerminator();
+    }
+    // else, CommonDominator is the block of NewBB, hence NewBB is the last
+    // possible insertion point in that block.
+    DEBUG(dbgs() << "Merge insertion point with:\n");
+    DEBUG(IPI->first->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+    DEBUG(NewPt->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+    appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+    return true;
+  }
+  return false;
+}
+
+void AArch64PromoteConstant::computeInsertionPoints(
+    Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
+  DEBUG(dbgs() << "** Compute insertion points **\n");
+  for (Value::user_iterator UseIt = Val->user_begin(),
+                            EndUseIt = Val->user_end();
+       UseIt != EndUseIt; ++UseIt) {
+    // If the user is not an Instruction, we cannot modify it.
+    if (!isa<Instruction>(*UseIt))
+      continue;
+
+    // Filter out uses that should not be converted.
+    if (!shouldConvertUse(Val, cast<Instruction>(*UseIt), UseIt.getOperandNo()))
+      continue;
+
+    DEBUG(dbgs() << "Considered use, opidx " << UseIt.getOperandNo() << ":\n");
+    DEBUG((*UseIt)->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+
+    Instruction *InsertionPoint = findInsertionPoint(UseIt);
+
+    DEBUG(dbgs() << "Considered insertion point:\n");
+    DEBUG(InsertionPoint->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+
+    // Check if the current insertion point is useless, i.e., it is dominated
+    // by another one.
+    InsertionPoints &InsertPts =
+        InsPtsPerFunc[InsertionPoint->getParent()->getParent()];
+    if (isDominated(InsertionPoint, UseIt, InsertPts))
+      continue;
+    // This insertion point is useful, check if we can merge some insertion
+    // point in a common dominator or if NewPt dominates an existing one.
+    if (tryAndMerge(InsertionPoint, UseIt, InsertPts))
+      continue;
+
+    DEBUG(dbgs() << "Keep considered insertion point\n");
+
+    // It is definitely useful by its own
+    InsertPts[InsertionPoint].push_back(UseIt);
+  }
+}
+
+bool AArch64PromoteConstant::insertDefinitions(
+    Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) {
+  // We will create one global variable per Module.
+  DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
+  bool HasChanged = false;
+
+  // Traverse all insertion points in all the function.
+  for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(),
+                                        EndIt = InsPtsPerFunc.end();
+       FctToInstPtsIt != EndIt; ++FctToInstPtsIt) {
+    InsertionPoints &InsertPts = FctToInstPtsIt->second;
+// Do more checking for debug purposes.
+#ifndef NDEBUG
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+        *FctToInstPtsIt->first).getDomTree();
+#endif
+    GlobalVariable *PromotedGV;
+    assert(!InsertPts.empty() && "Empty uses does not need a definition");
+
+    Module *M = FctToInstPtsIt->first->getParent();
+    DenseMap<Module *, GlobalVariable *>::iterator MapIt =
+        ModuleToMergedGV.find(M);
+    if (MapIt == ModuleToMergedGV.end()) {
+      PromotedGV = new GlobalVariable(
+          *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr,
+          "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
+      PromotedGV->setInitializer(Cst);
+      ModuleToMergedGV[M] = PromotedGV;
+      DEBUG(dbgs() << "Global replacement: ");
+      DEBUG(PromotedGV->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+      ++NumPromoted;
+      HasChanged = true;
+    } else {
+      PromotedGV = MapIt->second;
+    }
+
+    for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                   EndIPI = InsertPts.end();
+         IPI != EndIPI; ++IPI) {
+      // Create the load of the global variable.
+      IRBuilder<> Builder(IPI->first->getParent(), IPI->first);
+      LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
+      DEBUG(dbgs() << "**********\n");
+      DEBUG(dbgs() << "New def: ");
+      DEBUG(LoadedCst->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+
+      // Update the dominated uses.
+      Users &DominatedUsers = IPI->second;
+      for (Value::user_iterator Use : DominatedUsers) {
+#ifndef NDEBUG
+        assert((DT.dominates(LoadedCst, cast<Instruction>(*Use)) ||
+                (isa<PHINode>(*Use) &&
+                 DT.dominates(LoadedCst, findInsertionPoint(Use)))) &&
+               "Inserted definition does not dominate all its uses!");
+#endif
+        DEBUG(dbgs() << "Use to update " << Use.getOperandNo() << ":");
+        DEBUG(Use->print(dbgs()));
+        DEBUG(dbgs() << '\n');
+        Use->setOperand(Use.getOperandNo(), LoadedCst);
+        ++NumPromotedUses;
+      }
+    }
+  }
+  return HasChanged;
+}
+
+bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
+  InsertionPointsPerFunc InsertPtsPerFunc;
+  computeInsertionPoints(Val, InsertPtsPerFunc);
+  return insertDefinitions(Val, InsertPtsPerFunc);
+}
+
+bool AArch64PromoteConstant::promoteConstant(Constant *Cst) {
+  assert(Cst && "Given variable is not a valid constant.");
+
+  if (!shouldConvert(Cst))
+    return false;
+
+  DEBUG(dbgs() << "******************************\n");
+  DEBUG(dbgs() << "Candidate constant: ");
+  DEBUG(Cst->print(dbgs()));
+  DEBUG(dbgs() << '\n');
+
+  return computeAndInsertDefinitions(Cst);
+}
+
+bool AArch64PromoteConstant::runOnFunction(Function &F) {
+  // Look for instructions using constant vector. Promote that constant to a
+  // global variable. Create as few loads of this variable as possible and
+  // update the uses accordingly.
+  bool LocalChange = false;
+  SmallSet<Constant *, 8> AlreadyChecked;
+
+  for (auto &MBB : F) {
+    for (auto &MI : MBB) {
+      // Traverse the operand, looking for constant vectors. Replace them by a
+      // load of a global variable of constant vector type.
+      for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands();
+           OpIdx != EndOpIdx; ++OpIdx) {
+        Constant *Cst = dyn_cast<Constant>(MI.getOperand(OpIdx));
+        // There is no point in promoting global values as they are already
+        // global. Do not promote constant expressions either, as they may
+        // require some code expansion.
+        if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
+            AlreadyChecked.insert(Cst))
+          LocalChange |= promoteConstant(Cst);
+      }
+    }
+  }
+  return LocalChange;
+}
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
new file mode 100644
index 00000000000..48a361d50e5
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -0,0 +1,404 @@
+//===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64RegisterInfo.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "AArch64GenRegisterInfo.inc"
+
+AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii,
+                                         const AArch64Subtarget *sti)
+    : AArch64GenRegisterInfo(AArch64::LR), TII(tii), STI(sti) {}
+
+const MCPhysReg *
+AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
+    return CSR_AArch64_AllRegs_SaveList;
+  else
+    return CSR_AArch64_AAPCS_SaveList;
+}
+
+const uint32_t *
+AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::AnyReg)
+    return CSR_AArch64_AllRegs_RegMask;
+  else
+    return CSR_AArch64_AAPCS_RegMask;
+}
+
+const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
+  if (STI->isTargetDarwin())
+    return CSR_AArch64_TLS_Darwin_RegMask;
+
+  assert(STI->isTargetELF() && "only expect Darwin or ELF TLS");
+  return CSR_AArch64_TLS_ELF_RegMask;
+}
+
+const uint32_t *
+AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
+  // This should return a register mask that is the same as that returned by
+  // getCallPreservedMask but that additionally preserves the register used for
+  // the first i64 argument (which must also be the register used to return a
+  // single i64 return value)
+  //
+  // In case that the calling convention does not use the same register for
+  // both, the function should return NULL (does not currently apply)
+  return CSR_AArch64_AAPCS_ThisReturn_RegMask;
+}
+
+BitVector
+AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // FIXME: avoid re-calculating this every time.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(AArch64::SP);
+  Reserved.set(AArch64::XZR);
+  Reserved.set(AArch64::WSP);
+  Reserved.set(AArch64::WZR);
+
+  if (TFI->hasFP(MF) || STI->isTargetDarwin()) {
+    Reserved.set(AArch64::FP);
+    Reserved.set(AArch64::W29);
+  }
+
+  if (STI->isTargetDarwin()) {
+    Reserved.set(AArch64::X18); // Platform register
+    Reserved.set(AArch64::W18);
+  }
+
+  if (hasBasePointer(MF)) {
+    Reserved.set(AArch64::X19);
+    Reserved.set(AArch64::W19);
+  }
+
+  return Reserved;
+}
+
+bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
+                                      unsigned Reg) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (Reg) {
+  default:
+    break;
+  case AArch64::SP:
+  case AArch64::XZR:
+  case AArch64::WSP:
+  case AArch64::WZR:
+    return true;
+  case AArch64::X18:
+  case AArch64::W18:
+    return STI->isTargetDarwin();
+  case AArch64::FP:
+  case AArch64::W29:
+    return TFI->hasFP(MF) || STI->isTargetDarwin();
+  case AArch64::W19:
+  case AArch64::X19:
+    return hasBasePointer(MF);
+  }
+
+  return false;
+}
+
+const TargetRegisterClass *
+AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                      unsigned Kind) const {
+  return &AArch64::GPR64RegClass;
+}
+
+const TargetRegisterClass *
+AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &AArch64::CCRRegClass)
+    return nullptr; // Can't copy NZCV.
+  return RC;
+}
+
+unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
+
+bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // In the presence of variable sized objects, if the fixed stack size is
+  // large enough that referencing from the FP won't result in things being
+  // in range relatively often, we can use a base pointer to allow access
+  // from the other direction like the SP normally works.
+  if (MFI->hasVarSizedObjects()) {
+    // Conservatively estimate whether the negative offset from the frame
+    // pointer will be sufficient to reach. If a function has a smallish
+    // frame, it's less likely to have lots of spills and callee saved
+    // space, so it's all more likely to be within range of the frame pointer.
+    // If it's wrong, we'll materialize the constant and still get to the
+    // object; it's just suboptimal. Negative offsets use the unscaled
+    // load/store instructions, which have a 9-bit signed immediate.
+    if (MFI->getLocalFrameSize() < 256)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+unsigned
+AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
+}
+
+bool AArch64RegisterInfo::requiresRegisterScavenging(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // AArch64FrameLowering::resolveFrameIndexReference() can always fall back
+  // to the stack pointer, so only put the emergency spill slot next to the
+  // FP when there's no better way to access it (SP or base pointer).
+  return MFI->hasVarSizedObjects() && !hasBasePointer(MF);
+}
+
+bool AArch64RegisterInfo::requiresFrameIndexScavenging(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Only consider eliminating leaf frames.
+  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
+                          MFI->adjustsStack()))
+    return true;
+  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
+                                            int64_t Offset) const {
+  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
+    assert(i < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores, so
+  // return false for everything else.
+  if (!MI->mayLoad() && !MI->mayStore())
+    return false;
+
+  // Without a virtual base register, if the function has variable sized
+  // objects, all fixed-size local references will be via the frame pointer,
+  // Approximate the offset and see if it's legal for the instruction.
+  // Note that the incoming offset is based on the SP value at function entry,
+  // so it'll be negative.
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Estimate an offset from the frame pointer.
+  // Conservatively assume all GPR callee-saved registers get pushed.
+  // FP, LR, X19-X28, D8-D15. 64-bits each.
+  int64_t FPOffset = Offset - 16 * 20;
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset += MFI->getLocalFrameSize();
+  // Assume that we'll have at least some spill slots allocated.
+  // FIXME: This is a total SWAG number. We should run some statistics
+  //        and pick a real one.
+  Offset += 128; // 128 bytes of spill slots
+
+  // If there is a frame pointer, try using it.
+  // The FP is only available if there is no dynamic realignment. We
+  // don't know for sure yet whether we'll need that, so we guess based
+  // on whether there are any local variables that would trigger it.
+  if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset))
+    return false;
+
+  // If we can reference via the stack pointer or base pointer, try that.
+  // FIXME: This (and the code that resolves the references) can be improved
+  //        to only disallow SP relative references in the live range of
+  //        the VLA(s). In practice, it's unclear how much difference that
+  //        would make, but it may be worth doing.
+  if (isFrameOffsetLegal(MI, Offset))
+    return false;
+
+  // The offset likely isn't legal; we want to allocate a virtual base register.
+  return true;
+}
+
+bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                             int64_t Offset) const {
+  assert(Offset <= INT_MAX && "Offset too big to fit in int.");
+  assert(MI && "Unable to get the legal offset for nil instruction.");
+  int SaveOffset = Offset;
+  return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
+}
+
+/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
+/// at the beginning of the basic block.
+void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                       unsigned BaseReg,
+                                                       int FrameIdx,
+                                                       int64_t Offset) const {
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL; // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const MachineFunction &MF = *MBB->getParent();
+  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
+  unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+
+  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+      .addFrameIndex(FrameIdx)
+      .addImm(Offset)
+      .addImm(Shifter);
+}
+
+void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                            int64_t Offset) const {
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII);
+  assert(Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                              int SPAdj, unsigned FIOperandNum,
+                                              RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>(
+      MF.getTarget().getFrameLowering());
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned FrameReg;
+  int Offset;
+
+  // Special handling of dbg_value, stackmap and patchpoint instructions.
+  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+                                             /*PreferFP=*/true);
+    Offset += MI.getOperand(FIOperandNum + 1).getImm();
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
+  if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+    return;
+
+  assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
+         "Emergency spill slot is out of reach");
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above.  Handle the rest, providing a register that is
+  // SP+LargeImm.
+  unsigned ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
+  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
+}
+
+namespace llvm {
+
+unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                                  MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case AArch64::GPR32RegClassID:
+  case AArch64::GPR32spRegClassID:
+  case AArch64::GPR32allRegClassID:
+  case AArch64::GPR64spRegClassID:
+  case AArch64::GPR64allRegClassID:
+  case AArch64::GPR64RegClassID:
+  case AArch64::GPR32commonRegClassID:
+  case AArch64::GPR64commonRegClassID:
+    return 32 - 1                                      // XZR/SP
+           - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
+           - STI->isTargetDarwin() // X18 reserved as platform register
+           - hasBasePointer(MF);   // X19
+  case AArch64::FPR8RegClassID:
+  case AArch64::FPR16RegClassID:
+  case AArch64::FPR32RegClassID:
+  case AArch64::FPR64RegClassID:
+  case AArch64::FPR128RegClassID:
+    return 32;
+
+  case AArch64::DDRegClassID:
+  case AArch64::DDDRegClassID:
+  case AArch64::DDDDRegClassID:
+  case AArch64::QQRegClassID:
+  case AArch64::QQQRegClassID:
+  case AArch64::QQQQRegClassID:
+    return 32;
+
+  case AArch64::FPR128_loRegClassID:
+    return 16;
+  }
+}
+
+} // namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
new file mode 100644
index 00000000000..76af1edce72
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -0,0 +1,101 @@
+//==- AArch64RegisterInfo.h - AArch64 Register Information Impl --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64REGISTERINFO_H
+#define LLVM_TARGET_AArch64REGISTERINFO_H
+
+#define GET_REGINFO_HEADER
+#include "AArch64GenRegisterInfo.inc"
+
+namespace llvm {
+
+class AArch64InstrInfo;
+class AArch64Subtarget;
+class MachineFunction;
+class RegScavenger;
+class TargetRegisterClass;
+
+struct AArch64RegisterInfo : public AArch64GenRegisterInfo {
+private:
+  const AArch64InstrInfo *TII;
+  const AArch64Subtarget *STI;
+
+public:
+  AArch64RegisterInfo(const AArch64InstrInfo *tii, const AArch64Subtarget *sti);
+
+  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+
+  /// Code Generation virtual methods...
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
+
+  unsigned getCSRFirstUseCost() const override {
+    // The cost will be compared against BlockFrequency where entry has the
+    // value of 1 << 14. A value of 5 will choose to spill or split really
+    // cold path instead of using a callee-saved register.
+    return 5;
+  }
+
+  // Calls involved in thread-local variable lookup save more registers than
+  // normal calls, so they need a different mask to represent this.
+  const uint32_t *getTLSCallPreservedMask() const;
+
+  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+  /// case that 'returned' is on an i64 first argument if the calling convention
+  /// is one that can (partially) model this attribute with a preserved mask
+  /// (i.e. it is a calling convention that uses the same register for the first
+  /// i64 argument and an i64 return value)
+  ///
+  /// Should return NULL in the case that the calling convention does not have
+  /// this property
+  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI,
+                          int64_t Offset) const override;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
+                                    int FrameIdx,
+                                    int64_t Offset) const override;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
+
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
+  bool hasBasePointer(const MachineFunction &MF) const;
+  unsigned getBaseRegister() const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_AArch64REGISTERINFO_H
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
new file mode 100644
index 00000000000..21c927f2385
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -0,0 +1,593 @@
+//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+
+class AArch64Reg<bits<16> enc, string n, list<Register> subregs = [],
+               list<string> altNames = []>
+        : Register<n, altNames> {
+  let HWEncoding = enc;
+  let Namespace = "AArch64";
+  let SubRegs = subregs;
+}
+
+let Namespace = "AArch64" in {
+  def sub_32 : SubRegIndex<32>;
+
+  def bsub : SubRegIndex<8>;
+  def hsub : SubRegIndex<16>;
+  def ssub : SubRegIndex<32>;
+  def dsub : SubRegIndex<32>;
+  def qhisub : SubRegIndex<64>;
+  def qsub : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def dsub0 : SubRegIndex<64>;
+  def dsub1 : SubRegIndex<64>;
+  def dsub2 : SubRegIndex<64>;
+  def dsub3 : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def qsub0 : SubRegIndex<128>;
+  def qsub1 : SubRegIndex<128>;
+  def qsub2 : SubRegIndex<128>;
+  def qsub3 : SubRegIndex<128>;
+}
+
+let Namespace = "AArch64" in {
+  def vreg : RegAltNameIndex;
+  def vlist1 : RegAltNameIndex;
+}
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+def W0    : AArch64Reg<0,   "w0" >, DwarfRegNum<[0]>;
+def W1    : AArch64Reg<1,   "w1" >, DwarfRegNum<[1]>;
+def W2    : AArch64Reg<2,   "w2" >, DwarfRegNum<[2]>;
+def W3    : AArch64Reg<3,   "w3" >, DwarfRegNum<[3]>;
+def W4    : AArch64Reg<4,   "w4" >, DwarfRegNum<[4]>;
+def W5    : AArch64Reg<5,   "w5" >, DwarfRegNum<[5]>;
+def W6    : AArch64Reg<6,   "w6" >, DwarfRegNum<[6]>;
+def W7    : AArch64Reg<7,   "w7" >, DwarfRegNum<[7]>;
+def W8    : AArch64Reg<8,   "w8" >, DwarfRegNum<[8]>;
+def W9    : AArch64Reg<9,   "w9" >, DwarfRegNum<[9]>;
+def W10   : AArch64Reg<10, "w10">, DwarfRegNum<[10]>;
+def W11   : AArch64Reg<11, "w11">, DwarfRegNum<[11]>;
+def W12   : AArch64Reg<12, "w12">, DwarfRegNum<[12]>;
+def W13   : AArch64Reg<13, "w13">, DwarfRegNum<[13]>;
+def W14   : AArch64Reg<14, "w14">, DwarfRegNum<[14]>;
+def W15   : AArch64Reg<15, "w15">, DwarfRegNum<[15]>;
+def W16   : AArch64Reg<16, "w16">, DwarfRegNum<[16]>;
+def W17   : AArch64Reg<17, "w17">, DwarfRegNum<[17]>;
+def W18   : AArch64Reg<18, "w18">, DwarfRegNum<[18]>;
+def W19   : AArch64Reg<19, "w19">, DwarfRegNum<[19]>;
+def W20   : AArch64Reg<20, "w20">, DwarfRegNum<[20]>;
+def W21   : AArch64Reg<21, "w21">, DwarfRegNum<[21]>;
+def W22   : AArch64Reg<22, "w22">, DwarfRegNum<[22]>;
+def W23   : AArch64Reg<23, "w23">, DwarfRegNum<[23]>;
+def W24   : AArch64Reg<24, "w24">, DwarfRegNum<[24]>;
+def W25   : AArch64Reg<25, "w25">, DwarfRegNum<[25]>;
+def W26   : AArch64Reg<26, "w26">, DwarfRegNum<[26]>;
+def W27   : AArch64Reg<27, "w27">, DwarfRegNum<[27]>;
+def W28   : AArch64Reg<28, "w28">, DwarfRegNum<[28]>;
+def W29   : AArch64Reg<29, "w29">, DwarfRegNum<[29]>;
+def W30   : AArch64Reg<30, "w30">, DwarfRegNum<[30]>;
+def WSP   : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
+def WZR   : AArch64Reg<31, "wzr">, DwarfRegAlias<WSP>;
+
+let SubRegIndices = [sub_32] in {
+def X0    : AArch64Reg<0,   "x0",  [W0]>, DwarfRegAlias<W0>;
+def X1    : AArch64Reg<1,   "x1",  [W1]>, DwarfRegAlias<W1>;
+def X2    : AArch64Reg<2,   "x2",  [W2]>, DwarfRegAlias<W2>;
+def X3    : AArch64Reg<3,   "x3",  [W3]>, DwarfRegAlias<W3>;
+def X4    : AArch64Reg<4,   "x4",  [W4]>, DwarfRegAlias<W4>;
+def X5    : AArch64Reg<5,   "x5",  [W5]>, DwarfRegAlias<W5>;
+def X6    : AArch64Reg<6,   "x6",  [W6]>, DwarfRegAlias<W6>;
+def X7    : AArch64Reg<7,   "x7",  [W7]>, DwarfRegAlias<W7>;
+def X8    : AArch64Reg<8,   "x8",  [W8]>, DwarfRegAlias<W8>;
+def X9    : AArch64Reg<9,   "x9",  [W9]>, DwarfRegAlias<W9>;
+def X10   : AArch64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
+def X11   : AArch64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
+def X12   : AArch64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
+def X13   : AArch64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
+def X14   : AArch64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
+def X15   : AArch64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
+def X16   : AArch64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
+def X17   : AArch64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
+def X18   : AArch64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
+def X19   : AArch64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
+def X20   : AArch64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
+def X21   : AArch64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
+def X22   : AArch64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
+def X23   : AArch64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
+def X24   : AArch64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
+def X25   : AArch64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
+def X26   : AArch64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
+def X27   : AArch64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
+def X28   : AArch64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
+def FP    : AArch64Reg<29, "x29", [W29]>, DwarfRegAlias<W29>;
+def LR    : AArch64Reg<30, "x30", [W30]>, DwarfRegAlias<W30>;
+def SP    : AArch64Reg<31, "sp",  [WSP]>, DwarfRegAlias<WSP>;
+def XZR   : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
+}
+
+// Condition code register.
+def NZCV  : AArch64Reg<0, "nzcv">;
+
+// GPR register classes with the intersections of GPR32/GPR32sp and
+// GPR64/GPR64sp for use by the coalescer.
+def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
+  let AltOrders = [(rotl GPR32common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64common : RegisterClass<"AArch64", [i64], 64,
+                                (add (sequence "X%u", 0, 28), FP, LR)> {
+  let AltOrders = [(rotl GPR64common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+// GPR register classes which exclude SP/WSP.
+def GPR32 : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR)> {
+  let AltOrders = [(rotl GPR32, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64 : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR)> {
+  let AltOrders = [(rotl GPR64, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// GPR register classes which include SP/WSP.
+def GPR32sp : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WSP)> {
+  let AltOrders = [(rotl GPR32sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64sp : RegisterClass<"AArch64", [i64], 64, (add GPR64common, SP)> {
+  let AltOrders = [(rotl GPR64sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+def GPR32sponly : RegisterClass<"AArch64", [i32], 32, (add WSP)>;
+def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>;
+
+def GPR64spPlus0Operand : AsmOperandClass {
+  let Name = "GPR64sp0";
+  let RenderMethod = "addRegOperands";
+  let ParserMethod = "tryParseGPR64sp0Operand";
+}
+
+def GPR64sp0 : RegisterOperand<GPR64sp> {
+  let ParserMatchClass = GPR64spPlus0Operand;
+}
+
+// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
+// constraint used by any instructions, it is used as a common super-class.
+def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>;
+def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// This is for indirect tail calls to store the address of the destination.
+def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21,
+                                                     X22, X23, X24, X25, X26,
+                                                     X27, X28)>;
+
+// GPR register classes for post increment amount of vector load/store that
+// has alternate printing when Rm=31 and prints a constant immediate value
+// equal to the total number of bytes transferred.
+
+// FIXME: TableGen *should* be able to do these itself now. There appears to be
+// a bug in counting how many operands a Post-indexed MCInst should have which
+// means the aliases don't trigger.
+def GPR64pi1  : RegisterOperand<GPR64, "printPostIncOperand<1>">;
+def GPR64pi2  : RegisterOperand<GPR64, "printPostIncOperand<2>">;
+def GPR64pi3  : RegisterOperand<GPR64, "printPostIncOperand<3>">;
+def GPR64pi4  : RegisterOperand<GPR64, "printPostIncOperand<4>">;
+def GPR64pi6  : RegisterOperand<GPR64, "printPostIncOperand<6>">;
+def GPR64pi8  : RegisterOperand<GPR64, "printPostIncOperand<8>">;
+def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand<12>">;
+def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand<16>">;
+def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand<24>">;
+def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand<32>">;
+def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
+def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
+
+// Condition code regclass.
+def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+
+  // CCR is not allocatable.
+  let isAllocatable = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Scalar Registers
+//===----------------------------------------------------------------------===//
+
+def B0    : AArch64Reg<0,   "b0">, DwarfRegNum<[64]>;
+def B1    : AArch64Reg<1,   "b1">, DwarfRegNum<[65]>;
+def B2    : AArch64Reg<2,   "b2">, DwarfRegNum<[66]>;
+def B3    : AArch64Reg<3,   "b3">, DwarfRegNum<[67]>;
+def B4    : AArch64Reg<4,   "b4">, DwarfRegNum<[68]>;
+def B5    : AArch64Reg<5,   "b5">, DwarfRegNum<[69]>;
+def B6    : AArch64Reg<6,   "b6">, DwarfRegNum<[70]>;
+def B7    : AArch64Reg<7,   "b7">, DwarfRegNum<[71]>;
+def B8    : AArch64Reg<8,   "b8">, DwarfRegNum<[72]>;
+def B9    : AArch64Reg<9,   "b9">, DwarfRegNum<[73]>;
+def B10   : AArch64Reg<10, "b10">, DwarfRegNum<[74]>;
+def B11   : AArch64Reg<11, "b11">, DwarfRegNum<[75]>;
+def B12   : AArch64Reg<12, "b12">, DwarfRegNum<[76]>;
+def B13   : AArch64Reg<13, "b13">, DwarfRegNum<[77]>;
+def B14   : AArch64Reg<14, "b14">, DwarfRegNum<[78]>;
+def B15   : AArch64Reg<15, "b15">, DwarfRegNum<[79]>;
+def B16   : AArch64Reg<16, "b16">, DwarfRegNum<[80]>;
+def B17   : AArch64Reg<17, "b17">, DwarfRegNum<[81]>;
+def B18   : AArch64Reg<18, "b18">, DwarfRegNum<[82]>;
+def B19   : AArch64Reg<19, "b19">, DwarfRegNum<[83]>;
+def B20   : AArch64Reg<20, "b20">, DwarfRegNum<[84]>;
+def B21   : AArch64Reg<21, "b21">, DwarfRegNum<[85]>;
+def B22   : AArch64Reg<22, "b22">, DwarfRegNum<[86]>;
+def B23   : AArch64Reg<23, "b23">, DwarfRegNum<[87]>;
+def B24   : AArch64Reg<24, "b24">, DwarfRegNum<[88]>;
+def B25   : AArch64Reg<25, "b25">, DwarfRegNum<[89]>;
+def B26   : AArch64Reg<26, "b26">, DwarfRegNum<[90]>;
+def B27   : AArch64Reg<27, "b27">, DwarfRegNum<[91]>;
+def B28   : AArch64Reg<28, "b28">, DwarfRegNum<[92]>;
+def B29   : AArch64Reg<29, "b29">, DwarfRegNum<[93]>;
+def B30   : AArch64Reg<30, "b30">, DwarfRegNum<[94]>;
+def B31   : AArch64Reg<31, "b31">, DwarfRegNum<[95]>;
+
+let SubRegIndices = [bsub] in {
+def H0    : AArch64Reg<0,   "h0", [B0]>, DwarfRegAlias<B0>;
+def H1    : AArch64Reg<1,   "h1", [B1]>, DwarfRegAlias<B1>;
+def H2    : AArch64Reg<2,   "h2", [B2]>, DwarfRegAlias<B2>;
+def H3    : AArch64Reg<3,   "h3", [B3]>, DwarfRegAlias<B3>;
+def H4    : AArch64Reg<4,   "h4", [B4]>, DwarfRegAlias<B4>;
+def H5    : AArch64Reg<5,   "h5", [B5]>, DwarfRegAlias<B5>;
+def H6    : AArch64Reg<6,   "h6", [B6]>, DwarfRegAlias<B6>;
+def H7    : AArch64Reg<7,   "h7", [B7]>, DwarfRegAlias<B7>;
+def H8    : AArch64Reg<8,   "h8", [B8]>, DwarfRegAlias<B8>;
+def H9    : AArch64Reg<9,   "h9", [B9]>, DwarfRegAlias<B9>;
+def H10   : AArch64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
+def H11   : AArch64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
+def H12   : AArch64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
+def H13   : AArch64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
+def H14   : AArch64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
+def H15   : AArch64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
+def H16   : AArch64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
+def H17   : AArch64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
+def H18   : AArch64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
+def H19   : AArch64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
+def H20   : AArch64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
+def H21   : AArch64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
+def H22   : AArch64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
+def H23   : AArch64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
+def H24   : AArch64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
+def H25   : AArch64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
+def H26   : AArch64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
+def H27   : AArch64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
+def H28   : AArch64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
+def H29   : AArch64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
+def H30   : AArch64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
+def H31   : AArch64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [hsub] in {
+def S0    : AArch64Reg<0,   "s0", [H0]>, DwarfRegAlias<B0>;
+def S1    : AArch64Reg<1,   "s1", [H1]>, DwarfRegAlias<B1>;
+def S2    : AArch64Reg<2,   "s2", [H2]>, DwarfRegAlias<B2>;
+def S3    : AArch64Reg<3,   "s3", [H3]>, DwarfRegAlias<B3>;
+def S4    : AArch64Reg<4,   "s4", [H4]>, DwarfRegAlias<B4>;
+def S5    : AArch64Reg<5,   "s5", [H5]>, DwarfRegAlias<B5>;
+def S6    : AArch64Reg<6,   "s6", [H6]>, DwarfRegAlias<B6>;
+def S7    : AArch64Reg<7,   "s7", [H7]>, DwarfRegAlias<B7>;
+def S8    : AArch64Reg<8,   "s8", [H8]>, DwarfRegAlias<B8>;
+def S9    : AArch64Reg<9,   "s9", [H9]>, DwarfRegAlias<B9>;
+def S10   : AArch64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
+def S11   : AArch64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
+def S12   : AArch64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
+def S13   : AArch64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
+def S14   : AArch64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
+def S15   : AArch64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
+def S16   : AArch64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
+def S17   : AArch64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
+def S18   : AArch64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
+def S19   : AArch64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
+def S20   : AArch64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
+def S21   : AArch64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
+def S22   : AArch64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
+def S23   : AArch64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
+def S24   : AArch64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
+def S25   : AArch64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
+def S26   : AArch64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
+def S27   : AArch64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
+def S28   : AArch64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
+def S29   : AArch64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
+def S30   : AArch64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
+def S31   : AArch64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
+def D0    : AArch64Reg<0,   "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
+def D1    : AArch64Reg<1,   "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
+def D2    : AArch64Reg<2,   "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
+def D3    : AArch64Reg<3,   "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
+def D4    : AArch64Reg<4,   "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
+def D5    : AArch64Reg<5,   "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
+def D6    : AArch64Reg<6,   "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
+def D7    : AArch64Reg<7,   "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
+def D8    : AArch64Reg<8,   "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
+def D9    : AArch64Reg<9,   "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
+def D10   : AArch64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
+def D11   : AArch64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
+def D12   : AArch64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
+def D13   : AArch64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
+def D14   : AArch64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
+def D15   : AArch64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
+def D16   : AArch64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
+def D17   : AArch64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
+def D18   : AArch64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
+def D19   : AArch64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
+def D20   : AArch64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
+def D21   : AArch64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
+def D22   : AArch64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
+def D23   : AArch64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
+def D24   : AArch64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
+def D25   : AArch64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
+def D26   : AArch64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
+def D27   : AArch64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
+def D28   : AArch64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
+def D29   : AArch64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
+def D30   : AArch64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
+def D31   : AArch64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
+def Q0    : AArch64Reg<0,   "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
+def Q1    : AArch64Reg<1,   "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
+def Q2    : AArch64Reg<2,   "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
+def Q3    : AArch64Reg<3,   "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
+def Q4    : AArch64Reg<4,   "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
+def Q5    : AArch64Reg<5,   "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
+def Q6    : AArch64Reg<6,   "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
+def Q7    : AArch64Reg<7,   "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
+def Q8    : AArch64Reg<8,   "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
+def Q9    : AArch64Reg<9,   "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
+def Q10   : AArch64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
+def Q11   : AArch64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
+def Q12   : AArch64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
+def Q13   : AArch64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
+def Q14   : AArch64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
+def Q15   : AArch64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
+def Q16   : AArch64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
+def Q17   : AArch64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
+def Q18   : AArch64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
+def Q19   : AArch64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
+def Q20   : AArch64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
+def Q21   : AArch64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
+def Q22   : AArch64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
+def Q23   : AArch64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
+def Q24   : AArch64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
+def Q25   : AArch64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
+def Q26   : AArch64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
+def Q27   : AArch64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
+def Q28   : AArch64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
+def Q29   : AArch64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
+def Q30   : AArch64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
+def Q31   : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+def FPR8  : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
+  let Size = 8;
+}
+def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
+  let Size = 16;
+}
+def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
+def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
+                                    v1i64],
+                                    64, (sequence "D%u", 0, 31)>;
+// We don't (yet) have an f128 legal type, so don't use that here. We
+// normalize 128-bit vectors to v2f64 for arg passing and such, so use
+// that here.
+def FPR128 : RegisterClass<"AArch64",
+                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
+                           128, (sequence "Q%u", 0, 31)>;
+
+// The lower 16 vector registers.  Some instructions can only take registers
+// in this range.
+def FPR128_lo : RegisterClass<"AArch64",
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                              128, (trunc FPR128, 16)>;
+
+// Pairs, triples, and quads of 64-bit vector registers.
+def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
+                                 [(rotl FPR64, 0), (rotl FPR64, 1),
+                                  (rotl FPR64, 2)]>;
+def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
+                               [(rotl FPR64, 0), (rotl FPR64, 1),
+                                (rotl FPR64, 2), (rotl FPR64, 3)]>;
+def DD   : RegisterClass<"AArch64", [untyped], 64, (add DSeqPairs)> {
+  let Size = 128;
+}
+def DDD  : RegisterClass<"AArch64", [untyped], 64, (add DSeqTriples)> {
+  let Size = 196;
+}
+def DDDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqQuads)> {
+  let Size = 256;
+}
+
+// Pairs, triples, and quads of 128-bit vector registers.
+def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
+                                 [(rotl FPR128, 0), (rotl FPR128, 1),
+                                  (rotl FPR128, 2)]>;
+def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
+                               [(rotl FPR128, 0), (rotl FPR128, 1),
+                                (rotl FPR128, 2), (rotl FPR128, 3)]>;
+def QQ   : RegisterClass<"AArch64", [untyped], 128, (add QSeqPairs)> {
+  let Size = 256;
+}
+def QQQ  : RegisterClass<"AArch64", [untyped], 128, (add QSeqTriples)> {
+  let Size = 384;
+}
+def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> {
+  let Size = 512;
+}
+
+
+// Vector operand versions of the FP registers. Alternate name printing and
+// assmebler matching.
+def VectorReg64AsmOperand : AsmOperandClass {
+  let Name = "VectorReg64";
+  let PredicateMethod = "isVectorReg";
+}
+def VectorReg128AsmOperand : AsmOperandClass {
+  let Name = "VectorReg128";
+  let PredicateMethod = "isVectorReg";
+}
+
+def V64  : RegisterOperand<FPR64, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg64AsmOperand;
+}
+
+def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg128AsmOperand;
+}
+
+def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; }
+def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
+  let ParserMatchClass = VectorRegLoAsmOperand;
+}
+
+class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
+    : AsmOperandClass {
+  let Name = "TypedVectorList" # count # "_" # lanes # kind;
+
+  let PredicateMethod
+      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
+  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
+}
+
+class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
+    : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
+                                                   # kind # "'>">;
+
+multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
+  // With implicit types (probably on instruction instead). E.g. { v0, v1 }
+  def _64AsmOperand : AsmOperandClass {
+    let Name = NAME # "64";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList64Operands<" # count # ">";
+  }
+
+  def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
+  }
+
+  def _128AsmOperand : AsmOperandClass {
+    let Name = NAME # "128";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList128Operands<" # count # ">";
+  }
+
+  def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
+  }
+
+  // 64-bit register lists with explicit type.
+
+  // { v0.8b, v1.8b }
+  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
+  def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
+  }
+
+  // { v0.4h, v1.4h }
+  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
+  def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
+  }
+
+  // { v0.2s, v1.2s }
+  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
+  def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
+  }
+
+  // { v0.1d, v1.1d }
+  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
+  def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
+  }
+
+  // 128-bit register lists with explicit type
+
+  // { v0.16b, v1.16b }
+  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
+  def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
+  }
+
+  // { v0.8h, v1.8h }
+  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
+  def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
+  }
+
+  // { v0.4s, v1.4s }
+  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
+  def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
+  }
+
+  // { v0.2d, v1.2d }
+  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
+  def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
+  }
+
+  // { v0.b, v1.b }
+  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
+  def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
+  }
+
+  // { v0.h, v1.h }
+  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
+  def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
+  }
+
+  // { v0.s, v1.s }
+  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
+  def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
+  }
+
+  // { v0.d, v1.d }
+  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
+  def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
+  }
+
+
+}
+
+defm VecListOne   : VectorList<1, FPR64, FPR128>;
+defm VecListTwo   : VectorList<2, DD,    QQ>;
+defm VecListThree : VectorList<3, DDD,   QQQ>;
+defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
+
+
+// Register operand versions of the scalar FP registers.
+def FPR16Op : RegisterOperand<FPR16, "printOperand">;
+def FPR32Op : RegisterOperand<FPR32, "printOperand">;
+def FPR64Op : RegisterOperand<FPR64, "printOperand">;
+def FPR128Op : RegisterOperand<FPR128, "printOperand">;
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
new file mode 100644
index 00000000000..0c3949ecfc1
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -0,0 +1,291 @@
+//==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A53 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
+def CortexA53Model : SchedMachineModel {
+  let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
+  let IssueWidth = 2;        // 2 micro-ops are dispatched per cycle.
+  let MinLatency = 1 ;       // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 3;       // Optimistic load latency assuming bypass.
+                             // This is overriden by OperandCycles if the
+                             // Itineraries are queried instead.
+  let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
+                             // Specification - Instruction Timings"
+                             // v 1.0 Spreadsheet
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Cortex-A53 is in-order.
+
+def A53UnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def A53UnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def A53UnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division
+def A53UnitLdSt   : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def A53UnitB      : ProcResource<1> { let BufferSize = 0; } // Branch
+def A53UnitFPALU  : ProcResource<1> { let BufferSize = 0; } // FP ALU
+def A53UnitFPMDS  : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types which both map the ProcResources and
+// set the latency.
+
+let SchedModel = CortexA53Model in {
+
+// ALU - Despite having a full latency of 4, most of the ALU instructions can
+//       forward a cycle earlier and then two cycles earlier in the case of a
+//       shift-only instruction. These latencies will be incorrect when the
+//       result cannot be forwarded, but modeling isn't rocket surgery.
+def : WriteRes<WriteImm, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteI, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteISReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIEReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIS, [A53UnitALU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [A53UnitALU]> { let Latency = 3; }
+
+// MAC
+def : WriteRes<WriteIM32, [A53UnitMAC]> { let Latency = 4; }
+def : WriteRes<WriteIM64, [A53UnitMAC]> { let Latency = 4; }
+
+// Div
+def : WriteRes<WriteID32, [A53UnitDiv]> { let Latency = 4; }
+def : WriteRes<WriteID64, [A53UnitDiv]> { let Latency = 4; }
+
+// Load
+def : WriteRes<WriteLD, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
+//               below, choosing the median of 3 which makes the latency 6.
+//               May model this more carefully in the future. The remaining
+//               A53WriteVLD# types represent the 1-5 cycle issues explicitly.
+def : WriteRes<WriteVLD, [A53UnitLdSt]> { let Latency = 6;
+                                          let ResourceCycles = [3]; }
+def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7;
+                                                  let ResourceCycles = [4]; }
+def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8;
+                                                  let ResourceCycles = [5]; }
+
+// Pre/Post Indexing - Performed as part of address generation which is already
+//                     accounted for in the WriteST* latencies below
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTP, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTX, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [A53UnitLdSt]> { let Latency = 5;
+                                          let ResourceCycles = [2];}
+def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+
+// Branch
+def : WriteRes<WriteBr, [A53UnitB]>;
+def : WriteRes<WriteBrReg, [A53UnitB]>;
+def : WriteRes<WriteSys, [A53UnitB]>;
+def : WriteRes<WriteBarrier, [A53UnitB]>;
+def : WriteRes<WriteHint, [A53UnitB]>;
+
+// FP ALU
+def : WriteRes<WriteF, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCmp, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCvt, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCopy, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteV, [A53UnitFPALU]> { let Latency = 6; }
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFMul, [A53UnitFPMDS]> { let Latency = 6; }
+def : WriteRes<WriteFDiv, [A53UnitFPMDS]> { let Latency = 33;
+                                            let ResourceCycles = [29]; }
+def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; }
+def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18;
+                                                     let ResourceCycles = [14]; }
+def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33;
+                                                     let ResourceCycles = [29]; }
+def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17;
+                                                      let ResourceCycles = [13]; }
+def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
+                                                      let ResourceCycles = [28]; }
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+// No forwarding for these reads.
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
+//       operands are needed one cycle later if and only if they are to be
+//       shifted. Otherwise, they too are needed two cycle later. This same
+//       ReadAdvance applies to Extended registers as well, even though there is
+//       a seperate SchedPredicate for them.
+def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
+                             WriteISReg, WriteIEReg,WriteIS,
+                             WriteID32,WriteID64,
+                             WriteIM32,WriteIM64]>;
+def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
+                                          WriteISReg, WriteIEReg,WriteIS,
+                                          WriteID32,WriteID64,
+                                          WriteIM32,WriteIM64]>;
+def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
+                                             WriteISReg, WriteIEReg,WriteIS,
+                                             WriteID32,WriteID64,
+                                             WriteIM32,WriteIM64]>;
+def A53ReadISReg : SchedReadVariant<[
+	SchedVar<RegShiftedPred, [A53ReadShifted]>,
+	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, A53ReadISReg>;
+
+def A53ReadIEReg : SchedReadVariant<[
+	SchedVar<RegExtendedPred, [A53ReadShifted]>,
+	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, A53ReadIEReg>;
+
+// MAC - Operands are generally needed one cycle later in the MAC pipe.
+//       Accumulator operands are needed two cycles later.
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
+                               WriteISReg, WriteIEReg,WriteIS,
+                               WriteID32,WriteID64,
+                               WriteIM32,WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRWs.
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[WriteI], (instrs COPY)>;
+
+//---
+// Vector Loads
+//---
+def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+//---
+// Floating Point MAC, DIV, SQRT
+//---
+def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+}
diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td
new file mode 100644
index 00000000000..a2a18023778
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -0,0 +1,865 @@
+//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AArch64 Cyclone to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def CycloneModel : SchedMachineModel {
+  let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 192; // Based on the reorder buffer.
+  let LoadLatency = 4; // Optimistic load latency.
+  let MispredictPenalty = 16; // 14-19 cycles are typical.
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cyclone.
+
+// 4 integer pipes
+def CyUnitI : ProcResource<4> {
+  let BufferSize = 48;
+}
+
+// 2 branch units: I[0..1]
+def CyUnitB : ProcResource<2> {
+  let Super  = CyUnitI;
+  let BufferSize = 24;
+}
+
+// 1 indirect-branch unit: I[0]
+def CyUnitBR : ProcResource<1> {
+  let Super  = CyUnitB;
+}
+
+// 2 shifter pipes: I[2..3]
+// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
+def CyUnitIS : ProcResource<2> {
+  let Super = CyUnitI;
+  let BufferSize = 24;
+}
+
+// 1 mul pipe: I[0]
+def CyUnitIM : ProcResource<1> {
+  let Super = CyUnitBR;
+  let BufferSize = 32;
+}
+
+// 1 div pipe: I[1]
+def CyUnitID : ProcResource<1> {
+  let Super = CyUnitB;
+  let BufferSize = 16;
+}
+
+// 1 integer division unit. This is driven by the ID pipe, but only
+// consumes the pipe for one cycle at issue and another cycle at writeback.
+def CyUnitIntDiv : ProcResource<1>;
+
+// 2 ld/st pipes.
+def CyUnitLS : ProcResource<2> {
+  let BufferSize = 28;
+}
+
+// 3 fp/vector pipes.
+def CyUnitV : ProcResource<3> {
+  let BufferSize = 48;
+}
+// 2 fp/vector arithmetic and multiply pipes: V[0-1]
+def CyUnitVM : ProcResource<2> {
+  let Super = CyUnitV;
+  let BufferSize = 32;
+}
+// 1 fp/vector division/sqrt pipe: V[2]
+def CyUnitVD : ProcResource<1> {
+  let Super = CyUnitV;
+  let BufferSize = 16;
+}
+// 1 fp compare pipe: V[0]
+def CyUnitVC : ProcResource<1> {
+  let Super = CyUnitVM;
+  let BufferSize = 16;
+}
+
+// 2 fp division/square-root units.  These are driven by the VD pipe,
+// but only consume the pipe for one cycle at issue and a cycle at writeback.
+def CyUnitFloatDiv : ProcResource<2>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write resources and latency on Cyclone.
+// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
+
+let SchedModel = CycloneModel in {
+
+//---
+// 7.8.1. Moves
+//---
+
+// A single nop micro-op (uX).
+def WriteX : SchedWriteRes<[]> { let Latency = 0; }
+
+// Move zero is a register rename (to machine register zero).
+// The move is replaced by a single nop micro-op.
+// MOVZ Rd, #0
+// AND Rd, Rzr, #imm
+def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>;
+def WriteImmZ  : SchedWriteVariant<[
+                   SchedVar<WriteZPred, [WriteX]>,
+                   SchedVar<NoSchedPred, [WriteImm]>]>;
+def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
+
+// Move GPR is a register rename and single nop micro-op.
+// ORR Xd, XZR, Xm
+// ADD Xd, Xn, #0
+def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>;
+def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>;
+def WriteMov      : SchedWriteVariant<[
+                      SchedVar<WriteIMovPred, [WriteX]>,
+                      SchedVar<WriteVMovPred, [WriteX]>,
+                      SchedVar<NoSchedPred,   [WriteI]>]>;
+def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
+
+// Move non-zero immediate is an integer ALU op.
+// MOVN,MOVZ,MOVK
+def : WriteRes<WriteImm, [CyUnitI]>;
+
+//---
+// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
+//              Shifts and Bitfield Operations
+//---
+
+// ADR,ADRP
+// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
+// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
+// ADC(S),SBC(S)
+// Aliases: CMN, CMP, TST
+//
+// Conditional operations.
+// CCMNi,CCMPi,CCMNr,CCMPr,
+// CSEL,CSINC,CSINV,CSNEG
+//
+// Bit counting and reversal operations.
+// CLS,CLZ,RBIT,REV,REV16,REV32
+def : WriteRes<WriteI, [CyUnitI]>;
+
+// ADD with shifted register operand is a single micro-op that
+// consumes a shift pipeline for two cycles.
+// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
+// EXAMPLE: ADDrs Xn, Xm LSL #imm
+def : WriteRes<WriteISReg, [CyUnitIS]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// ADD with extended register operand is the same as shifted reg operand.
+// ADD(S)re,SUB(S)re
+// EXAMPLE: ADDXre Xn, Xm, UXTB #1
+def : WriteRes<WriteIEReg, [CyUnitIS]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// Variable shift and bitfield operations.
+// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
+def : WriteRes<WriteIS, [CyUnitIS]>;
+
+// EXTR Shifts a pair of registers and requires two micro-ops.
+// The second micro-op is delayed, as modeled by ReadExtrHi.
+// EXTR Xn, Xm, #imm
+def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// EXTR's first register read is delayed by one cycle, effectively
+// shortening its writer's latency.
+// EXTR Xn, Xm, #imm
+def : ReadAdvance<ReadExtrHi, 1>;
+
+//---
+// 7.8.6. Multiplies
+//---
+
+// MUL/MNEG are aliases for MADD/MSUB.
+// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
+def : WriteRes<WriteIM32, [CyUnitIM]> {
+  let Latency = 4;
+}
+// MADDX,MSUBX,SMULH,UMULH
+def : WriteRes<WriteIM64, [CyUnitIM]> {
+  let Latency = 5;
+}
+
+//---
+// 7.8.7. Divide
+//---
+
+// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVW,UDIVW
+def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 10];
+}
+// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVX,UDIVX
+def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
+  let Latency = 13;
+  let ResourceCycles = [2, 13];
+}
+
+//---
+// 7.8.8,7.8.10. Load/Store, single element
+//---
+
+// Integer loads take 4 cycles and use one LS unit for one cycle.
+def : WriteRes<WriteLD, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// Store-load forwarding is 4 cycles.
+//
+// Note: The store-exclusive sequence incorporates this
+// latency. However, general heuristics should not model the
+// dependence between a store and subsequent may-alias load because
+// hardware speculation works.
+def : WriteRes<WriteST, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// Load from base address plus an optionally scaled register offset.
+// Rt latency is latency WriteIS + WriteLD.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def CyWriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
+  SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
+def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
+
+// EXAMPLE: STR Xn, Xm [, lsl 3]
+def CyWriteSTIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
+  SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
+def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
+
+// Read the (unshifted) base register Xn in the second micro-op one cycle later.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def ReadBaseRS : SchedReadAdvance<1>;
+def CyReadAdrBase : SchedReadVariant<[
+  SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
+  SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
+def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
+
+//---
+// 7.8.9,7.8.11. Load/Store, paired
+//---
+
+// Address pre/post increment is a simple ALU op with one cycle latency.
+def : WriteRes<WriteAdr, [CyUnitI]>;
+
+// LDP high register write is fused with the load, but a nop micro-op remains.
+def : WriteRes<WriteLDHi, []> {
+  let Latency = 4;
+}
+
+// STP is a vector op and store, except for QQ, which is just two stores.
+def : SchedAlias<WriteSTP, WriteVSTShuffle>;
+def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
+
+//---
+// 7.8.13. Branches
+//---
+
+// Branches take a single micro-op.
+// The misprediction penalty is defined as a SchedMachineModel property.
+def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
+def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
+
+//---
+// 7.8.14. Never-issued Instructions, Barrier and Hint Operations
+//---
+
+// NOP,SEV,SEVL,WFE,WFI,YIELD
+def : WriteRes<WriteHint, []> {let Latency = 0;}
+// ISB
+def : InstRW<[WriteI], (instrs ISB)>;
+// SLREX,DMB,DSB
+def : WriteRes<WriteBarrier, [CyUnitLS]>;
+
+// System instructions get an invalid latency because the latency of
+// other operations across them is meaningless.
+def : WriteRes<WriteSys, []> {let Latency = -1;}
+
+//===----------------------------------------------------------------------===//
+// 7.9 Vector Unit Instructions
+
+// Simple vector operations take 2 cycles.
+def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
+
+// Define some longer latency vector op types for Cyclone.
+def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
+def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
+def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
+
+// Simple floating-point operations take 2 cycles.
+def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
+
+//---
+// 7.9.1 Vector Moves
+//---
+
+// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
+// generates expensive int-float conversion instead:
+// FMOVDi Dd, #0.0
+// FMOVv2f64ns Vd.2d, #0.0
+
+// FMOVSi,FMOVDi
+def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
+
+// MOVI,MVNI are WriteV
+// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
+
+// Move FPR is a register rename and single nop micro-op.
+// ORR.16b Vd,Vn,Vn
+// COPY is handled above in the WriteMov Variant.
+def WriteVMov    : SchedWriteVariant<[
+                     SchedVar<WriteVMovPred, [WriteX]>,
+                     SchedVar<NoSchedPred,   [WriteV]>]>;
+def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
+
+// FMOVSr,FMOVDr are WriteF.
+
+// MOV V,V is a WriteV.
+
+// CPY D,V[x] is a WriteV
+
+// INS V[x],V[y] is a WriteV.
+
+// FMOVWSr,FMOVXDr,FMOVXDHighr
+def : WriteRes<WriteFCopy, [CyUnitLS]> {
+  let Latency = 5;
+}
+
+// FMOVSWr,FMOVDXr
+def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
+
+// INS V[x],R
+def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
+
+// SMOV,UMOV R,V[x]
+def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
+def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
+
+// DUP V,R
+def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
+
+// DUP V,V[x] is a WriteV.
+
+//---
+// 7.9.2 Integer Arithmetic, Logical, and Comparisons
+//---
+
+// BIC,ORR V,#imm are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "ABSv")>;
+
+// MVN,NEG,NOT are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
+
+// ADDP is a WriteV.
+def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
+
+def : InstRW<[CyWriteV3],
+             (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
+
+// ADD,SUB are WriteV
+
+// Forward declare.
+def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+
+// Add/Diff and accumulate uses the vector multiply unit.
+def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVAccum  : SchedReadAdvance<1,
+                    [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+             (instregex "SADALP","UADALP")>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+             (instregex "SABAv","UABAv","SABALv","UABALv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
+
+def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
+
+// WriteV includes:
+// AND,BIC,CMTST,EOR,ORN,ORR
+// ADDP
+// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
+// SADDL,SSUBL,UADDL,USUBL
+// SADDW,SSUBW,UADDW,USUBW
+
+def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
+                                     "CMLEv","CMLTv",
+                                     "CMHIv","CMHSv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
+                                     "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
+
+def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
+                                       "SABDLv","UABDLv")>;
+
+//---
+// 7.9.3 Floating Point Arithmetic and Comparisons
+//---
+
+// FABS,FNEG are WriteF
+
+def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
+def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
+
+def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
+                                     "FMINPv2i","FMINNMPv2i")>;
+
+def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
+
+def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
+                                  FSUBSrr,FSUBv2f32,FSUBv4f32,
+                                  FADDPv2f32,FADDPv4f32,
+                                  FABD32,FABDv2f32,FABDv4f32)>;
+def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
+                                  FSUBDrr,FSUBv2f64,
+                                  FADDPv2f64,
+                                  FABD64,FABDv2f64)>;
+
+def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
+
+def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
+                                     "FMAXS","FMAXD","FMAXv",
+                                     "FMINS","FMIND","FMINv",
+                                     "FMAXNMS","FMAXNMD","FMAXNMv",
+                                     "FMINNMS","FMINNMD","FMINNMv",
+                                     "FMAXPv2f","FMAXPv4f",
+                                     "FMINPv2f","FMINPv4f",
+                                     "FMAXNMPv2f","FMAXNMPv4f",
+                                     "FMINNMPv2f","FMINNMPv4f")>;
+
+// FCMP,FCMPE,FCCMP,FCCMPE
+def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
+
+// FCSEL is a WriteF.
+
+//---
+// 7.9.4 Shifts and Bitfield Operations
+//---
+
+// SHL is a WriteV
+
+def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
+
+def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
+
+// Shift and accumulate uses the vector multiply unit.
+def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVShiftAcc  : SchedReadAdvance<1,
+                        [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
+def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
+             (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// SSHL,USHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
+
+// SQSHL,SQSHLU,UQSHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
+
+// WriteV includes:
+// SHLL,SSHLL,USHLL
+// SLI,SRI
+// BIF,BIT,BSL
+// EXT
+// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
+// XTN2
+
+def : InstRW<[CyWriteV4],
+             (instregex "RSHRNv","SHRNv",
+                        "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
+                        "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+
+//---
+// 7.9.5 Multiplication
+//---
+
+def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
+def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
+                             "SQDMULLv","SQDMULHv","SQRDMULHv")>;
+
+// FMUL,FMULX,FNMUL default to WriteFMul.
+def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
+
+def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
+def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
+                               FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
+
+def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
+def : InstRW<[CyWriteVMul, CyReadVMulAcc],
+             (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
+              "SQDMLAL","SQDMLSL")>;
+
+def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
+def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
+def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
+def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
+
+def : InstRW<[CyWriteSMul, CyReadSMul],
+             (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
+              FMLAv2f32,FMLAv4f32,
+              FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
+def : InstRW<[CyWriteDMul, CyReadDMul],
+             (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
+              FMLAv2f64,FMLAv2i64_indexed,
+              FMLSv2f64,FMLSv2i64_indexed)>;
+
+def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
+def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
+
+//---
+// 7.9.6 Divide and Square Root
+//---
+
+// FDIV,FSQRT
+// TODO: Add 64-bit variant with 19 cycle latency.
+// TODO: Specialize FSQRT for longer latency.
+def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
+  let Latency = 17;
+  let ResourceCycles = [2, 17];
+}
+
+def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
+
+def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
+def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
+
+def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
+def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
+def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
+def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
+
+//---
+// 7.9.7 Integer-FP Conversions
+//---
+
+// FCVT lengthen f16/s32
+def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
+
+// FCVT,FCVTN,FCVTXN
+// SCVTF,UCVTF V,V
+// FRINT(AIMNPXZ) V,V
+def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
+
+// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
+def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
+
+// FCVT Rd, S/D = V6+LD4: 10 cycles
+def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
+def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
+
+// FCVTL is a WriteV
+
+//---
+// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
+//---
+
+def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
+def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
+                                       AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
+                                       SHA1SU0rrr)>;
+
+def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
+def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
+
+def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
+def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
+                                       SHA256Hrrr,SHA256H2rrr)>;
+
+// TRN,UZP,ZUP are WriteV.
+
+// TBL,TBX are WriteV.
+
+//---
+// 7.9.11-7.9.14 Load/Store, single element and paired
+//---
+
+// Loading into the vector unit takes 5 cycles vs 4 for integer loads.
+def : WriteRes<WriteVLD, [CyUnitLS]> {
+  let Latency = 5;
+}
+
+// Store-load forwarding is 4 cycles.
+def : WriteRes<WriteVST, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// WriteVLDPair/VSTPair sequences are expanded by the target description.
+
+//---
+// 7.9.15 Load, element operations
+//---
+
+// Only the first WriteVLD and WriteAdr for writeback matches def operands.
+// Subsequent WriteVLDs consume resources. Since all loaded values have the
+// same latency, this is acceptable.
+
+// Vd is read 5 cycles after issuing the vector load.
+def : ReadAdvance<ReadVLD, 5>;
+
+def : InstRW<[WriteVLD],
+             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+// Register writes from the load's high half are fused micro-ops.
+def : InstRW<[WriteVLD],
+             (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+             (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
+             (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],
+             (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
+             (instregex "LD1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle],
+             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr],
+             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+             (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+             (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
+             (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+             (instregex "LD2i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+             (instregex "LD2i(8|16|32)_POST")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+             (instregex "LD2i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+             (instregex "LD2i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+             (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+             (instregex "LD3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
+             (instregex "LD3i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
+             (instregex "LD3i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
+             (instregex "LD3i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+             (instregex "LD3i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
+             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+             (instrs LD3Rv1d,LD3Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+             (instrs LD3Rv2d_POST,LD3Rv2d_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD4Fourv(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
+              WriteVLDPairShuffle, WriteVLDPairShuffle],
+             (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
+              WriteVLDPairShuffle, WriteVLDPairShuffle],
+             (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
+             (instregex "LD4i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
+             (instregex "LD4i(8|16|32)_POST")>;
+
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+             (instrs LD4i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
+             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
+             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4Rv1d,LD4Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
+
+//---
+// 7.9.16 Store, element operations
+//---
+
+// Only the WriteAdr for writeback matches a def operands.
+// Subsequent WriteVLDs only consume resources.
+
+def : InstRW<[WriteVST],
+             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST],
+             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],
+             (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+             (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST],
+             (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST],
+             (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVST],
+             (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
+             (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle],
+             (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+             (instregex "ST2Twov(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
+def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
+
+def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
+def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
+
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
+            (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
+            (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
+              WriteVSTPairShuffle, WriteVSTPairShuffle],
+             (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
+              WriteVSTPairShuffle, WriteVSTPairShuffle],
+             (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
+
+//---
+// Unused SchedRead types
+//---
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+
+} // SchedModel = CycloneModel
diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td
new file mode 100644
index 00000000000..eaa9110ab1b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64Schedule.td
@@ -0,0 +1,104 @@
+//==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Define TII for use in SchedVariant Predicates.
+// const MachineInstr *MI and const TargetSchedModel *SchedModel
+// are defined by default.
+def : PredicateProlog<[{
+  const AArch64InstrInfo *TII =
+    static_cast<const AArch64InstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+// AArch64 Scheduler Definitions
+
+def WriteImm       : SchedWrite; // MOVN, MOVZ
+// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
+// select the correct sequence of WriteImms.
+
+def WriteI         : SchedWrite; // ALU
+def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
+def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
+def ReadI          : SchedRead;  // ALU
+def ReadISReg      : SchedRead;  // ALU of Shifted-Reg
+def ReadIEReg      : SchedRead;  // ALU of Extended-Reg
+def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
+def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
+def WriteIS        : SchedWrite; // Shift/Scale
+def WriteID32      : SchedWrite; // 32-bit Divide
+def WriteID64      : SchedWrite; // 64-bit Divide
+def ReadID         : SchedRead;  // 32/64-bit Divide
+def WriteIM32      : SchedWrite; // 32-bit Multiply
+def WriteIM64      : SchedWrite; // 64-bit Multiply
+def ReadIM         : SchedRead;  // 32/64-bit Multiply
+def ReadIMA        : SchedRead;  // 32/64-bit Multiply Accumulate
+def WriteBr        : SchedWrite; // Branch
+def WriteBrReg     : SchedWrite; // Indirect Branch
+
+def WriteLD        : SchedWrite; // Load from base addr plus immediate offset
+def WriteST        : SchedWrite; // Store to base addr plus immediate offset
+def WriteSTP       : SchedWrite; // Store a register pair.
+def WriteAdr       : SchedWrite; // Address pre/post increment.
+
+def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
+def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
+def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
+
+// Predicate for determining when a shiftable register is shifted.
+def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>;
+
+// Predicate for determining when a extendedable register is extended.
+def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>;
+
+// ScaledIdxPred is true if a WriteLDIdx operand will be
+// scaled. Subtargets can use this to dynamically select resources and
+// latency for WriteLDIdx and ReadAdrBase.
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
+
+// Serialized two-level address load.
+// EXAMPLE: LOADGot
+def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
+
+// Serialized two-level address lookup.
+// EXAMPLE: MOVaddr...
+def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
+
+// The second register of a load-pair.
+// LDP,LDPSW,LDNP,LDXP,LDAXP
+def WriteLDHi : SchedWrite;
+
+// Store-exclusive is a store followed by a dependent load.
+def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
+
+def WriteSys     : SchedWrite; // Long, variable latency system ops.
+def WriteBarrier : SchedWrite; // Memory barrier.
+def WriteHint    : SchedWrite; // Hint instruction.
+
+def WriteF       : SchedWrite; // General floating-point ops.
+def WriteFCmp    : SchedWrite; // Floating-point compare.
+def WriteFCvt    : SchedWrite; // Float conversion.
+def WriteFCopy   : SchedWrite; // Float-int register copy.
+def WriteFImm    : SchedWrite; // Floating-point immediate.
+def WriteFMul    : SchedWrite; // Floating-point multiply.
+def WriteFDiv    : SchedWrite; // Floating-point division.
+
+def WriteV   : SchedWrite; // Vector ops.
+def WriteVLD : SchedWrite; // Vector loads.
+def WriteVST : SchedWrite; // Vector stores.
+
+// Read the unwritten lanes of the VLD's destination registers.
+def ReadVLD : SchedRead;
+
+// Sequential vector load and shuffle.
+def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
+def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
+
+// Store a shuffled vector.
+def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
+def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
new file mode 100644
index 00000000000..5c65b750ee5
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -0,0 +1,59 @@
+//===-- AArch64SelectionDAGInfo.cpp - AArch64 SelectionDAG Info -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-selectiondag-info"
+
+AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const TargetMachine &TM)
+    : TargetSelectionDAGInfo(TM),
+      Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {}
+
+AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {}
+
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
+  // Check to see if there is a specialized entry-point for memory zeroing.
+  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
+  const char *bzeroEntry =
+      (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : nullptr;
+  // For small size (< 256), it is not beneficial to use bzero
+  // instead of memset.
+  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
+    const AArch64TargetLowering &TLI =
+        *static_cast<const AArch64TargetLowering *>(
+            DAG.getTarget().getTargetLowering());
+
+    EVT IntPtr = TLI.getPointerTy();
+    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = Dst;
+    Entry.Ty = IntPtrTy;
+    Args.push_back(Entry);
+    Entry.Node = Size;
+    Args.push_back(Entry);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl).setChain(Chain)
+      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                 DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
+      .setDiscardResult();
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    return CallResult.second;
+  }
+  return SDValue();
+}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
new file mode 100644
index 00000000000..8381f9916a8
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -0,0 +1,37 @@
+//===-- AArch64SelectionDAGInfo.h - AArch64 SelectionDAG Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64SELECTIONDAGINFO_H
+#define AArch64SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+public:
+  explicit AArch64SelectionDAGInfo(const TargetMachine &TM);
+  ~AArch64SelectionDAGInfo();
+
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
+                                  SDValue Dst, SDValue Src, SDValue Size,
+                                  unsigned Align, bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+};
+}
+
+#endif
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
new file mode 100644
index 00000000000..45f8ddbd2d8
--- /dev/null
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -0,0 +1,168 @@
+//===--- AArch64StorePairSuppress.cpp --- Suppress store pair formation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies floating point stores that should not be combined into
+// store pairs. Later we may do the same for floating point loads.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-stp-suppress"
+
+namespace {
+class AArch64StorePairSuppress : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  MachineFunction *MF;
+  TargetSchedModel SchedModel;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+
+public:
+  static char ID;
+  AArch64StorePairSuppress() : MachineFunctionPass(ID) {}
+
+  virtual const char *getPassName() const override {
+    return "AArch64 Store Pair Suppression";
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+private:
+  bool shouldAddSTPToBlock(const MachineBasicBlock *BB);
+
+  bool isNarrowFPStore(const MachineInstr &MI);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineTraceMetrics>();
+    AU.addPreserved<MachineTraceMetrics>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64StorePairSuppress::ID = 0;
+} // anonymous
+
+FunctionPass *llvm::createAArch64StorePairSuppressPass() {
+  return new AArch64StorePairSuppress();
+}
+
+/// Return true if an STP can be added to this block without increasing the
+/// critical resource height. STP is good to form in Ld/St limited blocks and
+/// bad to form in float-point limited blocks. This is true independent of the
+/// critical path. If the critical path is longer than the resource height, the
+/// extra vector ops can limit physreg renaming. Otherwise, it could simply
+/// oversaturate the vector units.
+bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB);
+  unsigned ResLength = BBTrace.getResourceLength();
+
+  // Get the machine model's scheduling class for STPQi.
+  // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
+  unsigned SCIdx = TII->get(AArch64::STPDi).getSchedClass();
+  const MCSchedClassDesc *SCDesc =
+      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+
+  // If a subtarget does not define resources for STPQi, bail here.
+  if (SCDesc->isValid() && !SCDesc->isVariant()) {
+    unsigned ResLenWithSTP = BBTrace.getResourceLength(
+        ArrayRef<const MachineBasicBlock *>(), SCDesc);
+    if (ResLenWithSTP > ResLength) {
+      DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
+                   << " resources " << ResLength << " -> " << ResLenWithSTP
+                   << "\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Return true if this is a floating-point store smaller than the V reg. On
+/// cyclone, these require a vector shuffle before storing a pair.
+/// Ideally we would call getMatchingPairOpcode() and have the machine model
+/// tell us if it's profitable with no cpu knowledge here.
+///
+/// FIXME: We plan to develop a decent Target abstraction for simple loads and
+/// stores. Until then use a nasty switch similar to AArch64LoadStoreOptimizer.
+bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+    return true;
+  }
+}
+
+bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  TII = static_cast<const AArch64InstrInfo *>(MF->getTarget().getInstrInfo());
+  TRI = MF->getTarget().getRegisterInfo();
+  MRI = &MF->getRegInfo();
+  const TargetSubtargetInfo &ST =
+      MF->getTarget().getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = nullptr;
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n');
+
+  if (!SchedModel.hasInstrSchedModel()) {
+    DEBUG(dbgs() << "  Skipping pass: no machine model present.\n");
+    return false;
+  }
+
+  // Check for a sequence of stores to the same base address. We don't need to
+  // precisely determine whether a store pair can be formed. But we do want to
+  // filter out most situations where we can't form store pairs to avoid
+  // computing trace metrics in those cases.
+  for (auto &MBB : *MF) {
+    bool SuppressSTP = false;
+    unsigned PrevBaseReg = 0;
+    for (auto &MI : MBB) {
+      if (!isNarrowFPStore(MI))
+        continue;
+      unsigned BaseReg;
+      unsigned Offset;
+      if (TII->getLdStBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) {
+        if (PrevBaseReg == BaseReg) {
+          // If this block can take STPs, skip ahead to the next block.
+          if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
+            break;
+          // Otherwise, continue unpairing the stores in this block.
+          DEBUG(dbgs() << "Unpairing store " << MI << "\n");
+          SuppressSTP = true;
+          TII->suppressLdStPair(&MI);
+        }
+        PrevBaseReg = BaseReg;
+      } else
+        PrevBaseReg = 0;
+    }
+  }
+  // This pass just sets some internal MachineMemOperand flags. It can't really
+  // invalidate anything.
+  return false;
+}
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
new file mode 100644
index 00000000000..cd69994620d
--- /dev/null
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -0,0 +1,116 @@
+//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-subtarget"
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "AArch64GenSubtargetInfo.inc"
+
+static cl::opt<bool>
+EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
+                     "converter pass"), cl::init(true), cl::Hidden);
+
+AArch64Subtarget::AArch64Subtarget(const std::string &TT,
+                                   const std::string &CPU,
+                                   const std::string &FS, bool LittleEndian)
+    : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
+      HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
+      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU),
+      TargetTriple(TT), IsLittleEndian(LittleEndian) {
+  // Determine default and user-specified characteristics
+
+  if (CPUString.empty())
+    CPUString = "generic";
+
+  ParseSubtargetFeatures(CPUString, FS);
+}
+
+/// ClassifyGlobalReference - Find the target operand flags that describe
+/// how a global value should be referenced for the current subtarget.
+unsigned char
+AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const {
+
+  // Determine whether this is a reference to a definition or a declaration.
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
+
+  // MachO large model always goes via a GOT, simply to get a single 8-byte
+  // absolute relocation on all global addresses.
+  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
+    return AArch64II::MO_GOT;
+
+  // The small code mode's direct accesses use ADRP, which cannot necessarily
+  // produce the value 0 (if the code is above 4GB). Therefore they must use the
+  // GOT.
+  if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
+    return AArch64II::MO_GOT;
+
+  // If symbol visibility is hidden, the extra load is not needed if
+  // the symbol is definitely defined in the current translation unit.
+
+  // The handling of non-hidden symbols in PIC mode is rather target-dependent:
+  //   + On MachO, if the symbol is defined in this module the GOT can be
+  //     skipped.
+  //   + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
+  //     defined could end up in unexpected places. Use a GOT.
+  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
+    if (isTargetMachO())
+      return (isDecl || GV->isWeakForLinker()) ? AArch64II::MO_GOT
+                                               : AArch64II::MO_NO_FLAG;
+    else
+      // No need to go through the GOT for local symbols on ELF.
+      return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
+  }
+
+  return AArch64II::MO_NO_FLAG;
+}
+
+/// This function returns the name of a function which has an interface
+/// like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over
+/// memset with zero passed as the second argument. Otherwise it
+/// returns null.
+const char *AArch64Subtarget::getBZeroEntry() const {
+  // Prefer bzero on Darwin only.
+  if(isTargetDarwin())
+    return "bzero";
+
+  return nullptr;
+}
+
+void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                         MachineInstr *begin, MachineInstr *end,
+                                         unsigned NumRegionInstrs) const {
+  // LNT run (at least on Cyclone) showed reasonably significant gains for
+  // bi-directional scheduling. 253.perlbmk.
+  Policy.OnlyTopDown = false;
+  Policy.OnlyBottomUp = false;
+}
+
+bool AArch64Subtarget::enableEarlyIfConversion() const {
+  return EnableEarlyIfConvert;
+}
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
new file mode 100644
index 00000000000..590ea0580ea
--- /dev/null
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -0,0 +1,110 @@
+//===--- AArch64Subtarget.h - Define Subtarget for the AArch64 -*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AArch64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64SUBTARGET_H
+#define AArch64SUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "AArch64RegisterInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AArch64GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+
+class AArch64Subtarget : public AArch64GenSubtargetInfo {
+protected:
+  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
+
+  /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
+  ARMProcFamilyEnum ARMProcFamily;
+
+  bool HasFPARMv8;
+  bool HasNEON;
+  bool HasCrypto;
+  bool HasCRC;
+
+  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
+  bool HasZeroCycleRegMove;
+
+  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
+  bool HasZeroCycleZeroing;
+
+  /// CPUString - String name of used CPU.
+  std::string CPUString;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  /// IsLittleEndian - Is the target little endian?
+  bool IsLittleEndian;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  AArch64Subtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS, bool LittleEndian);
+
+  bool enableMachineScheduler() const override { return true; }
+
+  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
+
+  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+
+  bool hasFPARMv8() const { return HasFPARMv8; }
+  bool hasNEON() const { return HasNEON; }
+  bool hasCrypto() const { return HasCrypto; }
+  bool hasCRC() const { return HasCRC; }
+
+  bool isLittleEndian() const { return IsLittleEndian; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  bool isCyclone() const { return CPUString == "cyclone"; }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return 64; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// ClassifyGlobalReference - Find the target operand flags that describe
+  /// how a global value should be referenced for the current subtarget.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
+
+  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
+                           MachineInstr *end,
+                           unsigned NumRegionInstrs) const override;
+
+  bool enableEarlyIfConversion() const override;
+};
+} // End llvm namespace
+
+#endif // AArch64SUBTARGET_H
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
new file mode 100644
index 00000000000..0b5dd2f067e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -0,0 +1,208 @@
+//===-- AArch64TargetMachine.cpp - Define TargetMachine for AArch64 -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"),
+           cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"),
+                     cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableAdvSIMDScalar("aarch64-simd-scalar", cl::desc("Enable use of AdvSIMD scalar"
+                    " integer instructions"), cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+EnablePromoteConstant("aarch64-promote-const", cl::desc("Enable the promote "
+                      "constant pass"), cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableCollectLOH("aarch64-collect-loh", cl::desc("Enable the pass that emits the"
+                 " linker optimization hints (LOH)"), cl::init(true),
+                 cl::Hidden);
+
+static cl::opt<bool>
+EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden,
+                              cl::desc("Enable the pass that removes dead"
+                                       " definitons and replaces stores to"
+                                       " them with stores to the zero"
+                                       " register"),
+                              cl::init(true));
+
+static cl::opt<bool>
+EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
+                   " optimization pass"), cl::init(true), cl::Hidden);
+
+extern "C" void LLVMInitializeAArch64Target() {
+  // Register the target.
+  RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
+  RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
+
+  RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64leTarget);
+  RegisterTargetMachine<AArch64beTargetMachine> W(TheARM64beTarget);
+}
+
+/// TargetMachine ctor - Create an AArch64 architecture model.
+///
+AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL,
+                                           bool LittleEndian)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, LittleEndian),
+      // This nested ternary is horrible, but DL needs to be properly
+      // initialized
+      // before TLInfo is constructed.
+      DL(Subtarget.isTargetMachO()
+             ? "e-m:o-i64:64-i128:128-n32:64-S128"
+             : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128"
+                             : "E-m:e-i64:64-i128:128-n32:64-S128")),
+      InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget),
+      TSInfo(*this) {
+  initAsmInfo();
+}
+
+void AArch64leTargetMachine::anchor() { }
+
+AArch64leTargetMachine::
+AArch64leTargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL)
+  : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void AArch64beTargetMachine::anchor() { }
+
+AArch64beTargetMachine::
+AArch64beTargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL)
+  : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+namespace {
+/// AArch64 Code Generator Pass Configuration Options.
+class AArch64PassConfig : public TargetPassConfig {
+public:
+  AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  AArch64TargetMachine &getAArch64TargetMachine() const {
+    return getTM<AArch64TargetMachine>();
+  }
+
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addILPOpts() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
+};
+} // namespace
+
+void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our AArch64 pass. This
+  // allows the AArch64 pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createAArch64TargetTransformInfoPass(this));
+}
+
+TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AArch64PassConfig(this, PM);
+}
+
+// Pass Pipeline Configuration
+bool AArch64PassConfig::addPreISel() {
+  // Run promote constant before global merge, so that the promoted constants
+  // get a chance to be merged
+  if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
+    addPass(createAArch64PromoteConstantPass());
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createGlobalMergePass(TM));
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64AddressTypePromotionPass());
+
+  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
+  // ourselves.
+  addPass(createAtomicExpandLoadLinkedPass(TM));
+
+  return false;
+}
+
+bool AArch64PassConfig::addInstSelector() {
+  addPass(createAArch64ISelDag(getAArch64TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
+  // references to _TLS_MODULE_BASE_ as possible.
+  if (TM->getSubtarget<AArch64Subtarget>().isTargetELF() &&
+      getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64CleanupLocalDynamicTLSPass());
+
+  return false;
+}
+
+bool AArch64PassConfig::addILPOpts() {
+  if (EnableCCMP)
+    addPass(createAArch64ConditionalCompares());
+  addPass(&EarlyIfConverterID);
+  if (EnableStPairSuppress)
+    addPass(createAArch64StorePairSuppressPass());
+  return true;
+}
+
+bool AArch64PassConfig::addPreRegAlloc() {
+  // Use AdvSIMD scalar instructions whenever profitable.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar)
+    addPass(createAArch64AdvSIMDScalar());
+  return true;
+}
+
+bool AArch64PassConfig::addPostRegAlloc() {
+  // Change dead register definitions to refer to the zero register.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
+    addPass(createAArch64DeadRegisterDefinitions());
+  return true;
+}
+
+bool AArch64PassConfig::addPreSched2() {
+  // Expand some pseudo instructions to allow proper scheduling.
+  addPass(createAArch64ExpandPseudoPass());
+  // Use load/store pair instructions when possible.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
+    addPass(createAArch64LoadStoreOptimizationPass());
+  return true;
+}
+
+bool AArch64PassConfig::addPreEmitPass() {
+  // Relax conditional branch instructions if they're otherwise out of
+  // range of their destination.
+  addPass(createAArch64BranchRelaxation());
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
+      TM->getSubtarget<AArch64Subtarget>().isTargetMachO())
+    addPass(createAArch64CollectLOHPass());
+  return true;
+}
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
new file mode 100644
index 00000000000..079b19b23bb
--- /dev/null
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -0,0 +1,94 @@
+//==-- AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AArch64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64TARGETMACHINE_H
+#define AArch64TARGETMACHINE_H
+
+#include "AArch64InstrInfo.h"
+#include "AArch64ISelLowering.h"
+#include "AArch64Subtarget.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64SelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class AArch64TargetMachine : public LLVMTargetMachine {
+protected:
+  AArch64Subtarget Subtarget;
+
+private:
+  const DataLayout DL;
+  AArch64InstrInfo InstrInfo;
+  AArch64TargetLowering TLInfo;
+  AArch64FrameLowering FrameLowering;
+  AArch64SelectionDAGInfo TSInfo;
+
+public:
+  AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL, bool IsLittleEndian);
+
+  const AArch64Subtarget *getSubtargetImpl() const override {
+    return &Subtarget;
+  }
+  const AArch64TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const AArch64FrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const AArch64RegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  /// \brief Register AArch64 analysis passes with a pass manager.
+  void addAnalysisPasses(PassManagerBase &PM) override;
+};
+
+// AArch64leTargetMachine - AArch64 little endian target machine.
+//
+class AArch64leTargetMachine : public AArch64TargetMachine {
+  virtual void anchor();
+public:
+  AArch64leTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
+                         Reloc::Model RM, CodeModel::Model CM,
+                         CodeGenOpt::Level OL);
+};
+
+// AArch64beTargetMachine - AArch64 big endian target machine.
+//
+class AArch64beTargetMachine : public AArch64TargetMachine {
+  virtual void anchor();
+public:
+  AArch64beTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
+                         Reloc::Model RM, CodeModel::Model CM,
+                         CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
new file mode 100644
index 00000000000..4069038dffe
--- /dev/null
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -0,0 +1,52 @@
+//===-- AArch64TargetObjectFile.cpp - AArch64 Object Info -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetObjectFile.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Dwarf.h"
+using namespace llvm;
+using namespace dwarf;
+
+void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
+    const MCExpr *Res =
+        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+    MCSymbol *PCSym = getContext().CreateTempSymbol();
+    Streamer.EmitLabel(PCSym);
+    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
+    return MCBinaryExpr::CreateSub(Res, PC, getContext());
+  }
+
+  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+      GV, Encoding, Mang, TM, MMI, Streamer);
+}
+
+MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV, Mang);
+}
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
new file mode 100644
index 00000000000..de63cb42542
--- /dev/null
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -0,0 +1,40 @@
+//===-- AArch64TargetObjectFile.h - AArch64 Object Info -*- C++ ---------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
+#define LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+class AArch64TargetMachine;
+
+/// This implementation is used for AArch64 ELF targets (Linux in particular).
+class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+/// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
+class AArch64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+public:
+  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+                                        unsigned Encoding, Mangler &Mang,
+                                        const TargetMachine &TM,
+                                        MachineModuleInfo *MMI,
+                                        MCStreamer &Streamer) const override;
+
+  MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+                                    const TargetMachine &TM,
+                                    MachineModuleInfo *MMI) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
new file mode 100644
index 00000000000..33e482a53a4
--- /dev/null
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -0,0 +1,464 @@
+//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// AArch64 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64tti"
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't have a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeAArch64TTIPass(PassRegistry &);
+}
+
+namespace {
+
+class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
+  const AArch64TargetMachine *TM;
+  const AArch64Subtarget *ST;
+  const AArch64TargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  AArch64TTI(const AArch64TargetMachine *TM)
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  void initializePass() override { pushTTIStack(this); }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  void *getAdjustedAnalysisPointer(const void *ID) override {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo *)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+  unsigned getIntImmCost(int64_t Val) const;
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) const override {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 32;
+      return 0;
+    }
+    return 31;
+  }
+
+  unsigned getRegisterBitWidth(bool Vector) const override {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 128;
+      return 0;
+    }
+    return 64;
+  }
+
+  unsigned getMaximumUnrollFactor() const override { return 2; }
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
+      override;
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
+      override;
+
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                  OperandValueKind Opd1Info = OK_AnyValue,
+                                  OperandValueKind Opd2Info = OK_AnyValue) const
+      override;
+
+  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
+      override;
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const override;
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti",
+                   "AArch64 Target Transform Info", true, true, false)
+char AArch64TTI::ID = 0;
+
+ImmutablePass *
+llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
+  return new AArch64TTI(TM);
+}
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
+  // Check if the immediate can be encoded within an instruction.
+  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
+    return 0;
+
+  if (Val < 0)
+    Val = ~Val;
+
+  // Calculate how many moves we will need to materialize this constant.
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  return (64 - LZ + 15) / 16;
+}
+
+/// \brief Calculate the cost of materializing the given constant.
+unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  // Sign-extend all constants to a multiple of 64-bit.
+  APInt ImmVal = Imm;
+  if (BitSize & 0x3f)
+    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+  // Split the constant into 64-bit chunks and calculate the cost for each
+  // chunk.
+  unsigned Cost = 0;
+  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+    int64_t Val = Tmp.getSExtValue();
+    Cost += getIntImmCost(Val);
+  }
+  // We need at least one instruction to materialze the constant.
+  return std::max(1U, Cost);
+}
+
+unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                 const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TCC_Free;
+
+  unsigned ImmIdx = ~0U;
+  switch (Opcode) {
+  default:
+    return TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr.
+    if (Idx == 0)
+      return 2 * TCC_Basic;
+    return TCC_Free;
+  case Instruction::Store:
+    ImmIdx = 0;
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+    ImmIdx = 1;
+    break;
+  // Always return TCC_Free for the shift value of a shift instruction.
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    if (Idx == 1)
+      return TCC_Free;
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+  case Instruction::BitCast:
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Select:
+  case Instruction::Ret:
+  case Instruction::Load:
+    break;
+  }
+
+  if (Idx == ImmIdx) {
+    unsigned NumConstants = (BitSize + 63) / 64;
+    unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TCC_Basic)
+      ? static_cast<unsigned>(TCC_Free) : Cost;
+  }
+  return AArch64TTI::getIntImmCost(Imm, Ty);
+}
+
+unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                 const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TCC_Free;
+
+  switch (IID) {
+  default:
+    return TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    if (Idx == 1) {
+      unsigned NumConstants = (BitSize + 63) / 64;
+      unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
+      return (Cost <= NumConstants * TCC_Basic)
+        ? static_cast<unsigned>(TCC_Free) : Cost;
+    }
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  }
+  return AArch64TTI::getIntImmCost(Imm, Ty);
+}
+
+AArch64TTI::PopcntSupportKind
+AArch64TTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  if (TyWidth == 32 || TyWidth == 64)
+    return PSK_FastHardware;
+  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
+  return PSK_Software;
+}
+
+unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
+    // LowerVectorINT_TO_FP:
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    // LowerVectorFP_TO_INT
+    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 4 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 4 },
+    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
+    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
+  };
+
+  int Idx = ConvertCostTableLookup<MVT>(
+      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
+      SrcTy.getSimpleVT());
+  if (Idx != -1)
+    return ConversionTbl[Idx].Cost;
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // The element at index zero is already inside the vector.
+    if (Index == 0)
+      return 0;
+  }
+
+  // All other insert/extracts cost this much.
+  return 2;
+}
+
+unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                          OperandValueKind Opd1Info,
+                                          OperandValueKind Opd2Info) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  switch (ISD) {
+  default:
+    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
+                                                       Opd2Info);
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::XOR:
+  case ISD::OR:
+  case ISD::AND:
+    // These nodes are marked as 'custom' for combining purposes only.
+    // We know that they are legal. See LowerAdd in ISelLowering.
+    return 1 * LT.first;
+  }
+}
+
+unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
+
+unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const {
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  // We don't lower vector selects well that are wider than the register width.
+  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    // We would need this many instructions to hide the scalarization happening.
+    unsigned AmortizationCost = 20;
+    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    VectorSelectTbl[] = {
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
+    };
+
+    EVT SelCondTy = TLI->getValueType(CondTy);
+    EVT SelValTy = TLI->getValueType(ValTy);
+    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+      int Idx =
+          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
+                                 SelValTy.getSimpleVT());
+      if (Idx != -1)
+        return VectorSelectTbl[Idx].Cost;
+    }
+  }
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const {
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+  if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isIntegerTy(64)) {
+    // Unaligned stores are extremely inefficient. We don't split
+    // unaligned v2i64 stores because the negative impact that has shown in
+    // practice on inlined memcpy code.
+    // We make v2i64 stores expensive so that we will only vectorize if there
+    // are 6 other instructions getting vectorized.
+    unsigned AmortizationCost = 6;
+
+    return LT.first * 2 * AmortizationCost;
+  }
+
+  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
+      Src->getVectorNumElements() < 8) {
+    // We scalarize the loads/stores because there is not v.4b register and we
+    // have to promote the elements to v.4h.
+    unsigned NumVecElts = Src->getVectorNumElements();
+    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+    // We generate 2 instructions per vector element.
+    return NumVectorizableInstsToAmortize * NumVecElts * 2;
+  }
+
+  return LT.first;
+}
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
new file mode 100644
index 00000000000..65b77c547dc
--- /dev/null
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -0,0 +1,4047 @@
+//==- AArch64AsmParser.cpp - Parse AArch64 assembly to MCInst instructions -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include <cstdio>
+using namespace llvm;
+
+namespace {
+
+class AArch64Operand;
+
+class AArch64AsmParser : public MCTargetAsmParser {
+public:
+  typedef SmallVectorImpl<MCParsedAsmOperand *> OperandVector;
+
+private:
+  StringRef Mnemonic; ///< Instruction mnemonic.
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  SMLoc getLoc() const { return Parser.getTok().getLoc(); }
+
+  bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+  AArch64CC::CondCode parseCondCodeString(StringRef Cond);
+  bool parseCondCode(OperandVector &Operands, bool invertCondCode);
+  int tryParseRegister();
+  int tryMatchVectorRegister(StringRef &Kind, bool expected);
+  bool parseRegister(OperandVector &Operands);
+  bool parseSymbolicImmVal(const MCExpr *&ImmVal);
+  bool parseVectorList(OperandVector &Operands);
+  bool parseOperand(OperandVector &Operands, bool isCondCode,
+                    bool invertCondCode);
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+  bool showMatchError(SMLoc Loc, unsigned ErrCode);
+
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveTLSDescCall(SMLoc L);
+
+  bool parseDirectiveLOH(StringRef LOH, SMLoc L);
+
+  bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+/// @name Auto-generated Match Functions
+/// {
+
+#define GET_ASSEMBLER_HEADER
+#include "AArch64GenAsmMatcher.inc"
+
+  /// }
+
+  OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
+  OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
+  OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
+  OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
+  bool tryParseVectorRegister(OperandVector &Operands);
+
+public:
+  enum AArch64MatchResultTy {
+    Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "AArch64GenAsmMatcher.inc"
+  };
+  AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+                 const MCInstrInfo &MII,
+                 const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+    MCAsmParserExtension::Initialize(_Parser);
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+                                      unsigned Kind) override;
+
+  static bool classifySymbolRef(const MCExpr *Expr,
+                                AArch64MCExpr::VariantKind &ELFRefKind,
+                                MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                int64_t &Addend);
+};
+} // end anonymous namespace
+
+namespace {
+
+/// AArch64Operand - Instances of this class represent a parsed AArch64 machine
+/// instruction.
+class AArch64Operand : public MCParsedAsmOperand {
+private:
+  enum KindTy {
+    k_Immediate,
+    k_ShiftedImm,
+    k_CondCode,
+    k_Register,
+    k_VectorList,
+    k_VectorIndex,
+    k_Token,
+    k_SysReg,
+    k_SysCR,
+    k_Prefetch,
+    k_ShiftExtend,
+    k_FPImm,
+    k_Barrier
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+    bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+    bool isVector;
+  };
+
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    unsigned NumElements;
+    unsigned ElementKind;
+  };
+
+  struct VectorIndexOp {
+    unsigned Val;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct ShiftedImmOp {
+    const MCExpr *Val;
+    unsigned ShiftAmount;
+  };
+
+  struct CondCodeOp {
+    AArch64CC::CondCode Code;
+  };
+
+  struct FPImmOp {
+    unsigned Val; // Encoded 8-bit representation.
+  };
+
+  struct BarrierOp {
+    unsigned Val; // Not the enum since not all values have names.
+  };
+
+  struct SysRegOp {
+    const char *Data;
+    unsigned Length;
+    uint64_t FeatureBits; // We need to pass through information about which
+                          // core we are compiling for so that the SysReg
+                          // Mappers can appropriately conditionalize.
+  };
+
+  struct SysCRImmOp {
+    unsigned Val;
+  };
+
+  struct PrefetchOp {
+    unsigned Val;
+  };
+
+  struct ShiftExtendOp {
+    AArch64_AM::ShiftExtendType Type;
+    unsigned Amount;
+    bool HasExplicitAmount;
+  };
+
+  struct ExtendOp {
+    unsigned Val;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct VectorListOp VectorList;
+    struct VectorIndexOp VectorIndex;
+    struct ImmOp Imm;
+    struct ShiftedImmOp ShiftedImm;
+    struct CondCodeOp CondCode;
+    struct FPImmOp FPImm;
+    struct BarrierOp Barrier;
+    struct SysRegOp SysReg;
+    struct SysCRImmOp SysCRImm;
+    struct PrefetchOp Prefetch;
+    struct ShiftExtendOp ShiftExtend;
+  };
+
+  // Keep the MCContext around as the MCExprs may need manipulated during
+  // the add<>Operands() calls.
+  MCContext &Ctx;
+
+  AArch64Operand(KindTy K, MCContext &_Ctx)
+      : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {}
+
+public:
+  AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    switch (Kind) {
+    case k_Token:
+      Tok = o.Tok;
+      break;
+    case k_Immediate:
+      Imm = o.Imm;
+      break;
+    case k_ShiftedImm:
+      ShiftedImm = o.ShiftedImm;
+      break;
+    case k_CondCode:
+      CondCode = o.CondCode;
+      break;
+    case k_FPImm:
+      FPImm = o.FPImm;
+      break;
+    case k_Barrier:
+      Barrier = o.Barrier;
+      break;
+    case k_Register:
+      Reg = o.Reg;
+      break;
+    case k_VectorList:
+      VectorList = o.VectorList;
+      break;
+    case k_VectorIndex:
+      VectorIndex = o.VectorIndex;
+      break;
+    case k_SysReg:
+      SysReg = o.SysReg;
+      break;
+    case k_SysCR:
+      SysCRImm = o.SysCRImm;
+      break;
+    case k_Prefetch:
+      Prefetch = o.Prefetch;
+      break;
+    case k_ShiftExtend:
+      ShiftExtend = o.ShiftExtend;
+      break;
+    }
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  bool isTokenSuffix() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return Tok.IsSuffix;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == k_Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getShiftedImmVal() const {
+    assert(Kind == k_ShiftedImm && "Invalid access!");
+    return ShiftedImm.Val;
+  }
+
+  unsigned getShiftedImmShift() const {
+    assert(Kind == k_ShiftedImm && "Invalid access!");
+    return ShiftedImm.ShiftAmount;
+  }
+
+  AArch64CC::CondCode getCondCode() const {
+    assert(Kind == k_CondCode && "Invalid access!");
+    return CondCode.Code;
+  }
+
+  unsigned getFPImm() const {
+    assert(Kind == k_FPImm && "Invalid access!");
+    return FPImm.Val;
+  }
+
+  unsigned getBarrier() const {
+    assert(Kind == k_Barrier && "Invalid access!");
+    return Barrier.Val;
+  }
+
+  unsigned getReg() const override {
+    assert(Kind == k_Register && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  unsigned getVectorListStart() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.RegNum;
+  }
+
+  unsigned getVectorListCount() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.Count;
+  }
+
+  unsigned getVectorIndex() const {
+    assert(Kind == k_VectorIndex && "Invalid access!");
+    return VectorIndex.Val;
+  }
+
+  StringRef getSysReg() const {
+    assert(Kind == k_SysReg && "Invalid access!");
+    return StringRef(SysReg.Data, SysReg.Length);
+  }
+
+  uint64_t getSysRegFeatureBits() const {
+    assert(Kind == k_SysReg && "Invalid access!");
+    return SysReg.FeatureBits;
+  }
+
+  unsigned getSysCR() const {
+    assert(Kind == k_SysCR && "Invalid access!");
+    return SysCRImm.Val;
+  }
+
+  unsigned getPrefetch() const {
+    assert(Kind == k_Prefetch && "Invalid access!");
+    return Prefetch.Val;
+  }
+
+  AArch64_AM::ShiftExtendType getShiftExtendType() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.Type;
+  }
+
+  unsigned getShiftExtendAmount() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.Amount;
+  }
+
+  bool hasShiftExtendAmount() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.HasExplicitAmount;
+  }
+
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isMem() const override { return false; }
+  bool isSImm9() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val < 256);
+  }
+  bool isSImm7s4() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
+  }
+  bool isSImm7s8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
+  }
+  bool isSImm7s16() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
+  }
+
+  bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const {
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (!AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
+                                           Addend)) {
+      // If we don't understand the expression, assume the best and
+      // let the fixup and relocation code deal with it.
+      return true;
+    }
+
+    if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+        ELFRefKind == AArch64MCExpr::VK_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_GOT_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) {
+      // Note that we don't range-check the addend. It's adjusted modulo page
+      // size when converted, so there is no "out of range" condition when using
+      // @pageoff.
+      return Addend >= 0 && (Addend % Scale) == 0;
+    } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
+               DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
+      // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
+      return Addend == 0;
+    }
+
+    return false;
+  }
+
+  template <int Scale> bool isUImm12Offset() const {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return isSymbolicUImm12Offset(getImm(), Scale);
+
+    int64_t Val = MCE->getValue();
+    return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
+  }
+
+  bool isImm0_7() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 8);
+  }
+  bool isImm1_8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 9);
+  }
+  bool isImm0_15() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 16);
+  }
+  bool isImm1_16() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 17);
+  }
+  bool isImm0_31() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 32);
+  }
+  bool isImm1_31() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 32);
+  }
+  bool isImm1_32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 33);
+  }
+  bool isImm0_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 64);
+  }
+  bool isImm1_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 64);
+  }
+  bool isImm1_64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 65);
+  }
+  bool isImm0_127() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 128);
+  }
+  bool isImm0_255() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 256);
+  }
+  bool isImm0_65535() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 65536);
+  }
+  bool isImm32_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 32 && Val < 64);
+  }
+  bool isLogicalImm32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(MCE->getValue(), 32);
+  }
+  bool isLogicalImm64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
+  }
+  bool isShiftedImm() const { return Kind == k_ShiftedImm; }
+  bool isAddSubImm() const {
+    if (!isShiftedImm() && !isImm())
+      return false;
+
+    const MCExpr *Expr;
+
+    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
+    if (isShiftedImm()) {
+      unsigned Shift = ShiftedImm.ShiftAmount;
+      Expr = ShiftedImm.Val;
+      if (Shift != 0 && Shift != 12)
+        return false;
+    } else {
+      Expr = getImm();
+    }
+
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind,
+                                          DarwinRefKind, Addend)) {
+      return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
+          || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
+          || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
+          || ELFRefKind == AArch64MCExpr::VK_LO12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
+          || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12;
+    }
+
+    // Otherwise it should be a real immediate in range:
+    const MCConstantExpr *CE = cast<MCConstantExpr>(Expr);
+    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  }
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isSIMDImmType10() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
+  }
+  bool isBranchTarget26() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
+  }
+  bool isPCRelLabel19() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
+  }
+  bool isBranchTarget14() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
+  }
+
+  bool
+  isMovWSymbol(ArrayRef<AArch64MCExpr::VariantKind> AllowedModifiers) const {
+    if (!isImm())
+      return false;
+
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFRefKind,
+                                             DarwinRefKind, Addend)) {
+      return false;
+    }
+    if (DarwinRefKind != MCSymbolRefExpr::VK_None)
+      return false;
+
+    for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
+      if (ELFRefKind == AllowedModifiers[i])
+        return Addend == 0;
+    }
+
+    return false;
+  }
+
+  bool isMovZSymbolG3() const {
+    static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovZSymbolG2() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
+        AArch64MCExpr::VK_TPREL_G2, AArch64MCExpr::VK_DTPREL_G2};
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovZSymbolG1() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G1,      AArch64MCExpr::VK_ABS_G1_S,
+        AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
+        AArch64MCExpr::VK_DTPREL_G1,
+    };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovZSymbolG0() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
+        AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_DTPREL_G0};
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovKSymbolG3() const {
+    static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovKSymbolG2() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G2_NC};
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovKSymbolG1() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+      AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_TPREL_G1_NC,
+      AArch64MCExpr::VK_DTPREL_G1_NC
+    };
+    return isMovWSymbol(Variants);
+  }
+
+  bool isMovKSymbolG0() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+      AArch64MCExpr::VK_ABS_G0_NC,   AArch64MCExpr::VK_GOTTPREL_G0_NC,
+      AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC
+    };
+    return isMovWSymbol(Variants);
+  }
+
+  template<int RegWidth, int Shift>
+  bool isMOVZMovAlias() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    uint64_t Value = CE->getValue();
+
+    if (RegWidth == 32)
+      Value &= 0xffffffffULL;
+
+    // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
+    if (Value == 0 && Shift != 0)
+      return false;
+
+    return (Value & ~(0xffffULL << Shift)) == 0;
+  }
+
+  template<int RegWidth, int Shift>
+  bool isMOVNMovAlias() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    uint64_t Value = CE->getValue();
+
+    // MOVZ takes precedence over MOVN.
+    for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16)
+      if ((Value & ~(0xffffULL << MOVZShift)) == 0)
+        return false;
+
+    Value = ~Value;
+    if (RegWidth == 32)
+      Value &= 0xffffffffULL;
+
+    return (Value & ~(0xffffULL << Shift)) == 0;
+  }
+
+  bool isFPImm() const { return Kind == k_FPImm; }
+  bool isBarrier() const { return Kind == k_Barrier; }
+  bool isSysReg() const { return Kind == k_SysReg; }
+  bool isMRSSystemRegister() const {
+    if (!isSysReg()) return false;
+
+    bool IsKnownRegister;
+    auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
+    Mapper.fromString(getSysReg(), IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+  bool isMSRSystemRegister() const {
+    if (!isSysReg()) return false;
+
+    bool IsKnownRegister;
+    auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
+    Mapper.fromString(getSysReg(), IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+  bool isSystemPStateField() const {
+    if (!isSysReg()) return false;
+
+    bool IsKnownRegister;
+    AArch64PState::PStateMapper().fromString(getSysReg(), IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+  bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
+  bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+  bool isVectorRegLo() const {
+    return Kind == k_Register && Reg.isVector &&
+           AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+               Reg.RegNum);
+  }
+  bool isGPR32as64() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
+  }
+
+  bool isGPR64sp0() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
+  }
+
+  /// Is this a vector list with the type implicit (presumably attached to the
+  /// instruction itself)?
+  template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
+    return Kind == k_VectorList && VectorList.Count == NumRegs &&
+           !VectorList.ElementKind;
+  }
+
+  template <unsigned NumRegs, unsigned NumElements, char ElementKind>
+  bool isTypedVectorList() const {
+    if (Kind != k_VectorList)
+      return false;
+    if (VectorList.Count != NumRegs)
+      return false;
+    if (VectorList.ElementKind != ElementKind)
+      return false;
+    return VectorList.NumElements == NumElements;
+  }
+
+  bool isVectorIndex1() const {
+    return Kind == k_VectorIndex && VectorIndex.Val == 1;
+  }
+  bool isVectorIndexB() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 16;
+  }
+  bool isVectorIndexH() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 8;
+  }
+  bool isVectorIndexS() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 4;
+  }
+  bool isVectorIndexD() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 2;
+  }
+  bool isToken() const override { return Kind == k_Token; }
+  bool isTokenEqual(StringRef Str) const {
+    return Kind == k_Token && getToken() == Str;
+  }
+  bool isSysCR() const { return Kind == k_SysCR; }
+  bool isPrefetch() const { return Kind == k_Prefetch; }
+  bool isShiftExtend() const { return Kind == k_ShiftExtend; }
+  bool isShifter() const {
+    if (!isShiftExtend())
+      return false;
+
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR || ST == AArch64_AM::ROR ||
+            ST == AArch64_AM::MSL);
+  }
+  bool isExtend() const {
+    if (!isShiftExtend())
+      return false;
+
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB ||
+            ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH ||
+            ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW ||
+            ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+            ET == AArch64_AM::LSL) &&
+           getShiftExtendAmount() <= 4;
+  }
+
+  bool isExtend64() const {
+    if (!isExtend())
+      return false;
+    // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX;
+  }
+  bool isExtendLSL64() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+            ET == AArch64_AM::LSL) &&
+           getShiftExtendAmount() <= 4;
+  }
+
+  template<int Width> bool isMemXExtend() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::LSL || ET == AArch64_AM::SXTX) &&
+           (getShiftExtendAmount() == Log2_32(Width / 8) ||
+            getShiftExtendAmount() == 0);
+  }
+
+  template<int Width> bool isMemWExtend() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW) &&
+           (getShiftExtendAmount() == Log2_32(Width / 8) ||
+            getShiftExtendAmount() == 0);
+  }
+
+  template <unsigned width>
+  bool isArithmeticShifter() const {
+    if (!isShifter())
+      return false;
+
+    // An arithmetic shifter is LSL, LSR, or ASR.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR) && getShiftExtendAmount() < width;
+  }
+
+  template <unsigned width>
+  bool isLogicalShifter() const {
+    if (!isShifter())
+      return false;
+
+    // A logical shifter is LSL, LSR, ASR or ROR.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR || ST == AArch64_AM::ROR) &&
+           getShiftExtendAmount() < width;
+  }
+
+  bool isMovImm32Shifter() const {
+    if (!isShifter())
+      return false;
+
+    // A MOVi shifter is LSL of 0, 16, 32, or 48.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    if (ST != AArch64_AM::LSL)
+      return false;
+    uint64_t Val = getShiftExtendAmount();
+    return (Val == 0 || Val == 16);
+  }
+
+  bool isMovImm64Shifter() const {
+    if (!isShifter())
+      return false;
+
+    // A MOVi shifter is LSL of 0 or 16.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    if (ST != AArch64_AM::LSL)
+      return false;
+    uint64_t Val = getShiftExtendAmount();
+    return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
+  }
+
+  bool isLogicalVecShifter() const {
+    if (!isShifter())
+      return false;
+
+    // A logical vector shifter is a left shift by 0, 8, 16, or 24.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::LSL &&
+           (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
+  }
+
+  bool isLogicalVecHalfWordShifter() const {
+    if (!isLogicalVecShifter())
+      return false;
+
+    // A logical vector shifter is a left shift by 0 or 8.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::LSL &&
+           (Shift == 0 || Shift == 8);
+  }
+
+  bool isMoveVecShifter() const {
+    if (!isShiftExtend())
+      return false;
+
+    // A logical vector shifter is a left shift by 8 or 16.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::MSL &&
+           (Shift == 8 || Shift == 16);
+  }
+
+  // Fallback unscaled operands are for aliases of LDR/STR that fall back
+  // to LDUR/STUR when the offset is not legal for the former but is for
+  // the latter. As such, in addition to checking for being a legal unscaled
+  // address, also check that it is not a legal scaled address. This avoids
+  // ambiguity in the matcher.
+  template<int Width>
+  bool isSImm9OffsetFB() const {
+    return isSImm9() && !isUImm12Offset<Width / 8>();
+  }
+
+  bool isAdrpLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    if (!isImm())
+        return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (4096 * (1LL << (21 - 1)));
+      int64_t Max = 4096 * ((1LL << (21 - 1)) - 1);
+      return (Val % 4096) == 0 && Val >= Min && Val <= Max;
+    }
+
+    return true;
+  }
+
+  bool isAdrLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    if (!isImm())
+        return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (1LL << (21 - 1));
+      int64_t Max = ((1LL << (21 - 1)) - 1);
+      return Val >= Min && Val <= Max;
+    }
+
+    return true;
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(getReg()));
+
+    const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+    uint32_t Reg = RI->getRegClass(AArch64::GPR32RegClassID).getRegister(
+        RI->getEncodingValue(getReg()));
+
+    Inst.addOperand(MCOperand::CreateReg(Reg));
+  }
+
+  void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+    Inst.addOperand(MCOperand::CreateReg(AArch64::D0 + getReg() - AArch64::Q0));
+  }
+
+  void addVectorReg128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addVectorRegLoOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  template <unsigned NumRegs>
+  void addVectorList64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static unsigned FirstRegs[] = { AArch64::D0,       AArch64::D0_D1,
+                                    AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+    Inst.addOperand(
+        MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0));
+  }
+
+  template <unsigned NumRegs>
+  void addVectorList128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static unsigned FirstRegs[] = { AArch64::Q0,       AArch64::Q0_Q1,
+                                    AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+    Inst.addOperand(
+        MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0));
+  }
+
+  void addVectorIndex1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // If this is a pageoff symrefexpr with an addend, adjust the addend
+    // to be only the page-offset portion. Otherwise, just add the expr
+    // as-is.
+    addExpr(Inst, getImm());
+  }
+
+  void addAddSubImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    if (isShiftedImm()) {
+      addExpr(Inst, getShiftedImmVal());
+      Inst.addOperand(MCOperand::CreateImm(getShiftedImmShift()));
+    } else {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::CreateImm(0));
+    }
+  }
+
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCondCode()));
+  }
+
+  void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      addExpr(Inst, getImm());
+    else
+      Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 12));
+  }
+
+  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  template<int Scale>
+  void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+
+    if (!MCE) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      return;
+    }
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / Scale));
+  }
+
+  void addSImm9Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4));
+  }
+
+  void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8));
+  }
+
+  void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16));
+  }
+
+  void addImm0_7Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_15Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm1_64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_127Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_255Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addImm32_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
+
+  void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid logical immediate operand!");
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 32);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
+  void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid logical immediate operand!");
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
+  void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid immediate operand!");
+    uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
+  void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
+
+  void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
+
+  void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
+
+  void addFPImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getFPImm()));
+  }
+
+  void addBarrierOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getBarrier()));
+  }
+
+  void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
+    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
+    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    uint32_t Bits =
+        AArch64PState::PStateMapper().fromString(getSysReg(), Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addSysCROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getSysCR()));
+  }
+
+  void addPrefetchOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getPrefetch()));
+  }
+
+  void addShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned Imm =
+        AArch64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTW;
+    unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addExtend64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTX;
+    unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addMemExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+    Inst.addOperand(MCOperand::CreateImm(IsSigned));
+    Inst.addOperand(MCOperand::CreateImm(getShiftExtendAmount() != 0));
+  }
+
+  // For 8-bit load/store instructions with a register offset, both the
+  // "DoShift" and "NoShift" variants have a shift of 0. Because of this,
+  // they're disambiguated by whether the shift was explicit or implicit rather
+  // than its size.
+  void addMemExtend8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+    Inst.addOperand(MCOperand::CreateImm(IsSigned));
+    Inst.addOperand(MCOperand::CreateImm(hasShiftExtendAmount()));
+  }
+
+  template<int Shift>
+  void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((Value >> Shift) & 0xffff));
+  }
+
+  template<int Shift>
+  void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((~Value >> Shift) & 0xffff));
+  }
+
+  void print(raw_ostream &OS) const override;
+
+  static AArch64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S,
+                                   MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Token, Ctx);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->Tok.IsSuffix = IsSuffix;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static AArch64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S,
+                                 SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Register, Ctx);
+    Op->Reg.RegNum = RegNum;
+    Op->Reg.isVector = isVector;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
+                                        unsigned NumElements, char ElementKind,
+                                        SMLoc S, SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_VectorList, Ctx);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.NumElements = NumElements;
+    Op->VectorList.ElementKind = ElementKind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static AArch64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
+                                         MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_VectorIndex, Ctx);
+    Op->VectorIndex.Val = Idx;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
+                                 MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Immediate, Ctx);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static AArch64Operand *CreateShiftedImm(const MCExpr *Val,
+                                          unsigned ShiftAmount, SMLoc S,
+                                          SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_ShiftedImm, Ctx);
+    Op->ShiftedImm .Val = Val;
+    Op->ShiftedImm.ShiftAmount = ShiftAmount;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static AArch64Operand *CreateCondCode(AArch64CC::CondCode Code, SMLoc S,
+                                        SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_CondCode, Ctx);
+    Op->CondCode.Code = Code;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static AArch64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_FPImm, Ctx);
+    Op->FPImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static AArch64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Barrier, Ctx);
+    Op->Barrier.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S,
+                                    uint64_t FeatureBits, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_SysReg, Ctx);
+    Op->SysReg.Data = Str.data();
+    Op->SysReg.Length = Str.size();
+    Op->SysReg.FeatureBits = FeatureBits;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static AArch64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E,
+                                   MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_SysCR, Ctx);
+    Op->SysCRImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static AArch64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Prefetch, Ctx);
+    Op->Prefetch.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static AArch64Operand *CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp,
+                                         unsigned Val, bool HasExplicitAmount,
+                                         SMLoc S, SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, Ctx);
+    Op->ShiftExtend.Type = ShOp;
+    Op->ShiftExtend.Amount = Val;
+    Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+};
+
+} // end anonymous namespace.
+
+void AArch64Operand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case k_FPImm:
+    OS << "<fpimm " << getFPImm() << "("
+       << AArch64_AM::getFPImmFloat(getFPImm()) << ") >";
+    break;
+  case k_Barrier: {
+    bool Valid;
+    StringRef Name = AArch64DB::DBarrierMapper().toString(getBarrier(), Valid);
+    if (Valid)
+      OS << "<barrier " << Name << ">";
+    else
+      OS << "<barrier invalid #" << getBarrier() << ">";
+    break;
+  }
+  case k_Immediate:
+    getImm()->print(OS);
+    break;
+  case k_ShiftedImm: {
+    unsigned Shift = getShiftedImmShift();
+    OS << "<shiftedimm ";
+    getShiftedImmVal()->print(OS);
+    OS << ", lsl #" << AArch64_AM::getShiftValue(Shift) << ">";
+    break;
+  }
+  case k_CondCode:
+    OS << "<condcode " << getCondCode() << ">";
+    break;
+  case k_Register:
+    OS << "<register " << getReg() << ">";
+    break;
+  case k_VectorList: {
+    OS << "<vectorlist ";
+    unsigned Reg = getVectorListStart();
+    for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
+      OS << Reg + i << " ";
+    OS << ">";
+    break;
+  }
+  case k_VectorIndex:
+    OS << "<vectorindex " << getVectorIndex() << ">";
+    break;
+  case k_SysReg:
+    OS << "<sysreg: " << getSysReg() << '>';
+    break;
+  case k_Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case k_SysCR:
+    OS << "c" << getSysCR();
+    break;
+  case k_Prefetch: {
+    bool Valid;
+    StringRef Name = AArch64PRFM::PRFMMapper().toString(getPrefetch(), Valid);
+    if (Valid)
+      OS << "<prfop " << Name << ">";
+    else
+      OS << "<prfop invalid #" << getPrefetch() << ">";
+    break;
+  }
+  case k_ShiftExtend: {
+    OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
+       << getShiftExtendAmount();
+    if (!hasShiftExtendAmount())
+      OS << "<imp>";
+    OS << '>';
+    break;
+  }
+  }
+}
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static unsigned matchVectorRegName(StringRef Name) {
+  return StringSwitch<unsigned>(Name)
+      .Case("v0", AArch64::Q0)
+      .Case("v1", AArch64::Q1)
+      .Case("v2", AArch64::Q2)
+      .Case("v3", AArch64::Q3)
+      .Case("v4", AArch64::Q4)
+      .Case("v5", AArch64::Q5)
+      .Case("v6", AArch64::Q6)
+      .Case("v7", AArch64::Q7)
+      .Case("v8", AArch64::Q8)
+      .Case("v9", AArch64::Q9)
+      .Case("v10", AArch64::Q10)
+      .Case("v11", AArch64::Q11)
+      .Case("v12", AArch64::Q12)
+      .Case("v13", AArch64::Q13)
+      .Case("v14", AArch64::Q14)
+      .Case("v15", AArch64::Q15)
+      .Case("v16", AArch64::Q16)
+      .Case("v17", AArch64::Q17)
+      .Case("v18", AArch64::Q18)
+      .Case("v19", AArch64::Q19)
+      .Case("v20", AArch64::Q20)
+      .Case("v21", AArch64::Q21)
+      .Case("v22", AArch64::Q22)
+      .Case("v23", AArch64::Q23)
+      .Case("v24", AArch64::Q24)
+      .Case("v25", AArch64::Q25)
+      .Case("v26", AArch64::Q26)
+      .Case("v27", AArch64::Q27)
+      .Case("v28", AArch64::Q28)
+      .Case("v29", AArch64::Q29)
+      .Case("v30", AArch64::Q30)
+      .Case("v31", AArch64::Q31)
+      .Default(0);
+}
+
+static bool isValidVectorKind(StringRef Name) {
+  return StringSwitch<bool>(Name.lower())
+      .Case(".8b", true)
+      .Case(".16b", true)
+      .Case(".4h", true)
+      .Case(".8h", true)
+      .Case(".2s", true)
+      .Case(".4s", true)
+      .Case(".1d", true)
+      .Case(".2d", true)
+      .Case(".1q", true)
+      // Accept the width neutral ones, too, for verbose syntax. If those
+      // aren't used in the right places, the token operand won't match so
+      // all will work out.
+      .Case(".b", true)
+      .Case(".h", true)
+      .Case(".s", true)
+      .Case(".d", true)
+      .Default(false);
+}
+
+static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
+                                 char &ElementKind) {
+  assert(isValidVectorKind(Name));
+
+  ElementKind = Name.lower()[Name.size() - 1];
+  NumElements = 0;
+
+  if (Name.size() == 2)
+    return;
+
+  // Parse the lane count
+  Name = Name.drop_front();
+  while (isdigit(Name.front())) {
+    NumElements = 10 * NumElements + (Name.front() - '0');
+    Name = Name.drop_front();
+  }
+}
+
+bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                     SMLoc &EndLoc) {
+  StartLoc = getLoc();
+  RegNo = tryParseRegister();
+  EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  return (RegNo == (unsigned)-1);
+}
+
+/// tryParseRegister - Try to parse a register name. The token must be an
+/// Identifier when called, and if it is a register name the token is eaten and
+/// the register is added to the operand list.
+int AArch64AsmParser::tryParseRegister() {
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  std::string lowerCase = Tok.getString().lower();
+  unsigned RegNum = MatchRegisterName(lowerCase);
+  // Also handle a few aliases of registers.
+  if (RegNum == 0)
+    RegNum = StringSwitch<unsigned>(lowerCase)
+                 .Case("fp",  AArch64::FP)
+                 .Case("lr",  AArch64::LR)
+                 .Case("x31", AArch64::XZR)
+                 .Case("w31", AArch64::WZR)
+                 .Default(0);
+
+  if (RegNum == 0)
+    return -1;
+
+  Parser.Lex(); // Eat identifier token.
+  return RegNum;
+}
+
+/// tryMatchVectorRegister - Try to parse a vector register name with optional
+/// kind specifier. If it is a register specifier, eat the token and return it.
+int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    TokError("vector register expected");
+    return -1;
+  }
+
+  StringRef Name = Parser.getTok().getString();
+  // If there is a kind specifier, it's separated from the register name by
+  // a '.'.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+  unsigned RegNum = matchVectorRegName(Head);
+  if (RegNum) {
+    if (Next != StringRef::npos) {
+      Kind = Name.slice(Next, StringRef::npos);
+      if (!isValidVectorKind(Kind)) {
+        TokError("invalid vector kind qualifier");
+        return -1;
+      }
+    }
+    Parser.Lex(); // Eat the register token.
+    return RegNum;
+  }
+
+  if (expected)
+    TokError("vector register expected");
+  return -1;
+}
+
+/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
+  SMLoc S = getLoc();
+
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  StringRef Tok = Parser.getTok().getIdentifier();
+  if (Tok[0] != 'c' && Tok[0] != 'C') {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  uint32_t CRNum;
+  bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
+  if (BadNum || CRNum > 15) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(
+      AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
+  return MatchOperand_Success;
+}
+
+/// tryParsePrefetch - Try to parse a prefetch operand.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  // Either an identifier for named values or a 5-bit immediate.
+  bool Hash = Tok.is(AsmToken::Hash);
+  if (Hash || Tok.is(AsmToken::Integer)) {
+    if (Hash)
+      Parser.Lex(); // Eat hash token.
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for prefetch operand");
+      return MatchOperand_ParseFail;
+    }
+    unsigned prfop = MCE->getValue();
+    if (prfop > 31) {
+      TokError("prefetch operand out of range, [0,31] expected");
+      return MatchOperand_ParseFail;
+    }
+
+    Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("pre-fetch hint expected");
+    return MatchOperand_ParseFail;
+  }
+
+  bool Valid;
+  unsigned prfop = AArch64PRFM::PRFMMapper().fromString(Tok.getString(), Valid);
+  if (!Valid) {
+    TokError("pre-fetch hint expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+  return MatchOperand_Success;
+}
+
+/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
+/// instruction.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
+
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat hash token.
+  }
+
+  if (parseSymbolicImmVal(Expr))
+    return MatchOperand_ParseFail;
+
+  AArch64MCExpr::VariantKind ELFRefKind;
+  MCSymbolRefExpr::VariantKind DarwinRefKind;
+  int64_t Addend;
+  if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+    if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
+        ELFRefKind == AArch64MCExpr::VK_INVALID) {
+      // No modifier was specified at all; this is the syntax for an ELF basic
+      // ADRP relocation (unfortunately).
+      Expr =
+          AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
+    } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
+                DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
+               Addend != 0) {
+      Error(S, "gotpage label reference not allowed an addend");
+      return MatchOperand_ParseFail;
+    } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
+               DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
+               DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
+      // The operand must be an @page or @gotpage qualified symbolref.
+      Error(S, "page or gotpage label reference expected");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  // We have either a label reference possibly with addend or an immediate. The
+  // addend is a raw value here. The linker will adjust it to only reference the
+  // page.
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+
+  return MatchOperand_Success;
+}
+
+/// tryParseAdrLabel - Parse and validate a source label for the ADR
+/// instruction.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
+
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat hash token.
+  }
+
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
+
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+
+  return MatchOperand_Success;
+}
+
+/// tryParseFPImm - A floating point immediate expression operand.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
+  SMLoc S = getLoc();
+
+  bool Hash = false;
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat '#'
+    Hash = true;
+  }
+
+  // Handle negation, as that still comes through as a separate token.
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex();
+  }
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.is(AsmToken::Real)) {
+    APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    // If we had a '-' in front, toggle the sign bit.
+    IntVal ^= (uint64_t)isNegative << 63;
+    int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+    Parser.Lex(); // Eat the token.
+    // Check for out of range values. As an exception, we let Zero through,
+    // as we handle that special case in post-processing before matching in
+    // order to use the zero register for it.
+    if (Val == -1 && !RealVal.isZero()) {
+      TokError("expected compatible register or floating-point constant");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+  if (Tok.is(AsmToken::Integer)) {
+    int64_t Val;
+    if (!isNegative && Tok.getString().startswith("0x")) {
+      Val = Tok.getIntVal();
+      if (Val > 255 || Val < 0) {
+        TokError("encoded floating point value out of range");
+        return MatchOperand_ParseFail;
+      }
+    } else {
+      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      // If we had a '-' in front, toggle the sign bit.
+      IntVal ^= (uint64_t)isNegative << 63;
+      Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+    }
+    Parser.Lex(); // Eat the token.
+    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (!Hash)
+    return MatchOperand_NoMatch;
+
+  TokError("invalid floating point immediate");
+  return MatchOperand_ParseFail;
+}
+
+/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+  SMLoc S = getLoc();
+
+  if (Parser.getTok().is(AsmToken::Hash))
+    Parser.Lex(); // Eat '#'
+  else if (Parser.getTok().isNot(AsmToken::Integer))
+    // Operand should start from # or should be integer, emit error otherwise.
+    return MatchOperand_NoMatch;
+
+  const MCExpr *Imm;
+  if (parseSymbolicImmVal(Imm))
+    return MatchOperand_ParseFail;
+  else if (Parser.getTok().isNot(AsmToken::Comma)) {
+    uint64_t ShiftAmount = 0;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm);
+    if (MCE) {
+      int64_t Val = MCE->getValue();
+      if (Val > 0xfff && (Val & 0xfff) == 0) {
+        Imm = MCConstantExpr::Create(Val >> 12, getContext());
+        ShiftAmount = 12;
+      }
+    }
+    SMLoc E = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E,
+                                                        getContext()));
+    return MatchOperand_Success;
+  }
+
+  // Eat ','
+  Parser.Lex();
+
+  // The optional operand must be "lsl #N" where N is non-negative.
+  if (!Parser.getTok().is(AsmToken::Identifier) ||
+      !Parser.getTok().getIdentifier().equals_lower("lsl")) {
+    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  // Eat 'lsl'
+  Parser.Lex();
+
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex();
+  }
+
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t ShiftAmount = Parser.getTok().getIntVal();
+
+  if (ShiftAmount < 0) {
+    Error(Parser.getTok().getLoc(), "positive shift amount required");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat the number
+
+  SMLoc E = Parser.getTok().getLoc();
+  Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
+                                                      S, E, getContext()));
+  return MatchOperand_Success;
+}
+
+/// parseCondCodeString - Parse a Condition Code string.
+AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
+  AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
+                    .Case("eq", AArch64CC::EQ)
+                    .Case("ne", AArch64CC::NE)
+                    .Case("cs", AArch64CC::HS)
+                    .Case("hs", AArch64CC::HS)
+                    .Case("cc", AArch64CC::LO)
+                    .Case("lo", AArch64CC::LO)
+                    .Case("mi", AArch64CC::MI)
+                    .Case("pl", AArch64CC::PL)
+                    .Case("vs", AArch64CC::VS)
+                    .Case("vc", AArch64CC::VC)
+                    .Case("hi", AArch64CC::HI)
+                    .Case("ls", AArch64CC::LS)
+                    .Case("ge", AArch64CC::GE)
+                    .Case("lt", AArch64CC::LT)
+                    .Case("gt", AArch64CC::GT)
+                    .Case("le", AArch64CC::LE)
+                    .Case("al", AArch64CC::AL)
+                    .Case("nv", AArch64CC::NV)
+                    .Default(AArch64CC::Invalid);
+  return CC;
+}
+
+/// parseCondCode - Parse a Condition Code operand.
+bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
+                                     bool invertCondCode) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  StringRef Cond = Tok.getString();
+  AArch64CC::CondCode CC = parseCondCodeString(Cond);
+  if (CC == AArch64CC::Invalid)
+    return TokError("invalid condition code");
+  Parser.Lex(); // Eat identifier token.
+
+  if (invertCondCode)
+    CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC));
+
+  Operands.push_back(
+      AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
+  return false;
+}
+
+/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
+/// them if present.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  std::string LowerID = Tok.getString().lower();
+  AArch64_AM::ShiftExtendType ShOp =
+      StringSwitch<AArch64_AM::ShiftExtendType>(LowerID)
+          .Case("lsl", AArch64_AM::LSL)
+          .Case("lsr", AArch64_AM::LSR)
+          .Case("asr", AArch64_AM::ASR)
+          .Case("ror", AArch64_AM::ROR)
+          .Case("msl", AArch64_AM::MSL)
+          .Case("uxtb", AArch64_AM::UXTB)
+          .Case("uxth", AArch64_AM::UXTH)
+          .Case("uxtw", AArch64_AM::UXTW)
+          .Case("uxtx", AArch64_AM::UXTX)
+          .Case("sxtb", AArch64_AM::SXTB)
+          .Case("sxth", AArch64_AM::SXTH)
+          .Case("sxtw", AArch64_AM::SXTW)
+          .Case("sxtx", AArch64_AM::SXTX)
+          .Default(AArch64_AM::InvalidShiftExtend);
+
+  if (ShOp == AArch64_AM::InvalidShiftExtend)
+    return MatchOperand_NoMatch;
+
+  SMLoc S = Tok.getLoc();
+  Parser.Lex();
+
+  bool Hash = getLexer().is(AsmToken::Hash);
+  if (!Hash && getLexer().isNot(AsmToken::Integer)) {
+    if (ShOp == AArch64_AM::LSL || ShOp == AArch64_AM::LSR ||
+        ShOp == AArch64_AM::ASR || ShOp == AArch64_AM::ROR ||
+        ShOp == AArch64_AM::MSL) {
+      // We expect a number here.
+      TokError("expected #imm after shift specifier");
+      return MatchOperand_ParseFail;
+    }
+
+    // "extend" type operatoins don't need an immediate, #0 is implicit.
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(
+        AArch64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (Hash)
+    Parser.Lex(); // Eat the '#'.
+
+  // Make sure we do actually have a number
+  if (!Parser.getTok().is(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(),
+          "expected integer shift amount");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *ImmVal;
+  if (getParser().parseExpression(ImmVal))
+    return MatchOperand_ParseFail;
+
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+  if (!MCE) {
+    TokError("expected #imm after shift specifier");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateShiftExtend(
+      ShOp, MCE->getValue(), true, S, E, getContext()));
+  return MatchOperand_Success;
+}
+
+/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
+/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
+bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
+                                   OperandVector &Operands) {
+  if (Name.find('.') != StringRef::npos)
+    return TokError("invalid operand");
+
+  Mnemonic = Name;
+  Operands.push_back(
+      AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
+
+  const AsmToken &Tok = Parser.getTok();
+  StringRef Op = Tok.getString();
+  SMLoc S = Tok.getLoc();
+
+  const MCExpr *Expr = nullptr;
+
+#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
+  do {                                                                         \
+    Expr = MCConstantExpr::Create(op1, getContext());                          \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));           \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));           \
+    Expr = MCConstantExpr::Create(op2, getContext());                          \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
+  } while (0)
+
+  if (Mnemonic == "ic") {
+    if (!Op.compare_lower("ialluis")) {
+      // SYS #0, C7, C1, #0
+      SYS_ALIAS(0, 7, 1, 0);
+    } else if (!Op.compare_lower("iallu")) {
+      // SYS #0, C7, C5, #0
+      SYS_ALIAS(0, 7, 5, 0);
+    } else if (!Op.compare_lower("ivau")) {
+      // SYS #3, C7, C5, #1
+      SYS_ALIAS(3, 7, 5, 1);
+    } else {
+      return TokError("invalid operand for IC instruction");
+    }
+  } else if (Mnemonic == "dc") {
+    if (!Op.compare_lower("zva")) {
+      // SYS #3, C7, C4, #1
+      SYS_ALIAS(3, 7, 4, 1);
+    } else if (!Op.compare_lower("ivac")) {
+      // SYS #3, C7, C6, #1
+      SYS_ALIAS(0, 7, 6, 1);
+    } else if (!Op.compare_lower("isw")) {
+      // SYS #0, C7, C6, #2
+      SYS_ALIAS(0, 7, 6, 2);
+    } else if (!Op.compare_lower("cvac")) {
+      // SYS #3, C7, C10, #1
+      SYS_ALIAS(3, 7, 10, 1);
+    } else if (!Op.compare_lower("csw")) {
+      // SYS #0, C7, C10, #2
+      SYS_ALIAS(0, 7, 10, 2);
+    } else if (!Op.compare_lower("cvau")) {
+      // SYS #3, C7, C11, #1
+      SYS_ALIAS(3, 7, 11, 1);
+    } else if (!Op.compare_lower("civac")) {
+      // SYS #3, C7, C14, #1
+      SYS_ALIAS(3, 7, 14, 1);
+    } else if (!Op.compare_lower("cisw")) {
+      // SYS #0, C7, C14, #2
+      SYS_ALIAS(0, 7, 14, 2);
+    } else {
+      return TokError("invalid operand for DC instruction");
+    }
+  } else if (Mnemonic == "at") {
+    if (!Op.compare_lower("s1e1r")) {
+      // SYS #0, C7, C8, #0
+      SYS_ALIAS(0, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e2r")) {
+      // SYS #4, C7, C8, #0
+      SYS_ALIAS(4, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e3r")) {
+      // SYS #6, C7, C8, #0
+      SYS_ALIAS(6, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e1w")) {
+      // SYS #0, C7, C8, #1
+      SYS_ALIAS(0, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e2w")) {
+      // SYS #4, C7, C8, #1
+      SYS_ALIAS(4, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e3w")) {
+      // SYS #6, C7, C8, #1
+      SYS_ALIAS(6, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e0r")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 2);
+    } else if (!Op.compare_lower("s1e0w")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 3);
+    } else if (!Op.compare_lower("s12e1r")) {
+      // SYS #4, C7, C8, #4
+      SYS_ALIAS(4, 7, 8, 4);
+    } else if (!Op.compare_lower("s12e1w")) {
+      // SYS #4, C7, C8, #5
+      SYS_ALIAS(4, 7, 8, 5);
+    } else if (!Op.compare_lower("s12e0r")) {
+      // SYS #4, C7, C8, #6
+      SYS_ALIAS(4, 7, 8, 6);
+    } else if (!Op.compare_lower("s12e0w")) {
+      // SYS #4, C7, C8, #7
+      SYS_ALIAS(4, 7, 8, 7);
+    } else {
+      return TokError("invalid operand for AT instruction");
+    }
+  } else if (Mnemonic == "tlbi") {
+    if (!Op.compare_lower("vmalle1is")) {
+      // SYS #0, C8, C3, #0
+      SYS_ALIAS(0, 8, 3, 0);
+    } else if (!Op.compare_lower("alle2is")) {
+      // SYS #4, C8, C3, #0
+      SYS_ALIAS(4, 8, 3, 0);
+    } else if (!Op.compare_lower("alle3is")) {
+      // SYS #6, C8, C3, #0
+      SYS_ALIAS(6, 8, 3, 0);
+    } else if (!Op.compare_lower("vae1is")) {
+      // SYS #0, C8, C3, #1
+      SYS_ALIAS(0, 8, 3, 1);
+    } else if (!Op.compare_lower("vae2is")) {
+      // SYS #4, C8, C3, #1
+      SYS_ALIAS(4, 8, 3, 1);
+    } else if (!Op.compare_lower("vae3is")) {
+      // SYS #6, C8, C3, #1
+      SYS_ALIAS(6, 8, 3, 1);
+    } else if (!Op.compare_lower("aside1is")) {
+      // SYS #0, C8, C3, #2
+      SYS_ALIAS(0, 8, 3, 2);
+    } else if (!Op.compare_lower("vaae1is")) {
+      // SYS #0, C8, C3, #3
+      SYS_ALIAS(0, 8, 3, 3);
+    } else if (!Op.compare_lower("alle1is")) {
+      // SYS #4, C8, C3, #4
+      SYS_ALIAS(4, 8, 3, 4);
+    } else if (!Op.compare_lower("vale1is")) {
+      // SYS #0, C8, C3, #5
+      SYS_ALIAS(0, 8, 3, 5);
+    } else if (!Op.compare_lower("vaale1is")) {
+      // SYS #0, C8, C3, #7
+      SYS_ALIAS(0, 8, 3, 7);
+    } else if (!Op.compare_lower("vmalle1")) {
+      // SYS #0, C8, C7, #0
+      SYS_ALIAS(0, 8, 7, 0);
+    } else if (!Op.compare_lower("alle2")) {
+      // SYS #4, C8, C7, #0
+      SYS_ALIAS(4, 8, 7, 0);
+    } else if (!Op.compare_lower("vale2is")) {
+      // SYS #4, C8, C3, #5
+      SYS_ALIAS(4, 8, 3, 5);
+    } else if (!Op.compare_lower("vale3is")) {
+      // SYS #6, C8, C3, #5
+      SYS_ALIAS(6, 8, 3, 5);
+    } else if (!Op.compare_lower("alle3")) {
+      // SYS #6, C8, C7, #0
+      SYS_ALIAS(6, 8, 7, 0);
+    } else if (!Op.compare_lower("vae1")) {
+      // SYS #0, C8, C7, #1
+      SYS_ALIAS(0, 8, 7, 1);
+    } else if (!Op.compare_lower("vae2")) {
+      // SYS #4, C8, C7, #1
+      SYS_ALIAS(4, 8, 7, 1);
+    } else if (!Op.compare_lower("vae3")) {
+      // SYS #6, C8, C7, #1
+      SYS_ALIAS(6, 8, 7, 1);
+    } else if (!Op.compare_lower("aside1")) {
+      // SYS #0, C8, C7, #2
+      SYS_ALIAS(0, 8, 7, 2);
+    } else if (!Op.compare_lower("vaae1")) {
+      // SYS #0, C8, C7, #3
+      SYS_ALIAS(0, 8, 7, 3);
+    } else if (!Op.compare_lower("alle1")) {
+      // SYS #4, C8, C7, #4
+      SYS_ALIAS(4, 8, 7, 4);
+    } else if (!Op.compare_lower("vale1")) {
+      // SYS #0, C8, C7, #5
+      SYS_ALIAS(0, 8, 7, 5);
+    } else if (!Op.compare_lower("vale2")) {
+      // SYS #4, C8, C7, #5
+      SYS_ALIAS(4, 8, 7, 5);
+    } else if (!Op.compare_lower("vale3")) {
+      // SYS #6, C8, C7, #5
+      SYS_ALIAS(6, 8, 7, 5);
+    } else if (!Op.compare_lower("vaale1")) {
+      // SYS #0, C8, C7, #7
+      SYS_ALIAS(0, 8, 7, 7);
+    } else if (!Op.compare_lower("ipas2e1")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 4, 1);
+    } else if (!Op.compare_lower("ipas2le1")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 4, 5);
+    } else if (!Op.compare_lower("ipas2e1is")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 0, 1);
+    } else if (!Op.compare_lower("ipas2le1is")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 0, 5);
+    } else if (!Op.compare_lower("vmalls12e1")) {
+      // SYS #4, C8, C7, #6
+      SYS_ALIAS(4, 8, 7, 6);
+    } else if (!Op.compare_lower("vmalls12e1is")) {
+      // SYS #4, C8, C3, #6
+      SYS_ALIAS(4, 8, 3, 6);
+    } else {
+      return TokError("invalid operand for TLBI instruction");
+    }
+  }
+
+#undef SYS_ALIAS
+
+  Parser.Lex(); // Eat operand.
+
+  bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
+  bool HasRegister = false;
+
+  // Check for the optional register operand.
+  if (getLexer().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat comma.
+
+    if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
+      return TokError("expected register operand");
+
+    HasRegister = true;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Parser.eatToEndOfStatement();
+    return TokError("unexpected token in argument list");
+  }
+
+  if (ExpectRegister && !HasRegister) {
+    return TokError("specified " + Mnemonic + " op requires a register");
+  }
+  else if (!ExpectRegister && HasRegister) {
+    return TokError("specified " + Mnemonic + " op does not use a register");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+
+  // Can be either a #imm style literal or an option name
+  bool Hash = Tok.is(AsmToken::Hash);
+  if (Hash || Tok.is(AsmToken::Integer)) {
+    // Immediate operand.
+    if (Hash)
+      Parser.Lex(); // Eat the '#'
+    const MCExpr *ImmVal;
+    SMLoc ExprLoc = getLoc();
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      Error(ExprLoc, "immediate value expected for barrier operand");
+      return MatchOperand_ParseFail;
+    }
+    if (MCE->getValue() < 0 || MCE->getValue() > 15) {
+      Error(ExprLoc, "barrier operand out of range");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(
+        AArch64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  bool Valid;
+  unsigned Opt = AArch64DB::DBarrierMapper().fromString(Tok.getString(), Valid);
+  if (!Valid) {
+    TokError("invalid barrier option name");
+    return MatchOperand_ParseFail;
+  }
+
+  // The only valid named option for ISB is 'sy'
+  if (Mnemonic == "isb" && Opt != AArch64DB::SY) {
+    TokError("'sy' or #imm operand expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateBarrier(Opt, getLoc(), getContext()));
+  Parser.Lex(); // Consume the option
+
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), getLoc(),
+                     STI.getFeatureBits(), getContext()));
+  Parser.Lex(); // Eat identifier
+
+  return MatchOperand_Success;
+}
+
+/// tryParseVectorRegister - Parse a vector register operand.
+bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return true;
+
+  SMLoc S = getLoc();
+  // Check for a vector register specifier first.
+  StringRef Kind;
+  int64_t Reg = tryMatchVectorRegister(Kind, false);
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
+  // If there was an explicit qualifier, that goes on as a literal text
+  // operand.
+  if (!Kind.empty())
+    Operands.push_back(
+        AArch64Operand::CreateToken(Kind, false, S, getContext()));
+
+  // If there is an index specifier following the register, parse that too.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
+    }
+
+    SMLoc E = getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return false;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+                                                         E, getContext()));
+  }
+
+  return false;
+}
+
+/// parseRegister - Parse a non-vector register operand.
+bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  // Try for a vector register.
+  if (!tryParseVectorRegister(Operands))
+    return false;
+
+  // Try for a scalar register.
+  int64_t Reg = tryParseRegister();
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
+
+  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
+  // as a string token in the instruction itself.
+  if (getLexer().getKind() == AsmToken::LBrac) {
+    SMLoc LBracS = getLoc();
+    Parser.Lex();
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Integer)) {
+      SMLoc IntS = getLoc();
+      int64_t Val = Tok.getIntVal();
+      if (Val == 1) {
+        Parser.Lex();
+        if (getLexer().getKind() == AsmToken::RBrac) {
+          SMLoc RBracS = getLoc();
+          Parser.Lex();
+          Operands.push_back(
+              AArch64Operand::CreateToken("[", false, LBracS, getContext()));
+          Operands.push_back(
+              AArch64Operand::CreateToken("1", false, IntS, getContext()));
+          Operands.push_back(
+              AArch64Operand::CreateToken("]", false, RBracS, getContext()));
+          return false;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
+  bool HasELFModifier = false;
+  AArch64MCExpr::VariantKind RefKind;
+
+  if (Parser.getTok().is(AsmToken::Colon)) {
+    Parser.Lex(); // Eat ':"
+    HasELFModifier = true;
+
+    if (Parser.getTok().isNot(AsmToken::Identifier)) {
+      Error(Parser.getTok().getLoc(),
+            "expect relocation specifier in operand after ':'");
+      return true;
+    }
+
+    std::string LowerCase = Parser.getTok().getIdentifier().lower();
+    RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
+                  .Case("lo12", AArch64MCExpr::VK_LO12)
+                  .Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
+                  .Case("abs_g2", AArch64MCExpr::VK_ABS_G2)
+                  .Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S)
+                  .Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC)
+                  .Case("abs_g1", AArch64MCExpr::VK_ABS_G1)
+                  .Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S)
+                  .Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC)
+                  .Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
+                  .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
+                  .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
+                  .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
+                  .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
+                  .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
+                  .Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0)
+                  .Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC)
+                  .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
+                  .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
+                  .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
+                  .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
+                  .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
+                  .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
+                  .Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0)
+                  .Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC)
+                  .Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12)
+                  .Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12)
+                  .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
+                  .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
+                  .Case("got", AArch64MCExpr::VK_GOT_PAGE)
+                  .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
+                  .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
+                  .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
+                  .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
+                  .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
+                  .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
+                  .Default(AArch64MCExpr::VK_INVALID);
+
+    if (RefKind == AArch64MCExpr::VK_INVALID) {
+      Error(Parser.getTok().getLoc(),
+            "expect relocation specifier in operand after ':'");
+      return true;
+    }
+
+    Parser.Lex(); // Eat identifier
+
+    if (Parser.getTok().isNot(AsmToken::Colon)) {
+      Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier");
+      return true;
+    }
+    Parser.Lex(); // Eat ':'
+  }
+
+  if (getParser().parseExpression(ImmVal))
+    return true;
+
+  if (HasELFModifier)
+    ImmVal = AArch64MCExpr::Create(ImmVal, RefKind, getContext());
+
+  return false;
+}
+
+/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
+bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+  assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat left bracket token.
+  StringRef Kind;
+  int64_t FirstReg = tryMatchVectorRegister(Kind, true);
+  if (FirstReg == -1)
+    return true;
+  int64_t PrevReg = FirstReg;
+  unsigned Count = 1;
+
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the minus.
+
+    SMLoc Loc = getLoc();
+    StringRef NextKind;
+    int64_t Reg = tryMatchVectorRegister(NextKind, true);
+    if (Reg == -1)
+      return true;
+    // Any Kind suffices must match on all regs in the list.
+    if (Kind != NextKind)
+      return Error(Loc, "mismatched register size suffix");
+
+    unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
+
+    if (Space == 0 || Space > 3) {
+      return Error(Loc, "invalid number of vectors");
+    }
+
+    Count += Space;
+  }
+  else {
+    while (Parser.getTok().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma token.
+
+      SMLoc Loc = getLoc();
+      StringRef NextKind;
+      int64_t Reg = tryMatchVectorRegister(NextKind, true);
+      if (Reg == -1)
+        return true;
+      // Any Kind suffices must match on all regs in the list.
+      if (Kind != NextKind)
+        return Error(Loc, "mismatched register size suffix");
+
+      // Registers must be incremental (with wraparound at 31)
+      if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
+          (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
+       return Error(Loc, "registers must be sequential");
+
+      PrevReg = Reg;
+      ++Count;
+    }
+  }
+
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return Error(getLoc(), "'}' expected");
+  Parser.Lex(); // Eat the '}' token.
+
+  if (Count > 4)
+    return Error(S, "invalid number of vectors");
+
+  unsigned NumElements = 0;
+  char ElementKind = 0;
+  if (!Kind.empty())
+    parseValidVectorKind(Kind, NumElements, ElementKind);
+
+  Operands.push_back(AArch64Operand::CreateVectorList(
+      FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
+
+  // If there is an index specifier following the list, parse that too.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
+    }
+
+    SMLoc E = getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return false;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+                                                         E, getContext()));
+  }
+  return false;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  unsigned RegNum = MatchRegisterName(Tok.getString().lower());
+
+  MCContext &Ctx = getContext();
+  const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+  if (!RI->getRegClass(AArch64::GPR64spRegClassID).contains(RegNum))
+    return MatchOperand_NoMatch;
+
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat register
+
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Operands.push_back(
+        AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+    return MatchOperand_Success;
+  }
+  Parser.Lex(); // Eat comma.
+
+  if (Parser.getTok().is(AsmToken::Hash))
+    Parser.Lex(); // Eat hash
+
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(getLoc(), "index must be absent or #0");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *ImmVal;
+  if (Parser.parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
+      cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
+    Error(getLoc(), "index must be absent or #0");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+  return MatchOperand_Success;
+}
+
+/// parseOperand - Parse a arm instruction operand.  For now this parses the
+/// operand regardless of the mnemonic.
+bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
+                                  bool invertCondCode) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  // Nothing custom, so do general case parsing.
+  SMLoc S, E;
+  switch (getLexer().getKind()) {
+  default: {
+    SMLoc S = getLoc();
+    const MCExpr *Expr;
+    if (parseSymbolicImmVal(Expr))
+      return Error(S, "invalid operand");
+
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+    return false;
+  }
+  case AsmToken::LBrac: {
+    SMLoc Loc = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateToken("[", false, Loc,
+                                                   getContext()));
+    Parser.Lex(); // Eat '['
+
+    // There's no comma after a '[', so we can parse the next operand
+    // immediately.
+    return parseOperand(Operands, false, false);
+  }
+  case AsmToken::LCurly:
+    return parseVectorList(Operands);
+  case AsmToken::Identifier: {
+    // If we're expecting a Condition Code operand, then just parse that.
+    if (isCondCode)
+      return parseCondCode(Operands, invertCondCode);
+
+    // If it's a register name, parse it.
+    if (!parseRegister(Operands))
+      return false;
+
+    // This could be an optional "shift" or "extend" operand.
+    OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
+    // We can only continue if no tokens were eaten.
+    if (GotShift != MatchOperand_NoMatch)
+      return GotShift;
+
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    S = getLoc();
+    if (getParser().parseExpression(IdVal))
+      return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext()));
+    return false;
+  }
+  case AsmToken::Integer:
+  case AsmToken::Real:
+  case AsmToken::Hash: {
+    // #42 -> immediate.
+    S = getLoc();
+    if (getLexer().is(AsmToken::Hash))
+      Parser.Lex();
+
+    // Parse a negative sign
+    bool isNegative = false;
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      isNegative = true;
+      // We need to consume this token only when we have a Real, otherwise
+      // we let parseSymbolicImmVal take care of it
+      if (Parser.getLexer().peekTok().is(AsmToken::Real))
+        Parser.Lex();
+    }
+
+    // The only Real that should come through here is a literal #0.0 for
+    // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
+    // so convert the value.
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Real)) {
+      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
+          Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
+          Mnemonic != "fcmlt")
+        return TokError("unexpected floating point literal");
+      else if (IntVal != 0 || isNegative)
+        return TokError("expected floating-point constant #0.0");
+      Parser.Lex(); // Eat the token.
+
+      Operands.push_back(
+          AArch64Operand::CreateToken("#0", false, S, getContext()));
+      Operands.push_back(
+          AArch64Operand::CreateToken(".0", false, S, getContext()));
+      return false;
+    }
+
+    const MCExpr *ImmVal;
+    if (parseSymbolicImmVal(ImmVal))
+      return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
+    return false;
+  }
+  }
+}
+
+/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its
+/// operands.
+bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                        StringRef Name, SMLoc NameLoc,
+                                        OperandVector &Operands) {
+  Name = StringSwitch<StringRef>(Name.lower())
+             .Case("beq", "b.eq")
+             .Case("bne", "b.ne")
+             .Case("bhs", "b.hs")
+             .Case("bcs", "b.cs")
+             .Case("blo", "b.lo")
+             .Case("bcc", "b.cc")
+             .Case("bmi", "b.mi")
+             .Case("bpl", "b.pl")
+             .Case("bvs", "b.vs")
+             .Case("bvc", "b.vc")
+             .Case("bhi", "b.hi")
+             .Case("bls", "b.ls")
+             .Case("bge", "b.ge")
+             .Case("blt", "b.lt")
+             .Case("bgt", "b.gt")
+             .Case("ble", "b.le")
+             .Case("bal", "b.al")
+             .Case("bnv", "b.nv")
+             .Default(Name);
+
+  // Create the leading tokens for the mnemonic, split by '.' characters.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+
+  // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
+  if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi") {
+    bool IsError = parseSysAlias(Head, NameLoc, Operands);
+    if (IsError && getLexer().isNot(AsmToken::EndOfStatement))
+      Parser.eatToEndOfStatement();
+    return IsError;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateToken(Head, false, NameLoc, getContext()));
+  Mnemonic = Head;
+
+  // Handle condition codes for a branch mnemonic
+  if (Head == "b" && Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start + 1, Next);
+
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()));
+    AArch64CC::CondCode CC = parseCondCodeString(Head);
+    if (CC == AArch64CC::Invalid)
+      return Error(SuffixLoc, "invalid condition code");
+    Operands.push_back(
+        AArch64Operand::CreateToken(".", true, SuffixLoc, getContext()));
+    Operands.push_back(
+        AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
+  }
+
+  // Add the remaining tokens in the mnemonic.
+  while (Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start, Next);
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()) + 1);
+    Operands.push_back(
+        AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
+  }
+
+  // Conditional compare instructions have a Condition Code operand, which needs
+  // to be parsed and an immediate operand created.
+  bool condCodeFourthOperand =
+      (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
+       Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
+       Head == "csinc" || Head == "csinv" || Head == "csneg");
+
+  // These instructions are aliases to some of the conditional select
+  // instructions. However, the condition code is inverted in the aliased
+  // instruction.
+  //
+  // FIXME: Is this the correct way to handle these? Or should the parser
+  //        generate the aliased instructions directly?
+  bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
+  bool condCodeThirdOperand =
+      (Head == "cinc" || Head == "cinv" || Head == "cneg");
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, false, false)) {
+      Parser.eatToEndOfStatement();
+      return true;
+    }
+
+    unsigned N = 2;
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma.
+
+      // Parse and remember the operand.
+      if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
+                                     (N == 3 && condCodeThirdOperand) ||
+                                     (N == 2 && condCodeSecondOperand),
+                       condCodeSecondOperand || condCodeThirdOperand)) {
+        Parser.eatToEndOfStatement();
+        return true;
+      }
+
+      // After successfully parsing some operands there are two special cases to
+      // consider (i.e. notional operands not separated by commas). Both are due
+      // to memory specifiers:
+      //  + An RBrac will end an address for load/store/prefetch
+      //  + An '!' will indicate a pre-indexed operation.
+      //
+      // It's someone else's responsibility to make sure these tokens are sane
+      // in the given context!
+      if (Parser.getTok().is(AsmToken::RBrac)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(AArch64Operand::CreateToken("]", false, Loc,
+                                                       getContext()));
+        Parser.Lex();
+      }
+
+      if (Parser.getTok().is(AsmToken::Exclaim)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(AArch64Operand::CreateToken("!", false, Loc,
+                                                       getContext()));
+        Parser.Lex();
+      }
+
+      ++N;
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = Parser.getTok().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+// FIXME: This entire function is a giant hack to provide us with decent
+// operand range validation/diagnostics until TableGen/MC can be extended
+// to support autogeneration of this kind of validation.
+bool AArch64AsmParser::validateInstruction(MCInst &Inst,
+                                         SmallVectorImpl<SMLoc> &Loc) {
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  // Check for indexed addressing modes w/ the base register being the
+  // same as a destination/source register or pair load where
+  // the Rt == Rt2. All of those are undefined behaviour.
+  switch (Inst.getOpcode()) {
+  case AArch64::LDPSWpre:
+  case AArch64::LDPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::LDPXpost:
+  case AArch64::LDPXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    // FALLTHROUGH
+  }
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi: {
+    unsigned Rt = Inst.getOperand(0).getReg();
+    unsigned Rt2 = Inst.getOperand(1).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case AArch64::LDPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::LDPQpost:
+  case AArch64::LDPQpre:
+  case AArch64::LDPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::LDPSWpost: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case AArch64::STPDpost:
+  case AArch64::STPDpre:
+  case AArch64::STPQpost:
+  case AArch64::STPQpre:
+  case AArch64::STPSpost:
+  case AArch64::STPSpre:
+  case AArch64::STPWpost:
+  case AArch64::STPWpre:
+  case AArch64::STPXpost:
+  case AArch64::STPXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case AArch64::LDRBBpre:
+  case AArch64::LDRBpre:
+  case AArch64::LDRHHpre:
+  case AArch64::LDRHpre:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRBBpost:
+  case AArch64::LDRBpost:
+  case AArch64::LDRHHpost:
+  case AArch64::LDRHpost:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRWpost:
+  case AArch64::LDRXpost: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case AArch64::STRBBpost:
+  case AArch64::STRBpost:
+  case AArch64::STRHHpost:
+  case AArch64::STRHpost:
+  case AArch64::STRWpost:
+  case AArch64::STRXpost:
+  case AArch64::STRBBpre:
+  case AArch64::STRBpre:
+  case AArch64::STRHHpre:
+  case AArch64::STRHpre:
+  case AArch64::STRWpre:
+  case AArch64::STRXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  }
+
+  // Now check immediate ranges. Separate from the above as there is overlap
+  // in the instructions being checked and this keeps the nested conditionals
+  // to a minimum.
+  switch (Inst.getOpcode()) {
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXri:
+  case AArch64::ADDWri:
+  case AArch64::ADDXri:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXri:
+  case AArch64::SUBWri:
+  case AArch64::SUBXri: {
+    // Annoyingly we can't do this in the isAddSubImm predicate, so there is
+    // some slight duplication here.
+    if (Inst.getOperand(2).isExpr()) {
+      const MCExpr *Expr = Inst.getOperand(2).getExpr();
+      AArch64MCExpr::VariantKind ELFRefKind;
+      MCSymbolRefExpr::VariantKind DarwinRefKind;
+      int64_t Addend;
+      if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+        return Error(Loc[2], "invalid immediate expression");
+      }
+
+      // Only allow these with ADDXri.
+      if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+          DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) &&
+          Inst.getOpcode() == AArch64::ADDXri)
+        return false;
+
+      // Only allow these with ADDXri/ADDWri
+      if ((ELFRefKind == AArch64MCExpr::VK_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+          ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) &&
+          (Inst.getOpcode() == AArch64::ADDXri ||
+          Inst.getOpcode() == AArch64::ADDWri))
+        return false;
+
+      // Don't allow expressions in the immediate field otherwise
+      return Error(Loc[2], "invalid immediate expression");
+    }
+    return false;
+  }
+  default:
+    return false;
+  }
+}
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+  switch (ErrCode) {
+  case Match_MissingFeature:
+    return Error(Loc,
+                 "instruction requires a CPU feature not currently enabled");
+  case Match_InvalidOperand:
+    return Error(Loc, "invalid operand for instruction");
+  case Match_InvalidSuffix:
+    return Error(Loc, "invalid type suffix for instruction");
+  case Match_InvalidCondCode:
+    return Error(Loc, "expected AArch64 condition code");
+  case Match_AddSubRegExtendSmall:
+    return Error(Loc,
+      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubRegExtendLarge:
+    return Error(Loc,
+      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubSecondSource:
+    return Error(Loc,
+      "expected compatible register, symbol or integer in range [0, 4095]");
+  case Match_LogicalSecondSource:
+    return Error(Loc, "expected compatible register or logical immediate");
+  case Match_InvalidMovImm32Shift:
+    return Error(Loc, "expected 'lsl' with optional integer 0 or 16");
+  case Match_InvalidMovImm64Shift:
+    return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48");
+  case Match_AddSubRegShift32:
+    return Error(Loc,
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
+  case Match_AddSubRegShift64:
+    return Error(Loc,
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
+  case Match_InvalidFPImm:
+    return Error(Loc,
+                 "expected compatible register or floating-point constant");
+  case Match_InvalidMemoryIndexedSImm9:
+    return Error(Loc, "index must be an integer in range [-256, 255].");
+  case Match_InvalidMemoryIndexed4SImm7:
+    return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
+  case Match_InvalidMemoryIndexed8SImm7:
+    return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
+  case Match_InvalidMemoryIndexed16SImm7:
+    return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
+  case Match_InvalidMemoryWExtend8:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0");
+  case Match_InvalidMemoryWExtend16:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
+  case Match_InvalidMemoryWExtend32:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
+  case Match_InvalidMemoryWExtend64:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
+  case Match_InvalidMemoryWExtend128:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #4");
+  case Match_InvalidMemoryXExtend8:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0");
+  case Match_InvalidMemoryXExtend16:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
+  case Match_InvalidMemoryXExtend32:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
+  case Match_InvalidMemoryXExtend64:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
+  case Match_InvalidMemoryXExtend128:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
+  case Match_InvalidMemoryIndexed1:
+    return Error(Loc, "index must be an integer in range [0, 4095].");
+  case Match_InvalidMemoryIndexed2:
+    return Error(Loc, "index must be a multiple of 2 in range [0, 8190].");
+  case Match_InvalidMemoryIndexed4:
+    return Error(Loc, "index must be a multiple of 4 in range [0, 16380].");
+  case Match_InvalidMemoryIndexed8:
+    return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
+  case Match_InvalidMemoryIndexed16:
+    return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
+  case Match_InvalidImm0_7:
+    return Error(Loc, "immediate must be an integer in range [0, 7].");
+  case Match_InvalidImm0_15:
+    return Error(Loc, "immediate must be an integer in range [0, 15].");
+  case Match_InvalidImm0_31:
+    return Error(Loc, "immediate must be an integer in range [0, 31].");
+  case Match_InvalidImm0_63:
+    return Error(Loc, "immediate must be an integer in range [0, 63].");
+  case Match_InvalidImm0_127:
+    return Error(Loc, "immediate must be an integer in range [0, 127].");
+  case Match_InvalidImm0_65535:
+    return Error(Loc, "immediate must be an integer in range [0, 65535].");
+  case Match_InvalidImm1_8:
+    return Error(Loc, "immediate must be an integer in range [1, 8].");
+  case Match_InvalidImm1_16:
+    return Error(Loc, "immediate must be an integer in range [1, 16].");
+  case Match_InvalidImm1_32:
+    return Error(Loc, "immediate must be an integer in range [1, 32].");
+  case Match_InvalidImm1_64:
+    return Error(Loc, "immediate must be an integer in range [1, 64].");
+  case Match_InvalidIndex1:
+    return Error(Loc, "expected lane specifier '[1]'");
+  case Match_InvalidIndexB:
+    return Error(Loc, "vector lane must be an integer in range [0, 15].");
+  case Match_InvalidIndexH:
+    return Error(Loc, "vector lane must be an integer in range [0, 7].");
+  case Match_InvalidIndexS:
+    return Error(Loc, "vector lane must be an integer in range [0, 3].");
+  case Match_InvalidIndexD:
+    return Error(Loc, "vector lane must be an integer in range [0, 1].");
+  case Match_InvalidLabel:
+    return Error(Loc, "expected label or encodable integer pc offset");
+  case Match_MRS:
+    return Error(Loc, "expected readable system register");
+  case Match_MSR:
+    return Error(Loc, "expected writable system register or pstate");
+  case Match_MnemonicFail:
+    return Error(Loc, "unrecognized instruction mnemonic");
+  default:
+    assert(0 && "unexpected error code!");
+    return Error(Loc, "invalid instruction format");
+  }
+}
+
+static const char *getSubtargetFeatureName(unsigned Val);
+
+bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                               OperandVector &Operands,
+                                               MCStreamer &Out,
+                                               unsigned &ErrorInfo,
+                                               bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[0]);
+  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+
+  StringRef Tok = Op->getToken();
+  unsigned NumOperands = Operands.size();
+
+  if (NumOperands == 4 && Tok == "lsl") {
+    AArch64Operand *Op2 = static_cast<AArch64Operand *>(Operands[2]);
+    AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
+    if (Op2->isReg() && Op3->isImm()) {
+      const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+      if (Op3CE) {
+        uint64_t Op3Val = Op3CE->getValue();
+        uint64_t NewOp3Val = 0;
+        uint64_t NewOp4Val = 0;
+        if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+                Op2->getReg())) {
+          NewOp3Val = (32 - Op3Val) & 0x1f;
+          NewOp4Val = 31 - Op3Val;
+        } else {
+          NewOp3Val = (64 - Op3Val) & 0x3f;
+          NewOp4Val = 63 - Op3Val;
+        }
+
+        const MCExpr *NewOp3 = MCConstantExpr::Create(NewOp3Val, getContext());
+        const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext());
+
+        Operands[0] = AArch64Operand::CreateToken(
+            "ubfm", false, Op->getStartLoc(), getContext());
+        Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
+                                                Op3->getEndLoc(), getContext());
+        Operands.push_back(AArch64Operand::CreateImm(
+            NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext()));
+        delete Op3;
+        delete Op;
+      }
+    }
+  } else if (NumOperands == 5) {
+    // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
+    // UBFIZ -> UBFM aliases.
+    if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
+      AArch64Operand *Op1 = static_cast<AArch64Operand *>(Operands[1]);
+      AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
+      AArch64Operand *Op4 = static_cast<AArch64Operand *>(Operands[4]);
+
+      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t RegWidth = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+              Op1->getReg()))
+            RegWidth = 64;
+          else
+            RegWidth = 32;
+
+          if (Op3Val >= RegWidth)
+            return Error(Op3->getStartLoc(),
+                         "expected integer in range [0, 31]");
+          if (Op4Val < 1 || Op4Val > RegWidth)
+            return Error(Op4->getStartLoc(),
+                         "expected integer in range [1, 32]");
+
+          uint64_t NewOp3Val = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+                  Op1->getReg()))
+            NewOp3Val = (32 - Op3Val) & 0x1f;
+          else
+            NewOp3Val = (64 - Op3Val) & 0x3f;
+
+          uint64_t NewOp4Val = Op4Val - 1;
+
+          if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
+            return Error(Op4->getStartLoc(),
+                         "requested insert overflows register");
+
+          const MCExpr *NewOp3 =
+              MCConstantExpr::Create(NewOp3Val, getContext());
+          const MCExpr *NewOp4 =
+              MCConstantExpr::Create(NewOp4Val, getContext());
+          Operands[3] = AArch64Operand::CreateImm(
+              NewOp3, Op3->getStartLoc(), Op3->getEndLoc(), getContext());
+          Operands[4] = AArch64Operand::CreateImm(
+              NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
+          if (Tok == "bfi")
+            Operands[0] = AArch64Operand::CreateToken(
+                "bfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "sbfiz")
+            Operands[0] = AArch64Operand::CreateToken(
+                "sbfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "ubfiz")
+            Operands[0] = AArch64Operand::CreateToken(
+                "ubfm", false, Op->getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+
+          delete Op;
+          delete Op3;
+          delete Op4;
+        }
+      }
+
+      // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
+      // UBFX -> UBFM aliases.
+    } else if (NumOperands == 5 &&
+               (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
+      AArch64Operand *Op1 = static_cast<AArch64Operand *>(Operands[1]);
+      AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
+      AArch64Operand *Op4 = static_cast<AArch64Operand *>(Operands[4]);
+
+      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t RegWidth = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+              Op1->getReg()))
+            RegWidth = 64;
+          else
+            RegWidth = 32;
+
+          if (Op3Val >= RegWidth)
+            return Error(Op3->getStartLoc(),
+                         "expected integer in range [0, 31]");
+          if (Op4Val < 1 || Op4Val > RegWidth)
+            return Error(Op4->getStartLoc(),
+                         "expected integer in range [1, 32]");
+
+          uint64_t NewOp4Val = Op3Val + Op4Val - 1;
+
+          if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val)
+            return Error(Op4->getStartLoc(),
+                         "requested extract overflows register");
+
+          const MCExpr *NewOp4 =
+              MCConstantExpr::Create(NewOp4Val, getContext());
+          Operands[4] = AArch64Operand::CreateImm(
+              NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
+          if (Tok == "bfxil")
+            Operands[0] = AArch64Operand::CreateToken(
+                "bfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "sbfx")
+            Operands[0] = AArch64Operand::CreateToken(
+                "sbfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "ubfx")
+            Operands[0] = AArch64Operand::CreateToken(
+                "ubfm", false, Op->getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+
+          delete Op;
+          delete Op4;
+        }
+      }
+    }
+  }
+  // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
+  //        InstAlias can't quite handle this since the reg classes aren't
+  //        subclasses.
+  if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
+    // The source register can be Wn here, but the matcher expects a
+    // GPR64. Twiddle it here if necessary.
+    AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[2]);
+    if (Op->isReg()) {
+      unsigned Reg = getXRegFromWReg(Op->getReg());
+      Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+                                              Op->getEndLoc(), getContext());
+      delete Op;
+    }
+  }
+  // FIXME: Likewise for sxt[bh] with a Xd dst operand
+  else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
+    AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
+    if (Op->isReg() &&
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Op->getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR64. Twiddle it here if necessary.
+      AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[2]);
+      if (Op->isReg()) {
+        unsigned Reg = getXRegFromWReg(Op->getReg());
+        Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+                                                Op->getEndLoc(), getContext());
+        delete Op;
+      }
+    }
+  }
+  // FIXME: Likewise for uxt[bh] with a Xd dst operand
+  else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
+    AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
+    if (Op->isReg() &&
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Op->getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR32. Twiddle it here if necessary.
+      AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
+      if (Op->isReg()) {
+        unsigned Reg = getWRegFromXReg(Op->getReg());
+        Operands[1] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+                                                Op->getEndLoc(), getContext());
+        delete Op;
+      }
+    }
+  }
+
+  // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
+  if (NumOperands == 3 && Tok == "fmov") {
+    AArch64Operand *RegOp = static_cast<AArch64Operand *>(Operands[1]);
+    AArch64Operand *ImmOp = static_cast<AArch64Operand *>(Operands[2]);
+    if (RegOp->isReg() && ImmOp->isFPImm() &&
+        ImmOp->getFPImm() == (unsigned)-1) {
+      unsigned zreg =
+          AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains(
+              RegOp->getReg())
+              ? AArch64::WZR
+              : AArch64::XZR;
+      Operands[2] = AArch64Operand::CreateReg(zreg, false, Op->getStartLoc(),
+                                              Op->getEndLoc(), getContext());
+      delete ImmOp;
+    }
+  }
+
+  MCInst Inst;
+  // First try to match against the secondary set of tables containing the
+  // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
+
+  // If that fails, try against the alternate table containing long-form NEON:
+  // "fadd v0.2s, v1.2s, v2.2s"
+  if (MatchResult != Match_Success)
+    MatchResult =
+        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
+
+  switch (MatchResult) {
+  case Match_Success: {
+    // Perform range checking and other semantic validations
+    SmallVector<SMLoc, 8> OperandLocs;
+    NumOperands = Operands.size();
+    for (unsigned i = 1; i < NumOperands; ++i)
+      OperandLocs.push_back(Operands[i]->getStartLoc());
+    if (validateInstruction(Inst, OperandLocs))
+      return true;
+
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  }
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing (neon, e.g.).
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
+  case Match_MnemonicFail:
+    return showMatchError(IDLoc, MatchResult);
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    // If the match failed on a suffix token operand, tweak the diagnostic
+    // accordingly.
+    if (((AArch64Operand *)Operands[ErrorInfo])->isToken() &&
+        ((AArch64Operand *)Operands[ErrorInfo])->isTokenSuffix())
+      MatchResult = Match_InvalidSuffix;
+
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  case Match_InvalidMemoryIndexed1:
+  case Match_InvalidMemoryIndexed2:
+  case Match_InvalidMemoryIndexed4:
+  case Match_InvalidMemoryIndexed8:
+  case Match_InvalidMemoryIndexed16:
+  case Match_InvalidCondCode:
+  case Match_AddSubRegExtendSmall:
+  case Match_AddSubRegExtendLarge:
+  case Match_AddSubSecondSource:
+  case Match_LogicalSecondSource:
+  case Match_AddSubRegShift32:
+  case Match_AddSubRegShift64:
+  case Match_InvalidMovImm32Shift:
+  case Match_InvalidMovImm64Shift:
+  case Match_InvalidFPImm:
+  case Match_InvalidMemoryWExtend8:
+  case Match_InvalidMemoryWExtend16:
+  case Match_InvalidMemoryWExtend32:
+  case Match_InvalidMemoryWExtend64:
+  case Match_InvalidMemoryWExtend128:
+  case Match_InvalidMemoryXExtend8:
+  case Match_InvalidMemoryXExtend16:
+  case Match_InvalidMemoryXExtend32:
+  case Match_InvalidMemoryXExtend64:
+  case Match_InvalidMemoryXExtend128:
+  case Match_InvalidMemoryIndexed4SImm7:
+  case Match_InvalidMemoryIndexed8SImm7:
+  case Match_InvalidMemoryIndexed16SImm7:
+  case Match_InvalidMemoryIndexedSImm9:
+  case Match_InvalidImm0_7:
+  case Match_InvalidImm0_15:
+  case Match_InvalidImm0_31:
+  case Match_InvalidImm0_63:
+  case Match_InvalidImm0_127:
+  case Match_InvalidImm0_65535:
+  case Match_InvalidImm1_8:
+  case Match_InvalidImm1_16:
+  case Match_InvalidImm1_32:
+  case Match_InvalidImm1_64:
+  case Match_InvalidIndex1:
+  case Match_InvalidIndexB:
+  case Match_InvalidIndexH:
+  case Match_InvalidIndexS:
+  case Match_InvalidIndexD:
+  case Match_InvalidLabel:
+  case Match_MSR:
+  case Match_MRS: {
+    // Any time we get here, there's nothing fancy to do. Just get the
+    // operand SMLoc and display the diagnostic.
+    SMLoc ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+  return true;
+}
+
+/// ParseDirective parses the arm specific directives
+bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  SMLoc Loc = DirectiveID.getLoc();
+  if (IDVal == ".hword")
+    return parseDirectiveWord(2, Loc);
+  if (IDVal == ".word")
+    return parseDirectiveWord(4, Loc);
+  if (IDVal == ".xword")
+    return parseDirectiveWord(8, Loc);
+  if (IDVal == ".tlsdesccall")
+    return parseDirectiveTLSDescCall(Loc);
+
+  return parseDirectiveLOH(IDVal, Loc);
+}
+
+/// parseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+// parseDirectiveTLSDescCall:
+//   ::= .tlsdesccall symbol
+bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return Error(L, "expected symbol after directive");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+  Expr = AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
+
+  MCInst Inst;
+  Inst.setOpcode(AArch64::TLSDESCCALL);
+  Inst.addOperand(MCOperand::CreateExpr(Expr));
+
+  getParser().getStreamer().EmitInstruction(Inst, STI);
+  return false;
+}
+
+/// ::= .loh <lohName | lohId> label1, ..., labelN
+/// The number of arguments depends on the loh identifier.
+bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
+  if (IDVal != MCLOHDirectiveName())
+    return true;
+  MCLOHType Kind;
+  if (getParser().getTok().isNot(AsmToken::Identifier)) {
+    if (getParser().getTok().isNot(AsmToken::Integer))
+      return TokError("expected an identifier or a number in directive");
+    // We successfully get a numeric value for the identifier.
+    // Check if it is valid.
+    int64_t Id = getParser().getTok().getIntVal();
+    Kind = (MCLOHType)Id;
+    // Check that Id does not overflow MCLOHType.
+    if (!isValidMCLOHType(Kind) || Id != Kind)
+      return TokError("invalid numeric identifier in directive");
+  } else {
+    StringRef Name = getTok().getIdentifier();
+    // We successfully parse an identifier.
+    // Check if it is a recognized one.
+    int Id = MCLOHNameToId(Name);
+
+    if (Id == -1)
+      return TokError("invalid identifier in directive");
+    Kind = (MCLOHType)Id;
+  }
+  // Consume the identifier.
+  Lex();
+  // Get the number of arguments of this LOH.
+  int NbArgs = MCLOHIdToNbArgs(Kind);
+
+  assert(NbArgs != -1 && "Invalid number of arguments");
+
+  SmallVector<MCSymbol *, 3> Args;
+  for (int Idx = 0; Idx < NbArgs; ++Idx) {
+    StringRef Name;
+    if (getParser().parseIdentifier(Name))
+      return TokError("expected identifier in directive");
+    Args.push_back(getContext().GetOrCreateSymbol(Name));
+
+    if (Idx + 1 == NbArgs)
+      break;
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+    Lex();
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+
+  getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
+  return false;
+}
+
+bool
+AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
+                                    AArch64MCExpr::VariantKind &ELFRefKind,
+                                    MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                    int64_t &Addend) {
+  ELFRefKind = AArch64MCExpr::VK_INVALID;
+  DarwinRefKind = MCSymbolRefExpr::VK_None;
+  Addend = 0;
+
+  if (const AArch64MCExpr *AE = dyn_cast<AArch64MCExpr>(Expr)) {
+    ELFRefKind = AE->getKind();
+    Expr = AE->getSubExpr();
+  }
+
+  const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
+  if (SE) {
+    // It's a simple symbol reference with no addend.
+    DarwinRefKind = SE->getKind();
+    return true;
+  }
+
+  const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
+  if (!BE)
+    return false;
+
+  SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+  if (!SE)
+    return false;
+  DarwinRefKind = SE->getKind();
+
+  if (BE->getOpcode() != MCBinaryExpr::Add &&
+      BE->getOpcode() != MCBinaryExpr::Sub)
+    return false;
+
+  // See if the addend is is a constant, otherwise there's more going
+  // on here than we can deal with.
+  auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
+  if (!AddendExpr)
+    return false;
+
+  Addend = AddendExpr->getValue();
+  if (BE->getOpcode() == MCBinaryExpr::Sub)
+    Addend = -Addend;
+
+  // It's some symbol reference + a constant addend, but really
+  // shouldn't use both Darwin and ELF syntax.
+  return ELFRefKind == AArch64MCExpr::VK_INVALID ||
+         DarwinRefKind == MCSymbolRefExpr::VK_None;
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmParser() {
+  RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
+  RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
+
+  RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64leTarget);
+  RegisterMCAsmParser<AArch64AsmParser> W(TheARM64beTarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "AArch64GenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+                                                      unsigned Kind) {
+  AArch64Operand *Op = static_cast<AArch64Operand *>(AsmOp);
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  int64_t ExpectedVal;
+  switch (Kind) {
+  default:
+    return Match_InvalidOperand;
+  case MCK__35_0:
+    ExpectedVal = 0;
+    break;
+  case MCK__35_1:
+    ExpectedVal = 1;
+    break;
+  case MCK__35_12:
+    ExpectedVal = 12;
+    break;
+  case MCK__35_16:
+    ExpectedVal = 16;
+    break;
+  case MCK__35_2:
+    ExpectedVal = 2;
+    break;
+  case MCK__35_24:
+    ExpectedVal = 24;
+    break;
+  case MCK__35_3:
+    ExpectedVal = 3;
+    break;
+  case MCK__35_32:
+    ExpectedVal = 32;
+    break;
+  case MCK__35_4:
+    ExpectedVal = 4;
+    break;
+  case MCK__35_48:
+    ExpectedVal = 48;
+    break;
+  case MCK__35_6:
+    ExpectedVal = 6;
+    break;
+  case MCK__35_64:
+    ExpectedVal = 64;
+    break;
+  case MCK__35_8:
+    ExpectedVal = 8;
+    break;
+  }
+  if (!Op->isImm())
+    return Match_InvalidOperand;
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+  if (!CE)
+    return Match_InvalidOperand;
+  if (CE->getValue() == ExpectedVal)
+    return Match_Success;
+  return Match_InvalidOperand;
+}
diff --git a/lib/Target/AArch64/AsmParser/CMakeLists.txt b/lib/Target/AArch64/AsmParser/CMakeLists.txt
new file mode 100644
index 00000000000..cc0a9d86a14
--- /dev/null
+++ b/lib/Target/AArch64/AsmParser/CMakeLists.txt
@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAArch64AsmParser
+  AArch64AsmParser.cpp
+  )
+
diff --git a/lib/Target/AArch64/AsmParser/LLVMBuild.txt b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
new file mode 100644
index 00000000000..11eb9d55f61
--- /dev/null
+++ b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/AArch64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AArch64AsmParser
+parent = AArch64
+required_libraries = AArch64Desc AArch64Info AArch64Utils MC MCParser Support
+add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/AsmParser/Makefile b/lib/Target/AArch64/AsmParser/Makefile
new file mode 100644
index 00000000000..00268c76f8e
--- /dev/null
+++ b/lib/Target/AArch64/AsmParser/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/AArch64/AsmParser/Makefile ---------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAArch64AsmParser
+
+# Hack: we need to include 'main' ARM target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
new file mode 100644
index 00000000000..789d549bb15
--- /dev/null
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -0,0 +1,51 @@
+set(LLVM_TARGET_DEFINITIONS AArch64.td)
+
+tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering)
+tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
+tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
+tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
+add_public_tablegen_target(AArch64CommonTableGen)
+
+add_llvm_target(AArch64CodeGen
+  AArch64AddressTypePromotion.cpp
+  AArch64AdvSIMDScalarPass.cpp
+  AArch64AsmPrinter.cpp
+  AArch64BranchRelaxation.cpp
+  AArch64CleanupLocalDynamicTLSPass.cpp
+  AArch64CollectLOH.cpp
+  AArch64ConditionalCompares.cpp
+  AArch64DeadRegisterDefinitionsPass.cpp
+  AArch64ExpandPseudoInsts.cpp
+  AArch64FastISel.cpp
+  AArch64FrameLowering.cpp
+  AArch64ISelDAGToDAG.cpp
+  AArch64ISelLowering.cpp
+  AArch64InstrInfo.cpp
+  AArch64LoadStoreOptimizer.cpp
+  AArch64MCInstLower.cpp
+  AArch64PromoteConstant.cpp
+  AArch64RegisterInfo.cpp
+  AArch64SelectionDAGInfo.cpp
+  AArch64StorePairSuppress.cpp
+  AArch64Subtarget.cpp
+  AArch64TargetMachine.cpp
+  AArch64TargetObjectFile.cpp
+  AArch64TargetTransformInfo.cpp
+)
+
+add_dependencies(LLVMAArch64CodeGen intrinsics_gen)
+
+add_subdirectory(TargetInfo)
+add_subdirectory(AsmParser)
+add_subdirectory(Disassembler)
+add_subdirectory(InstPrinter)
+add_subdirectory(MCTargetDesc)
+add_subdirectory(Utils)
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
new file mode 100644
index 00000000000..6de27d6d51a
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -0,0 +1,1559 @@
+//===- AArch64Disassembler.cpp - Disassembler for AArch64 -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64Disassembler.h"
+#include "AArch64ExternalSymbolizer.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-disassembler"
+
+// Pull DecodeStatus and its enum values into the global namespace.
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
+                                              unsigned RegNo, uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+                                       uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
+
+static bool Check(DecodeStatus &Out, DecodeStatus In) {
+  switch (In) {
+    case MCDisassembler::Success:
+      // Out stays the same.
+      return true;
+    case MCDisassembler::SoftFail:
+      Out = In;
+      return true;
+    case MCDisassembler::Fail:
+      Out = In;
+      return false;
+  }
+  llvm_unreachable("Invalid DecodeStatus!");
+}
+
+#include "AArch64GenDisassemblerTables.inc"
+#include "AArch64GenInstrInfo.inc"
+
+#define Success llvm::MCDisassembler::Success
+#define Fail llvm::MCDisassembler::Fail
+#define SoftFail llvm::MCDisassembler::SoftFail
+
+static MCDisassembler *createAArch64Disassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new AArch64Disassembler(STI, Ctx);
+}
+
+DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                               const MemoryObject &Region,
+                                               uint64_t Address,
+                                               raw_ostream &os,
+                                               raw_ostream &cs) const {
+  CommentStream = &cs;
+
+  uint8_t bytes[4];
+
+  Size = 0;
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
+    return Fail;
+  Size = 4;
+
+  // Encoded as a small-endian 32-bit word in the stream.
+  uint32_t insn =
+      (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
+
+  // Calling the auto-generated decoder function.
+  return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
+}
+
+static MCSymbolizer *
+createAArch64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
+                              LLVMSymbolLookupCallback SymbolLookUp,
+                              void *DisInfo, MCContext *Ctx,
+                              MCRelocationInfo *RelInfo) {
+  return new llvm::AArch64ExternalSymbolizer(
+                                     *Ctx,
+                                     std::unique_ptr<MCRelocationInfo>(RelInfo),
+                                     GetOpInfo, SymbolLookUp, DisInfo);
+}
+
+extern "C" void LLVMInitializeAArch64Disassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(TheAArch64leTarget,
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget,
+                                       createAArch64ExternalSymbolizer);
+
+  TargetRegistry::RegisterMCDisassembler(TheARM64leTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheARM64beTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget,
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget,
+                                       createAArch64ExternalSymbolizer);
+}
+
+static const unsigned FPR128DecoderTable[] = {
+    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
+    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
+    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+    AArch64::Q30, AArch64::Q31
+};
+
+static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR128DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t Addr,
+                                                 const void *Decoder) {
+  if (RegNo > 15)
+    return Fail;
+  return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
+}
+
+static const unsigned FPR64DecoderTable[] = {
+    AArch64::D0,  AArch64::D1,  AArch64::D2,  AArch64::D3,  AArch64::D4,
+    AArch64::D5,  AArch64::D6,  AArch64::D7,  AArch64::D8,  AArch64::D9,
+    AArch64::D10, AArch64::D11, AArch64::D12, AArch64::D13, AArch64::D14,
+    AArch64::D15, AArch64::D16, AArch64::D17, AArch64::D18, AArch64::D19,
+    AArch64::D20, AArch64::D21, AArch64::D22, AArch64::D23, AArch64::D24,
+    AArch64::D25, AArch64::D26, AArch64::D27, AArch64::D28, AArch64::D29,
+    AArch64::D30, AArch64::D31
+};
+
+static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned FPR32DecoderTable[] = {
+    AArch64::S0,  AArch64::S1,  AArch64::S2,  AArch64::S3,  AArch64::S4,
+    AArch64::S5,  AArch64::S6,  AArch64::S7,  AArch64::S8,  AArch64::S9,
+    AArch64::S10, AArch64::S11, AArch64::S12, AArch64::S13, AArch64::S14,
+    AArch64::S15, AArch64::S16, AArch64::S17, AArch64::S18, AArch64::S19,
+    AArch64::S20, AArch64::S21, AArch64::S22, AArch64::S23, AArch64::S24,
+    AArch64::S25, AArch64::S26, AArch64::S27, AArch64::S28, AArch64::S29,
+    AArch64::S30, AArch64::S31
+};
+
+static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR32DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned FPR16DecoderTable[] = {
+    AArch64::H0,  AArch64::H1,  AArch64::H2,  AArch64::H3,  AArch64::H4,
+    AArch64::H5,  AArch64::H6,  AArch64::H7,  AArch64::H8,  AArch64::H9,
+    AArch64::H10, AArch64::H11, AArch64::H12, AArch64::H13, AArch64::H14,
+    AArch64::H15, AArch64::H16, AArch64::H17, AArch64::H18, AArch64::H19,
+    AArch64::H20, AArch64::H21, AArch64::H22, AArch64::H23, AArch64::H24,
+    AArch64::H25, AArch64::H26, AArch64::H27, AArch64::H28, AArch64::H29,
+    AArch64::H30, AArch64::H31
+};
+
+static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR16DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned FPR8DecoderTable[] = {
+    AArch64::B0,  AArch64::B1,  AArch64::B2,  AArch64::B3,  AArch64::B4,
+    AArch64::B5,  AArch64::B6,  AArch64::B7,  AArch64::B8,  AArch64::B9,
+    AArch64::B10, AArch64::B11, AArch64::B12, AArch64::B13, AArch64::B14,
+    AArch64::B15, AArch64::B16, AArch64::B17, AArch64::B18, AArch64::B19,
+    AArch64::B20, AArch64::B21, AArch64::B22, AArch64::B23, AArch64::B24,
+    AArch64::B25, AArch64::B26, AArch64::B27, AArch64::B28, AArch64::B29,
+    AArch64::B30, AArch64::B31
+};
+
+static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR8DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned GPR64DecoderTable[] = {
+    AArch64::X0,  AArch64::X1,  AArch64::X2,  AArch64::X3,  AArch64::X4,
+    AArch64::X5,  AArch64::X6,  AArch64::X7,  AArch64::X8,  AArch64::X9,
+    AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14,
+    AArch64::X15, AArch64::X16, AArch64::X17, AArch64::X18, AArch64::X19,
+    AArch64::X20, AArch64::X21, AArch64::X22, AArch64::X23, AArch64::X24,
+    AArch64::X25, AArch64::X26, AArch64::X27, AArch64::X28, AArch64::FP,
+    AArch64::LR,  AArch64::XZR
+};
+
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = GPR64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = GPR64DecoderTable[RegNo];
+  if (Register == AArch64::XZR)
+    Register = AArch64::SP;
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned GPR32DecoderTable[] = {
+    AArch64::W0,  AArch64::W1,  AArch64::W2,  AArch64::W3,  AArch64::W4,
+    AArch64::W5,  AArch64::W6,  AArch64::W7,  AArch64::W8,  AArch64::W9,
+    AArch64::W10, AArch64::W11, AArch64::W12, AArch64::W13, AArch64::W14,
+    AArch64::W15, AArch64::W16, AArch64::W17, AArch64::W18, AArch64::W19,
+    AArch64::W20, AArch64::W21, AArch64::W22, AArch64::W23, AArch64::W24,
+    AArch64::W25, AArch64::W26, AArch64::W27, AArch64::W28, AArch64::W29,
+    AArch64::W30, AArch64::WZR
+};
+
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = GPR32DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = GPR32DecoderTable[RegNo];
+  if (Register == AArch64::WZR)
+    Register = AArch64::WSP;
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned VectorDecoderTable[] = {
+    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
+    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
+    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+    AArch64::Q30, AArch64::Q31
+};
+
+static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = VectorDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned QQDecoderTable[] = {
+  AArch64::Q0_Q1,   AArch64::Q1_Q2,   AArch64::Q2_Q3,   AArch64::Q3_Q4,
+  AArch64::Q4_Q5,   AArch64::Q5_Q6,   AArch64::Q6_Q7,   AArch64::Q7_Q8,
+  AArch64::Q8_Q9,   AArch64::Q9_Q10,  AArch64::Q10_Q11, AArch64::Q11_Q12,
+  AArch64::Q12_Q13, AArch64::Q13_Q14, AArch64::Q14_Q15, AArch64::Q15_Q16,
+  AArch64::Q16_Q17, AArch64::Q17_Q18, AArch64::Q18_Q19, AArch64::Q19_Q20,
+  AArch64::Q20_Q21, AArch64::Q21_Q22, AArch64::Q22_Q23, AArch64::Q23_Q24,
+  AArch64::Q24_Q25, AArch64::Q25_Q26, AArch64::Q26_Q27, AArch64::Q27_Q28,
+  AArch64::Q28_Q29, AArch64::Q29_Q30, AArch64::Q30_Q31, AArch64::Q31_Q0
+};
+
+static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned QQQDecoderTable[] = {
+  AArch64::Q0_Q1_Q2,    AArch64::Q1_Q2_Q3,    AArch64::Q2_Q3_Q4,
+  AArch64::Q3_Q4_Q5,    AArch64::Q4_Q5_Q6,    AArch64::Q5_Q6_Q7,
+  AArch64::Q6_Q7_Q8,    AArch64::Q7_Q8_Q9,    AArch64::Q8_Q9_Q10,
+  AArch64::Q9_Q10_Q11,  AArch64::Q10_Q11_Q12, AArch64::Q11_Q12_Q13,
+  AArch64::Q12_Q13_Q14, AArch64::Q13_Q14_Q15, AArch64::Q14_Q15_Q16,
+  AArch64::Q15_Q16_Q17, AArch64::Q16_Q17_Q18, AArch64::Q17_Q18_Q19,
+  AArch64::Q18_Q19_Q20, AArch64::Q19_Q20_Q21, AArch64::Q20_Q21_Q22,
+  AArch64::Q21_Q22_Q23, AArch64::Q22_Q23_Q24, AArch64::Q23_Q24_Q25,
+  AArch64::Q24_Q25_Q26, AArch64::Q25_Q26_Q27, AArch64::Q26_Q27_Q28,
+  AArch64::Q27_Q28_Q29, AArch64::Q28_Q29_Q30, AArch64::Q29_Q30_Q31,
+  AArch64::Q30_Q31_Q0,  AArch64::Q31_Q0_Q1
+};
+
+static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned QQQQDecoderTable[] = {
+  AArch64::Q0_Q1_Q2_Q3,     AArch64::Q1_Q2_Q3_Q4,     AArch64::Q2_Q3_Q4_Q5,
+  AArch64::Q3_Q4_Q5_Q6,     AArch64::Q4_Q5_Q6_Q7,     AArch64::Q5_Q6_Q7_Q8,
+  AArch64::Q6_Q7_Q8_Q9,     AArch64::Q7_Q8_Q9_Q10,    AArch64::Q8_Q9_Q10_Q11,
+  AArch64::Q9_Q10_Q11_Q12,  AArch64::Q10_Q11_Q12_Q13, AArch64::Q11_Q12_Q13_Q14,
+  AArch64::Q12_Q13_Q14_Q15, AArch64::Q13_Q14_Q15_Q16, AArch64::Q14_Q15_Q16_Q17,
+  AArch64::Q15_Q16_Q17_Q18, AArch64::Q16_Q17_Q18_Q19, AArch64::Q17_Q18_Q19_Q20,
+  AArch64::Q18_Q19_Q20_Q21, AArch64::Q19_Q20_Q21_Q22, AArch64::Q20_Q21_Q22_Q23,
+  AArch64::Q21_Q22_Q23_Q24, AArch64::Q22_Q23_Q24_Q25, AArch64::Q23_Q24_Q25_Q26,
+  AArch64::Q24_Q25_Q26_Q27, AArch64::Q25_Q26_Q27_Q28, AArch64::Q26_Q27_Q28_Q29,
+  AArch64::Q27_Q28_Q29_Q30, AArch64::Q28_Q29_Q30_Q31, AArch64::Q29_Q30_Q31_Q0,
+  AArch64::Q30_Q31_Q0_Q1,   AArch64::Q31_Q0_Q1_Q2
+};
+
+static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned DDDecoderTable[] = {
+  AArch64::D0_D1,   AArch64::D1_D2,   AArch64::D2_D3,   AArch64::D3_D4,
+  AArch64::D4_D5,   AArch64::D5_D6,   AArch64::D6_D7,   AArch64::D7_D8,
+  AArch64::D8_D9,   AArch64::D9_D10,  AArch64::D10_D11, AArch64::D11_D12,
+  AArch64::D12_D13, AArch64::D13_D14, AArch64::D14_D15, AArch64::D15_D16,
+  AArch64::D16_D17, AArch64::D17_D18, AArch64::D18_D19, AArch64::D19_D20,
+  AArch64::D20_D21, AArch64::D21_D22, AArch64::D22_D23, AArch64::D23_D24,
+  AArch64::D24_D25, AArch64::D25_D26, AArch64::D26_D27, AArch64::D27_D28,
+  AArch64::D28_D29, AArch64::D29_D30, AArch64::D30_D31, AArch64::D31_D0
+};
+
+static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned DDDDecoderTable[] = {
+  AArch64::D0_D1_D2,    AArch64::D1_D2_D3,    AArch64::D2_D3_D4,
+  AArch64::D3_D4_D5,    AArch64::D4_D5_D6,    AArch64::D5_D6_D7,
+  AArch64::D6_D7_D8,    AArch64::D7_D8_D9,    AArch64::D8_D9_D10,
+  AArch64::D9_D10_D11,  AArch64::D10_D11_D12, AArch64::D11_D12_D13,
+  AArch64::D12_D13_D14, AArch64::D13_D14_D15, AArch64::D14_D15_D16,
+  AArch64::D15_D16_D17, AArch64::D16_D17_D18, AArch64::D17_D18_D19,
+  AArch64::D18_D19_D20, AArch64::D19_D20_D21, AArch64::D20_D21_D22,
+  AArch64::D21_D22_D23, AArch64::D22_D23_D24, AArch64::D23_D24_D25,
+  AArch64::D24_D25_D26, AArch64::D25_D26_D27, AArch64::D26_D27_D28,
+  AArch64::D27_D28_D29, AArch64::D28_D29_D30, AArch64::D29_D30_D31,
+  AArch64::D30_D31_D0,  AArch64::D31_D0_D1
+};
+
+static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static const unsigned DDDDDecoderTable[] = {
+  AArch64::D0_D1_D2_D3,     AArch64::D1_D2_D3_D4,     AArch64::D2_D3_D4_D5,
+  AArch64::D3_D4_D5_D6,     AArch64::D4_D5_D6_D7,     AArch64::D5_D6_D7_D8,
+  AArch64::D6_D7_D8_D9,     AArch64::D7_D8_D9_D10,    AArch64::D8_D9_D10_D11,
+  AArch64::D9_D10_D11_D12,  AArch64::D10_D11_D12_D13, AArch64::D11_D12_D13_D14,
+  AArch64::D12_D13_D14_D15, AArch64::D13_D14_D15_D16, AArch64::D14_D15_D16_D17,
+  AArch64::D15_D16_D17_D18, AArch64::D16_D17_D18_D19, AArch64::D17_D18_D19_D20,
+  AArch64::D18_D19_D20_D21, AArch64::D19_D20_D21_D22, AArch64::D20_D21_D22_D23,
+  AArch64::D21_D22_D23_D24, AArch64::D22_D23_D24_D25, AArch64::D23_D24_D25_D26,
+  AArch64::D24_D25_D26_D27, AArch64::D25_D26_D27_D28, AArch64::D26_D27_D28_D29,
+  AArch64::D27_D28_D29_D30, AArch64::D28_D29_D30_D31, AArch64::D29_D30_D31_D0,
+  AArch64::D30_D31_D0_D1,   AArch64::D31_D0_D1_D2
+};
+
+static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  // scale{5} is asserted as 1 in tblgen.
+  Imm |= 0x20;  
+  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+  return Success;
+}
+
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+  return Success;
+}
+
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+                                       uint64_t Addr, const void *Decoder) {
+  int64_t ImmVal = Imm;
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend 19-bit immediate.
+  if (ImmVal & (1 << (19 - 1)))
+    ImmVal |= ~((1LL << 19) - 1);
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr,
+                                     Inst.getOpcode() != AArch64::LDRXl, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  return Success;
+}
+
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+                                    uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm((Imm  >> 1) & 1));
+  Inst.addOperand(MCOperand::CreateImm(Imm & 1));
+  return Success;
+}
+
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
+
+  Imm |= 0x8000;
+  Inst.addOperand(MCOperand::CreateImm(Imm));
+
+  bool ValidNamed;
+  (void)AArch64SysReg::MRSMapper(STI.getFeatureBits())
+      .toString(Imm, ValidNamed);
+
+  return ValidNamed ? Success : Fail;
+}
+
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
+
+  Imm |= 0x8000;
+  Inst.addOperand(MCOperand::CreateImm(Imm));
+
+  bool ValidNamed;
+  (void)AArch64SysReg::MSRMapper(STI.getFeatureBits())
+      .toString(Imm, ValidNamed);
+
+  return ValidNamed ? Success : Fail;
+}
+
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  // This decoder exists to add the dummy Lane operand to the MCInst, which must
+  // be 1 in assembly but has no other real manifestation.
+  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned IsToVec = fieldFromInstruction(Insn, 16, 1);
+
+  if (IsToVec) {
+    DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
+  } else {
+    DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  // Add the lane
+  Inst.addOperand(MCOperand::CreateImm(1));
+
+  return Success;
+}
+
+static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::CreateImm(Add - Imm));
+  return Success;
+}
+
+static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1)));
+  return Success;
+}
+
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 64);
+}
+
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
+}
+
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 32);
+}
+
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
+}
+
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 16);
+}
+
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
+}
+
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 8);
+}
+
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 64);
+}
+
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 32);
+}
+
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 16);
+}
+
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 8);
+}
+
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
+                                                   const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
+  unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
+  unsigned shift = (shiftHi << 6) | shiftLo;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::ADDWrs:
+  case AArch64::ADDSWrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBSWrs:
+    // if shift == '11' then ReservedValue()
+    if (shiftHi == 0x3)
+      return Fail;
+    // Deliberate fallthrough
+  case AArch64::ANDWrs:
+  case AArch64::ANDSWrs:
+  case AArch64::BICWrs:
+  case AArch64::BICSWrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORNWrs:
+  case AArch64::EORWrs:
+  case AArch64::EONWrs: {
+    // if sf == '0' and imm6<5> == '1' then ReservedValue()
+    if (shiftLo >> 5 == 1)
+      return Fail;
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+  case AArch64::ADDXrs:
+  case AArch64::ADDSXrs:
+  case AArch64::SUBXrs:
+  case AArch64::SUBSXrs:
+    // if shift == '11' then ReservedValue()
+    if (shiftHi == 0x3)
+      return Fail;
+    // Deliberate fallthrough
+  case AArch64::ANDXrs:
+  case AArch64::ANDSXrs:
+  case AArch64::BICXrs:
+  case AArch64::BICSXrs:
+  case AArch64::ORRXrs:
+  case AArch64::ORNXrs:
+  case AArch64::EORXrs:
+  case AArch64::EONXrs:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(shift));
+  return Success;
+}
+
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned imm = fieldFromInstruction(insn, 5, 16);
+  unsigned shift = fieldFromInstruction(insn, 21, 2);
+  shift <<= 4;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::MOVZWi:
+  case AArch64::MOVNWi:
+  case AArch64::MOVKWi:
+    if (shift & (1U << 5))
+      return Fail;
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case AArch64::MOVZXi:
+  case AArch64::MOVNXi:
+  case AArch64::MOVKXi:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  }
+
+  if (Inst.getOpcode() == AArch64::MOVKWi ||
+      Inst.getOpcode() == AArch64::MOVKXi)
+    Inst.addOperand(Inst.getOperand(0));
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  Inst.addOperand(MCOperand::CreateImm(shift));
+  return Success;
+}
+
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned offset = fieldFromInstruction(insn, 10, 12);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::PRFMui:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+    break;
+  case AArch64::STRBBui:
+  case AArch64::LDRBBui:
+  case AArch64::LDRSBWui:
+  case AArch64::STRHHui:
+  case AArch64::LDRHHui:
+  case AArch64::LDRSHWui:
+  case AArch64::STRWui:
+  case AArch64::LDRWui:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::STRXui:
+  case AArch64::LDRXui:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRQui:
+  case AArch64::STRQui:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRDui:
+  case AArch64::STRDui:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRSui:
+  case AArch64::STRSui:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRHui:
+  case AArch64::STRHui:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRBui:
+  case AArch64::STRBui:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(offset));
+  return Success;
+}
+
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  int64_t offset = fieldFromInstruction(insn, 12, 9);
+
+  // offset is a 9-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (9 - 1)))
+    offset |= ~((1LL << 9) - 1);
+
+  // First operand is always the writeback to the address register, if needed.
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::STRBBpre:
+  case AArch64::LDRBBpre:
+  case AArch64::STRHHpre:
+  case AArch64::LDRHHpre:
+  case AArch64::STRWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::STRBBpost:
+  case AArch64::LDRBBpost:
+  case AArch64::STRHHpost:
+  case AArch64::LDRHHpost:
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::STRXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::STRXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRXpost:
+  case AArch64::LDRQpre:
+  case AArch64::STRQpre:
+  case AArch64::LDRQpost:
+  case AArch64::STRQpost:
+  case AArch64::LDRDpre:
+  case AArch64::STRDpre:
+  case AArch64::LDRDpost:
+  case AArch64::STRDpost:
+  case AArch64::LDRSpre:
+  case AArch64::STRSpre:
+  case AArch64::LDRSpost:
+  case AArch64::STRSpost:
+  case AArch64::LDRHpre:
+  case AArch64::STRHpre:
+  case AArch64::LDRHpost:
+  case AArch64::STRHpost:
+  case AArch64::LDRBpre:
+  case AArch64::STRBpre:
+  case AArch64::LDRBpost:
+  case AArch64::STRBpost:
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    break;
+  }
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::PRFUMi:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+    break;
+  case AArch64::STURBBi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+  case AArch64::STURHHi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURSHWi:
+  case AArch64::STURWi:
+  case AArch64::LDURWi:
+  case AArch64::LDTRSBWi:
+  case AArch64::LDTRSHWi:
+  case AArch64::STTRWi:
+  case AArch64::LDTRWi:
+  case AArch64::STTRHi:
+  case AArch64::LDTRHi:
+  case AArch64::LDTRBi:
+  case AArch64::STTRBi:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::STRBBpre:
+  case AArch64::LDRBBpre:
+  case AArch64::STRHHpre:
+  case AArch64::LDRHHpre:
+  case AArch64::STRWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::STRBBpost:
+  case AArch64::LDRBBpost:
+  case AArch64::STRHHpost:
+  case AArch64::LDRHHpost:
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::STURXi:
+  case AArch64::LDURXi:
+  case AArch64::LDTRSBXi:
+  case AArch64::LDTRSHXi:
+  case AArch64::LDTRSWi:
+  case AArch64::STTRXi:
+  case AArch64::LDTRXi:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::STRXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::STRXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRXpost:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+  case AArch64::LDRQpre:
+  case AArch64::STRQpre:
+  case AArch64::LDRQpost:
+  case AArch64::STRQpost:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURDi:
+  case AArch64::STURDi:
+  case AArch64::LDRDpre:
+  case AArch64::STRDpre:
+  case AArch64::LDRDpost:
+  case AArch64::STRDpost:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURSi:
+  case AArch64::STURSi:
+  case AArch64::LDRSpre:
+  case AArch64::STRSpre:
+  case AArch64::LDRSpost:
+  case AArch64::STRSpost:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURHi:
+  case AArch64::STURHi:
+  case AArch64::LDRHpre:
+  case AArch64::STRHpre:
+  case AArch64::LDRHpost:
+  case AArch64::STRHpost:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURBi:
+  case AArch64::STURBi:
+  case AArch64::LDRBpre:
+  case AArch64::STRBpre:
+  case AArch64::LDRBpost:
+  case AArch64::STRBpost:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(offset));
+
+  bool IsLoad = fieldFromInstruction(insn, 22, 1);
+  bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0;
+  bool IsFP = fieldFromInstruction(insn, 26, 1);
+
+  // Cannot write back to a transfer register (but xzr != sp).
+  if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn)
+    return SoftFail;
+
+  return Success;
+}
+
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
+                                                   const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  unsigned Rs = fieldFromInstruction(insn, 16, 5);
+
+  unsigned Opcode = Inst.getOpcode();
+  switch (Opcode) {
+  default:
+    return Fail;
+  case AArch64::STLXRW:
+  case AArch64::STLXRB:
+  case AArch64::STLXRH:
+  case AArch64::STXRW:
+  case AArch64::STXRB:
+  case AArch64::STXRH:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDARW:
+  case AArch64::LDARB:
+  case AArch64::LDARH:
+  case AArch64::LDAXRW:
+  case AArch64::LDAXRB:
+  case AArch64::LDAXRH:
+  case AArch64::LDXRW:
+  case AArch64::LDXRB:
+  case AArch64::LDXRH:
+  case AArch64::STLRW:
+  case AArch64::STLRB:
+  case AArch64::STLRH:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::STLXRX:
+  case AArch64::STXRX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDARX:
+  case AArch64::LDAXRX:
+  case AArch64::LDXRX:
+  case AArch64::STLRX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::STLXPW:
+  case AArch64::STXPW:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDAXPW:
+  case AArch64::LDXPW:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::STLXPX:
+  case AArch64::STXPX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDAXPX:
+  case AArch64::LDXPX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+
+  // You shouldn't load to the same register twice in an instruction...
+  if ((Opcode == AArch64::LDAXPW || Opcode == AArch64::LDXPW ||
+       Opcode == AArch64::LDAXPX || Opcode == AArch64::LDXPX) &&
+      Rt == Rt2)
+    return SoftFail;
+
+  return Success;
+}
+
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  int64_t offset = fieldFromInstruction(insn, 15, 7);
+  bool IsLoad = fieldFromInstruction(insn, 22, 1);
+
+  // offset is a 7-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (7 - 1)))
+    offset |= ~((1LL << 7) - 1);
+
+  unsigned Opcode = Inst.getOpcode();
+  bool NeedsDisjointWritebackTransfer = false;
+
+  // First operand is always writeback of base register.
+  switch (Opcode) {
+  default:
+    break;
+  case AArch64::LDPXpost:
+  case AArch64::STPXpost:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPXpre:
+  case AArch64::STPXpre:
+  case AArch64::LDPSWpre:
+  case AArch64::LDPWpost:
+  case AArch64::STPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::STPWpre:
+  case AArch64::LDPQpost:
+  case AArch64::STPQpost:
+  case AArch64::LDPQpre:
+  case AArch64::STPQpre:
+  case AArch64::LDPDpost:
+  case AArch64::STPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::STPDpre:
+  case AArch64::LDPSpost:
+  case AArch64::STPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::STPSpre:
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    break;
+  }
+
+  switch (Opcode) {
+  default:
+    return Fail;
+  case AArch64::LDPXpost:
+  case AArch64::STPXpost:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPXpre:
+  case AArch64::STPXpre:
+  case AArch64::LDPSWpre:
+    NeedsDisjointWritebackTransfer = true;
+    // Fallthrough
+  case AArch64::LDNPXi:
+  case AArch64::STNPXi:
+  case AArch64::LDPXi:
+  case AArch64::STPXi:
+  case AArch64::LDPSWi:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDPWpost:
+  case AArch64::STPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::STPWpre:
+    NeedsDisjointWritebackTransfer = true;
+    // Fallthrough
+  case AArch64::LDNPWi:
+  case AArch64::STNPWi:
+  case AArch64::LDPWi:
+  case AArch64::STPWi:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPQi:
+  case AArch64::STNPQi:
+  case AArch64::LDPQpost:
+  case AArch64::STPQpost:
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+  case AArch64::LDPQpre:
+  case AArch64::STPQpre:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPDi:
+  case AArch64::STNPDi:
+  case AArch64::LDPDpost:
+  case AArch64::STPDpost:
+  case AArch64::LDPDi:
+  case AArch64::STPDi:
+  case AArch64::LDPDpre:
+  case AArch64::STPDpre:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPSi:
+  case AArch64::STNPSi:
+  case AArch64::LDPSpost:
+  case AArch64::STPSpost:
+  case AArch64::LDPSi:
+  case AArch64::STPSi:
+  case AArch64::LDPSpre:
+  case AArch64::STPSpre:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(offset));
+
+  // You shouldn't load to the same register twice in an instruction...
+  if (IsLoad && Rt == Rt2)
+    return SoftFail;
+
+  // ... or do any operation that writes-back to a transfer register. But note
+  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
+  if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn))
+    return SoftFail;
+
+  return Success;
+}
+
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned extend = fieldFromInstruction(insn, 10, 6);
+
+  unsigned shift = extend & 0x7;
+  if (shift > 4)
+    return Fail;
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::ADDWrx:
+  case AArch64::SUBWrx:
+    DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::ADDSWrx:
+  case AArch64::SUBSWrx:
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::ADDXrx:
+  case AArch64::SUBXrx:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::ADDSXrx:
+  case AArch64::SUBSXrx:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::ADDXrx64:
+  case AArch64::SUBXrx64:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::SUBSXrx64:
+  case AArch64::ADDSXrx64:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(extend));
+  return Success;
+}
+
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+  unsigned imm;
+
+  if (Datasize) {
+    if (Inst.getOpcode() == AArch64::ANDSXri)
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 13);
+    if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64))
+      return Fail;
+  } else {
+    if (Inst.getOpcode() == AArch64::ANDSWri)
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 12);
+    if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 32))
+      return Fail;
+  }
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  return Success;
+}
+
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
+
+  if (Inst.getOpcode() == AArch64::MOVID)
+    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
+  else
+    DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case AArch64::MOVIv4i16:
+  case AArch64::MOVIv8i16:
+  case AArch64::MVNIv4i16:
+  case AArch64::MVNIv8i16:
+  case AArch64::MOVIv2i32:
+  case AArch64::MOVIv4i32:
+  case AArch64::MVNIv2i32:
+  case AArch64::MVNIv4i32:
+    Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
+    break;
+  case AArch64::MOVIv2s_msl:
+  case AArch64::MOVIv4s_msl:
+  case AArch64::MVNIv2s_msl:
+  case AArch64::MVNIv4s_msl:
+    Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108));
+    break;
+  }
+
+  return Success;
+}
+
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
+
+  // Tied operands added twice.
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
+
+  return Success;
+}
+
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
+  imm |= fieldFromInstruction(insn, 29, 2);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend the 21-bit immediate.
+  if (imm & (1 << (21 - 1)))
+    imm |= ~((1LL << 21) - 1);
+
+  DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Imm = fieldFromInstruction(insn, 10, 14);
+  unsigned S = fieldFromInstruction(insn, 29, 1);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+
+  unsigned ShifterVal = (Imm >> 12) & 3;
+  unsigned ImmVal = Imm & 0xFFF;
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  if (ShifterVal != 0 && ShifterVal != 1)
+    return Fail;
+
+  if (Datasize) {
+    if (Rd == 31 && !S)
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  } else {
+    if (Rd == 31 && !S)
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+  }
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal));
+  return Success;
+}
+
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  int64_t imm = fieldFromInstruction(insn, 0, 26);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend the 26-bit immediate.
+  if (imm & (1 << (26 - 1)))
+    imm |= ~((1LL << 26) - 1);
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  uint64_t op1 = fieldFromInstruction(insn, 16, 3);
+  uint64_t op2 = fieldFromInstruction(insn, 5, 3);
+  uint64_t crm = fieldFromInstruction(insn, 8, 4);
+
+  uint64_t pstate_field = (op1 << 3) | op2;
+
+  Inst.addOperand(MCOperand::CreateImm(pstate_field));
+  Inst.addOperand(MCOperand::CreateImm(crm));
+
+  bool ValidNamed;
+  (void)AArch64PState::PStateMapper().toString(pstate_field, ValidNamed);
+  
+  return ValidNamed ? Success : Fail;
+}
+
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
+  uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
+  bit |= fieldFromInstruction(insn, 19, 5);
+  int64_t dst = fieldFromInstruction(insn, 5, 14);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend 14-bit immediate.
+  if (dst & (1 << (14 - 1)))
+    dst |= ~((1LL << 14) - 1);
+
+  if (fieldFromInstruction(insn, 31, 1) == 0)
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+  else
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(bit));
+  if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(dst));
+
+  return Success;
+}
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
new file mode 100644
index 00000000000..68d4867977b
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -0,0 +1,40 @@
+//===- AArch64Disassembler.h - Disassembler for AArch64 ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64DISASSEMBLER_H
+#define AArch64DISASSEMBLER_H
+
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class MemoryObject;
+class raw_ostream;
+
+class AArch64Disassembler : public MCDisassembler {
+public:
+  AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
+
+  ~AArch64Disassembler() {}
+
+  /// getInstruction - See MCDisassembler.
+  MCDisassembler::DecodeStatus
+  getInstruction(MCInst &instr, uint64_t &size, const MemoryObject &region,
+                 uint64_t address, raw_ostream &vStream,
+                 raw_ostream &cStream) const override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
new file mode 100644
index 00000000000..24663684a3f
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -0,0 +1,221 @@
+//===- AArch64ExternalSymbolizer.cpp - Symbolizer for AArch64 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ExternalSymbolizer.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-disassembler"
+
+static MCSymbolRefExpr::VariantKind
+getVariant(uint64_t LLVMDisassembler_VariantKind) {
+  switch (LLVMDisassembler_VariantKind) {
+  case LLVMDisassembler_VariantKind_None:
+    return MCSymbolRefExpr::VK_None;
+  case LLVMDisassembler_VariantKind_ARM64_PAGE:
+    return MCSymbolRefExpr::VK_PAGE;
+  case LLVMDisassembler_VariantKind_ARM64_PAGEOFF:
+    return MCSymbolRefExpr::VK_PAGEOFF;
+  case LLVMDisassembler_VariantKind_ARM64_GOTPAGE:
+    return MCSymbolRefExpr::VK_GOTPAGE;
+  case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF:
+    return MCSymbolRefExpr::VK_GOTPAGEOFF;
+  case LLVMDisassembler_VariantKind_ARM64_TLVP:
+  case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
+  default:
+    assert(0 && "bad LLVMDisassembler_VariantKind");
+    return MCSymbolRefExpr::VK_None;
+  }
+}
+
+/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+/// operand in place of the immediate Value in the MCInst.  The immediate
+/// Value has not had any PC adjustment made by the caller. If the instruction
+/// is a branch that adds the PC to the immediate Value then isBranch is
+/// Success, else Fail. If GetOpInfo is non-null, then it is called to get any
+/// symbolic information at the Address for this instrution.  If that returns
+/// non-zero then the symbolic information it returns is used to create an
+/// MCExpr and that is added as an operand to the MCInst.  If GetOpInfo()
+/// returns zero and isBranch is Success then a symbol look up for
+/// Address + Value is done and if a symbol is found an MCExpr is created with
+/// that, else an MCExpr with Address + Value is created.  If GetOpInfo()
+/// returns zero and isBranch is Fail then the the Opcode of the MCInst is
+/// tested and for ADRP an other instructions that help to load of pointers
+/// a symbol look up is done to see it is returns a specific reference type
+/// to add to the comment stream.  This function returns Success if it adds
+/// an operand to the MCInst and Fail otherwise.
+bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
+    MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address,
+    bool IsBranch, uint64_t Offset, uint64_t InstSize) {
+  // FIXME: This method shares a lot of code with
+  //        MCExternalSymbolizer::tryAddingSymbolicOperand. It may be possible
+  //        refactor the MCExternalSymbolizer interface to allow more of this
+  //        implementation to be shared.
+  //
+  struct LLVMOpInfo1 SymbolicOp;
+  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+  SymbolicOp.Value = Value;
+  uint64_t ReferenceType;
+  const char *ReferenceName;
+  if (!GetOpInfo ||
+      !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+    if (IsBranch) {
+      ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+      const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType,
+                                      Address, &ReferenceName);
+      if (Name) {
+        SymbolicOp.AddSymbol.Name = Name;
+        SymbolicOp.AddSymbol.Present = true;
+        SymbolicOp.Value = 0;
+      } else {
+        SymbolicOp.Value = Address + Value;
+      }
+      if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+        CommentStream << "symbol stub for: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message)
+        CommentStream << "Objc message: " << ReferenceName;
+    } else if (MI.getOpcode() == AArch64::ADRP) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP;
+        // otool expects the fully encoded ADRP instruction to be passed in as
+        // the value here, so reconstruct it:
+        const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+        uint32_t EncodedInst = 0x90000000;
+        EncodedInst |= (Value & 0x3) << 29; // immlo
+        EncodedInst |= ((Value >> 2) & 0x7FFFF) << 5; // immhi
+        EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg
+        SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+                     &ReferenceName);
+        CommentStream << format("0x%llx",
+                                0xfffffffffffff000LL & (Address + Value));
+    } else if (MI.getOpcode() == AArch64::ADDXri ||
+               MI.getOpcode() == AArch64::LDRXui ||
+               MI.getOpcode() == AArch64::LDRXl ||
+               MI.getOpcode() == AArch64::ADR) {
+      if (MI.getOpcode() == AArch64::ADDXri)
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri;
+      else if (MI.getOpcode() == AArch64::LDRXui)
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui;
+      if (MI.getOpcode() == AArch64::LDRXl) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl;
+        SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                     &ReferenceName);
+      } else if (MI.getOpcode() == AArch64::ADR) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR;
+        SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                            &ReferenceName);
+      } else {
+        const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+        // otool expects the fully encoded ADD/LDR instruction to be passed in
+        // as the value here, so reconstruct it:
+        unsigned EncodedInst =
+          MI.getOpcode() == AArch64::ADDXri ? 0x91000000: 0xF9400000;
+        EncodedInst |= Value << 10; // imm12 [+ shift:2 for ADD]
+        EncodedInst |=
+          MCRI.getEncodingValue(MI.getOperand(1).getReg()) << 5; // Rn
+        EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // Rd
+
+        SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+                     &ReferenceName);
+      }
+      if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
+        CommentStream << "literal pool symbol address: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+        CommentStream << "literal pool for: \"" << ReferenceName << "\"";
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
+        CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message)
+        CommentStream << "Objc message: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
+        CommentStream << "Objc message ref: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
+        CommentStream << "Objc selector ref: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
+        CommentStream << "Objc class ref: " << ReferenceName;
+      // For these instructions, the SymbolLookUp() above is just to get the
+      // ReferenceType and ReferenceName.  We want to make sure not to
+      // fall through so we don't build an MCExpr to leave the disassembly
+      // of the immediate values of these instructions to the InstPrinter.
+      return false;
+    } else {
+      return false;
+    }
+  }
+
+  const MCExpr *Add = nullptr;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+      MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind);
+      if (Variant != MCSymbolRefExpr::VK_None)
+        Add = MCSymbolRefExpr::Create(Sym, Variant, Ctx);
+      else
+        Add = MCSymbolRefExpr::Create(Sym, Ctx);
+    } else {
+      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Sub = nullptr;
+  if (SymbolicOp.SubtractSymbol.Present) {
+    if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, Ctx);
+    } else {
+      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Off = nullptr;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx);
+    else
+      LHS = MCUnaryExpr::CreateMinus(Sub, Ctx);
+    if (Off)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, Ctx);
+  }
+
+  MI.addOperand(MCOperand::CreateExpr(Expr));
+
+  return true;
+}
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
new file mode 100644
index 00000000000..171d31c48cd
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -0,0 +1,38 @@
+//===- AArch64ExternalSymbolizer.h - Symbolizer for AArch64 -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Symbolize AArch64 assembly code during disassembly using callbacks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64EXTERNALSYMBOLIZER_H
+#define AArch64EXTERNALSYMBOLIZER_H
+
+#include "llvm/MC/MCExternalSymbolizer.h"
+
+namespace llvm {
+
+class AArch64ExternalSymbolizer : public MCExternalSymbolizer {
+public:
+  AArch64ExternalSymbolizer(MCContext &Ctx,
+                            std::unique_ptr<MCRelocationInfo> RelInfo,
+                            LLVMOpInfoCallback GetOpInfo,
+                            LLVMSymbolLookupCallback SymbolLookUp,
+                            void *DisInfo)
+      : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp,
+                             DisInfo) {}
+
+  bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream,
+                                int64_t Value, uint64_t Address, bool IsBranch,
+                                uint64_t Offset, uint64_t InstSize) override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/Disassembler/CMakeLists.txt b/lib/Target/AArch64/Disassembler/CMakeLists.txt
new file mode 100644
index 00000000000..be4ccad6d1b
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/CMakeLists.txt
@@ -0,0 +1,14 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAArch64Disassembler
+  AArch64Disassembler.cpp
+  AArch64ExternalSymbolizer.cpp
+  )
+# workaround for hanging compilation on MSVC8, 9 and 10
+#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
+#set_property(
+#  SOURCE ARMDisassembler.cpp
+#  PROPERTY COMPILE_FLAGS "/Od"
+#  )
+#endif()
+add_dependencies(LLVMAArch64Disassembler AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
new file mode 100644
index 00000000000..a4224f4a2f5
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/AArch64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AArch64Disassembler
+parent = AArch64
+required_libraries = AArch64Info AArch64Utils MC Support
+add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/Disassembler/Makefile b/lib/Target/AArch64/Disassembler/Makefile
new file mode 100644
index 00000000000..741bb817a63
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/AArch64/Disassembler/Makefile ------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAArch64Disassembler
+
+# Hack: we need to include 'main' arm target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
new file mode 100644
index 00000000000..f484a5b1bdc
--- /dev/null
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -0,0 +1,1316 @@
+//==-- AArch64InstPrinter.cpp - Convert AArch64 MCInst to assembly syntax --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AArch64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter.inc"
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter1.inc"
+
+AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI,
+                                       const MCInstrInfo &MII,
+                                       const MCRegisterInfo &MRI,
+                                       const MCSubtargetInfo &STI)
+    : MCInstPrinter(MAI, MII, MRI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
+AArch64AppleInstPrinter::AArch64AppleInstPrinter(const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI)
+    : AArch64InstPrinter(MAI, MII, MRI, STI) {}
+
+void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // This is for .cfi directives.
+  OS << getRegisterName(RegNo);
+}
+
+void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  // Check for special encodings and print the canonical alias instead.
+
+  unsigned Opcode = MI->getOpcode();
+
+  if (Opcode == AArch64::SYSxt)
+    if (printSysAlias(MI, O)) {
+      printAnnotation(O, Annot);
+      return;
+    }
+
+  // SBFM/UBFM should print to a nicer aliased form if possible.
+  if (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri ||
+      Opcode == AArch64::UBFMXri || Opcode == AArch64::UBFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0);
+    const MCOperand &Op1 = MI->getOperand(1);
+    const MCOperand &Op2 = MI->getOperand(2);
+    const MCOperand &Op3 = MI->getOperand(3);
+
+    bool IsSigned = (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri);
+    bool Is64Bit = (Opcode == AArch64::SBFMXri || Opcode == AArch64::UBFMXri);
+    if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
+      const char *AsmMnemonic = nullptr;
+
+      switch (Op3.getImm()) {
+      default:
+        break;
+      case 7:
+        if (IsSigned)
+          AsmMnemonic = "sxtb";
+        else if (!Is64Bit)
+          AsmMnemonic = "uxtb";
+        break;
+      case 15:
+        if (IsSigned)
+          AsmMnemonic = "sxth";
+        else if (!Is64Bit)
+          AsmMnemonic = "uxth";
+        break;
+      case 31:
+        // *xtw is only valid for signed 64-bit operations.
+        if (Is64Bit && IsSigned)
+          AsmMnemonic = "sxtw";
+        break;
+      }
+
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(getWRegFromXReg(Op1.getReg()));
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
+
+    // All immediate shifts are aliases, implemented using the Bitfield
+    // instruction. In all cases the immediate shift amount shift must be in
+    // the range 0 to (reg.size -1).
+    if (Op2.isImm() && Op3.isImm()) {
+      const char *AsmMnemonic = nullptr;
+      int shift = 0;
+      int64_t immr = Op2.getImm();
+      int64_t imms = Op3.getImm();
+      if (Opcode == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
+        AsmMnemonic = "lsl";
+        shift = 31 - imms;
+      } else if (Opcode == AArch64::UBFMXri && imms != 0x3f &&
+                 ((imms + 1 == immr))) {
+        AsmMnemonic = "lsl";
+        shift = 63 - imms;
+      } else if (Opcode == AArch64::UBFMWri && imms == 0x1f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == AArch64::UBFMXri && imms == 0x3f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == AArch64::SBFMWri && imms == 0x1f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      } else if (Opcode == AArch64::SBFMXri && imms == 0x3f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      }
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
+
+    // SBFIZ/UBFIZ aliases
+    if (Op2.getImm() > Op3.getImm()) {
+      O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t'
+        << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+        << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1;
+      printAnnotation(O, Annot);
+      return;
+    }
+
+    // Otherwise SBFX/UBFX is the preferred form
+    O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t'
+      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+      << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1;
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  if (Opcode == AArch64::BFMXri || Opcode == AArch64::BFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0
+    const MCOperand &Op2 = MI->getOperand(2);
+    int ImmR = MI->getOperand(3).getImm();
+    int ImmS = MI->getOperand(4).getImm();
+
+    // BFI alias
+    if (ImmS < ImmR) {
+      int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
+      int LSB = (BitWidth - ImmR) % BitWidth;
+      int Width = ImmS + 1;
+      O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", "
+        << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width;
+      printAnnotation(O, Annot);
+      return;
+    }
+
+    int LSB = ImmR;
+    int Width = ImmS - ImmR + 1;
+    // Otherwise BFXIL the preferred form
+    O << "\tbfxil\t"
+      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg())
+      << ", #" << LSB << ", #" << Width;
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
+  // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
+  // printed.
+  if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi ||
+       Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+      MI->getOperand(1).isExpr()) {
+    if (Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi)
+      O << "\tmovz\t";
+    else
+      O << "\tmovn\t";
+
+    O << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+      << *MI->getOperand(1).getExpr();
+    return;
+  }
+
+  if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) &&
+      MI->getOperand(2).isExpr()) {
+    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+      << *MI->getOperand(2).getExpr();
+    return;
+  }
+
+  if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
+
+  printAnnotation(O, Annot);
+}
+
+static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
+                                bool &IsTbx) {
+  switch (Opcode) {
+  case AArch64::TBXv8i8One:
+  case AArch64::TBXv8i8Two:
+  case AArch64::TBXv8i8Three:
+  case AArch64::TBXv8i8Four:
+    IsTbx = true;
+    Layout = ".8b";
+    return true;
+  case AArch64::TBLv8i8One:
+  case AArch64::TBLv8i8Two:
+  case AArch64::TBLv8i8Three:
+  case AArch64::TBLv8i8Four:
+    IsTbx = false;
+    Layout = ".8b";
+    return true;
+  case AArch64::TBXv16i8One:
+  case AArch64::TBXv16i8Two:
+  case AArch64::TBXv16i8Three:
+  case AArch64::TBXv16i8Four:
+    IsTbx = true;
+    Layout = ".16b";
+    return true;
+  case AArch64::TBLv16i8One:
+  case AArch64::TBLv16i8Two:
+  case AArch64::TBLv16i8Three:
+  case AArch64::TBLv16i8Four:
+    IsTbx = false;
+    Layout = ".16b";
+    return true;
+  default:
+    return false;
+  }
+}
+
+struct LdStNInstrDesc {
+  unsigned Opcode;
+  const char *Mnemonic;
+  const char *Layout;
+  int ListOperand;
+  bool HasLane;
+  int NaturalOffset;
+};
+
+static LdStNInstrDesc LdStNInstInfo[] = {
+  { AArch64::LD1i8,             "ld1",  ".b",     1, true,  0  },
+  { AArch64::LD1i16,            "ld1",  ".h",     1, true,  0  },
+  { AArch64::LD1i32,            "ld1",  ".s",     1, true,  0  },
+  { AArch64::LD1i64,            "ld1",  ".d",     1, true,  0  },
+  { AArch64::LD1i8_POST,        "ld1",  ".b",     2, true,  1  },
+  { AArch64::LD1i16_POST,       "ld1",  ".h",     2, true,  2  },
+  { AArch64::LD1i32_POST,       "ld1",  ".s",     2, true,  4  },
+  { AArch64::LD1i64_POST,       "ld1",  ".d",     2, true,  8  },
+  { AArch64::LD1Rv16b,          "ld1r", ".16b",   0, false, 0  },
+  { AArch64::LD1Rv8h,           "ld1r", ".8h",    0, false, 0  },
+  { AArch64::LD1Rv4s,           "ld1r", ".4s",    0, false, 0  },
+  { AArch64::LD1Rv2d,           "ld1r", ".2d",    0, false, 0  },
+  { AArch64::LD1Rv8b,           "ld1r", ".8b",    0, false, 0  },
+  { AArch64::LD1Rv4h,           "ld1r", ".4h",    0, false, 0  },
+  { AArch64::LD1Rv2s,           "ld1r", ".2s",    0, false, 0  },
+  { AArch64::LD1Rv1d,           "ld1r", ".1d",    0, false, 0  },
+  { AArch64::LD1Rv16b_POST,     "ld1r", ".16b",   1, false, 1  },
+  { AArch64::LD1Rv8h_POST,      "ld1r", ".8h",    1, false, 2  },
+  { AArch64::LD1Rv4s_POST,      "ld1r", ".4s",    1, false, 4  },
+  { AArch64::LD1Rv2d_POST,      "ld1r", ".2d",    1, false, 8  },
+  { AArch64::LD1Rv8b_POST,      "ld1r", ".8b",    1, false, 1  },
+  { AArch64::LD1Rv4h_POST,      "ld1r", ".4h",    1, false, 2  },
+  { AArch64::LD1Rv2s_POST,      "ld1r", ".2s",    1, false, 4  },
+  { AArch64::LD1Rv1d_POST,      "ld1r", ".1d",    1, false, 8  },
+  { AArch64::LD1Onev16b,        "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Onev8h,         "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Onev4s,         "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Onev2d,         "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Onev8b,         "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Onev4h,         "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Onev2s,         "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Onev1d,         "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Onev16b_POST,   "ld1",  ".16b",   1, false, 16 },
+  { AArch64::LD1Onev8h_POST,    "ld1",  ".8h",    1, false, 16 },
+  { AArch64::LD1Onev4s_POST,    "ld1",  ".4s",    1, false, 16 },
+  { AArch64::LD1Onev2d_POST,    "ld1",  ".2d",    1, false, 16 },
+  { AArch64::LD1Onev8b_POST,    "ld1",  ".8b",    1, false, 8  },
+  { AArch64::LD1Onev4h_POST,    "ld1",  ".4h",    1, false, 8  },
+  { AArch64::LD1Onev2s_POST,    "ld1",  ".2s",    1, false, 8  },
+  { AArch64::LD1Onev1d_POST,    "ld1",  ".1d",    1, false, 8  },
+  { AArch64::LD1Twov16b,        "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Twov8h,         "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Twov4s,         "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Twov2d,         "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Twov8b,         "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Twov4h,         "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Twov2s,         "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Twov1d,         "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Twov16b_POST,   "ld1",  ".16b",   1, false, 32 },
+  { AArch64::LD1Twov8h_POST,    "ld1",  ".8h",    1, false, 32 },
+  { AArch64::LD1Twov4s_POST,    "ld1",  ".4s",    1, false, 32 },
+  { AArch64::LD1Twov2d_POST,    "ld1",  ".2d",    1, false, 32 },
+  { AArch64::LD1Twov8b_POST,    "ld1",  ".8b",    1, false, 16 },
+  { AArch64::LD1Twov4h_POST,    "ld1",  ".4h",    1, false, 16 },
+  { AArch64::LD1Twov2s_POST,    "ld1",  ".2s",    1, false, 16 },
+  { AArch64::LD1Twov1d_POST,    "ld1",  ".1d",    1, false, 16 },
+  { AArch64::LD1Threev16b,      "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Threev8h,       "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Threev4s,       "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Threev2d,       "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Threev8b,       "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Threev4h,       "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Threev2s,       "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Threev1d,       "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Threev16b_POST, "ld1",  ".16b",   1, false, 48 },
+  { AArch64::LD1Threev8h_POST,  "ld1",  ".8h",    1, false, 48 },
+  { AArch64::LD1Threev4s_POST,  "ld1",  ".4s",    1, false, 48 },
+  { AArch64::LD1Threev2d_POST,  "ld1",  ".2d",    1, false, 48 },
+  { AArch64::LD1Threev8b_POST,  "ld1",  ".8b",    1, false, 24 },
+  { AArch64::LD1Threev4h_POST,  "ld1",  ".4h",    1, false, 24 },
+  { AArch64::LD1Threev2s_POST,  "ld1",  ".2s",    1, false, 24 },
+  { AArch64::LD1Threev1d_POST,  "ld1",  ".1d",    1, false, 24 },
+  { AArch64::LD1Fourv16b,       "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Fourv8h,        "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Fourv4s,        "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Fourv2d,        "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Fourv8b,        "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Fourv4h,        "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Fourv2s,        "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Fourv1d,        "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Fourv16b_POST,  "ld1",  ".16b",   1, false, 64 },
+  { AArch64::LD1Fourv8h_POST,   "ld1",  ".8h",    1, false, 64 },
+  { AArch64::LD1Fourv4s_POST,   "ld1",  ".4s",    1, false, 64 },
+  { AArch64::LD1Fourv2d_POST,   "ld1",  ".2d",    1, false, 64 },
+  { AArch64::LD1Fourv8b_POST,   "ld1",  ".8b",    1, false, 32 },
+  { AArch64::LD1Fourv4h_POST,   "ld1",  ".4h",    1, false, 32 },
+  { AArch64::LD1Fourv2s_POST,   "ld1",  ".2s",    1, false, 32 },
+  { AArch64::LD1Fourv1d_POST,   "ld1",  ".1d",    1, false, 32 },
+  { AArch64::LD2i8,             "ld2",  ".b",     1, true,  0  },
+  { AArch64::LD2i16,            "ld2",  ".h",     1, true,  0  },
+  { AArch64::LD2i32,            "ld2",  ".s",     1, true,  0  },
+  { AArch64::LD2i64,            "ld2",  ".d",     1, true,  0  },
+  { AArch64::LD2i8_POST,        "ld2",  ".b",     2, true,  2  },
+  { AArch64::LD2i16_POST,       "ld2",  ".h",     2, true,  4  },
+  { AArch64::LD2i32_POST,       "ld2",  ".s",     2, true,  8  },
+  { AArch64::LD2i64_POST,       "ld2",  ".d",     2, true,  16  },
+  { AArch64::LD2Rv16b,          "ld2r", ".16b",   0, false, 0  },
+  { AArch64::LD2Rv8h,           "ld2r", ".8h",    0, false, 0  },
+  { AArch64::LD2Rv4s,           "ld2r", ".4s",    0, false, 0  },
+  { AArch64::LD2Rv2d,           "ld2r", ".2d",    0, false, 0  },
+  { AArch64::LD2Rv8b,           "ld2r", ".8b",    0, false, 0  },
+  { AArch64::LD2Rv4h,           "ld2r", ".4h",    0, false, 0  },
+  { AArch64::LD2Rv2s,           "ld2r", ".2s",    0, false, 0  },
+  { AArch64::LD2Rv1d,           "ld2r", ".1d",    0, false, 0  },
+  { AArch64::LD2Rv16b_POST,     "ld2r", ".16b",   1, false, 2  },
+  { AArch64::LD2Rv8h_POST,      "ld2r", ".8h",    1, false, 4  },
+  { AArch64::LD2Rv4s_POST,      "ld2r", ".4s",    1, false, 8  },
+  { AArch64::LD2Rv2d_POST,      "ld2r", ".2d",    1, false, 16 },
+  { AArch64::LD2Rv8b_POST,      "ld2r", ".8b",    1, false, 2  },
+  { AArch64::LD2Rv4h_POST,      "ld2r", ".4h",    1, false, 4  },
+  { AArch64::LD2Rv2s_POST,      "ld2r", ".2s",    1, false, 8  },
+  { AArch64::LD2Rv1d_POST,      "ld2r", ".1d",    1, false, 16 },
+  { AArch64::LD2Twov16b,        "ld2",  ".16b",   0, false, 0  },
+  { AArch64::LD2Twov8h,         "ld2",  ".8h",    0, false, 0  },
+  { AArch64::LD2Twov4s,         "ld2",  ".4s",    0, false, 0  },
+  { AArch64::LD2Twov2d,         "ld2",  ".2d",    0, false, 0  },
+  { AArch64::LD2Twov8b,         "ld2",  ".8b",    0, false, 0  },
+  { AArch64::LD2Twov4h,         "ld2",  ".4h",    0, false, 0  },
+  { AArch64::LD2Twov2s,         "ld2",  ".2s",    0, false, 0  },
+  { AArch64::LD2Twov16b_POST,   "ld2",  ".16b",   1, false, 32 },
+  { AArch64::LD2Twov8h_POST,    "ld2",  ".8h",    1, false, 32 },
+  { AArch64::LD2Twov4s_POST,    "ld2",  ".4s",    1, false, 32 },
+  { AArch64::LD2Twov2d_POST,    "ld2",  ".2d",    1, false, 32 },
+  { AArch64::LD2Twov8b_POST,    "ld2",  ".8b",    1, false, 16 },
+  { AArch64::LD2Twov4h_POST,    "ld2",  ".4h",    1, false, 16 },
+  { AArch64::LD2Twov2s_POST,    "ld2",  ".2s",    1, false, 16 },
+  { AArch64::LD3i8,             "ld3",  ".b",     1, true,  0  },
+  { AArch64::LD3i16,            "ld3",  ".h",     1, true,  0  },
+  { AArch64::LD3i32,            "ld3",  ".s",     1, true,  0  },
+  { AArch64::LD3i64,            "ld3",  ".d",     1, true,  0  },
+  { AArch64::LD3i8_POST,        "ld3",  ".b",     2, true,  3  },
+  { AArch64::LD3i16_POST,       "ld3",  ".h",     2, true,  6  },
+  { AArch64::LD3i32_POST,       "ld3",  ".s",     2, true,  12  },
+  { AArch64::LD3i64_POST,       "ld3",  ".d",     2, true,  24  },
+  { AArch64::LD3Rv16b,          "ld3r", ".16b",   0, false, 0  },
+  { AArch64::LD3Rv8h,           "ld3r", ".8h",    0, false, 0  },
+  { AArch64::LD3Rv4s,           "ld3r", ".4s",    0, false, 0  },
+  { AArch64::LD3Rv2d,           "ld3r", ".2d",    0, false, 0  },
+  { AArch64::LD3Rv8b,           "ld3r", ".8b",    0, false, 0  },
+  { AArch64::LD3Rv4h,           "ld3r", ".4h",    0, false, 0  },
+  { AArch64::LD3Rv2s,           "ld3r", ".2s",    0, false, 0  },
+  { AArch64::LD3Rv1d,           "ld3r", ".1d",    0, false, 0  },
+  { AArch64::LD3Rv16b_POST,     "ld3r", ".16b",   1, false, 3  },
+  { AArch64::LD3Rv8h_POST,      "ld3r", ".8h",    1, false, 6  },
+  { AArch64::LD3Rv4s_POST,      "ld3r", ".4s",    1, false, 12 },
+  { AArch64::LD3Rv2d_POST,      "ld3r", ".2d",    1, false, 24 },
+  { AArch64::LD3Rv8b_POST,      "ld3r", ".8b",    1, false, 3  },
+  { AArch64::LD3Rv4h_POST,      "ld3r", ".4h",    1, false, 6  },
+  { AArch64::LD3Rv2s_POST,      "ld3r", ".2s",    1, false, 12 },
+  { AArch64::LD3Rv1d_POST,      "ld3r", ".1d",    1, false, 24 },
+  { AArch64::LD3Threev16b,      "ld3",  ".16b",   0, false, 0  },
+  { AArch64::LD3Threev8h,       "ld3",  ".8h",    0, false, 0  },
+  { AArch64::LD3Threev4s,       "ld3",  ".4s",    0, false, 0  },
+  { AArch64::LD3Threev2d,       "ld3",  ".2d",    0, false, 0  },
+  { AArch64::LD3Threev8b,       "ld3",  ".8b",    0, false, 0  },
+  { AArch64::LD3Threev4h,       "ld3",  ".4h",    0, false, 0  },
+  { AArch64::LD3Threev2s,       "ld3",  ".2s",    0, false, 0  },
+  { AArch64::LD3Threev16b_POST, "ld3",  ".16b",   1, false, 48 },
+  { AArch64::LD3Threev8h_POST,  "ld3",  ".8h",    1, false, 48 },
+  { AArch64::LD3Threev4s_POST,  "ld3",  ".4s",    1, false, 48 },
+  { AArch64::LD3Threev2d_POST,  "ld3",  ".2d",    1, false, 48 },
+  { AArch64::LD3Threev8b_POST,  "ld3",  ".8b",    1, false, 24 },
+  { AArch64::LD3Threev4h_POST,  "ld3",  ".4h",    1, false, 24 },
+  { AArch64::LD3Threev2s_POST,  "ld3",  ".2s",    1, false, 24 },
+  { AArch64::LD4i8,             "ld4",  ".b",     1, true,  0  },
+  { AArch64::LD4i16,            "ld4",  ".h",     1, true,  0  },
+  { AArch64::LD4i32,            "ld4",  ".s",     1, true,  0  },
+  { AArch64::LD4i64,            "ld4",  ".d",     1, true,  0  },
+  { AArch64::LD4i8_POST,        "ld4",  ".b",     2, true,  4  },
+  { AArch64::LD4i16_POST,       "ld4",  ".h",     2, true,  8  },
+  { AArch64::LD4i32_POST,       "ld4",  ".s",     2, true,  16 },
+  { AArch64::LD4i64_POST,       "ld4",  ".d",     2, true,  32 },
+  { AArch64::LD4Rv16b,          "ld4r", ".16b",   0, false, 0  },
+  { AArch64::LD4Rv8h,           "ld4r", ".8h",    0, false, 0  },
+  { AArch64::LD4Rv4s,           "ld4r", ".4s",    0, false, 0  },
+  { AArch64::LD4Rv2d,           "ld4r", ".2d",    0, false, 0  },
+  { AArch64::LD4Rv8b,           "ld4r", ".8b",    0, false, 0  },
+  { AArch64::LD4Rv4h,           "ld4r", ".4h",    0, false, 0  },
+  { AArch64::LD4Rv2s,           "ld4r", ".2s",    0, false, 0  },
+  { AArch64::LD4Rv1d,           "ld4r", ".1d",    0, false, 0  },
+  { AArch64::LD4Rv16b_POST,     "ld4r", ".16b",   1, false, 4  },
+  { AArch64::LD4Rv8h_POST,      "ld4r", ".8h",    1, false, 8  },
+  { AArch64::LD4Rv4s_POST,      "ld4r", ".4s",    1, false, 16 },
+  { AArch64::LD4Rv2d_POST,      "ld4r", ".2d",    1, false, 32 },
+  { AArch64::LD4Rv8b_POST,      "ld4r", ".8b",    1, false, 4  },
+  { AArch64::LD4Rv4h_POST,      "ld4r", ".4h",    1, false, 8  },
+  { AArch64::LD4Rv2s_POST,      "ld4r", ".2s",    1, false, 16 },
+  { AArch64::LD4Rv1d_POST,      "ld4r", ".1d",    1, false, 32 },
+  { AArch64::LD4Fourv16b,       "ld4",  ".16b",   0, false, 0  },
+  { AArch64::LD4Fourv8h,        "ld4",  ".8h",    0, false, 0  },
+  { AArch64::LD4Fourv4s,        "ld4",  ".4s",    0, false, 0  },
+  { AArch64::LD4Fourv2d,        "ld4",  ".2d",    0, false, 0  },
+  { AArch64::LD4Fourv8b,        "ld4",  ".8b",    0, false, 0  },
+  { AArch64::LD4Fourv4h,        "ld4",  ".4h",    0, false, 0  },
+  { AArch64::LD4Fourv2s,        "ld4",  ".2s",    0, false, 0  },
+  { AArch64::LD4Fourv16b_POST,  "ld4",  ".16b",   1, false, 64 },
+  { AArch64::LD4Fourv8h_POST,   "ld4",  ".8h",    1, false, 64 },
+  { AArch64::LD4Fourv4s_POST,   "ld4",  ".4s",    1, false, 64 },
+  { AArch64::LD4Fourv2d_POST,   "ld4",  ".2d",    1, false, 64 },
+  { AArch64::LD4Fourv8b_POST,   "ld4",  ".8b",    1, false, 32 },
+  { AArch64::LD4Fourv4h_POST,   "ld4",  ".4h",    1, false, 32 },
+  { AArch64::LD4Fourv2s_POST,   "ld4",  ".2s",    1, false, 32 },
+  { AArch64::ST1i8,             "st1",  ".b",     0, true,  0  },
+  { AArch64::ST1i16,            "st1",  ".h",     0, true,  0  },
+  { AArch64::ST1i32,            "st1",  ".s",     0, true,  0  },
+  { AArch64::ST1i64,            "st1",  ".d",     0, true,  0  },
+  { AArch64::ST1i8_POST,        "st1",  ".b",     1, true,  1  },
+  { AArch64::ST1i16_POST,       "st1",  ".h",     1, true,  2  },
+  { AArch64::ST1i32_POST,       "st1",  ".s",     1, true,  4  },
+  { AArch64::ST1i64_POST,       "st1",  ".d",     1, true,  8  },
+  { AArch64::ST1Onev16b,        "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Onev8h,         "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Onev4s,         "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Onev2d,         "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Onev8b,         "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Onev4h,         "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Onev2s,         "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Onev1d,         "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Onev16b_POST,   "st1",  ".16b",   1, false, 16 },
+  { AArch64::ST1Onev8h_POST,    "st1",  ".8h",    1, false, 16 },
+  { AArch64::ST1Onev4s_POST,    "st1",  ".4s",    1, false, 16 },
+  { AArch64::ST1Onev2d_POST,    "st1",  ".2d",    1, false, 16 },
+  { AArch64::ST1Onev8b_POST,    "st1",  ".8b",    1, false, 8  },
+  { AArch64::ST1Onev4h_POST,    "st1",  ".4h",    1, false, 8  },
+  { AArch64::ST1Onev2s_POST,    "st1",  ".2s",    1, false, 8  },
+  { AArch64::ST1Onev1d_POST,    "st1",  ".1d",    1, false, 8  },
+  { AArch64::ST1Twov16b,        "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Twov8h,         "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Twov4s,         "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Twov2d,         "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Twov8b,         "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Twov4h,         "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Twov2s,         "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Twov1d,         "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Twov16b_POST,   "st1",  ".16b",   1, false, 32 },
+  { AArch64::ST1Twov8h_POST,    "st1",  ".8h",    1, false, 32 },
+  { AArch64::ST1Twov4s_POST,    "st1",  ".4s",    1, false, 32 },
+  { AArch64::ST1Twov2d_POST,    "st1",  ".2d",    1, false, 32 },
+  { AArch64::ST1Twov8b_POST,    "st1",  ".8b",    1, false, 16 },
+  { AArch64::ST1Twov4h_POST,    "st1",  ".4h",    1, false, 16 },
+  { AArch64::ST1Twov2s_POST,    "st1",  ".2s",    1, false, 16 },
+  { AArch64::ST1Twov1d_POST,    "st1",  ".1d",    1, false, 16 },
+  { AArch64::ST1Threev16b,      "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Threev8h,       "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Threev4s,       "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Threev2d,       "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Threev8b,       "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Threev4h,       "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Threev2s,       "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Threev1d,       "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Threev16b_POST, "st1",  ".16b",   1, false, 48 },
+  { AArch64::ST1Threev8h_POST,  "st1",  ".8h",    1, false, 48 },
+  { AArch64::ST1Threev4s_POST,  "st1",  ".4s",    1, false, 48 },
+  { AArch64::ST1Threev2d_POST,  "st1",  ".2d",    1, false, 48 },
+  { AArch64::ST1Threev8b_POST,  "st1",  ".8b",    1, false, 24 },
+  { AArch64::ST1Threev4h_POST,  "st1",  ".4h",    1, false, 24 },
+  { AArch64::ST1Threev2s_POST,  "st1",  ".2s",    1, false, 24 },
+  { AArch64::ST1Threev1d_POST,  "st1",  ".1d",    1, false, 24 },
+  { AArch64::ST1Fourv16b,       "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Fourv8h,        "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Fourv4s,        "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Fourv2d,        "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Fourv8b,        "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Fourv4h,        "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Fourv2s,        "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Fourv1d,        "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Fourv16b_POST,  "st1",  ".16b",   1, false, 64 },
+  { AArch64::ST1Fourv8h_POST,   "st1",  ".8h",    1, false, 64 },
+  { AArch64::ST1Fourv4s_POST,   "st1",  ".4s",    1, false, 64 },
+  { AArch64::ST1Fourv2d_POST,   "st1",  ".2d",    1, false, 64 },
+  { AArch64::ST1Fourv8b_POST,   "st1",  ".8b",    1, false, 32 },
+  { AArch64::ST1Fourv4h_POST,   "st1",  ".4h",    1, false, 32 },
+  { AArch64::ST1Fourv2s_POST,   "st1",  ".2s",    1, false, 32 },
+  { AArch64::ST1Fourv1d_POST,   "st1",  ".1d",    1, false, 32 },
+  { AArch64::ST2i8,             "st2",  ".b",     0, true,  0  },
+  { AArch64::ST2i16,            "st2",  ".h",     0, true,  0  },
+  { AArch64::ST2i32,            "st2",  ".s",     0, true,  0  },
+  { AArch64::ST2i64,            "st2",  ".d",     0, true,  0  },
+  { AArch64::ST2i8_POST,        "st2",  ".b",     1, true,  2  },
+  { AArch64::ST2i16_POST,       "st2",  ".h",     1, true,  4  },
+  { AArch64::ST2i32_POST,       "st2",  ".s",     1, true,  8  },
+  { AArch64::ST2i64_POST,       "st2",  ".d",     1, true,  16 },
+  { AArch64::ST2Twov16b,        "st2",  ".16b",   0, false, 0  },
+  { AArch64::ST2Twov8h,         "st2",  ".8h",    0, false, 0  },
+  { AArch64::ST2Twov4s,         "st2",  ".4s",    0, false, 0  },
+  { AArch64::ST2Twov2d,         "st2",  ".2d",    0, false, 0  },
+  { AArch64::ST2Twov8b,         "st2",  ".8b",    0, false, 0  },
+  { AArch64::ST2Twov4h,         "st2",  ".4h",    0, false, 0  },
+  { AArch64::ST2Twov2s,         "st2",  ".2s",    0, false, 0  },
+  { AArch64::ST2Twov16b_POST,   "st2",  ".16b",   1, false, 32 },
+  { AArch64::ST2Twov8h_POST,    "st2",  ".8h",    1, false, 32 },
+  { AArch64::ST2Twov4s_POST,    "st2",  ".4s",    1, false, 32 },
+  { AArch64::ST2Twov2d_POST,    "st2",  ".2d",    1, false, 32 },
+  { AArch64::ST2Twov8b_POST,    "st2",  ".8b",    1, false, 16 },
+  { AArch64::ST2Twov4h_POST,    "st2",  ".4h",    1, false, 16 },
+  { AArch64::ST2Twov2s_POST,    "st2",  ".2s",    1, false, 16 },
+  { AArch64::ST3i8,             "st3",  ".b",     0, true,  0  },
+  { AArch64::ST3i16,            "st3",  ".h",     0, true,  0  },
+  { AArch64::ST3i32,            "st3",  ".s",     0, true,  0  },
+  { AArch64::ST3i64,            "st3",  ".d",     0, true,  0  },
+  { AArch64::ST3i8_POST,        "st3",  ".b",     1, true,  3  },
+  { AArch64::ST3i16_POST,       "st3",  ".h",     1, true,  6  },
+  { AArch64::ST3i32_POST,       "st3",  ".s",     1, true,  12 },
+  { AArch64::ST3i64_POST,       "st3",  ".d",     1, true,  24 },
+  { AArch64::ST3Threev16b,      "st3",  ".16b",   0, false, 0  },
+  { AArch64::ST3Threev8h,       "st3",  ".8h",    0, false, 0  },
+  { AArch64::ST3Threev4s,       "st3",  ".4s",    0, false, 0  },
+  { AArch64::ST3Threev2d,       "st3",  ".2d",    0, false, 0  },
+  { AArch64::ST3Threev8b,       "st3",  ".8b",    0, false, 0  },
+  { AArch64::ST3Threev4h,       "st3",  ".4h",    0, false, 0  },
+  { AArch64::ST3Threev2s,       "st3",  ".2s",    0, false, 0  },
+  { AArch64::ST3Threev16b_POST, "st3",  ".16b",   1, false, 48 },
+  { AArch64::ST3Threev8h_POST,  "st3",  ".8h",    1, false, 48 },
+  { AArch64::ST3Threev4s_POST,  "st3",  ".4s",    1, false, 48 },
+  { AArch64::ST3Threev2d_POST,  "st3",  ".2d",    1, false, 48 },
+  { AArch64::ST3Threev8b_POST,  "st3",  ".8b",    1, false, 24 },
+  { AArch64::ST3Threev4h_POST,  "st3",  ".4h",    1, false, 24 },
+  { AArch64::ST3Threev2s_POST,  "st3",  ".2s",    1, false, 24 },
+  { AArch64::ST4i8,             "st4",  ".b",     0, true,  0  },
+  { AArch64::ST4i16,            "st4",  ".h",     0, true,  0  },
+  { AArch64::ST4i32,            "st4",  ".s",     0, true,  0  },
+  { AArch64::ST4i64,            "st4",  ".d",     0, true,  0  },
+  { AArch64::ST4i8_POST,        "st4",  ".b",     1, true,  4  },
+  { AArch64::ST4i16_POST,       "st4",  ".h",     1, true,  8  },
+  { AArch64::ST4i32_POST,       "st4",  ".s",     1, true,  16 },
+  { AArch64::ST4i64_POST,       "st4",  ".d",     1, true,  32 },
+  { AArch64::ST4Fourv16b,       "st4",  ".16b",   0, false, 0  },
+  { AArch64::ST4Fourv8h,        "st4",  ".8h",    0, false, 0  },
+  { AArch64::ST4Fourv4s,        "st4",  ".4s",    0, false, 0  },
+  { AArch64::ST4Fourv2d,        "st4",  ".2d",    0, false, 0  },
+  { AArch64::ST4Fourv8b,        "st4",  ".8b",    0, false, 0  },
+  { AArch64::ST4Fourv4h,        "st4",  ".4h",    0, false, 0  },
+  { AArch64::ST4Fourv2s,        "st4",  ".2s",    0, false, 0  },
+  { AArch64::ST4Fourv16b_POST,  "st4",  ".16b",   1, false, 64 },
+  { AArch64::ST4Fourv8h_POST,   "st4",  ".8h",    1, false, 64 },
+  { AArch64::ST4Fourv4s_POST,   "st4",  ".4s",    1, false, 64 },
+  { AArch64::ST4Fourv2d_POST,   "st4",  ".2d",    1, false, 64 },
+  { AArch64::ST4Fourv8b_POST,   "st4",  ".8b",    1, false, 32 },
+  { AArch64::ST4Fourv4h_POST,   "st4",  ".4h",    1, false, 32 },
+  { AArch64::ST4Fourv2s_POST,   "st4",  ".2s",    1, false, 32 },
+};
+
+static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
+  unsigned Idx;
+  for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
+    if (LdStNInstInfo[Idx].Opcode == Opcode)
+      return &LdStNInstInfo[Idx];
+
+  return nullptr;
+}
+
+void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                        StringRef Annot) {
+  unsigned Opcode = MI->getOpcode();
+  StringRef Layout, Mnemonic;
+
+  bool IsTbx;
+  if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
+    O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
+      << getRegisterName(MI->getOperand(0).getReg(), AArch64::vreg) << ", ";
+
+    unsigned ListOpNum = IsTbx ? 2 : 1;
+    printVectorList(MI, ListOpNum, O, "");
+
+    O << ", "
+      << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), AArch64::vreg);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
+    O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
+
+    // Now onto the operands: first a vector list with possible lane
+    // specifier. E.g. { v0 }[2]
+    int OpNum = LdStDesc->ListOperand;
+    printVectorList(MI, OpNum++, O, "");
+
+    if (LdStDesc->HasLane)
+      O << '[' << MI->getOperand(OpNum++).getImm() << ']';
+
+    // Next the address: [xN]
+    unsigned AddrReg = MI->getOperand(OpNum++).getReg();
+    O << ", [" << getRegisterName(AddrReg) << ']';
+
+    // Finally, there might be a post-indexed offset.
+    if (LdStDesc->NaturalOffset != 0) {
+      unsigned Reg = MI->getOperand(OpNum++).getReg();
+      if (Reg != AArch64::XZR)
+        O << ", " << getRegisterName(Reg);
+      else {
+        assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
+        O << ", #" << LdStDesc->NaturalOffset;
+      }
+    }
+
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  AArch64InstPrinter::printInst(MI, O, Annot);
+}
+
+bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
+#ifndef NDEBUG
+  unsigned Opcode = MI->getOpcode();
+  assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
+#endif
+
+  const char *Asm = nullptr;
+  const MCOperand &Op1 = MI->getOperand(0);
+  const MCOperand &Cn = MI->getOperand(1);
+  const MCOperand &Cm = MI->getOperand(2);
+  const MCOperand &Op2 = MI->getOperand(3);
+
+  unsigned Op1Val = Op1.getImm();
+  unsigned CnVal = Cn.getImm();
+  unsigned CmVal = Cm.getImm();
+  unsigned Op2Val = Op2.getImm();
+
+  if (CnVal == 7) {
+    switch (CmVal) {
+    default:
+      break;
+
+    // IC aliases
+    case 1:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tialluis";
+      break;
+    case 5:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tiallu";
+      else if (Op1Val == 3 && Op2Val == 1)
+        Asm = "ic\tivau";
+      break;
+
+    // DC aliases
+    case 4:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tzva";
+      break;
+    case 6:
+      if (Op1Val == 0 && Op2Val == 1)
+        Asm = "dc\tivac";
+      if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tisw";
+      break;
+    case 10:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcsw";
+      break;
+    case 11:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvau";
+      break;
+    case 14:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcivac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcisw";
+      break;
+
+    // AT aliases
+    case 8:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e1r"; break;
+        case 1: Asm = "at\ts1e1w"; break;
+        case 2: Asm = "at\ts1e0r"; break;
+        case 3: Asm = "at\ts1e0w"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e2r"; break;
+        case 1: Asm = "at\ts1e2w"; break;
+        case 4: Asm = "at\ts12e1r"; break;
+        case 5: Asm = "at\ts12e1w"; break;
+        case 6: Asm = "at\ts12e0r"; break;
+        case 7: Asm = "at\ts12e0w"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e3r"; break;
+        case 1: Asm = "at\ts1e3w"; break;
+        }
+        break;
+      }
+      break;
+    }
+  } else if (CnVal == 8) {
+    // TLBI aliases
+    switch (CmVal) {
+    default:
+      break;
+    case 3:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1is"; break;
+        case 1: Asm = "tlbi\tvae1is"; break;
+        case 2: Asm = "tlbi\taside1is"; break;
+        case 3: Asm = "tlbi\tvaae1is"; break;
+        case 5: Asm = "tlbi\tvale1is"; break;
+        case 7: Asm = "tlbi\tvaale1is"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2is"; break;
+        case 1: Asm = "tlbi\tvae2is"; break;
+        case 4: Asm = "tlbi\talle1is"; break;
+        case 5: Asm = "tlbi\tvale2is"; break;
+        case 6: Asm = "tlbi\tvmalls12e1is"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3is"; break;
+        case 1: Asm = "tlbi\tvae3is"; break;
+        case 5: Asm = "tlbi\tvale3is"; break;
+        }
+        break;
+      }
+      break;
+    case 0:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1is"; break;
+        case 5: Asm = "tlbi\tipas2le1is"; break;
+        }
+        break;
+      }
+      break;
+    case 4:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1"; break;
+        case 5: Asm = "tlbi\tipas2le1"; break;
+        }
+        break;
+      }
+      break;
+    case 7:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1"; break;
+        case 1: Asm = "tlbi\tvae1"; break;
+        case 2: Asm = "tlbi\taside1"; break;
+        case 3: Asm = "tlbi\tvaae1"; break;
+        case 5: Asm = "tlbi\tvale1"; break;
+        case 7: Asm = "tlbi\tvaale1"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2"; break;
+        case 1: Asm = "tlbi\tvae2"; break;
+        case 4: Asm = "tlbi\talle1"; break;
+        case 5: Asm = "tlbi\tvale2"; break;
+        case 6: Asm = "tlbi\tvmalls12e1"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3"; break;
+        case 1: Asm = "tlbi\tvae3";  break;
+        case 5: Asm = "tlbi\tvale3"; break;
+        }
+        break;
+      }
+      break;
+    }
+  }
+
+  if (Asm) {
+    unsigned Reg = MI->getOperand(4).getReg();
+
+    O << '\t' << Asm;
+    if (StringRef(Asm).lower().find("all") == StringRef::npos)
+      O << ", " << getRegisterName(Reg);
+  }
+
+  return Asm != nullptr;
+}
+
+void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    O << getRegisterName(Reg);
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << *Op.getExpr();
+  }
+}
+
+void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  O << format("#%#llx", Op.getImm());
+}
+
+void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
+                                             unsigned Imm, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    if (Reg == AArch64::XZR)
+      O << "#" << Imm;
+    else
+      O << getRegisterName(Reg);
+  } else
+    assert(0 && "unknown operand kind in printPostIncOperand64");
+}
+
+void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isReg() && "Non-register vreg operand!");
+  unsigned Reg = Op.getReg();
+  O << getRegisterName(Reg, AArch64::vreg);
+}
+
+void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
+  O << "c" << Op.getImm();
+}
+
+void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    unsigned Val = (MO.getImm() & 0xfff);
+    assert(Val == MO.getImm() && "Add/sub immediate out of range!");
+    unsigned Shift =
+        AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
+    O << '#' << Val;
+    if (Shift != 0)
+      printShifter(MI, OpNum + 1, O);
+
+    if (CommentStream)
+      *CommentStream << '=' << (Val << Shift) << '\n';
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    O << *MO.getExpr();
+    printShifter(MI, OpNum + 1, O);
+  }
+}
+
+void AArch64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  uint64_t Val = MI->getOperand(OpNum).getImm();
+  O << "#0x";
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 32));
+}
+
+void AArch64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  uint64_t Val = MI->getOperand(OpNum).getImm();
+  O << "#0x";
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 64));
+}
+
+void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  // LSL #0 should not be printed.
+  if (AArch64_AM::getShiftType(Val) == AArch64_AM::LSL &&
+      AArch64_AM::getShiftValue(Val) == 0)
+    return;
+  O << ", " << AArch64_AM::getShiftExtendName(AArch64_AM::getShiftType(Val))
+    << " #" << AArch64_AM::getShiftValue(Val);
+}
+
+void AArch64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printShifter(MI, OpNum + 1, O);
+}
+
+void AArch64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printArithExtend(MI, OpNum + 1, O);
+}
+
+void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getArithExtendType(Val);
+  unsigned ShiftVal = AArch64_AM::getArithShiftValue(Val);
+
+  // If the destination or first source register operand is [W]SP, print
+  // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
+  // all.
+  if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) {
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src1 = MI->getOperand(1).getReg();
+    if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) &&
+          ExtType == AArch64_AM::UXTX) ||
+         ((Dest == AArch64::WSP || Src1 == AArch64::WSP) &&
+          ExtType == AArch64_AM::UXTW) ) {
+      if (ShiftVal != 0)
+        O << ", lsl #" << ShiftVal;
+      return;
+    }
+  }
+  O << ", " << AArch64_AM::getShiftExtendName(ExtType);
+  if (ShiftVal != 0)
+    O << " #" << ShiftVal;
+}
+
+void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O, char SrcRegKind,
+                                        unsigned Width) {
+  unsigned SignExtend = MI->getOperand(OpNum).getImm();
+  unsigned DoShift = MI->getOperand(OpNum + 1).getImm();
+
+  // sxtw, sxtx, uxtw or lsl (== uxtx)
+  bool IsLSL = !SignExtend && SrcRegKind == 'x';
+  if (IsLSL)
+    O << "lsl";
+  else
+    O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind;
+
+  if (DoShift || IsLSL)
+    O << " #" << Log2_32(Width / 8);
+}
+
+void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << AArch64CC::getCondCodeName(CC);
+}
+
+void AArch64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << AArch64CC::getCondCodeName(AArch64CC::getInvertedCondCode(CC));
+}
+
+void AArch64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
+}
+
+template<int Scale>
+void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  O << '#' << Scale * MI->getOperand(OpNum).getImm();
+}
+
+void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
+                                           unsigned Scale, raw_ostream &O) {
+  const MCOperand MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    O << "#" << (MO.getImm() * Scale);
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    O << *MO.getExpr();
+  }
+}
+
+void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
+                                          unsigned Scale, raw_ostream &O) {
+  const MCOperand MO1 = MI->getOperand(OpNum + 1);
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
+  if (MO1.isImm()) {
+      O << ", #" << (MO1.getImm() * Scale);
+  } else {
+    assert(MO1.isExpr() && "Unexpected operand type!");
+    O << ", " << *MO1.getExpr();
+  }
+  O << ']';
+}
+
+void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned prfop = MI->getOperand(OpNum).getImm();
+  bool Valid;
+  StringRef Name = AArch64PRFM::PRFMMapper().toString(prfop, Valid);
+  if (Valid)
+    O << Name;
+  else
+    O << '#' << prfop;
+}
+
+void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  float FPImm =
+      MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm());
+
+  // 8 decimal places are enough to perfectly represent permitted floats.
+  O << format("#%.8f", FPImm);
+}
+
+static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
+  while (Stride--) {
+    switch (Reg) {
+    default:
+      assert(0 && "Vector register expected!");
+    case AArch64::Q0:  Reg = AArch64::Q1;  break;
+    case AArch64::Q1:  Reg = AArch64::Q2;  break;
+    case AArch64::Q2:  Reg = AArch64::Q3;  break;
+    case AArch64::Q3:  Reg = AArch64::Q4;  break;
+    case AArch64::Q4:  Reg = AArch64::Q5;  break;
+    case AArch64::Q5:  Reg = AArch64::Q6;  break;
+    case AArch64::Q6:  Reg = AArch64::Q7;  break;
+    case AArch64::Q7:  Reg = AArch64::Q8;  break;
+    case AArch64::Q8:  Reg = AArch64::Q9;  break;
+    case AArch64::Q9:  Reg = AArch64::Q10; break;
+    case AArch64::Q10: Reg = AArch64::Q11; break;
+    case AArch64::Q11: Reg = AArch64::Q12; break;
+    case AArch64::Q12: Reg = AArch64::Q13; break;
+    case AArch64::Q13: Reg = AArch64::Q14; break;
+    case AArch64::Q14: Reg = AArch64::Q15; break;
+    case AArch64::Q15: Reg = AArch64::Q16; break;
+    case AArch64::Q16: Reg = AArch64::Q17; break;
+    case AArch64::Q17: Reg = AArch64::Q18; break;
+    case AArch64::Q18: Reg = AArch64::Q19; break;
+    case AArch64::Q19: Reg = AArch64::Q20; break;
+    case AArch64::Q20: Reg = AArch64::Q21; break;
+    case AArch64::Q21: Reg = AArch64::Q22; break;
+    case AArch64::Q22: Reg = AArch64::Q23; break;
+    case AArch64::Q23: Reg = AArch64::Q24; break;
+    case AArch64::Q24: Reg = AArch64::Q25; break;
+    case AArch64::Q25: Reg = AArch64::Q26; break;
+    case AArch64::Q26: Reg = AArch64::Q27; break;
+    case AArch64::Q27: Reg = AArch64::Q28; break;
+    case AArch64::Q28: Reg = AArch64::Q29; break;
+    case AArch64::Q29: Reg = AArch64::Q30; break;
+    case AArch64::Q30: Reg = AArch64::Q31; break;
+    // Vector lists can wrap around.
+    case AArch64::Q31:
+      Reg = AArch64::Q0;
+      break;
+    }
+  }
+  return Reg;
+}
+
+void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O,
+                                         StringRef LayoutSuffix) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+
+  O << "{ ";
+
+  // Work out how many registers there are in the list (if there is an actual
+  // list).
+  unsigned NumRegs = 1;
+  if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) ||
+      MRI.getRegClass(AArch64::QQRegClassID).contains(Reg))
+    NumRegs = 2;
+  else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg))
+    NumRegs = 3;
+  else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg))
+    NumRegs = 4;
+
+  // Now forget about the list and find out what the first register is.
+  if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0))
+    Reg = FirstReg;
+  else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0))
+    Reg = FirstReg;
+
+  // If it's a D-reg, we need to promote it to the equivalent Q-reg before
+  // printing (otherwise getRegisterName fails).
+  if (MRI.getRegClass(AArch64::FPR64RegClassID).contains(Reg)) {
+    const MCRegisterClass &FPR128RC =
+        MRI.getRegClass(AArch64::FPR128RegClassID);
+    Reg = MRI.getMatchingSuperReg(Reg, AArch64::dsub, &FPR128RC);
+  }
+
+  for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
+    O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix;
+    if (i + 1 != NumRegs)
+      O << ", ";
+  }
+
+  O << " }";
+}
+
+void AArch64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
+                                                        unsigned OpNum,
+                                                        raw_ostream &O) {
+  printVectorList(MI, OpNum, O, "");
+}
+
+template <unsigned NumLanes, char LaneKind>
+void AArch64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  std::string Suffix(".");
+  if (NumLanes)
+    Suffix += itostr(NumLanes) + LaneKind;
+  else
+    Suffix += LaneKind;
+
+  printVectorList(MI, OpNum, O, Suffix);
+}
+
+void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  O << "[" << MI->getOperand(OpNum).getImm() << "]";
+}
+
+void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << (Op.getImm() << 2);
+    return;
+  }
+
+  // If the branch target is simply an address then print it in hex.
+  const MCConstantExpr *BranchTarget =
+      dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
+  int64_t Address;
+  if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+    O << "0x";
+    O.write_hex(Address);
+  } else {
+    // Otherwise, just print the expression.
+    O << *MI->getOperand(OpNum).getExpr();
+  }
+}
+
+void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << (Op.getImm() << 12);
+    return;
+  }
+
+  // Otherwise, just print the expression.
+  O << *MI->getOperand(OpNum).getExpr();
+}
+
+void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+  unsigned Opcode = MI->getOpcode();
+
+  bool Valid;
+  StringRef Name;
+  if (Opcode == AArch64::ISB)
+    Name = AArch64ISB::ISBMapper().toString(Val, Valid);
+  else
+    Name = AArch64DB::DBarrierMapper().toString(Val, Valid);
+  if (Valid)
+    O << Name;
+  else
+    O << "#" << Val;
+}
+
+void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+
+  bool Valid;
+  auto Mapper = AArch64SysReg::MRSMapper(getAvailableFeatures());
+  std::string Name = Mapper.toString(Val, Valid);
+
+  if (Valid)
+    O << StringRef(Name).upper();
+}
+
+void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+
+  bool Valid;
+  auto Mapper = AArch64SysReg::MSRMapper(getAvailableFeatures());
+  std::string Name = Mapper.toString(Val, Valid);
+
+  if (Valid)
+    O << StringRef(Name).upper();
+}
+
+void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+
+  bool Valid;
+  StringRef Name = AArch64PState::PStateMapper().toString(Val, Valid);
+  if (Valid)
+    O << StringRef(Name.str()).upper();
+  else
+    O << "#" << Val;
+}
+
+void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned RawVal = MI->getOperand(OpNo).getImm();
+  uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal);
+  O << format("#%#016llx", Val);
+}
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
new file mode 100644
index 00000000000..fe7666e5cad
--- /dev/null
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -0,0 +1,140 @@
+//===-- AArch64InstPrinter.h - Convert AArch64 MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AArch64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64INSTPRINTER_H
+#define AArch64INSTPRINTER_H
+
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class AArch64InstPrinter : public MCInstPrinter {
+public:
+  AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+
+  // Autogenerated by tblgen.
+  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx, raw_ostream &O);
+  virtual StringRef getRegName(unsigned RegNo) const {
+    return getRegisterName(RegNo);
+  }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AArch64::NoRegAltName);
+
+protected:
+  bool printSysAlias(const MCInst *MI, raw_ostream &O);
+  // Operand printers
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printHexImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
+                           raw_ostream &O);
+  template<int Amount>
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printPostIncOperand(MI, OpNo, Amount, O);
+  }
+
+  void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printArithExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                      char SrcRegKind, unsigned Width);
+  template <char SrcRegKind, unsigned Width>
+  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printMemExtend(MI, OpNum, O, SrcRegKind, Width);
+  }
+
+  void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printInverseCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAlignedLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                         raw_ostream &O);
+  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                        raw_ostream &O);
+
+  template<int Scale>
+  void printUImm12Offset(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printUImm12Offset(MI, OpNum, Scale, O);
+  }
+
+  template<int BitWidth>
+  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexedWB(MI, OpNum, BitWidth / 8, O);
+  }
+
+  void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  template<int Scale>
+  void printImmScale(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                       StringRef LayoutSuffix);
+
+  /// Print a list of vector registers where the type suffix is implicit
+  /// (i.e. attached to the instruction rather than the registers).
+  void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O);
+
+  template <unsigned NumLanes, char LaneKind>
+  void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSystemPStateField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+};
+
+class AArch64AppleInstPrinter : public AArch64InstPrinter {
+public:
+  AArch64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                        const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+
+  void printInstruction(const MCInst *MI, raw_ostream &O) override;
+  bool printAliasInstr(const MCInst *MI, raw_ostream &O) override;
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx, raw_ostream &O);
+  StringRef getRegName(unsigned RegNo) const override {
+    return getRegisterName(RegNo);
+  }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AArch64::NoRegAltName);
+};
+}
+
+#endif
diff --git a/lib/Target/AArch64/InstPrinter/CMakeLists.txt b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
new file mode 100644
index 00000000000..363f50258d7
--- /dev/null
+++ b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAArch64AsmPrinter
+  AArch64InstPrinter.cpp
+  )
+
+add_dependencies(LLVMAArch64AsmPrinter AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/InstPrinter/LLVMBuild.txt b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
new file mode 100644
index 00000000000..a13e842cdd3
--- /dev/null
+++ b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/AArch64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AArch64AsmPrinter
+parent = AArch64
+required_libraries = AArch64Utils MC Support
+add_to_library_groups = AArch64
+
diff --git a/lib/Target/AArch64/InstPrinter/Makefile b/lib/Target/AArch64/InstPrinter/Makefile
new file mode 100644
index 00000000000..b17e8d08011
--- /dev/null
+++ b/lib/Target/AArch64/InstPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/AArch64/AsmPrinter/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAArch64AsmPrinter
+
+# Hack: we need to include 'main' arm target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
new file mode 100644
index 00000000000..642c18394a6
--- /dev/null
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -0,0 +1,35 @@
+;===- ./lib/Target/AArch64/LLVMBuild.txt -------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils
+
+[component_0]
+type = TargetGroup
+name = AArch64
+parent = Target
+has_asmparser = 1
+has_asmprinter = 1
+has_disassembler = 1
+has_jit = 1
+
+[component_1]
+type = Library
+name = AArch64CodeGen
+parent = AArch64
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
+add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
new file mode 100644
index 00000000000..8b1e44e26e9
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -0,0 +1,738 @@
+//===- AArch64AddressingModes.h - AArch64 Addressing Modes ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+#define LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// AArch64_AM - AArch64 Addressing Mode Stuff
+namespace AArch64_AM {
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//
+
+enum ShiftExtendType {
+  InvalidShiftExtend = -1,
+  LSL = 0,
+  LSR,
+  ASR,
+  ROR,
+  MSL,
+
+  UXTB,
+  UXTH,
+  UXTW,
+  UXTX,
+
+  SXTB,
+  SXTH,
+  SXTW,
+  SXTX,
+};
+
+/// getShiftName - Get the string encoding for the shift type.
+static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
+  switch (ST) {
+  default: assert(false && "unhandled shift type!");
+  case AArch64_AM::LSL: return "lsl";
+  case AArch64_AM::LSR: return "lsr";
+  case AArch64_AM::ASR: return "asr";
+  case AArch64_AM::ROR: return "ror";
+  case AArch64_AM::MSL: return "msl";
+  case AArch64_AM::UXTB: return "uxtb";
+  case AArch64_AM::UXTH: return "uxth";
+  case AArch64_AM::UXTW: return "uxtw";
+  case AArch64_AM::UXTX: return "uxtx";
+  case AArch64_AM::SXTB: return "sxtb";
+  case AArch64_AM::SXTH: return "sxth";
+  case AArch64_AM::SXTW: return "sxtw";
+  case AArch64_AM::SXTX: return "sxtx";
+  }
+  return nullptr;
+}
+
+/// getShiftType - Extract the shift type.
+static inline AArch64_AM::ShiftExtendType getShiftType(unsigned Imm) {
+  switch ((Imm >> 6) & 0x7) {
+  default: return AArch64_AM::InvalidShiftExtend;
+  case 0: return AArch64_AM::LSL;
+  case 1: return AArch64_AM::LSR;
+  case 2: return AArch64_AM::ASR;
+  case 3: return AArch64_AM::ROR;
+  case 4: return AArch64_AM::MSL;
+  }
+}
+
+/// getShiftValue - Extract the shift value.
+static inline unsigned getShiftValue(unsigned Imm) {
+  return Imm & 0x3f;
+}
+
+/// getShifterImm - Encode the shift type and amount:
+///   imm:     6-bit shift amount
+///   shifter: 000 ==> lsl
+///            001 ==> lsr
+///            010 ==> asr
+///            011 ==> ror
+///            100 ==> msl
+///   {8-6}  = shifter
+///   {5-0}  = imm
+static inline unsigned getShifterImm(AArch64_AM::ShiftExtendType ST,
+                                     unsigned Imm) {
+  assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
+  unsigned STEnc = 0;
+  switch (ST) {
+  default:  llvm_unreachable("Invalid shift requested");
+  case AArch64_AM::LSL: STEnc = 0; break;
+  case AArch64_AM::LSR: STEnc = 1; break;
+  case AArch64_AM::ASR: STEnc = 2; break;
+  case AArch64_AM::ROR: STEnc = 3; break;
+  case AArch64_AM::MSL: STEnc = 4; break;
+  }
+  return (STEnc << 6) | (Imm & 0x3f);
+}
+
+//===----------------------------------------------------------------------===//
+// Extends
+//
+
+/// getArithShiftValue - get the arithmetic shift value.
+static inline unsigned getArithShiftValue(unsigned Imm) {
+  return Imm & 0x7;
+}
+
+/// getExtendType - Extract the extend type for operands of arithmetic ops.
+static inline AArch64_AM::ShiftExtendType getExtendType(unsigned Imm) {
+  assert((Imm & 0x7) == Imm && "invalid immediate!");
+  switch (Imm) {
+  default: llvm_unreachable("Compiler bug!");
+  case 0: return AArch64_AM::UXTB;
+  case 1: return AArch64_AM::UXTH;
+  case 2: return AArch64_AM::UXTW;
+  case 3: return AArch64_AM::UXTX;
+  case 4: return AArch64_AM::SXTB;
+  case 5: return AArch64_AM::SXTH;
+  case 6: return AArch64_AM::SXTW;
+  case 7: return AArch64_AM::SXTX;
+  }
+}
+
+static inline AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm) {
+  return getExtendType((Imm >> 3) & 0x7);
+}
+
+/// Mapping from extend bits to required operation:
+///   shifter: 000 ==> uxtb
+///            001 ==> uxth
+///            010 ==> uxtw
+///            011 ==> uxtx
+///            100 ==> sxtb
+///            101 ==> sxth
+///            110 ==> sxtw
+///            111 ==> sxtx
+inline unsigned getExtendEncoding(AArch64_AM::ShiftExtendType ET) {
+  switch (ET) {
+  default: llvm_unreachable("Invalid extend type requested");
+  case AArch64_AM::UXTB: return 0; break;
+  case AArch64_AM::UXTH: return 1; break;
+  case AArch64_AM::UXTW: return 2; break;
+  case AArch64_AM::UXTX: return 3; break;
+  case AArch64_AM::SXTB: return 4; break;
+  case AArch64_AM::SXTH: return 5; break;
+  case AArch64_AM::SXTW: return 6; break;
+  case AArch64_AM::SXTX: return 7; break;
+  }
+}
+
+/// getArithExtendImm - Encode the extend type and shift amount for an
+///                     arithmetic instruction:
+///   imm:     3-bit extend amount
+///   {5-3}  = shifter
+///   {2-0}  = imm3
+static inline unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET,
+                                         unsigned Imm) {
+  assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
+  return (getExtendEncoding(ET) << 3) | (Imm & 0x7);
+}
+
+/// getMemDoShift - Extract the "do shift" flag value for load/store
+/// instructions.
+static inline bool getMemDoShift(unsigned Imm) {
+  return (Imm & 0x1) != 0;
+}
+
+/// getExtendType - Extract the extend type for the offset operand of
+/// loads/stores.
+static inline AArch64_AM::ShiftExtendType getMemExtendType(unsigned Imm) {
+  return getExtendType((Imm >> 1) & 0x7);
+}
+
+/// getExtendImm - Encode the extend type and amount for a load/store inst:
+///   doshift:     should the offset be scaled by the access size
+///   shifter: 000 ==> uxtb
+///            001 ==> uxth
+///            010 ==> uxtw
+///            011 ==> uxtx
+///            100 ==> sxtb
+///            101 ==> sxth
+///            110 ==> sxtw
+///            111 ==> sxtx
+///   {3-1}  = shifter
+///   {0}  = doshift
+static inline unsigned getMemExtendImm(AArch64_AM::ShiftExtendType ET,
+                                       bool DoShift) {
+  return (getExtendEncoding(ET) << 1) | unsigned(DoShift);
+}
+
+static inline uint64_t ror(uint64_t elt, unsigned size) {
+  return ((elt & 1) << (size-1)) | (elt >> 1);
+}
+
+/// processLogicalImmediate - Determine if an immediate value can be encoded
+/// as the immediate operand of a logical instruction for the given register
+/// size.  If so, return true with "encoding" set to the encoded value in
+/// the form N:immr:imms.
+static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize,
+                                           uint64_t &encoding) {
+  if (imm == 0ULL || imm == ~0ULL ||
+      (regSize != 64 && (imm >> regSize != 0 || imm == ~0U)))
+    return false;
+
+  unsigned size = 2;
+  uint64_t eltVal = imm;
+
+  // First, determine the element size.
+  while (size < regSize) {
+    unsigned numElts = regSize / size;
+    unsigned mask = (1ULL << size) - 1;
+    uint64_t lowestEltVal = imm & mask;
+
+    bool allMatched = true;
+    for (unsigned i = 1; i < numElts; ++i) {
+     uint64_t currEltVal = (imm >> (i*size)) & mask;
+      if (currEltVal != lowestEltVal) {
+        allMatched = false;
+        break;
+      }
+    }
+
+    if (allMatched) {
+      eltVal = lowestEltVal;
+      break;
+    }
+
+    size *= 2;
+  }
+
+  // Second, determine the rotation to make the element be: 0^m 1^n.
+  for (unsigned i = 0; i < size; ++i) {
+    eltVal = ror(eltVal, size);
+    uint32_t clz = countLeadingZeros(eltVal) - (64 - size);
+    uint32_t cto = CountTrailingOnes_64(eltVal);
+
+    if (clz + cto == size) {
+      // Encode in immr the number of RORs it would take to get *from* this
+      // element value to our target value, where i+1 is the number of RORs
+      // to go the opposite direction.
+      unsigned immr = size - (i + 1);
+
+      // If size has a 1 in the n'th bit, create a value that has zeroes in
+      // bits [0, n] and ones above that.
+      uint64_t nimms = ~(size-1) << 1;
+
+      // Or the CTO value into the low bits, which must be below the Nth bit
+      // bit mentioned above.
+      nimms |= (cto-1);
+
+      // Extract the seventh bit and toggle it to create the N field.
+      unsigned N = ((nimms >> 6) & 1) ^ 1;
+
+      encoding = (N << 12) | (immr << 6) | (nimms & 0x3f);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// isLogicalImmediate - Return true if the immediate is valid for a logical
+/// immediate instruction of the given register size. Return false otherwise.
+static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) {
+  uint64_t encoding;
+  return processLogicalImmediate(imm, regSize, encoding);
+}
+
+/// encodeLogicalImmediate - Return the encoded immediate value for a logical
+/// immediate instruction of the given register size.
+static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) {
+  uint64_t encoding = 0;
+  bool res = processLogicalImmediate(imm, regSize, encoding);
+  assert(res && "invalid logical immediate");
+  (void)res;
+  return encoding;
+}
+
+/// decodeLogicalImmediate - Decode a logical immediate value in the form
+/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the
+/// integer value it represents with regSize bits.
+static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) {
+  // Extract the N, imms, and immr fields.
+  unsigned N = (val >> 12) & 1;
+  unsigned immr = (val >> 6) & 0x3f;
+  unsigned imms = val & 0x3f;
+
+  assert((regSize == 64 || N == 0) && "undefined logical immediate encoding");
+  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+  assert(len >= 0 && "undefined logical immediate encoding");
+  unsigned size = (1 << len);
+  unsigned R = immr & (size - 1);
+  unsigned S = imms & (size - 1);
+  assert(S != size - 1 && "undefined logical immediate encoding");
+  uint64_t pattern = (1ULL << (S + 1)) - 1;
+  for (unsigned i = 0; i < R; ++i)
+    pattern = ror(pattern, size);
+
+  // Replicate the pattern to fill the regSize.
+  while (size != regSize) {
+    pattern |= (pattern << size);
+    size *= 2;
+  }
+  return pattern;
+}
+
+/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value
+/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits)
+/// is a valid encoding for an integer value with regSize bits.
+static inline bool isValidDecodeLogicalImmediate(uint64_t val,
+                                                 unsigned regSize) {
+  // Extract the N and imms fields needed for checking.
+  unsigned N = (val >> 12) & 1;
+  unsigned imms = val & 0x3f;
+
+  if (regSize == 32 && N != 0) // undefined logical immediate encoding
+    return false;
+  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+  if (len < 0) // undefined logical immediate encoding
+    return false;
+  unsigned size = (1 << len);
+  unsigned S = imms & (size - 1);
+  if (S == size - 1) // undefined logical immediate encoding
+    return false;
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point Immediates
+//
+static inline float getFPImmFloat(unsigned Imm) {
+  // We expect an 8-bit binary encoding of a floating-point number here.
+  union {
+    uint32_t I;
+    float F;
+  } FPUnion;
+
+  uint8_t Sign = (Imm >> 7) & 0x1;
+  uint8_t Exp = (Imm >> 4) & 0x7;
+  uint8_t Mantissa = Imm & 0xf;
+
+  //   8-bit FP    iEEEE Float Encoding
+  //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
+  //
+  // where B = NOT(b);
+
+  FPUnion.I = 0;
+  FPUnion.I |= Sign << 31;
+  FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+  FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+  FPUnion.I |= (Exp & 0x3) << 23;
+  FPUnion.I |= Mantissa << 19;
+  return FPUnion.F;
+}
+
+/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP32Imm(const APInt &Imm) {
+  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
+  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0x7ffff)
+    return -1;
+  Mantissa >>= 19;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP32Imm(const APFloat &FPImm) {
+  return getFP32Imm(FPImm.bitcastToAPInt());
+}
+
+/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP64Imm(const APInt &Imm) {
+  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023;   // -1022 to 1023
+  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0xffffffffffffULL)
+    return -1;
+  Mantissa >>= 48;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP64Imm(const APFloat &FPImm) {
+  return getFP64Imm(FPImm.bitcastToAPInt());
+}
+
+//===--------------------------------------------------------------------===//
+// AdvSIMD Modified Immediates
+//===--------------------------------------------------------------------===//
+
+// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType1(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffffff00ffffff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 32) | EncVal;
+}
+
+// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType2(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffff00ffffff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 40) | (EncVal << 8);
+}
+
+// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00
+static inline bool isAdvSIMDModImmType3(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xff00ffffff00ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) {
+  return (Imm & 0xff0000ULL) >> 16;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 16);
+}
+
+// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType4(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0x00ffffff00ffffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) {
+  return (Imm & 0xff000000ULL) >> 24;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 56) | (EncVal << 24);
+}
+
+// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType5(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) &&
+         ((Imm & 0xff00ff00ff00ff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal;
+}
+
+// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType6(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) &&
+         ((Imm & 0x00ff00ff00ff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8);
+}
+
+// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF
+static inline bool isAdvSIMDModImmType7(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL;
+}
+
+// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF
+static inline bool isAdvSIMDModImmType8(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL;
+}
+
+static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) {
+  return (Imm & 0x00ff0000ULL) >> 16;
+}
+
+// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh
+static inline bool isAdvSIMDModImmType9(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm >> 48) == (Imm & 0x0000ffffULL)) &&
+         ((Imm >> 56) == (Imm & 0x000000ffULL));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  EncVal |= (EncVal << 8);
+  EncVal |= (EncVal << 16);
+  EncVal |= (EncVal << 32);
+  return EncVal;
+}
+
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// cmode: 1110, op: 1
+static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
+  uint64_t ByteA = Imm & 0xff00000000000000ULL;
+  uint64_t ByteB = Imm & 0x00ff000000000000ULL;
+  uint64_t ByteC = Imm & 0x0000ff0000000000ULL;
+  uint64_t ByteD = Imm & 0x000000ff00000000ULL;
+  uint64_t ByteE = Imm & 0x00000000ff000000ULL;
+  uint64_t ByteF = Imm & 0x0000000000ff0000ULL;
+  uint64_t ByteG = Imm & 0x000000000000ff00ULL;
+  uint64_t ByteH = Imm & 0x00000000000000ffULL;
+
+  return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) &&
+         (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) &&
+         (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) &&
+         (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) &&
+         (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) &&
+         (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) &&
+         (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) &&
+         (ByteH == 0ULL || ByteH == 0x00000000000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0xff00000000000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x00ff000000000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x0000ff0000000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x000000ff00000000ULL) != 0;
+  uint8_t BitE = (Imm & 0x00000000ff000000ULL) != 0;
+  uint8_t BitF = (Imm & 0x0000000000ff0000ULL) != 0;
+  uint8_t BitG = (Imm & 0x000000000000ff00ULL) != 0;
+  uint8_t BitH = (Imm & 0x00000000000000ffULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0xff00000000000000ULL;
+  if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL;
+  if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL;
+  if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL;
+  if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL;
+  if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL;
+  if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL;
+  if (Imm & 0x01) EncVal |= 0x00000000000000ffULL;
+  return EncVal;
+}
+
+// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00
+static inline bool isAdvSIMDModImmType11(uint64_t Imm) {
+  uint64_t BString = (Imm & 0x7E000000ULL) >> 25;
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (BString == 0x1f || BString == 0x20) &&
+         ((Imm & 0x0007ffff0007ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0x80000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x20000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x01000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x00800000ULL) != 0;
+  uint8_t BitE = (Imm & 0x00400000ULL) != 0;
+  uint8_t BitF = (Imm & 0x00200000ULL) != 0;
+  uint8_t BitG = (Imm & 0x00100000ULL) != 0;
+  uint8_t BitH = (Imm & 0x00080000ULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0x80000000ULL;
+  if (Imm & 0x40) EncVal |= 0x3e000000ULL;
+  else            EncVal |= 0x40000000ULL;
+  if (Imm & 0x20) EncVal |= 0x01000000ULL;
+  if (Imm & 0x10) EncVal |= 0x00800000ULL;
+  if (Imm & 0x08) EncVal |= 0x00400000ULL;
+  if (Imm & 0x04) EncVal |= 0x00200000ULL;
+  if (Imm & 0x02) EncVal |= 0x00100000ULL;
+  if (Imm & 0x01) EncVal |= 0x00080000ULL;
+  return (EncVal << 32) | EncVal;
+}
+
+// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType12(uint64_t Imm) {
+  uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54;
+  return ((BString == 0xff || BString == 0x100) &&
+         ((Imm & 0x0000ffffffffffffULL) == 0));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0x8000000000000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x0040000000000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x0020000000000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x0010000000000000ULL) != 0;
+  uint8_t BitE = (Imm & 0x0008000000000000ULL) != 0;
+  uint8_t BitF = (Imm & 0x0004000000000000ULL) != 0;
+  uint8_t BitG = (Imm & 0x0002000000000000ULL) != 0;
+  uint8_t BitH = (Imm & 0x0001000000000000ULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0x8000000000000000ULL;
+  if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL;
+  else            EncVal |= 0x4000000000000000ULL;
+  if (Imm & 0x20) EncVal |= 0x0020000000000000ULL;
+  if (Imm & 0x10) EncVal |= 0x0010000000000000ULL;
+  if (Imm & 0x08) EncVal |= 0x0008000000000000ULL;
+  if (Imm & 0x04) EncVal |= 0x0004000000000000ULL;
+  if (Imm & 0x02) EncVal |= 0x0002000000000000ULL;
+  if (Imm & 0x01) EncVal |= 0x0001000000000000ULL;
+  return (EncVal << 32) | EncVal;
+}
+
+} // end namespace AArch64_AM
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
new file mode 100644
index 00000000000..d8900d4fceb
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -0,0 +1,566 @@
+//===-- AArch64AsmBackend.cpp - AArch64 Assembler Backend -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+
+class AArch64AsmBackend : public MCAsmBackend {
+  static const unsigned PCRelFlagVal =
+      MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
+
+public:
+  AArch64AsmBackend(const Target &T) : MCAsmBackend() {}
+
+  unsigned getNumFixupKinds() const override {
+    return AArch64::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // AArch64FixupKinds.h.
+      //
+      // Name                           Offset (bits) Size (bits)     Flags
+      { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_aarch64_add_imm12", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
+      { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
+      { "fixup_aarch64_movw", 5, 16, 0 },
+      { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
+      { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+  void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
+
+  unsigned getPointerSize() const { return 8; }
+};
+
+} // end anonymous namespace
+
+/// \brief The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    assert(0 && "Unknown fixup kind!");
+
+  case AArch64::fixup_aarch64_tlsdesc_call:
+    return 0;
+
+  case FK_Data_1:
+    return 1;
+
+  case FK_Data_2:
+  case AArch64::fixup_aarch64_movw:
+    return 2;
+
+  case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    return 3;
+
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+  case FK_Data_4:
+    return 4;
+
+  case FK_Data_8:
+    return 8;
+  }
+}
+
+static unsigned AdrImmBits(unsigned Value) {
+  unsigned lo2 = Value & 0x3;
+  unsigned hi19 = (Value & 0x1ffffc) >> 2;
+  return (hi19 << 5) | (lo2 << 29);
+}
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  int64_t SignedValue = static_cast<int64_t>(Value);
+  switch (Kind) {
+  default:
+    assert(false && "Unknown fixup kind!");
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    return AdrImmBits(Value & 0x1fffffULL);
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    // Signed 21-bit immediate
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded.
+    return (Value >> 2) & 0x7ffff;
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+    // Unsigned 12-bit immediate
+    if (Value >= 0x1000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value;
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+    // Unsigned 12-bit immediate which gets multiplied by 2
+    if (Value & 1 || Value >= 0x2000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 1;
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+    // Unsigned 12-bit immediate which gets multiplied by 4
+    if (Value & 3 || Value >= 0x4000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 2;
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+    // Unsigned 12-bit immediate which gets multiplied by 8
+    if (Value & 7 || Value >= 0x8000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 3;
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    // Unsigned 12-bit immediate which gets multiplied by 16
+    if (Value & 15 || Value >= 0x10000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 4;
+  case AArch64::fixup_aarch64_movw:
+    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
+    return Value;
+  case AArch64::fixup_aarch64_pcrel_branch14:
+    // Signed 16-bit immediate
+    if (SignedValue > 32767 || SignedValue < -32768)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3fff;
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    // Signed 28-bit immediate
+    if (SignedValue > 134217727 || SignedValue < -134217728)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3ffffff;
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+    return Value;
+  }
+}
+
+void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                   unsigned DataSize, uint64_t Value,
+                                   bool IsPCRel) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  if (!Value)
+    return; // Doesn't change encoding.
+  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  // Apply any target-specific value adjustments.
+  Value = adjustFixupValue(Fixup.getKind(), Value);
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  return false;
+}
+
+bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                             uint64_t Value,
+                                             const MCRelaxableFragment *DF,
+                                             const MCAsmLayout &Layout) const {
+  // FIXME:  This isn't correct for AArch64. Just moving the "generic" logic
+  // into the targets for now.
+  //
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
+                                         MCInst &Res) const {
+  assert(false && "AArch64AsmBackend::relaxInstruction() unimplemented");
+}
+
+bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  if ((Count & 3) != 0) {
+    for (uint64_t i = 0, e = (Count & 3); i != e; ++i)
+      OW->Write8(0);
+  }
+
+  // We are properly aligned, so write NOPs as requested.
+  Count /= 4;
+  for (uint64_t i = 0; i != Count; ++i)
+    OW->Write32(0xd503201f);
+  return true;
+}
+
+namespace {
+
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+  /// \brief A "frameless" leaf function, where no non-volatile registers are
+  /// saved. The return remains in LR throughout the function.
+  UNWIND_AArch64_MODE_FRAMELESS = 0x02000000,
+
+  /// \brief No compact unwind encoding available. Instead the low 23-bits of
+  /// the compact unwind encoding is the offset of the DWARF FDE in the
+  /// __eh_frame section. This mode is never used in object files. It is only
+  /// generated by the linker in final linked images, which have only DWARF info
+  /// for a function.
+  UNWIND_AArch64_MODE_DWARF = 0x03000000,
+
+  /// \brief This is a standard arm64 prologue where FP/LR are immediately
+  /// pushed on the stack, then SP is copied to FP. If there are any
+  /// non-volatile register saved, they are copied into the stack fame in pairs
+  /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
+  /// five X pairs and four D pairs can be saved, but the memory layout must be
+  /// in register number order.
+  UNWIND_AArch64_MODE_FRAME = 0x04000000,
+
+  /// \brief Frame register pair encodings.
+  UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001,
+  UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002,
+  UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004,
+  UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008,
+  UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010,
+  UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100,
+  UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200,
+  UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400,
+  UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800
+};
+
+} // end CU namespace
+
+// FIXME: This should be in a separate file.
+class DarwinAArch64AsmBackend : public AArch64AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Encode compact unwind stack adjustment for frameless functions.
+  /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+  /// The stack size always needs to be 16 byte aligned.
+  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
+    return (StackSize / 16) << 12;
+  }
+
+public:
+  DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
+      : AArch64AsmBackend(T), MRI(MRI) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
+                                         MachO::CPU_SUBTYPE_ARM64_ALL);
+  }
+
+  bool doesSectionRequireSymbols(const MCSection &Section) const override {
+    // Any section for which the linker breaks things into atoms needs to
+    // preserve symbols, including assembler local symbols, to identify
+    // those atoms. These sections are:
+    // Sections of type:
+    //
+    //    S_CSTRING_LITERALS  (e.g. __cstring)
+    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
+    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
+    //
+    // Sections named:
+    //
+    //    __TEXT,__eh_frame
+    //    __TEXT,__ustring
+    //    __DATA,__cfstring
+    //    __DATA,__objc_classrefs
+    //    __DATA,__objc_catlist
+    //
+    // FIXME: It would be better if the compiler used actual linker local
+    // symbols for each of these sections rather than preserving what
+    // are ostensibly assembler local symbols.
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
+    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
+            SMO.getType() == MachO::S_4BYTE_LITERALS ||
+            SMO.getType() == MachO::S_8BYTE_LITERALS ||
+            SMO.getType() == MachO::S_16BYTE_LITERALS ||
+            SMO.getType() == MachO::S_LITERAL_POINTERS ||
+            (SMO.getSegmentName() == "__TEXT" &&
+             (SMO.getSectionName() == "__eh_frame" ||
+              SMO.getSectionName() == "__ustring")) ||
+            (SMO.getSegmentName() == "__DATA" &&
+             (SMO.getSectionName() == "__cfstring" ||
+              SMO.getSectionName() == "__objc_classrefs" ||
+              SMO.getSectionName() == "__objc_catlist")));
+  }
+
+  /// \brief Generate the compact unwind encoding from the CFI directives.
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
+    if (Instrs.empty())
+      return CU::UNWIND_AArch64_MODE_FRAMELESS;
+
+    bool HasFP = false;
+    unsigned StackSize = 0;
+
+    uint32_t CompactUnwindEncoding = 0;
+    for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Cannot handle this directive:  bail out.
+        return CU::UNWIND_AArch64_MODE_DWARF;
+      case MCCFIInstruction::OpDefCfa: {
+        // Defines a frame pointer.
+        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
+                   AArch64::FP &&
+               "Invalid frame pointer!");
+        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
+
+        const MCCFIInstruction &LRPush = Instrs[++i];
+        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Link register not pushed!");
+        const MCCFIInstruction &FPPush = Instrs[++i];
+        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Frame pointer not pushed!");
+
+        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
+        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+
+        LRReg = getXRegFromWReg(LRReg);
+        FPReg = getXRegFromWReg(FPReg);
+
+        assert(LRReg == AArch64::LR && FPReg == AArch64::FP &&
+               "Pushing invalid registers for frame!");
+
+        // Indicate that the function has a frame.
+        CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME;
+        HasFP = true;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        assert(StackSize == 0 && "We already have the CFA offset!");
+        StackSize = std::abs(Inst.getOffset());
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Registers are saved in pairs. We expect there to be two consecutive
+        // `.cfi_offset' instructions with the appropriate registers specified.
+        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        if (i + 1 == e)
+          return CU::UNWIND_AArch64_MODE_DWARF;
+
+        const MCCFIInstruction &Inst2 = Instrs[++i];
+        if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
+          return CU::UNWIND_AArch64_MODE_DWARF;
+        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+
+        // N.B. The encodings must be in register number order, and the X
+        // registers before the D registers.
+
+        // X19/X20 pair = 0x00000001,
+        // X21/X22 pair = 0x00000002,
+        // X23/X24 pair = 0x00000004,
+        // X25/X26 pair = 0x00000008,
+        // X27/X28 pair = 0x00000010
+        Reg1 = getXRegFromWReg(Reg1);
+        Reg2 = getXRegFromWReg(Reg2);
+
+        if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 &&
+            (CompactUnwindEncoding & 0xF1E) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR;
+        else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 &&
+                 (CompactUnwindEncoding & 0xF1C) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR;
+        else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 &&
+                 (CompactUnwindEncoding & 0xF18) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR;
+        else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 &&
+                 (CompactUnwindEncoding & 0xF10) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR;
+        else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 &&
+                 (CompactUnwindEncoding & 0xF00) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR;
+        else {
+          Reg1 = getDRegFromBReg(Reg1);
+          Reg2 = getDRegFromBReg(Reg2);
+
+          // D8/D9 pair   = 0x00000100,
+          // D10/D11 pair = 0x00000200,
+          // D12/D13 pair = 0x00000400,
+          // D14/D15 pair = 0x00000800
+          if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 &&
+              (CompactUnwindEncoding & 0xE00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR;
+          else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 &&
+                   (CompactUnwindEncoding & 0xC00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR;
+          else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 &&
+                   (CompactUnwindEncoding & 0x800) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR;
+          else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR;
+          else
+            // A pair was pushed which we cannot handle.
+            return CU::UNWIND_AArch64_MODE_DWARF;
+        }
+
+        break;
+      }
+      }
+    }
+
+    if (!HasFP) {
+      // With compact unwind info we can only represent stack adjustments of up
+      // to 65520 bytes.
+      if (StackSize > 65520)
+        return CU::UNWIND_AArch64_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS;
+      CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
+    }
+
+    return CompactUnwindEncoding;
+  }
+};
+
+} // end anonymous namespace
+
+namespace {
+
+class ELFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+  uint8_t OSABI;
+  bool IsLittleEndian;
+
+  ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian)
+    : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian);
+  }
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+};
+
+void ELFAArch64AsmBackend::processFixupValue(
+    const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
+    const MCFragment *DF, const MCValue &Target, uint64_t &Value,
+    bool &IsResolved) {
+  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
+  // ~0xfff. This means that the required offset to reach a symbol can vary by
+  // up to one step depending on where the ADRP is in memory. For example:
+  //
+  //     ADRP x0, there
+  //  there:
+  //
+  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
+  // we'll need that as an offset. At any other address "there" will be in the
+  // same page as the ADRP and the instruction should encode 0x0. Assuming the
+  // section isn't 0x1000-aligned, we therefore need to delegate this decision
+  // to the linker -- a relocation!
+  if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+    IsResolved = false;
+}
+
+void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                      unsigned DataSize, uint64_t Value,
+                                      bool IsPCRel) const {
+  // store fixups in .eh_frame section in big endian order
+  if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
+    const MCSection *Sec = Fixup.getValue()->FindAssociatedSection();
+    const MCSectionELF *SecELF = static_cast<const MCSectionELF *>(Sec);
+    if (SecELF->getSectionName() == ".eh_frame")
+      Value = ByteSwap_32(unsigned(Value));
+  }
+  AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
+}
+}
+
+MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
+                                            const MCRegisterInfo &MRI,
+                                            StringRef TT, StringRef CPU) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return new DarwinAArch64AsmBackend(T, MRI);
+
+  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+  return new ELFAArch64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true);
+}
+
+MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
+                                            const MCRegisterInfo &MRI,
+                                            StringRef TT, StringRef CPU) {
+  Triple TheTriple(TT);
+
+  assert(TheTriple.isOSBinFormatELF() &&
+         "Big endian is only supported for ELF targets!");
+  return new ELFAArch64AsmBackend(T, TheTriple.getOS(),
+                                  /*IsLittleEndian=*/false);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
new file mode 100644
index 00000000000..e05191eaf3e
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -0,0 +1,257 @@
+//===-- AArch64ELFObjectWriter.cpp - AArch64 ELF Writer -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file handles ELF-specific object emission, converting LLVM's internal
+// fixups into the appropriate relocations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class AArch64ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian);
+
+  virtual ~AArch64ELFObjectWriter();
+
+protected:
+  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsPCRel) const override;
+
+private:
+};
+}
+
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
+                                               bool IsLittleEndian)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+                              /*HasRelocationAddend*/ true) {}
+
+AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
+
+unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+  AArch64MCExpr::VariantKind RefKind =
+      static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
+  AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
+  bool IsNC = AArch64MCExpr::isNotChecked(RefKind);
+
+  assert((!Target.getSymA() ||
+          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  assert((!Target.getSymB() ||
+          Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_2:
+      return ELF::R_AARCH64_PREL16;
+    case FK_Data_4:
+      return ELF::R_AARCH64_PREL32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_PREL64;
+    case AArch64::fixup_aarch64_pcrel_adr_imm21:
+      assert(SymLoc == AArch64MCExpr::VK_NONE && "unexpected ADR relocation");
+      return ELF::R_AARCH64_ADR_PREL_LO21;
+    case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+      if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC)
+        return ELF::R_AARCH64_ADR_PREL_PG_HI21;
+      if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC)
+        return ELF::R_AARCH64_ADR_GOT_PAGE;
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
+        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+      llvm_unreachable("invalid symbol kind for ADRP relocation");
+    case AArch64::fixup_aarch64_pcrel_branch26:
+      return ELF::R_AARCH64_JUMP26;
+    case AArch64::fixup_aarch64_pcrel_call26:
+      return ELF::R_AARCH64_CALL26;
+    case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL)
+        return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
+      return ELF::R_AARCH64_LD_PREL_LO19;
+    case AArch64::fixup_aarch64_pcrel_branch14:
+      return ELF::R_AARCH64_TSTBR14;
+    case AArch64::fixup_aarch64_pcrel_branch19:
+      return ELF::R_AARCH64_CONDBR19;
+    default:
+      llvm_unreachable("Unsupported pc-relative fixup kind");
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_2:
+      return ELF::R_AARCH64_ABS16;
+    case FK_Data_4:
+      return ELF::R_AARCH64_ABS32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_ABS64;
+    case AArch64::fixup_aarch64_add_imm12:
+      if (RefKind == AArch64MCExpr::VK_DTPREL_HI12)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
+      if (RefKind == AArch64MCExpr::VK_TPREL_HI12)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_LO12_NC)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_LO12)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
+      if (RefKind == AArch64MCExpr::VK_TPREL_LO12_NC)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_LO12)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
+      if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12)
+        return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_ADD_ABS_LO12_NC;
+
+      report_fatal_error("invalid fixup for add (uimm12) instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale1:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
+
+      report_fatal_error("invalid fixup for 8-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale2:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
+
+      report_fatal_error("invalid fixup for 16-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale4:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
+
+      report_fatal_error("invalid fixup for 32-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale8:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_GOT && IsNC)
+        return ELF::R_AARCH64_LD64_GOT_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC)
+        return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
+        return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
+
+      report_fatal_error("invalid fixup for 64-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale16:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
+
+      report_fatal_error("invalid fixup for 128-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_movw:
+      if (RefKind == AArch64MCExpr::VK_ABS_G3)
+        return ELF::R_AARCH64_MOVW_UABS_G3;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2)
+        return ELF::R_AARCH64_MOVW_UABS_G2;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2_S)
+        return ELF::R_AARCH64_MOVW_SABS_G2;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G2_NC;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1)
+        return ELF::R_AARCH64_MOVW_UABS_G1;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1_S)
+        return ELF::R_AARCH64_MOVW_SABS_G1;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0)
+        return ELF::R_AARCH64_MOVW_UABS_G0;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0_S)
+        return ELF::R_AARCH64_MOVW_SABS_G0;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G2)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G1)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G0)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G2)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G1)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G0)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_GOTTPREL_G1)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
+      report_fatal_error("invalid fixup for movz/movk instruction");
+      return 0;
+    case AArch64::fixup_aarch64_tlsdesc_call:
+      return ELF::R_AARCH64_TLSDESC_CALL;
+    default:
+      llvm_unreachable("Unknown ELF relocation type");
+    }
+  }
+
+  llvm_unreachable("Unimplemented fixup -> relocation");
+}
+
+MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_ostream &OS,
+                                                 uint8_t OSABI,
+                                                 bool IsLittleEndian) {
+  MCELFObjectTargetWriter *MOTW =
+      new AArch64ELFObjectWriter(OSABI, IsLittleEndian);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
new file mode 100644
index 00000000000..a79406d9d1f
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -0,0 +1,160 @@
+//===- lib/MC/AArch64ELFStreamer.cpp - ELF Object Output for AArch64 ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits AArch64 ELF .o object files. Different
+// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit
+// regions of data and code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
+/// the appropriate points in the object files. These symbols are defined in the
+/// AArch64 ELF ABI:
+///    infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf
+///
+/// In brief: $x or $d should be emitted at the start of each contiguous region
+/// of A64 code or data in a section. In practice, this emission does not rely
+/// on explicit assembler directives but on inherent properties of the
+/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an
+/// instruction).
+///
+/// As a result this system is orthogonal to the DataRegion infrastructure used
+/// by MachO. Beware!
+class AArch64ELFStreamer : public MCELFStreamer {
+public:
+  AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                   MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
+        LastEMS(EMS_None) {}
+
+  ~AArch64ELFStreamer() {}
+
+  void ChangeSection(const MCSection *Section,
+                     const MCExpr *Subsection) override {
+    // We have to keep track of the mapping symbol state of any sections we
+    // use. Each one should start off as EMS_None, which is provided as the
+    // default constructor by DenseMap::lookup.
+    LastMappingSymbols[getPreviousSection().first] = LastEMS;
+    LastEMS = LastMappingSymbols.lookup(Section);
+
+    MCELFStreamer::ChangeSection(Section, Subsection);
+  }
+
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer. We override it to add the appropriate mapping symbol if
+  /// necessary.
+  void EmitInstruction(const MCInst &Inst,
+                       const MCSubtargetInfo &STI) override {
+    EmitA64MappingSymbol();
+    MCELFStreamer::EmitInstruction(Inst, STI);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  void EmitBytes(StringRef Data) override {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitBytes(Data);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                     const SMLoc &Loc) override {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitValueImpl(Value, Size);
+  }
+
+private:
+  enum ElfMappingSymbol {
+    EMS_None,
+    EMS_A64,
+    EMS_Data
+  };
+
+  void EmitDataMappingSymbol() {
+    if (LastEMS == EMS_Data)
+      return;
+    EmitMappingSymbol("$d");
+    LastEMS = EMS_Data;
+  }
+
+  void EmitA64MappingSymbol() {
+    if (LastEMS == EMS_A64)
+      return;
+    EmitMappingSymbol("$x");
+    LastEMS = EMS_A64;
+  }
+
+  void EmitMappingSymbol(StringRef Name) {
+    MCSymbol *Start = getContext().CreateTempSymbol();
+    EmitLabel(Start);
+
+    MCSymbol *Symbol = getContext().GetOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++));
+
+    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+    MCELF::SetType(SD, ELF::STT_NOTYPE);
+    MCELF::SetBinding(SD, ELF::STB_LOCAL);
+    SD.setExternal(false);
+    Symbol->setSection(*getCurrentSection().first);
+
+    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
+    Symbol->setVariableValue(Value);
+  }
+
+  int64_t MappingSymbolCounter;
+
+  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
+  ElfMappingSymbol LastEMS;
+
+  /// @}
+};
+}
+
+namespace llvm {
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                        raw_ostream &OS, MCCodeEmitter *Emitter,
+                                        bool RelaxAll, bool NoExecStack) {
+  AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  if (NoExecStack)
+    S->getAssembler().setNoExecStack(true);
+  return S;
+}
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
new file mode 100644
index 00000000000..bc6973bd5f8
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -0,0 +1,26 @@
+//===-- AArch64ELFStreamer.h - ELF Streamer for AArch64 ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF streamer information for the AArch64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64_ELF_STREAMER_H
+#define LLVM_AARCH64_ELF_STREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                        raw_ostream &OS, MCCodeEmitter *Emitter,
+                                        bool RelaxAll, bool NoExecStack);
+}
+
+#endif // AArch64_ELF_STREAMER_H
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
new file mode 100644
index 00000000000..bf405fbac77
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -0,0 +1,76 @@
+//===-- AArch64FixupKinds.h - AArch64 Specific Fixup Entries ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AArch64FIXUPKINDS_H
+#define LLVM_AArch64FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace AArch64 {
+
+enum Fixups {
+  // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADR instruction.
+  fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind,
+
+  // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADRP instruction.
+  fixup_aarch64_pcrel_adrp_imm21,
+
+  // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions.
+  //     No alignment adjustment. All value bits are encoded.
+  fixup_aarch64_add_imm12,
+
+  // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and
+  // store instructions.
+  fixup_aarch64_ldst_imm12_scale1,
+  fixup_aarch64_ldst_imm12_scale2,
+  fixup_aarch64_ldst_imm12_scale4,
+  fixup_aarch64_ldst_imm12_scale8,
+  fixup_aarch64_ldst_imm12_scale16,
+
+  // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by
+  // pc-relative loads and generates relocations directly when necessary.
+  fixup_aarch64_ldr_pcrel_imm19,
+
+  // FIXME: comment
+  fixup_aarch64_movw,
+
+  // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
+  // immediate.
+  fixup_aarch64_pcrel_branch14,
+
+  // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by
+  // b.cc and generates relocations directly when necessary.
+  fixup_aarch64_pcrel_branch19,
+
+  // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
+  // immediate.
+  fixup_aarch64_pcrel_branch26,
+
+  // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
+  // immediate. Distinguished from branch26 only on ELF.
+  fixup_aarch64_pcrel_call26,
+
+  // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF
+  // R_AARCH64_TLSDESC_CALL relocation.
+  fixup_aarch64_tlsdesc_call,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+
+} // end namespace AArch64
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
new file mode 100644
index 00000000000..dc4a8bf6c9a
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -0,0 +1,99 @@
+//===-- AArch64MCAsmInfo.cpp - AArch64 asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AArch64MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+enum AsmWriterVariantTy {
+  Default = -1,
+  Generic = 0,
+  Apple = 1
+};
+
+static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
+    "aarch64-neon-syntax", cl::init(Default),
+    cl::desc("Choose style of NEON code to emit from AArch64 backend:"),
+    cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
+               clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"),
+               clEnumValEnd));
+
+AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+
+  PrivateGlobalPrefix = "L";
+  SeparatorString = "%%";
+  CommentString = ";";
+  PointerSize = CalleeSaveStackSlotSize = 8;
+
+  AlignmentIsInBytes = false;
+  UsesELFSectionDirectiveForBSS = true;
+  SupportsDebugInformation = true;
+  UseDataRegionDirectives = true;
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
+    const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
+  MCSymbol *PCSym = Context.CreateTempSymbol();
+  Streamer.EmitLabel(PCSym);
+  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
+  return MCBinaryExpr::CreateSub(Res, PC, Context);
+}
+
+AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
+  Triple T(TT);
+  if (T.getArch() == Triple::arm64_be || T.getArch() == Triple::aarch64_be)
+    IsLittleEndian = false;
+
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+
+  PointerSize = 8;
+
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  CommentString = "//";
+  PrivateGlobalPrefix = ".L";
+  Code32Directive = ".code\t32";
+
+  Data16bitsDirective = "\t.hword\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = "\t.xword\t";
+
+  UseDataRegionDirectives = false;
+
+  WeakRefDirective = "\t.weak\t";
+
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  UseIntegratedAssembler = true;
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
new file mode 100644
index 00000000000..42a031d7c2c
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -0,0 +1,36 @@
+//=====-- AArch64MCAsmInfo.h - AArch64 asm properties ---------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AArch64MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64TARGETASMINFO_H
+#define AArch64TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+class Target;
+class StringRef;
+class MCStreamer;
+struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
+  explicit AArch64MCAsmInfoDarwin();
+  const MCExpr *
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+};
+
+struct AArch64MCAsmInfoELF : public MCAsmInfo {
+  explicit AArch64MCAsmInfoELF(StringRef TT);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
new file mode 100644
index 00000000000..464a18cdbc0
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -0,0 +1,654 @@
+//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
+namespace {
+
+class AArch64MCCodeEmitter : public MCCodeEmitter {
+  MCContext &Ctx;
+
+  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const AArch64MCCodeEmitter &);     // DO NOT IMPLEMENT
+public:
+  AArch64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                     MCContext &ctx)
+      : Ctx(ctx) {}
+
+  ~AArch64MCCodeEmitter() {}
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  /// getLdStUImm12OpValue - Return encoding info for 12-bit unsigned immediate
+  /// attached to a load, store or prfm instruction. If operand requires a
+  /// relocation, record it and return zero in that part of the encoding.
+  template <uint32_t FixupKind>
+  uint32_t getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+  /// target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+  /// the 2-bit shift field.
+  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
+  /// branch target.
+  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  /// getLoadLiteralOpValue - Return the encoded value for a load-literal
+  /// pc-relative address.
+  uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMemExtendOpValue - Return the encoded value for a reg-extend load/store
+  /// instruction: bit 0 is whether a shift is present, bit 1 is whether the
+  /// operation is a sign extend (as opposed to a zero extend).
+  uint32_t getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+  /// branch target.
+  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  /// getBranchTargetOpValue - Return the encoded value for an unconditional
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
+  /// of a MOVZ or MOVK instruction.
+  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
+  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
+  /// shifter (MSL).
+  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  /// getFixedPointScaleOpValue - Return the encoded value for the
+  // FP-to-fixed-point scale factor.
+  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getSIMDShift64OpValue - Return the encoded value for the
+  // shift-by-immediate AdvSIMD instructions.
+  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                   const MCSubtargetInfo &STI) const;
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
+
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue,
+                      const MCSubtargetInfo &STI) const;
+
+  template<int hasRs, int hasRt2> unsigned
+  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
+                        const MCSubtargetInfo &STI) const;
+
+  unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
+                                     const MCSubtargetInfo &STI) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new AArch64MCCodeEmitter(MCII, STI, Ctx);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned
+AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else {
+    assert(MO.isImm() && "did not expect relocated expression");
+    return static_cast<unsigned>(MO.getImm());
+  }
+
+  assert(0 && "Unable to encode MCOperand!");
+  return 0;
+}
+
+template<unsigned FixupKind> uint32_t
+AArch64MCCodeEmitter::getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  uint32_t ImmVal = 0;
+
+  if (MO.isImm())
+    ImmVal = static_cast<uint32_t>(MO.getImm());
+  else {
+    assert(MO.isExpr() && "unable to encode load/store imm operand");
+    MCFixupKind Kind = MCFixupKind(FixupKind);
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+    ++MCNumFixups;
+  }
+
+  return ImmVal;
+}
+
+/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+/// target.
+uint32_t
+AArch64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+  const MCExpr *Expr = MO.getExpr();
+
+  MCFixupKind Kind = MI.getOpcode() == AArch64::ADR
+                         ? MCFixupKind(AArch64::fixup_aarch64_pcrel_adr_imm21)
+                         : MCFixupKind(AArch64::fixup_aarch64_pcrel_adrp_imm21);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+  MCNumFixups += 1;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
+/// return value.
+uint32_t
+AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  // Suboperands are [imm, shifter].
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(AArch64_AM::getShiftType(MO1.getImm()) == AArch64_AM::LSL &&
+         "unexpected shift type for add/sub immediate");
+  unsigned ShiftVal = AArch64_AM::getShiftValue(MO1.getImm());
+  assert((ShiftVal == 0 || ShiftVal == 12) &&
+         "unexpected shift value for add/sub immediate");
+  if (MO.isImm())
+    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
+  assert(MO.isExpr() && "Unable to encode MCOperand!");
+  const MCExpr *Expr = MO.getExpr();
+
+  // Encode the 12 bits of the fixup.
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_add_imm12);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
+}
+
+/// getCondBranchTargetOpValue - Return the encoded value for a conditional
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getLoadLiteralOpValue - Return the encoded value for a load-literal
+/// pc-relative address.
+uint32_t
+AArch64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_ldr_pcrel_imm19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  unsigned SignExtend = MI.getOperand(OpIdx).getImm();
+  unsigned DoShift = MI.getOperand(OpIdx + 1).getImm();
+  return (SignExtend << 1) | DoShift;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected movz/movk immediate");
+
+  Fixups.push_back(MCFixup::Create(
+      0, MO.getExpr(), MCFixupKind(AArch64::fixup_aarch64_movw), MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
+}
+
+/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getTestBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch14);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getBranchTargetOpValue - Return the encoded value for an unconditional
+/// branch target.
+uint32_t
+AArch64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MI.getOpcode() == AArch64::BL
+                         ? MCFixupKind(AArch64::fixup_aarch64_pcrel_call26)
+                         : MCFixupKind(AArch64::fixup_aarch64_pcrel_branch26);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getVecShifterOpValue - Return the encoded value for the vector shifter:
+///
+///   00 -> 0
+///   01 -> 8
+///   10 -> 16
+///   11 -> 24
+uint32_t
+AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+
+  switch (MO.getImm()) {
+  default:
+    break;
+  case 0:
+    return 0;
+  case 8:
+    return 1;
+  case 16:
+    return 2;
+  case 24:
+    return 3;
+  }
+
+  assert(false && "Invalid value for vector shift amount!");
+  return 0;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm());
+}
+
+uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm() | 32);
+}
+
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 32 - (MO.getImm() | 16);
+}
+
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 16 - (MO.getImm() | 8);
+}
+
+/// getFixedPointScaleOpValue - Return the encoded value for the
+// FP-to-fixed-point scale factor.
+uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 32 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 16 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 8 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 64;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 32;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 16;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 8;
+}
+
+/// getMoveVecShifterOpValue - Return the encoded value for the vector move
+/// shifter (MSL).
+uint32_t AArch64MCCodeEmitter::getMoveVecShifterOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() &&
+         "Expected an immediate value for the move shift amount!");
+  unsigned ShiftVal = AArch64_AM::getShiftValue(MO.getImm());
+  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
+  return ShiftVal == 8 ? 0 : 1;
+}
+
+unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                                       const MCSubtargetInfo &STI) const {
+  // If one of the signed fixup kinds is applied to a MOVZ instruction, the
+  // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
+  // job to ensure that any bits possibly affected by this are 0. This means we
+  // must zero out bit 30 (essentially emitting a MOVN).
+  MCOperand UImm16MO = MI.getOperand(1);
+
+  // Nothing to do if there's no fixup.
+  if (UImm16MO.isImm())
+    return EncodedValue;
+
+  const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
+  switch (A64E->getKind()) {
+  case AArch64MCExpr::VK_DTPREL_G2:
+  case AArch64MCExpr::VK_DTPREL_G1:
+  case AArch64MCExpr::VK_DTPREL_G0:
+  case AArch64MCExpr::VK_GOTTPREL_G1:
+  case AArch64MCExpr::VK_TPREL_G2:
+  case AArch64MCExpr::VK_TPREL_G1:
+  case AArch64MCExpr::VK_TPREL_G0:
+    return EncodedValue & ~(1u << 30);
+  default:
+    // Nothing to do for an unsigned fixup.
+    return EncodedValue;
+  }
+
+
+  return EncodedValue & ~(1u << 30);
+}
+
+void AArch64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  if (MI.getOpcode() == AArch64::TLSDESCCALL) {
+    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+    // following (BLR) instruction. It doesn't emit any code itself so it
+    // doesn't go through the normal TableGenerated channels.
+    MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call);
+    Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
+    return;
+  }
+
+  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  EmitConstant(Binary, 4, OS);
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+unsigned
+AArch64MCCodeEmitter::fixMulHigh(const MCInst &MI,
+                                 unsigned EncodedValue,
+                                 const MCSubtargetInfo &STI) const {
+  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+  // (i.e. all bits 1) but is ignored by the processor.
+  EncodedValue |= 0x1f << 10;
+  return EncodedValue;
+}
+
+template<int hasRs, int hasRt2> unsigned
+AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
+                                            unsigned EncodedValue,
+                                            const MCSubtargetInfo &STI) const {
+  if (!hasRs) EncodedValue |= 0x001F0000;
+  if (!hasRt2) EncodedValue |= 0x00007C00;
+
+  return EncodedValue;
+}
+
+unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
+    const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const {
+  // The Rm field of FCMP and friends is unused - it should be assembled
+  // as 0, but is ignored by the processor.
+  EncodedValue &= ~(0x1f << 16);
+  return EncodedValue;
+}
+
+#include "AArch64GenMCCodeEmitter.inc"
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
new file mode 100644
index 00000000000..85c3ec7a55f
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -0,0 +1,174 @@
+//===-- AArch64MCExpr.cpp - AArch64 specific MC expression classes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64symbolrefexpr"
+
+const AArch64MCExpr *AArch64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
+                                       MCContext &Ctx) {
+  return new (Ctx) AArch64MCExpr(Expr, Kind);
+}
+
+StringRef AArch64MCExpr::getVariantKindName() const {
+  switch (static_cast<uint32_t>(getKind())) {
+  case VK_CALL:                return "";
+  case VK_LO12:                return ":lo12:";
+  case VK_ABS_G3:              return ":abs_g3:";
+  case VK_ABS_G2:              return ":abs_g2:";
+  case VK_ABS_G2_S:            return ":abs_g2_s:";
+  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case VK_ABS_G1:              return ":abs_g1:";
+  case VK_ABS_G1_S:            return ":abs_g1_s:";
+  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case VK_ABS_G0:              return ":abs_g0:";
+  case VK_ABS_G0_S:            return ":abs_g0_s:";
+  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case VK_DTPREL_G2:           return ":dtprel_g2:";
+  case VK_DTPREL_G1:           return ":dtprel_g1:";
+  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case VK_DTPREL_G0:           return ":dtprel_g0:";
+  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
+  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case VK_TPREL_G2:            return ":tprel_g2:";
+  case VK_TPREL_G1:            return ":tprel_g1:";
+  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case VK_TPREL_G0:            return ":tprel_g0:";
+  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case VK_TPREL_HI12:          return ":tprel_hi12:";
+  case VK_TPREL_LO12:          return ":tprel_lo12:";
+  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case VK_ABS_PAGE:            return "";
+  case VK_GOT_PAGE:            return ":got:";
+  case VK_GOT_LO12:            return ":got_lo12:";
+  case VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case VK_TLSDESC:             return "";
+  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  default:
+    llvm_unreachable("Invalid ELF symbol kind");
+  }
+}
+
+void AArch64MCExpr::PrintImpl(raw_ostream &OS) const {
+  if (getKind() != VK_NONE)
+    OS << getVariantKindName();
+  OS << *Expr;
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+// FIXME: really do above: now that two backends are using it.
+static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbolsImpl(BE->getLHS(), Asm);
+    AddValueSymbolsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbolsImpl(getSubExpr(), Asm);
+}
+
+const MCSection *AArch64MCExpr::FindAssociatedSection() const {
+  llvm_unreachable("FIXME: what goes here?");
+}
+
+bool AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                            const MCAsmLayout *Layout) const {
+  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
+    return false;
+
+  Res =
+      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expression");
+    break;
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    // We're known to be under a TLS fixup, so any symbol should be
+    // modified. There should be only one.
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
+    MCELF::SetType(SD, ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch (getSymbolLoc(Kind)) {
+  default:
+    return;
+  case VK_DTPREL:
+  case VK_GOTTPREL:
+  case VK_TPREL:
+  case VK_TLSDESC:
+    break;
+  }
+
+  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
new file mode 100644
index 00000000000..e869ed0a26a
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -0,0 +1,168 @@
+//=--- AArch64MCExpr.h - AArch64 specific MC expression classes ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes AArch64-specific MCExprs, used for modifiers like
+// ":lo12:" or ":gottprel_g1:".
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AArch64MCEXPR_H
+#define LLVM_AArch64MCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class AArch64MCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_NONE     = 0x000,
+
+    // Symbol locations specifying (roughly speaking) what calculation should be
+    // performed to construct the final address for the relocated
+    // symbol. E.g. direct, via the GOT, ...
+    VK_ABS      = 0x001,
+    VK_SABS     = 0x002,
+    VK_GOT      = 0x003,
+    VK_DTPREL   = 0x004,
+    VK_GOTTPREL = 0x005,
+    VK_TPREL    = 0x006,
+    VK_TLSDESC  = 0x007,
+    VK_SymLocBits = 0x00f,
+
+    // Variants specifying which part of the final address calculation is
+    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
+    // MOVZ/MOVK.
+    VK_PAGE     = 0x010,
+    VK_PAGEOFF  = 0x020,
+    VK_HI12     = 0x030,
+    VK_G0       = 0x040,
+    VK_G1       = 0x050,
+    VK_G2       = 0x060,
+    VK_G3       = 0x070,
+    VK_AddressFragBits = 0x0f0,
+
+    // Whether the final relocation is a checked one (where a linker should
+    // perform a range-check on the final address) or not. Note that this field
+    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
+    // on its own is a non-checked relocation. We side with ELF on being
+    // explicit about this!
+    VK_NC       = 0x100,
+
+    // Convenience definitions for referring to specific textual representations
+    // of relocation specifiers. Note that this means the "_NC" is sometimes
+    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
+    // since a user would write ":lo12:").
+    VK_CALL              = VK_ABS,
+    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
+    VK_ABS_G3            = VK_ABS      | VK_G3,
+    VK_ABS_G2            = VK_ABS      | VK_G2,
+    VK_ABS_G2_S          = VK_SABS     | VK_G2,
+    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
+    VK_ABS_G1            = VK_ABS      | VK_G1,
+    VK_ABS_G1_S          = VK_SABS     | VK_G1,
+    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
+    VK_ABS_G0            = VK_ABS      | VK_G0,
+    VK_ABS_G0_S          = VK_SABS     | VK_G0,
+    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
+    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
+    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
+    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
+    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
+    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
+    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
+    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
+    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
+    VK_DTPREL_HI12       = VK_DTPREL   | VK_HI12,
+    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
+    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
+    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
+    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
+    VK_TPREL_G2          = VK_TPREL    | VK_G2,
+    VK_TPREL_G1          = VK_TPREL    | VK_G1,
+    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
+    VK_TPREL_G0          = VK_TPREL    | VK_G0,
+    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
+    VK_TPREL_HI12        = VK_TPREL    | VK_HI12,
+    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
+    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
+
+    VK_INVALID  = 0xfff
+  };
+
+private:
+  const MCExpr *Expr;
+  const VariantKind Kind;
+
+  explicit AArch64MCExpr(const MCExpr *Expr, VariantKind Kind)
+    : Expr(Expr), Kind(Kind) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const AArch64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
+                                   MCContext &Ctx);
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// Get the kind of this expression.
+  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
+
+  /// Get the expression this modifier applies to.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+  /// @name VariantKind information extractors.
+  /// @{
+
+  static VariantKind getSymbolLoc(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_SymLocBits);
+  }
+
+  static VariantKind getAddressFrag(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
+  }
+
+  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
+
+  /// @}
+
+  /// Convert the variant kind into an ELF-appropriate modifier
+  /// (e.g. ":got:", ":lo12:").
+  StringRef getVariantKindName() const;
+
+  void PrintImpl(raw_ostream &OS) const override;
+
+  void AddValueSymbols(MCAssembler *) const override;
+
+  const MCSection *FindAssociatedSection() const override;
+
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const override;
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  static bool classof(const AArch64MCExpr *) { return true; }
+
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
new file mode 100644
index 00000000000..ae698c59f6c
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -0,0 +1,225 @@
+//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCTargetDesc.h"
+#include "AArch64ELFStreamer.h"
+#include "AArch64MCAsmInfo.h"
+#include "InstPrinter/AArch64InstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "AArch64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AArch64GenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AArch64GenRegisterInfo.inc"
+
+static MCInstrInfo *createAArch64MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAArch64MCInstrInfo(X);
+  return X;
+}
+
+static MCSubtargetInfo *
+createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+
+  if (CPU.empty())
+    CPU = "generic";
+
+  InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAArch64MCRegisterInfo(X, AArch64::LR);
+  return X;
+}
+
+static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
+                                         StringRef TT) {
+  Triple TheTriple(TT);
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new AArch64MCAsmInfoDarwin();
+  else {
+    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+    MAI = new AArch64MCAsmInfoELF(TT);
+  }
+
+  // Initial state of the frame pointer is SP.
+  unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                 CodeModel::Model CM,
+                                                 CodeGenOpt::Level OL) {
+  Triple TheTriple(TT);
+  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
+         "Only expect Darwin and ELF targets");
+
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  // The default MCJIT memory managers make no guarantees about where they can
+  // find an executable page; JITed code needs to be able to refer to globals
+  // no matter how far away they are.
+  else if (CM == CodeModel::JITDefault)
+    CM = CodeModel::Large;
+  else if (CM != CodeModel::Small && CM != CodeModel::Large)
+    report_fatal_error(
+        "Only small and large code models are allowed on AArch64");
+
+  // AArch64 Darwin is always PIC.
+  if (TheTriple.isOSDarwin())
+    RM = Reloc::PIC_;
+  // On ELF platforms the default static relocation model has a smart enough
+  // linker to cope with referencing external symbols defined in a shared
+  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
+    RM = Reloc::Static;
+
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new AArch64InstPrinter(MAI, MII, MRI, STI);
+  if (SyntaxVariant == 1)
+    return new AArch64AppleInstPrinter(MAI, MII, MRI, STI);
+
+  return nullptr;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &TAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+                               /*LabelSections*/ true);
+
+  return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeAArch64TargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheAArch64leTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheAArch64beTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn Z(TheARM64leTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn W(TheARM64beTarget, createAArch64MCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget,
+                                        createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget,
+                                        createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget,
+                                        createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget,
+                                        createAArch64MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheAArch64leTarget,
+                                      createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget,
+                                      createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget,
+                                      createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget,
+                                      createAArch64MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheAArch64leTarget,
+                                    createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget,
+                                    createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARM64leTarget,
+                                    createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARM64beTarget,
+                                    createAArch64MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheAArch64leTarget,
+                                          createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget,
+                                          createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget,
+                                          createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget,
+                                          createAArch64MCSubtargetInfo);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget,
+                                       createAArch64leAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
+                                       createAArch64beAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget,
+                                       createAArch64leAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget,
+                                       createAArch64beAsmBackend);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget,
+                                        createAArch64MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget,
+                                        createAArch64MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget,
+                                        createAArch64MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget,
+                                        createAArch64MCCodeEmitter);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheAArch64leTarget,
+                                           createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget,
+                                           createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget,
+                                        createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget,
+                                        createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget,
+                                        createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget,
+                                        createAArch64MCInstPrinter);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
new file mode 100644
index 00000000000..d886ea23c13
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -0,0 +1,70 @@
+//===-- AArch64MCTargetDesc.h - AArch64 Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64MCTARGETDESC_H
+#define AArch64MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCRegisterInfo;
+class MCObjectWriter;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheAArch64leTarget;
+extern Target TheAArch64beTarget;
+extern Target TheARM64leTarget;
+extern Target TheARM64beTarget;
+
+MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+MCAsmBackend *createAArch64leAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI, StringRef TT,
+                                        StringRef CPU);
+MCAsmBackend *createAArch64beAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI, StringRef TT,
+                                        StringRef CPU);
+
+MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
+                                             bool IsLittleEndian);
+
+MCObjectWriter *createAArch64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
+                                            uint32_t CPUSubtype);
+
+} // End llvm namespace
+
+// Defines symbolic names for AArch64 registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "AArch64GenRegisterInfo.inc"
+
+// Defines symbolic names for the AArch64 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "AArch64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AArch64GenSubtargetInfo.inc"
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
new file mode 100644
index 00000000000..5c86189a6ef
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -0,0 +1,396 @@
+//===-- AArch64MachObjectWriter.cpp - ARM Mach Object Writer --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
+  bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
+                                  const MCSymbolRefExpr *Sym,
+                                  unsigned &Log2Size, const MCAssembler &Asm);
+
+public:
+  AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
+                                 /*UseAggressiveSymbolFolding=*/true) {}
+
+  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override;
+};
+}
+
+bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
+    const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
+    unsigned &Log2Size, const MCAssembler &Asm) {
+  RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
+  Log2Size = ~0U;
+
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    return false;
+
+  case FK_Data_1:
+    Log2Size = llvm::Log2_32(1);
+    return true;
+  case FK_Data_2:
+    Log2Size = llvm::Log2_32(2);
+    return true;
+  case FK_Data_4:
+    Log2Size = llvm::Log2_32(4);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case FK_Data_8:
+    Log2Size = llvm::Log2_32(8);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    Log2Size = llvm::Log2_32(4);
+    switch (Sym->getKind()) {
+    default:
+      assert(0 && "Unexpected symbol reference variant kind!");
+    case MCSymbolRefExpr::VK_PAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
+      return true;
+    }
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    Log2Size = llvm::Log2_32(4);
+    // This encompasses the relocation for the whole 21-bit value.
+    switch (Sym->getKind()) {
+    default:
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "ADR/ADRP relocations must be GOT relative");
+    case MCSymbolRefExpr::VK_PAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21);
+      return true;
+    }
+    return true;
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    Log2Size = llvm::Log2_32(4);
+    RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
+    return true;
+  }
+}
+
+void AArch64MachObjectWriter::RecordRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment);
+  unsigned Log2Size = 0;
+  int64_t Value = 0;
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+  unsigned Kind = Fixup.getKind();
+
+  FixupOffset += Fixup.getOffset();
+
+  // AArch64 pcrel relocation addends do not include the section offset.
+  if (IsPCRel)
+    FixedValue += FixupOffset;
+
+  // ADRP fixups use relocations for the whole symbol value and only
+  // put the addend in the instruction itself. Clear out any value the
+  // generic code figured out from the sybmol definition.
+  if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+    FixedValue = 0;
+
+  // imm19 relocations are for conditional branches, which require
+  // assembler local symbols. If we got here, that's not what we have,
+  // so complain loudly.
+  if (Kind == AArch64::fixup_aarch64_pcrel_branch19) {
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "conditional branch requires assembler-local"
+                                " label. '" +
+                                    Target.getSymA()->getSymbol().getName() +
+                                    "' is external.");
+    return;
+  }
+
+  // 14-bit branch relocations should only target internal labels, and so
+  // should never get here.
+  if (Kind == AArch64::fixup_aarch64_pcrel_branch14) {
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "Invalid relocation on conditional branch!");
+    return;
+  }
+
+  if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
+                                  Asm)) {
+    Asm.getContext().FatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
+    return;
+  }
+
+  Value = Target.getConstant();
+
+  if (Target.isAbsolute()) { // constant
+    // FIXME: Should this always be extern?
+    // SymbolNum of 0 indicates the absolute section.
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+    Index = 0;
+
+    if (IsPCRel) {
+      IsExtern = 1;
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "PC relative absolute relocation!");
+
+      // FIXME: x86_64 sets the type to a branch reloc here. Should we do
+      // something similar?
+    }
+  } else if (Target.getSymB()) { // A - B + constant
+    const MCSymbol *A = &Target.getSymA()->getSymbol();
+    const MCSymbolData &A_SD = Asm.getSymbolData(*A);
+    const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
+
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    const MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
+
+    // Check for "_foo@got - .", which comes through here as:
+    // Ltmp0:
+    //    ... _foo@got - Ltmp0
+    if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT &&
+        Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None &&
+        Layout.getSymbolOffset(&B_SD) ==
+            Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
+      // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
+      Index = A_Base->getIndex();
+      IsExtern = 1;
+      Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
+      IsPCRel = 1;
+      MachO::any_relocation_info MRE;
+      MRE.r_word0 = FixupOffset;
+      MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                     (IsExtern << 27) | (Type << 28));
+      Writer->addRelocation(Fragment->getParent(), MRE);
+      return;
+    } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+               Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+      // Otherwise, neither symbol can be modified.
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation of modified symbol");
+
+    // We don't support PCrel relocations of differences.
+    if (IsPCRel)
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported pc-relative relocation of "
+                                  "difference");
+
+    // AArch64 always uses external relocations. If there is no symbol to use as
+    // a base address (a local symbol with no preceding non-local symbol),
+    // error out.
+    //
+    // FIXME: We should probably just synthesize an external symbol and use
+    // that.
+    if (!A_Base)
+      Asm.getContext().FatalError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + A->getName() +
+              "'. Must have non-local symbol earlier in section.");
+    if (!B_Base)
+      Asm.getContext().FatalError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + B->getName() +
+              "'. Must have non-local symbol earlier in section.");
+
+    if (A_Base == B_Base && A_Base)
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation with identical base");
+
+    Value += (!A_SD.getFragment() ? 0
+                                  : Writer->getSymbolAddress(&A_SD, Layout)) -
+             (!A_Base || !A_Base->getFragment()
+                  ? 0
+                  : Writer->getSymbolAddress(A_Base, Layout));
+    Value -= (!B_SD.getFragment() ? 0
+                                  : Writer->getSymbolAddress(&B_SD, Layout)) -
+             (!B_Base || !B_Base->getFragment()
+                  ? 0
+                  : Writer->getSymbolAddress(B_Base, Layout));
+
+    Index = A_Base->getIndex();
+    IsExtern = 1;
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    Index = B_Base->getIndex();
+    IsExtern = 1;
+    Type = MachO::ARM64_RELOC_SUBTRACTOR;
+  } else { // A + constant
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData *Base = Asm.getAtom(&SD);
+    const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
+        Fragment->getParent()->getSection());
+
+    // If the symbol is a variable and we weren't able to get a Base for it
+    // (i.e., it's not in the symbol table associated with a section) resolve
+    // the relocation based its expansion instead.
+    if (Symbol->isVariable() && !Base) {
+      // If the evaluation is an absolute value, just use that directly
+      // to keep things easy.
+      int64_t Res;
+      if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+
+      // FIXME: Will the Target we already have ever have any data in it
+      // we need to preserve and merge with the new Target? How about
+      // the FixedValue?
+      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
+        Asm.getContext().FatalError(Fixup.getLoc(),
+                                    "unable to resolve variable '" +
+                                        Symbol->getName() + "'");
+      return RecordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                              FixedValue);
+    }
+
+    // Relocations inside debug sections always use local relocations when
+    // possible. This seems to be done because the debugger doesn't fully
+    // understand relocation entries and expects to find values that
+    // have already been fixed up.
+    if (Symbol->isInSection()) {
+      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+        Base = nullptr;
+    }
+
+    // AArch64 uses external relocations as much as possible. For debug
+    // sections, and for pointer-sized relocations (.quad), we allow section
+    // relocations.  It's code sections that run into trouble.
+    if (Base) {
+      Index = Base->getIndex();
+      IsExtern = 1;
+
+      // Add the local offset, if needed.
+      if (Base != &SD)
+        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
+    } else if (Symbol->isInSection()) {
+      // Pointer-sized relocations can use a local relocation. Otherwise,
+      // we have to be in a debug info section.
+      if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
+        Asm.getContext().FatalError(
+            Fixup.getLoc(),
+            "unsupported relocation of local symbol '" + Symbol->getName() +
+                "'. Must have non-local symbol earlier in section.");
+      // Adjust the relocation to be section-relative.
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD =
+          Asm.getSectionData(SD.getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      IsExtern = 0;
+      Value += Writer->getSymbolAddress(&SD, Layout);
+
+      if (IsPCRel)
+        Value -= Writer->getFragmentAddress(Fragment, Layout) +
+                 Fixup.getOffset() + (1ULL << Log2Size);
+    } else {
+      // Resolve constant variables.
+      if (SD.getSymbol().isVariable()) {
+        int64_t Res;
+        if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+                Res, Layout, Writer->getSectionAddressMap())) {
+          FixedValue = Res;
+          return;
+        }
+      }
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation of variable '" +
+                                      Symbol->getName() + "'");
+    }
+  }
+
+  // If the relocation kind is Branch26, Page21, or Pageoff12, any addend
+  // is represented via an Addend relocation, not encoded directly into
+  // the instruction.
+  if ((Type == MachO::ARM64_RELOC_BRANCH26 ||
+       Type == MachO::ARM64_RELOC_PAGE21 ||
+       Type == MachO::ARM64_RELOC_PAGEOFF12) &&
+      Value) {
+    assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    // Now set up the Addend relocation.
+    Type = MachO::ARM64_RELOC_ADDEND;
+    Index = Value;
+    IsPCRel = 0;
+    Log2Size = 2;
+    IsExtern = 0;
+
+    // Put zero into the instruction itself. The addend is in the relocation.
+    Value = 0;
+  }
+
+  // If there's any addend left to handle, encode it in the instruction.
+  FixedValue = Value;
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                 (IsExtern << 27) | (Type << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS,
+                                                  uint32_t CPUType,
+                                                  uint32_t CPUSubtype) {
+  return createMachObjectWriter(
+      new AArch64MachObjectWriter(CPUType, CPUSubtype), OS,
+      /*IsLittleEndian=*/true);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 00000000000..7d5bced17a6
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_llvm_library(LLVMAArch64Desc
+  AArch64AsmBackend.cpp
+  AArch64ELFObjectWriter.cpp
+  AArch64ELFStreamer.cpp
+  AArch64MCAsmInfo.cpp
+  AArch64MCCodeEmitter.cpp
+  AArch64MCExpr.cpp
+  AArch64MCTargetDesc.cpp
+  AArch64MachObjectWriter.cpp
+)
+add_dependencies(LLVMAArch64Desc AArch64CommonTableGen)
+
+# Hack: we need to include 'main' target directory to grab private headers
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 00000000000..70cff0b704f
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AArch64Desc
+parent = AArch64
+required_libraries = AArch64AsmPrinter AArch64Info MC Support
+add_to_library_groups = AArch64
+
diff --git a/lib/Target/AArch64/MCTargetDesc/Makefile b/lib/Target/AArch64/MCTargetDesc/Makefile
new file mode 100644
index 00000000000..5779ac5ac60
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/AArch64/TargetDesc/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAArch64Desc
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/Makefile b/lib/Target/AArch64/Makefile
new file mode 100644
index 00000000000..f356c585041
--- /dev/null
+++ b/lib/Target/AArch64/Makefile
@@ -0,0 +1,25 @@
+##===- lib/Target/AArch64/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMAArch64CodeGen
+TARGET = AArch64
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = AArch64GenRegisterInfo.inc AArch64GenInstrInfo.inc \
+		AArch64GenAsmWriter.inc AArch64GenAsmWriter1.inc \
+		AArch64GenDAGISel.inc \
+		AArch64GenCallingConv.inc AArch64GenAsmMatcher.inc \
+		AArch64GenSubtargetInfo.inc AArch64GenMCCodeEmitter.inc \
+		AArch64GenFastISel.inc AArch64GenDisassemblerTables.inc \
+		AArch64GenMCPseudoLowering.inc
+
+DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
new file mode 100644
index 00000000000..3a382c165e7
--- /dev/null
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -0,0 +1,31 @@
+//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+namespace llvm {
+Target TheAArch64leTarget;
+Target TheAArch64beTarget;
+Target TheARM64leTarget;
+Target TheARM64beTarget;
+} // end namespace llvm
+
+extern "C" void LLVMInitializeAArch64TargetInfo() {
+  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64leTarget, "arm64",
+                                                   "AArch64 (little endian)");
+  RegisterTarget<Triple::arm64_be, /*HasJIT=*/true> Y(TheARM64beTarget, "arm64_be",
+                                                      "AArch64 (big endian)");
+
+  RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
+      TheAArch64leTarget, "aarch64", "AArch64 (little endian)");
+  RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
+      TheAArch64beTarget, "aarch64_be", "AArch64 (big endian)");
+}
diff --git a/lib/Target/AArch64/TargetInfo/CMakeLists.txt b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
new file mode 100644
index 00000000000..e236eed00be
--- /dev/null
+++ b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAArch64Info
+  AArch64TargetInfo.cpp
+  )
+
+add_dependencies(LLVMAArch64Info AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
new file mode 100644
index 00000000000..93c5407bb1f
--- /dev/null
+++ b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/AArch64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AArch64Info
+parent = AArch64
+required_libraries = Support
+add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/TargetInfo/Makefile b/lib/Target/AArch64/TargetInfo/Makefile
new file mode 100644
index 00000000000..9dc9aa4bccf
--- /dev/null
+++ b/lib/Target/AArch64/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/AArch64/TargetInfo/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAArch64Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
new file mode 100644
index 00000000000..3c24bb30a26
--- /dev/null
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -0,0 +1,901 @@
+//===-- AArch64BaseInfo.cpp - AArch64 Base encoding information------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides basic encoding and assembly information for AArch64.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Regex.h"
+
+using namespace llvm;
+
+StringRef AArch64NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
+  for (unsigned i = 0; i < NumPairs; ++i) {
+    if (Pairs[i].Value == Value) {
+      Valid = true;
+      return Pairs[i].Name;
+    }
+  }
+
+  Valid = false;
+  return StringRef();
+}
+
+uint32_t AArch64NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
+  std::string LowerCaseName = Name.lower();
+  for (unsigned i = 0; i < NumPairs; ++i) {
+    if (Pairs[i].Name == LowerCaseName) {
+      Valid = true;
+      return Pairs[i].Value;
+    }
+  }
+
+  Valid = false;
+  return -1;
+}
+
+bool AArch64NamedImmMapper::validImm(uint32_t Value) const {
+  return Value < TooBigImm;
+}
+
+const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATPairs[] = {
+  {"s1e1r", S1E1R},
+  {"s1e2r", S1E2R},
+  {"s1e3r", S1E3R},
+  {"s1e1w", S1E1W},
+  {"s1e2w", S1E2W},
+  {"s1e3w", S1E3W},
+  {"s1e0r", S1E0R},
+  {"s1e0w", S1E0W},
+  {"s12e1r", S12E1R},
+  {"s12e1w", S12E1W},
+  {"s12e0r", S12E0R},
+  {"s12e0w", S12E0W},
+};
+
+AArch64AT::ATMapper::ATMapper()
+  : AArch64NamedImmMapper(ATPairs, 0) {}
+
+const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierPairs[] = {
+  {"oshld", OSHLD},
+  {"oshst", OSHST},
+  {"osh", OSH},
+  {"nshld", NSHLD},
+  {"nshst", NSHST},
+  {"nsh", NSH},
+  {"ishld", ISHLD},
+  {"ishst", ISHST},
+  {"ish", ISH},
+  {"ld", LD},
+  {"st", ST},
+  {"sy", SY}
+};
+
+AArch64DB::DBarrierMapper::DBarrierMapper()
+  : AArch64NamedImmMapper(DBarrierPairs, 16u) {}
+
+const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCPairs[] = {
+  {"zva", ZVA},
+  {"ivac", IVAC},
+  {"isw", ISW},
+  {"cvac", CVAC},
+  {"csw", CSW},
+  {"cvau", CVAU},
+  {"civac", CIVAC},
+  {"cisw", CISW}
+};
+
+AArch64DC::DCMapper::DCMapper()
+  : AArch64NamedImmMapper(DCPairs, 0) {}
+
+const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICPairs[] = {
+  {"ialluis",  IALLUIS},
+  {"iallu", IALLU},
+  {"ivau", IVAU}
+};
+
+AArch64IC::ICMapper::ICMapper()
+  : AArch64NamedImmMapper(ICPairs, 0) {}
+
+const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBPairs[] = {
+  {"sy",  SY},
+};
+
+AArch64ISB::ISBMapper::ISBMapper()
+  : AArch64NamedImmMapper(ISBPairs, 16) {}
+
+const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMPairs[] = {
+  {"pldl1keep", PLDL1KEEP},
+  {"pldl1strm", PLDL1STRM},
+  {"pldl2keep", PLDL2KEEP},
+  {"pldl2strm", PLDL2STRM},
+  {"pldl3keep", PLDL3KEEP},
+  {"pldl3strm", PLDL3STRM},
+  {"plil1keep", PLIL1KEEP},
+  {"plil1strm", PLIL1STRM},
+  {"plil2keep", PLIL2KEEP},
+  {"plil2strm", PLIL2STRM},
+  {"plil3keep", PLIL3KEEP},
+  {"plil3strm", PLIL3STRM},
+  {"pstl1keep", PSTL1KEEP},
+  {"pstl1strm", PSTL1STRM},
+  {"pstl2keep", PSTL2KEEP},
+  {"pstl2strm", PSTL2STRM},
+  {"pstl3keep", PSTL3KEEP},
+  {"pstl3strm", PSTL3STRM}
+};
+
+AArch64PRFM::PRFMMapper::PRFMMapper()
+  : AArch64NamedImmMapper(PRFMPairs, 32) {}
+
+const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStatePairs[] = {
+  {"spsel", SPSel},
+  {"daifset", DAIFSet},
+  {"daifclr", DAIFClr}
+};
+
+AArch64PState::PStateMapper::PStateMapper()
+  : AArch64NamedImmMapper(PStatePairs, 0) {}
+
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSPairs[] = {
+  {"mdccsr_el0", MDCCSR_EL0},
+  {"dbgdtrrx_el0", DBGDTRRX_EL0},
+  {"mdrar_el1", MDRAR_EL1},
+  {"oslsr_el1", OSLSR_EL1},
+  {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1},
+  {"pmceid0_el0", PMCEID0_EL0},
+  {"pmceid1_el0", PMCEID1_EL0},
+  {"midr_el1", MIDR_EL1},
+  {"ccsidr_el1", CCSIDR_EL1},
+  {"clidr_el1", CLIDR_EL1},
+  {"ctr_el0", CTR_EL0},
+  {"mpidr_el1", MPIDR_EL1},
+  {"revidr_el1", REVIDR_EL1},
+  {"aidr_el1", AIDR_EL1},
+  {"dczid_el0", DCZID_EL0},
+  {"id_pfr0_el1", ID_PFR0_EL1},
+  {"id_pfr1_el1", ID_PFR1_EL1},
+  {"id_dfr0_el1", ID_DFR0_EL1},
+  {"id_afr0_el1", ID_AFR0_EL1},
+  {"id_mmfr0_el1", ID_MMFR0_EL1},
+  {"id_mmfr1_el1", ID_MMFR1_EL1},
+  {"id_mmfr2_el1", ID_MMFR2_EL1},
+  {"id_mmfr3_el1", ID_MMFR3_EL1},
+  {"id_isar0_el1", ID_ISAR0_EL1},
+  {"id_isar1_el1", ID_ISAR1_EL1},
+  {"id_isar2_el1", ID_ISAR2_EL1},
+  {"id_isar3_el1", ID_ISAR3_EL1},
+  {"id_isar4_el1", ID_ISAR4_EL1},
+  {"id_isar5_el1", ID_ISAR5_EL1},
+  {"id_aa64pfr0_el1", ID_A64PFR0_EL1},
+  {"id_aa64pfr1_el1", ID_A64PFR1_EL1},
+  {"id_aa64dfr0_el1", ID_A64DFR0_EL1},
+  {"id_aa64dfr1_el1", ID_A64DFR1_EL1},
+  {"id_aa64afr0_el1", ID_A64AFR0_EL1},
+  {"id_aa64afr1_el1", ID_A64AFR1_EL1},
+  {"id_aa64isar0_el1", ID_A64ISAR0_EL1},
+  {"id_aa64isar1_el1", ID_A64ISAR1_EL1},
+  {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1},
+  {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1},
+  {"mvfr0_el1", MVFR0_EL1},
+  {"mvfr1_el1", MVFR1_EL1},
+  {"mvfr2_el1", MVFR2_EL1},
+  {"rvbar_el1", RVBAR_EL1},
+  {"rvbar_el2", RVBAR_EL2},
+  {"rvbar_el3", RVBAR_EL3},
+  {"isr_el1", ISR_EL1},
+  {"cntpct_el0", CNTPCT_EL0},
+  {"cntvct_el0", CNTVCT_EL0},
+
+  // Trace registers
+  {"trcstatr", TRCSTATR},
+  {"trcidr8", TRCIDR8},
+  {"trcidr9", TRCIDR9},
+  {"trcidr10", TRCIDR10},
+  {"trcidr11", TRCIDR11},
+  {"trcidr12", TRCIDR12},
+  {"trcidr13", TRCIDR13},
+  {"trcidr0", TRCIDR0},
+  {"trcidr1", TRCIDR1},
+  {"trcidr2", TRCIDR2},
+  {"trcidr3", TRCIDR3},
+  {"trcidr4", TRCIDR4},
+  {"trcidr5", TRCIDR5},
+  {"trcidr6", TRCIDR6},
+  {"trcidr7", TRCIDR7},
+  {"trcoslsr", TRCOSLSR},
+  {"trcpdsr", TRCPDSR},
+  {"trcdevaff0", TRCDEVAFF0},
+  {"trcdevaff1", TRCDEVAFF1},
+  {"trclsr", TRCLSR},
+  {"trcauthstatus", TRCAUTHSTATUS},
+  {"trcdevarch", TRCDEVARCH},
+  {"trcdevid", TRCDEVID},
+  {"trcdevtype", TRCDEVTYPE},
+  {"trcpidr4", TRCPIDR4},
+  {"trcpidr5", TRCPIDR5},
+  {"trcpidr6", TRCPIDR6},
+  {"trcpidr7", TRCPIDR7},
+  {"trcpidr0", TRCPIDR0},
+  {"trcpidr1", TRCPIDR1},
+  {"trcpidr2", TRCPIDR2},
+  {"trcpidr3", TRCPIDR3},
+  {"trccidr0", TRCCIDR0},
+  {"trccidr1", TRCCIDR1},
+  {"trccidr2", TRCCIDR2},
+  {"trccidr3", TRCCIDR3},
+
+  // GICv3 registers
+  {"icc_iar1_el1", ICC_IAR1_EL1},
+  {"icc_iar0_el1", ICC_IAR0_EL1},
+  {"icc_hppir1_el1", ICC_HPPIR1_EL1},
+  {"icc_hppir0_el1", ICC_HPPIR0_EL1},
+  {"icc_rpr_el1", ICC_RPR_EL1},
+  {"ich_vtr_el2", ICH_VTR_EL2},
+  {"ich_eisr_el2", ICH_EISR_EL2},
+  {"ich_elsr_el2", ICH_ELSR_EL2}
+};
+
+AArch64SysReg::MRSMapper::MRSMapper(uint64_t FeatureBits)
+  : SysRegMapper(FeatureBits) {
+    InstPairs = &MRSPairs[0];
+    NumInstPairs = llvm::array_lengthof(MRSPairs);
+}
+
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRPairs[] = {
+  {"dbgdtrtx_el0", DBGDTRTX_EL0},
+  {"oslar_el1", OSLAR_EL1},
+  {"pmswinc_el0", PMSWINC_EL0},
+
+  // Trace registers
+  {"trcoslar", TRCOSLAR},
+  {"trclar", TRCLAR},
+
+  // GICv3 registers
+  {"icc_eoir1_el1", ICC_EOIR1_EL1},
+  {"icc_eoir0_el1", ICC_EOIR0_EL1},
+  {"icc_dir_el1", ICC_DIR_EL1},
+  {"icc_sgi1r_el1", ICC_SGI1R_EL1},
+  {"icc_asgi1r_el1", ICC_ASGI1R_EL1},
+  {"icc_sgi0r_el1", ICC_SGI0R_EL1}
+};
+
+AArch64SysReg::MSRMapper::MSRMapper(uint64_t FeatureBits)
+  : SysRegMapper(FeatureBits) {
+    InstPairs = &MSRPairs[0];
+    NumInstPairs = llvm::array_lengthof(MSRPairs);
+}
+
+
+const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegPairs[] = {
+  {"osdtrrx_el1", OSDTRRX_EL1},
+  {"osdtrtx_el1",  OSDTRTX_EL1},
+  {"teecr32_el1", TEECR32_EL1},
+  {"mdccint_el1", MDCCINT_EL1},
+  {"mdscr_el1", MDSCR_EL1},
+  {"dbgdtr_el0", DBGDTR_EL0},
+  {"oseccr_el1", OSECCR_EL1},
+  {"dbgvcr32_el2", DBGVCR32_EL2},
+  {"dbgbvr0_el1", DBGBVR0_EL1},
+  {"dbgbvr1_el1", DBGBVR1_EL1},
+  {"dbgbvr2_el1", DBGBVR2_EL1},
+  {"dbgbvr3_el1", DBGBVR3_EL1},
+  {"dbgbvr4_el1", DBGBVR4_EL1},
+  {"dbgbvr5_el1", DBGBVR5_EL1},
+  {"dbgbvr6_el1", DBGBVR6_EL1},
+  {"dbgbvr7_el1", DBGBVR7_EL1},
+  {"dbgbvr8_el1", DBGBVR8_EL1},
+  {"dbgbvr9_el1", DBGBVR9_EL1},
+  {"dbgbvr10_el1", DBGBVR10_EL1},
+  {"dbgbvr11_el1", DBGBVR11_EL1},
+  {"dbgbvr12_el1", DBGBVR12_EL1},
+  {"dbgbvr13_el1", DBGBVR13_EL1},
+  {"dbgbvr14_el1", DBGBVR14_EL1},
+  {"dbgbvr15_el1", DBGBVR15_EL1},
+  {"dbgbcr0_el1", DBGBCR0_EL1},
+  {"dbgbcr1_el1", DBGBCR1_EL1},
+  {"dbgbcr2_el1", DBGBCR2_EL1},
+  {"dbgbcr3_el1", DBGBCR3_EL1},
+  {"dbgbcr4_el1", DBGBCR4_EL1},
+  {"dbgbcr5_el1", DBGBCR5_EL1},
+  {"dbgbcr6_el1", DBGBCR6_EL1},
+  {"dbgbcr7_el1", DBGBCR7_EL1},
+  {"dbgbcr8_el1", DBGBCR8_EL1},
+  {"dbgbcr9_el1", DBGBCR9_EL1},
+  {"dbgbcr10_el1", DBGBCR10_EL1},
+  {"dbgbcr11_el1", DBGBCR11_EL1},
+  {"dbgbcr12_el1", DBGBCR12_EL1},
+  {"dbgbcr13_el1", DBGBCR13_EL1},
+  {"dbgbcr14_el1", DBGBCR14_EL1},
+  {"dbgbcr15_el1", DBGBCR15_EL1},
+  {"dbgwvr0_el1", DBGWVR0_EL1},
+  {"dbgwvr1_el1", DBGWVR1_EL1},
+  {"dbgwvr2_el1", DBGWVR2_EL1},
+  {"dbgwvr3_el1", DBGWVR3_EL1},
+  {"dbgwvr4_el1", DBGWVR4_EL1},
+  {"dbgwvr5_el1", DBGWVR5_EL1},
+  {"dbgwvr6_el1", DBGWVR6_EL1},
+  {"dbgwvr7_el1", DBGWVR7_EL1},
+  {"dbgwvr8_el1", DBGWVR8_EL1},
+  {"dbgwvr9_el1", DBGWVR9_EL1},
+  {"dbgwvr10_el1", DBGWVR10_EL1},
+  {"dbgwvr11_el1", DBGWVR11_EL1},
+  {"dbgwvr12_el1", DBGWVR12_EL1},
+  {"dbgwvr13_el1", DBGWVR13_EL1},
+  {"dbgwvr14_el1", DBGWVR14_EL1},
+  {"dbgwvr15_el1", DBGWVR15_EL1},
+  {"dbgwcr0_el1", DBGWCR0_EL1},
+  {"dbgwcr1_el1", DBGWCR1_EL1},
+  {"dbgwcr2_el1", DBGWCR2_EL1},
+  {"dbgwcr3_el1", DBGWCR3_EL1},
+  {"dbgwcr4_el1", DBGWCR4_EL1},
+  {"dbgwcr5_el1", DBGWCR5_EL1},
+  {"dbgwcr6_el1", DBGWCR6_EL1},
+  {"dbgwcr7_el1", DBGWCR7_EL1},
+  {"dbgwcr8_el1", DBGWCR8_EL1},
+  {"dbgwcr9_el1", DBGWCR9_EL1},
+  {"dbgwcr10_el1", DBGWCR10_EL1},
+  {"dbgwcr11_el1", DBGWCR11_EL1},
+  {"dbgwcr12_el1", DBGWCR12_EL1},
+  {"dbgwcr13_el1", DBGWCR13_EL1},
+  {"dbgwcr14_el1", DBGWCR14_EL1},
+  {"dbgwcr15_el1", DBGWCR15_EL1},
+  {"teehbr32_el1", TEEHBR32_EL1},
+  {"osdlr_el1", OSDLR_EL1},
+  {"dbgprcr_el1", DBGPRCR_EL1},
+  {"dbgclaimset_el1", DBGCLAIMSET_EL1},
+  {"dbgclaimclr_el1", DBGCLAIMCLR_EL1},
+  {"csselr_el1", CSSELR_EL1},
+  {"vpidr_el2", VPIDR_EL2},
+  {"vmpidr_el2", VMPIDR_EL2},
+  {"sctlr_el1", SCTLR_EL1},
+  {"sctlr_el2", SCTLR_EL2},
+  {"sctlr_el3", SCTLR_EL3},
+  {"actlr_el1", ACTLR_EL1},
+  {"actlr_el2", ACTLR_EL2},
+  {"actlr_el3", ACTLR_EL3},
+  {"cpacr_el1", CPACR_EL1},
+  {"hcr_el2", HCR_EL2},
+  {"scr_el3", SCR_EL3},
+  {"mdcr_el2", MDCR_EL2},
+  {"sder32_el3", SDER32_EL3},
+  {"cptr_el2", CPTR_EL2},
+  {"cptr_el3", CPTR_EL3},
+  {"hstr_el2", HSTR_EL2},
+  {"hacr_el2", HACR_EL2},
+  {"mdcr_el3", MDCR_EL3},
+  {"ttbr0_el1", TTBR0_EL1},
+  {"ttbr0_el2", TTBR0_EL2},
+  {"ttbr0_el3", TTBR0_EL3},
+  {"ttbr1_el1", TTBR1_EL1},
+  {"tcr_el1", TCR_EL1},
+  {"tcr_el2", TCR_EL2},
+  {"tcr_el3", TCR_EL3},
+  {"vttbr_el2", VTTBR_EL2},
+  {"vtcr_el2", VTCR_EL2},
+  {"dacr32_el2", DACR32_EL2},
+  {"spsr_el1", SPSR_EL1},
+  {"spsr_el2", SPSR_EL2},
+  {"spsr_el3", SPSR_EL3},
+  {"elr_el1", ELR_EL1},
+  {"elr_el2", ELR_EL2},
+  {"elr_el3", ELR_EL3},
+  {"sp_el0", SP_EL0},
+  {"sp_el1", SP_EL1},
+  {"sp_el2", SP_EL2},
+  {"spsel", SPSel},
+  {"nzcv", NZCV},
+  {"daif", DAIF},
+  {"currentel", CurrentEL},
+  {"spsr_irq", SPSR_irq},
+  {"spsr_abt", SPSR_abt},
+  {"spsr_und", SPSR_und},
+  {"spsr_fiq", SPSR_fiq},
+  {"fpcr", FPCR},
+  {"fpsr", FPSR},
+  {"dspsr_el0", DSPSR_EL0},
+  {"dlr_el0", DLR_EL0},
+  {"ifsr32_el2", IFSR32_EL2},
+  {"afsr0_el1", AFSR0_EL1},
+  {"afsr0_el2", AFSR0_EL2},
+  {"afsr0_el3", AFSR0_EL3},
+  {"afsr1_el1", AFSR1_EL1},
+  {"afsr1_el2", AFSR1_EL2},
+  {"afsr1_el3", AFSR1_EL3},
+  {"esr_el1", ESR_EL1},
+  {"esr_el2", ESR_EL2},
+  {"esr_el3", ESR_EL3},
+  {"fpexc32_el2", FPEXC32_EL2},
+  {"far_el1", FAR_EL1},
+  {"far_el2", FAR_EL2},
+  {"far_el3", FAR_EL3},
+  {"hpfar_el2", HPFAR_EL2},
+  {"par_el1", PAR_EL1},
+  {"pmcr_el0", PMCR_EL0},
+  {"pmcntenset_el0", PMCNTENSET_EL0},
+  {"pmcntenclr_el0", PMCNTENCLR_EL0},
+  {"pmovsclr_el0", PMOVSCLR_EL0},
+  {"pmselr_el0", PMSELR_EL0},
+  {"pmccntr_el0", PMCCNTR_EL0},
+  {"pmxevtyper_el0", PMXEVTYPER_EL0},
+  {"pmxevcntr_el0", PMXEVCNTR_EL0},
+  {"pmuserenr_el0", PMUSERENR_EL0},
+  {"pmintenset_el1", PMINTENSET_EL1},
+  {"pmintenclr_el1", PMINTENCLR_EL1},
+  {"pmovsset_el0", PMOVSSET_EL0},
+  {"mair_el1", MAIR_EL1},
+  {"mair_el2", MAIR_EL2},
+  {"mair_el3", MAIR_EL3},
+  {"amair_el1", AMAIR_EL1},
+  {"amair_el2", AMAIR_EL2},
+  {"amair_el3", AMAIR_EL3},
+  {"vbar_el1", VBAR_EL1},
+  {"vbar_el2", VBAR_EL2},
+  {"vbar_el3", VBAR_EL3},
+  {"rmr_el1", RMR_EL1},
+  {"rmr_el2", RMR_EL2},
+  {"rmr_el3", RMR_EL3},
+  {"contextidr_el1", CONTEXTIDR_EL1},
+  {"tpidr_el0", TPIDR_EL0},
+  {"tpidr_el2", TPIDR_EL2},
+  {"tpidr_el3", TPIDR_EL3},
+  {"tpidrro_el0", TPIDRRO_EL0},
+  {"tpidr_el1", TPIDR_EL1},
+  {"cntfrq_el0", CNTFRQ_EL0},
+  {"cntvoff_el2", CNTVOFF_EL2},
+  {"cntkctl_el1", CNTKCTL_EL1},
+  {"cnthctl_el2", CNTHCTL_EL2},
+  {"cntp_tval_el0", CNTP_TVAL_EL0},
+  {"cnthp_tval_el2", CNTHP_TVAL_EL2},
+  {"cntps_tval_el1", CNTPS_TVAL_EL1},
+  {"cntp_ctl_el0", CNTP_CTL_EL0},
+  {"cnthp_ctl_el2", CNTHP_CTL_EL2},
+  {"cntps_ctl_el1", CNTPS_CTL_EL1},
+  {"cntp_cval_el0", CNTP_CVAL_EL0},
+  {"cnthp_cval_el2", CNTHP_CVAL_EL2},
+  {"cntps_cval_el1", CNTPS_CVAL_EL1},
+  {"cntv_tval_el0", CNTV_TVAL_EL0},
+  {"cntv_ctl_el0", CNTV_CTL_EL0},
+  {"cntv_cval_el0", CNTV_CVAL_EL0},
+  {"pmevcntr0_el0", PMEVCNTR0_EL0},
+  {"pmevcntr1_el0", PMEVCNTR1_EL0},
+  {"pmevcntr2_el0", PMEVCNTR2_EL0},
+  {"pmevcntr3_el0", PMEVCNTR3_EL0},
+  {"pmevcntr4_el0", PMEVCNTR4_EL0},
+  {"pmevcntr5_el0", PMEVCNTR5_EL0},
+  {"pmevcntr6_el0", PMEVCNTR6_EL0},
+  {"pmevcntr7_el0", PMEVCNTR7_EL0},
+  {"pmevcntr8_el0", PMEVCNTR8_EL0},
+  {"pmevcntr9_el0", PMEVCNTR9_EL0},
+  {"pmevcntr10_el0", PMEVCNTR10_EL0},
+  {"pmevcntr11_el0", PMEVCNTR11_EL0},
+  {"pmevcntr12_el0", PMEVCNTR12_EL0},
+  {"pmevcntr13_el0", PMEVCNTR13_EL0},
+  {"pmevcntr14_el0", PMEVCNTR14_EL0},
+  {"pmevcntr15_el0", PMEVCNTR15_EL0},
+  {"pmevcntr16_el0", PMEVCNTR16_EL0},
+  {"pmevcntr17_el0", PMEVCNTR17_EL0},
+  {"pmevcntr18_el0", PMEVCNTR18_EL0},
+  {"pmevcntr19_el0", PMEVCNTR19_EL0},
+  {"pmevcntr20_el0", PMEVCNTR20_EL0},
+  {"pmevcntr21_el0", PMEVCNTR21_EL0},
+  {"pmevcntr22_el0", PMEVCNTR22_EL0},
+  {"pmevcntr23_el0", PMEVCNTR23_EL0},
+  {"pmevcntr24_el0", PMEVCNTR24_EL0},
+  {"pmevcntr25_el0", PMEVCNTR25_EL0},
+  {"pmevcntr26_el0", PMEVCNTR26_EL0},
+  {"pmevcntr27_el0", PMEVCNTR27_EL0},
+  {"pmevcntr28_el0", PMEVCNTR28_EL0},
+  {"pmevcntr29_el0", PMEVCNTR29_EL0},
+  {"pmevcntr30_el0", PMEVCNTR30_EL0},
+  {"pmccfiltr_el0", PMCCFILTR_EL0},
+  {"pmevtyper0_el0", PMEVTYPER0_EL0},
+  {"pmevtyper1_el0", PMEVTYPER1_EL0},
+  {"pmevtyper2_el0", PMEVTYPER2_EL0},
+  {"pmevtyper3_el0", PMEVTYPER3_EL0},
+  {"pmevtyper4_el0", PMEVTYPER4_EL0},
+  {"pmevtyper5_el0", PMEVTYPER5_EL0},
+  {"pmevtyper6_el0", PMEVTYPER6_EL0},
+  {"pmevtyper7_el0", PMEVTYPER7_EL0},
+  {"pmevtyper8_el0", PMEVTYPER8_EL0},
+  {"pmevtyper9_el0", PMEVTYPER9_EL0},
+  {"pmevtyper10_el0", PMEVTYPER10_EL0},
+  {"pmevtyper11_el0", PMEVTYPER11_EL0},
+  {"pmevtyper12_el0", PMEVTYPER12_EL0},
+  {"pmevtyper13_el0", PMEVTYPER13_EL0},
+  {"pmevtyper14_el0", PMEVTYPER14_EL0},
+  {"pmevtyper15_el0", PMEVTYPER15_EL0},
+  {"pmevtyper16_el0", PMEVTYPER16_EL0},
+  {"pmevtyper17_el0", PMEVTYPER17_EL0},
+  {"pmevtyper18_el0", PMEVTYPER18_EL0},
+  {"pmevtyper19_el0", PMEVTYPER19_EL0},
+  {"pmevtyper20_el0", PMEVTYPER20_EL0},
+  {"pmevtyper21_el0", PMEVTYPER21_EL0},
+  {"pmevtyper22_el0", PMEVTYPER22_EL0},
+  {"pmevtyper23_el0", PMEVTYPER23_EL0},
+  {"pmevtyper24_el0", PMEVTYPER24_EL0},
+  {"pmevtyper25_el0", PMEVTYPER25_EL0},
+  {"pmevtyper26_el0", PMEVTYPER26_EL0},
+  {"pmevtyper27_el0", PMEVTYPER27_EL0},
+  {"pmevtyper28_el0", PMEVTYPER28_EL0},
+  {"pmevtyper29_el0", PMEVTYPER29_EL0},
+  {"pmevtyper30_el0", PMEVTYPER30_EL0},
+
+  // Trace registers
+  {"trcprgctlr", TRCPRGCTLR},
+  {"trcprocselr", TRCPROCSELR},
+  {"trcconfigr", TRCCONFIGR},
+  {"trcauxctlr", TRCAUXCTLR},
+  {"trceventctl0r", TRCEVENTCTL0R},
+  {"trceventctl1r", TRCEVENTCTL1R},
+  {"trcstallctlr", TRCSTALLCTLR},
+  {"trctsctlr", TRCTSCTLR},
+  {"trcsyncpr", TRCSYNCPR},
+  {"trcccctlr", TRCCCCTLR},
+  {"trcbbctlr", TRCBBCTLR},
+  {"trctraceidr", TRCTRACEIDR},
+  {"trcqctlr", TRCQCTLR},
+  {"trcvictlr", TRCVICTLR},
+  {"trcviiectlr", TRCVIIECTLR},
+  {"trcvissctlr", TRCVISSCTLR},
+  {"trcvipcssctlr", TRCVIPCSSCTLR},
+  {"trcvdctlr", TRCVDCTLR},
+  {"trcvdsacctlr", TRCVDSACCTLR},
+  {"trcvdarcctlr", TRCVDARCCTLR},
+  {"trcseqevr0", TRCSEQEVR0},
+  {"trcseqevr1", TRCSEQEVR1},
+  {"trcseqevr2", TRCSEQEVR2},
+  {"trcseqrstevr", TRCSEQRSTEVR},
+  {"trcseqstr", TRCSEQSTR},
+  {"trcextinselr", TRCEXTINSELR},
+  {"trccntrldvr0", TRCCNTRLDVR0},
+  {"trccntrldvr1", TRCCNTRLDVR1},
+  {"trccntrldvr2", TRCCNTRLDVR2},
+  {"trccntrldvr3", TRCCNTRLDVR3},
+  {"trccntctlr0", TRCCNTCTLR0},
+  {"trccntctlr1", TRCCNTCTLR1},
+  {"trccntctlr2", TRCCNTCTLR2},
+  {"trccntctlr3", TRCCNTCTLR3},
+  {"trccntvr0", TRCCNTVR0},
+  {"trccntvr1", TRCCNTVR1},
+  {"trccntvr2", TRCCNTVR2},
+  {"trccntvr3", TRCCNTVR3},
+  {"trcimspec0", TRCIMSPEC0},
+  {"trcimspec1", TRCIMSPEC1},
+  {"trcimspec2", TRCIMSPEC2},
+  {"trcimspec3", TRCIMSPEC3},
+  {"trcimspec4", TRCIMSPEC4},
+  {"trcimspec5", TRCIMSPEC5},
+  {"trcimspec6", TRCIMSPEC6},
+  {"trcimspec7", TRCIMSPEC7},
+  {"trcrsctlr2", TRCRSCTLR2},
+  {"trcrsctlr3", TRCRSCTLR3},
+  {"trcrsctlr4", TRCRSCTLR4},
+  {"trcrsctlr5", TRCRSCTLR5},
+  {"trcrsctlr6", TRCRSCTLR6},
+  {"trcrsctlr7", TRCRSCTLR7},
+  {"trcrsctlr8", TRCRSCTLR8},
+  {"trcrsctlr9", TRCRSCTLR9},
+  {"trcrsctlr10", TRCRSCTLR10},
+  {"trcrsctlr11", TRCRSCTLR11},
+  {"trcrsctlr12", TRCRSCTLR12},
+  {"trcrsctlr13", TRCRSCTLR13},
+  {"trcrsctlr14", TRCRSCTLR14},
+  {"trcrsctlr15", TRCRSCTLR15},
+  {"trcrsctlr16", TRCRSCTLR16},
+  {"trcrsctlr17", TRCRSCTLR17},
+  {"trcrsctlr18", TRCRSCTLR18},
+  {"trcrsctlr19", TRCRSCTLR19},
+  {"trcrsctlr20", TRCRSCTLR20},
+  {"trcrsctlr21", TRCRSCTLR21},
+  {"trcrsctlr22", TRCRSCTLR22},
+  {"trcrsctlr23", TRCRSCTLR23},
+  {"trcrsctlr24", TRCRSCTLR24},
+  {"trcrsctlr25", TRCRSCTLR25},
+  {"trcrsctlr26", TRCRSCTLR26},
+  {"trcrsctlr27", TRCRSCTLR27},
+  {"trcrsctlr28", TRCRSCTLR28},
+  {"trcrsctlr29", TRCRSCTLR29},
+  {"trcrsctlr30", TRCRSCTLR30},
+  {"trcrsctlr31", TRCRSCTLR31},
+  {"trcssccr0", TRCSSCCR0},
+  {"trcssccr1", TRCSSCCR1},
+  {"trcssccr2", TRCSSCCR2},
+  {"trcssccr3", TRCSSCCR3},
+  {"trcssccr4", TRCSSCCR4},
+  {"trcssccr5", TRCSSCCR5},
+  {"trcssccr6", TRCSSCCR6},
+  {"trcssccr7", TRCSSCCR7},
+  {"trcsscsr0", TRCSSCSR0},
+  {"trcsscsr1", TRCSSCSR1},
+  {"trcsscsr2", TRCSSCSR2},
+  {"trcsscsr3", TRCSSCSR3},
+  {"trcsscsr4", TRCSSCSR4},
+  {"trcsscsr5", TRCSSCSR5},
+  {"trcsscsr6", TRCSSCSR6},
+  {"trcsscsr7", TRCSSCSR7},
+  {"trcsspcicr0", TRCSSPCICR0},
+  {"trcsspcicr1", TRCSSPCICR1},
+  {"trcsspcicr2", TRCSSPCICR2},
+  {"trcsspcicr3", TRCSSPCICR3},
+  {"trcsspcicr4", TRCSSPCICR4},
+  {"trcsspcicr5", TRCSSPCICR5},
+  {"trcsspcicr6", TRCSSPCICR6},
+  {"trcsspcicr7", TRCSSPCICR7},
+  {"trcpdcr", TRCPDCR},
+  {"trcacvr0", TRCACVR0},
+  {"trcacvr1", TRCACVR1},
+  {"trcacvr2", TRCACVR2},
+  {"trcacvr3", TRCACVR3},
+  {"trcacvr4", TRCACVR4},
+  {"trcacvr5", TRCACVR5},
+  {"trcacvr6", TRCACVR6},
+  {"trcacvr7", TRCACVR7},
+  {"trcacvr8", TRCACVR8},
+  {"trcacvr9", TRCACVR9},
+  {"trcacvr10", TRCACVR10},
+  {"trcacvr11", TRCACVR11},
+  {"trcacvr12", TRCACVR12},
+  {"trcacvr13", TRCACVR13},
+  {"trcacvr14", TRCACVR14},
+  {"trcacvr15", TRCACVR15},
+  {"trcacatr0", TRCACATR0},
+  {"trcacatr1", TRCACATR1},
+  {"trcacatr2", TRCACATR2},
+  {"trcacatr3", TRCACATR3},
+  {"trcacatr4", TRCACATR4},
+  {"trcacatr5", TRCACATR5},
+  {"trcacatr6", TRCACATR6},
+  {"trcacatr7", TRCACATR7},
+  {"trcacatr8", TRCACATR8},
+  {"trcacatr9", TRCACATR9},
+  {"trcacatr10", TRCACATR10},
+  {"trcacatr11", TRCACATR11},
+  {"trcacatr12", TRCACATR12},
+  {"trcacatr13", TRCACATR13},
+  {"trcacatr14", TRCACATR14},
+  {"trcacatr15", TRCACATR15},
+  {"trcdvcvr0", TRCDVCVR0},
+  {"trcdvcvr1", TRCDVCVR1},
+  {"trcdvcvr2", TRCDVCVR2},
+  {"trcdvcvr3", TRCDVCVR3},
+  {"trcdvcvr4", TRCDVCVR4},
+  {"trcdvcvr5", TRCDVCVR5},
+  {"trcdvcvr6", TRCDVCVR6},
+  {"trcdvcvr7", TRCDVCVR7},
+  {"trcdvcmr0", TRCDVCMR0},
+  {"trcdvcmr1", TRCDVCMR1},
+  {"trcdvcmr2", TRCDVCMR2},
+  {"trcdvcmr3", TRCDVCMR3},
+  {"trcdvcmr4", TRCDVCMR4},
+  {"trcdvcmr5", TRCDVCMR5},
+  {"trcdvcmr6", TRCDVCMR6},
+  {"trcdvcmr7", TRCDVCMR7},
+  {"trccidcvr0", TRCCIDCVR0},
+  {"trccidcvr1", TRCCIDCVR1},
+  {"trccidcvr2", TRCCIDCVR2},
+  {"trccidcvr3", TRCCIDCVR3},
+  {"trccidcvr4", TRCCIDCVR4},
+  {"trccidcvr5", TRCCIDCVR5},
+  {"trccidcvr6", TRCCIDCVR6},
+  {"trccidcvr7", TRCCIDCVR7},
+  {"trcvmidcvr0", TRCVMIDCVR0},
+  {"trcvmidcvr1", TRCVMIDCVR1},
+  {"trcvmidcvr2", TRCVMIDCVR2},
+  {"trcvmidcvr3", TRCVMIDCVR3},
+  {"trcvmidcvr4", TRCVMIDCVR4},
+  {"trcvmidcvr5", TRCVMIDCVR5},
+  {"trcvmidcvr6", TRCVMIDCVR6},
+  {"trcvmidcvr7", TRCVMIDCVR7},
+  {"trccidcctlr0", TRCCIDCCTLR0},
+  {"trccidcctlr1", TRCCIDCCTLR1},
+  {"trcvmidcctlr0", TRCVMIDCCTLR0},
+  {"trcvmidcctlr1", TRCVMIDCCTLR1},
+  {"trcitctrl", TRCITCTRL},
+  {"trcclaimset", TRCCLAIMSET},
+  {"trcclaimclr", TRCCLAIMCLR},
+
+  // GICv3 registers
+  {"icc_bpr1_el1", ICC_BPR1_EL1},
+  {"icc_bpr0_el1", ICC_BPR0_EL1},
+  {"icc_pmr_el1", ICC_PMR_EL1},
+  {"icc_ctlr_el1", ICC_CTLR_EL1},
+  {"icc_ctlr_el3", ICC_CTLR_EL3},
+  {"icc_sre_el1", ICC_SRE_EL1},
+  {"icc_sre_el2", ICC_SRE_EL2},
+  {"icc_sre_el3", ICC_SRE_EL3},
+  {"icc_igrpen0_el1", ICC_IGRPEN0_EL1},
+  {"icc_igrpen1_el1", ICC_IGRPEN1_EL1},
+  {"icc_igrpen1_el3", ICC_IGRPEN1_EL3},
+  {"icc_seien_el1", ICC_SEIEN_EL1},
+  {"icc_ap0r0_el1", ICC_AP0R0_EL1},
+  {"icc_ap0r1_el1", ICC_AP0R1_EL1},
+  {"icc_ap0r2_el1", ICC_AP0R2_EL1},
+  {"icc_ap0r3_el1", ICC_AP0R3_EL1},
+  {"icc_ap1r0_el1", ICC_AP1R0_EL1},
+  {"icc_ap1r1_el1", ICC_AP1R1_EL1},
+  {"icc_ap1r2_el1", ICC_AP1R2_EL1},
+  {"icc_ap1r3_el1", ICC_AP1R3_EL1},
+  {"ich_ap0r0_el2", ICH_AP0R0_EL2},
+  {"ich_ap0r1_el2", ICH_AP0R1_EL2},
+  {"ich_ap0r2_el2", ICH_AP0R2_EL2},
+  {"ich_ap0r3_el2", ICH_AP0R3_EL2},
+  {"ich_ap1r0_el2", ICH_AP1R0_EL2},
+  {"ich_ap1r1_el2", ICH_AP1R1_EL2},
+  {"ich_ap1r2_el2", ICH_AP1R2_EL2},
+  {"ich_ap1r3_el2", ICH_AP1R3_EL2},
+  {"ich_hcr_el2", ICH_HCR_EL2},
+  {"ich_misr_el2", ICH_MISR_EL2},
+  {"ich_vmcr_el2", ICH_VMCR_EL2},
+  {"ich_vseir_el2", ICH_VSEIR_EL2},
+  {"ich_lr0_el2", ICH_LR0_EL2},
+  {"ich_lr1_el2", ICH_LR1_EL2},
+  {"ich_lr2_el2", ICH_LR2_EL2},
+  {"ich_lr3_el2", ICH_LR3_EL2},
+  {"ich_lr4_el2", ICH_LR4_EL2},
+  {"ich_lr5_el2", ICH_LR5_EL2},
+  {"ich_lr6_el2", ICH_LR6_EL2},
+  {"ich_lr7_el2", ICH_LR7_EL2},
+  {"ich_lr8_el2", ICH_LR8_EL2},
+  {"ich_lr9_el2", ICH_LR9_EL2},
+  {"ich_lr10_el2", ICH_LR10_EL2},
+  {"ich_lr11_el2", ICH_LR11_EL2},
+  {"ich_lr12_el2", ICH_LR12_EL2},
+  {"ich_lr13_el2", ICH_LR13_EL2},
+  {"ich_lr14_el2", ICH_LR14_EL2},
+  {"ich_lr15_el2", ICH_LR15_EL2}
+};
+
+const AArch64NamedImmMapper::Mapping
+AArch64SysReg::SysRegMapper::CycloneSysRegPairs[] = {
+  {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3}
+};
+
+uint32_t
+AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
+  std::string NameLower = Name.lower();
+
+  // First search the registers shared by all
+  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
+    if (SysRegPairs[i].Name == NameLower) {
+      Valid = true;
+      return SysRegPairs[i].Value;
+    }
+  }
+
+  // Next search for target specific registers
+  if (FeatureBits & AArch64::ProcCyclone) {
+    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
+      if (CycloneSysRegPairs[i].Name == NameLower) {
+        Valid = true;
+        return CycloneSysRegPairs[i].Value;
+      }
+    }
+  }
+
+  // Now try the instruction-specific registers (either read-only or
+  // write-only).
+  for (unsigned i = 0; i < NumInstPairs; ++i) {
+    if (InstPairs[i].Name == NameLower) {
+      Valid = true;
+      return InstPairs[i].Value;
+    }
+  }
+
+  // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name, where the bits
+  // are: 11 xxx 1x11 xxxx xxx
+  Regex GenericRegPattern("^s3_([0-7])_c(1[15])_c([0-9]|1[0-5])_([0-7])$");
+
+  SmallVector<StringRef, 4> Ops;
+  if (!GenericRegPattern.match(NameLower, &Ops)) {
+    Valid = false;
+    return -1;
+  }
+
+  uint32_t Op0 = 3, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
+  uint32_t Bits;
+  Ops[1].getAsInteger(10, Op1);
+  Ops[2].getAsInteger(10, CRn);
+  Ops[3].getAsInteger(10, CRm);
+  Ops[4].getAsInteger(10, Op2);
+  Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
+
+  Valid = true;
+  return Bits;
+}
+
+std::string
+AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+  // First search the registers shared by all
+  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
+    if (SysRegPairs[i].Value == Bits) {
+      Valid = true;
+      return SysRegPairs[i].Name;
+    }
+  }
+
+  // Next search for target specific registers
+  if (FeatureBits & AArch64::ProcCyclone) {
+    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
+      if (CycloneSysRegPairs[i].Value == Bits) {
+        Valid = true;
+        return CycloneSysRegPairs[i].Name;
+      }
+    }
+  }
+
+  // Now try the instruction-specific registers (either read-only or
+  // write-only).
+  for (unsigned i = 0; i < NumInstPairs; ++i) {
+    if (InstPairs[i].Value == Bits) {
+      Valid = true;
+      return InstPairs[i].Name;
+    }
+  }
+
+  uint32_t Op0 = (Bits >> 14) & 0x3;
+  uint32_t Op1 = (Bits >> 11) & 0x7;
+  uint32_t CRn = (Bits >> 7) & 0xf;
+  uint32_t CRm = (Bits >> 3) & 0xf;
+  uint32_t Op2 = Bits & 0x7;
+
+  // Only combinations matching: 11 xxx 1x11 xxxx xxx are valid for a generic
+  // name.
+  if (Op0 != 3 || (CRn != 11 && CRn != 15)) {
+      Valid = false;
+      return "";
+  }
+
+  assert(Op0 == 3 && (CRn == 11 || CRn == 15) && "Invalid generic sysreg");
+
+  Valid = true;
+  return "s3_" + utostr(Op1) + "_c" + utostr(CRn)
+               + "_c" + utostr(CRm) + "_" + utostr(Op2);
+}
+
+const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIPairs[] = {
+  {"ipas2e1is", IPAS2E1IS},
+  {"ipas2le1is", IPAS2LE1IS},
+  {"vmalle1is", VMALLE1IS},
+  {"alle2is", ALLE2IS},
+  {"alle3is", ALLE3IS},
+  {"vae1is", VAE1IS},
+  {"vae2is", VAE2IS},
+  {"vae3is", VAE3IS},
+  {"aside1is", ASIDE1IS},
+  {"vaae1is", VAAE1IS},
+  {"alle1is", ALLE1IS},
+  {"vale1is", VALE1IS},
+  {"vale2is", VALE2IS},
+  {"vale3is", VALE3IS},
+  {"vmalls12e1is", VMALLS12E1IS},
+  {"vaale1is", VAALE1IS},
+  {"ipas2e1", IPAS2E1},
+  {"ipas2le1", IPAS2LE1},
+  {"vmalle1", VMALLE1},
+  {"alle2", ALLE2},
+  {"alle3", ALLE3},
+  {"vae1", VAE1},
+  {"vae2", VAE2},
+  {"vae3", VAE3},
+  {"aside1", ASIDE1},
+  {"vaae1", VAAE1},
+  {"alle1", ALLE1},
+  {"vale1", VALE1},
+  {"vale2", VALE2},
+  {"vale3", VALE3},
+  {"vmalls12e1", VMALLS12E1},
+  {"vaale1", VAALE1}
+};
+
+AArch64TLBI::TLBIMapper::TLBIMapper()
+  : AArch64NamedImmMapper(TLBIPairs, 0) {}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
new file mode 100644
index 00000000000..9e4c389cc2e
--- /dev/null
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -0,0 +1,1294 @@
+//===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the AArch64 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64BASEINFO_H
+#define AArch64BASEINFO_H
+
+// FIXME: Is it easiest to fix this layering violation by moving the .inc
+// #includes from AArch64MCTargetDesc.h to here?
+#include "MCTargetDesc/AArch64MCTargetDesc.h" // For AArch64::X0 and friends.
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+inline static unsigned getWRegFromXReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::X0: return AArch64::W0;
+  case AArch64::X1: return AArch64::W1;
+  case AArch64::X2: return AArch64::W2;
+  case AArch64::X3: return AArch64::W3;
+  case AArch64::X4: return AArch64::W4;
+  case AArch64::X5: return AArch64::W5;
+  case AArch64::X6: return AArch64::W6;
+  case AArch64::X7: return AArch64::W7;
+  case AArch64::X8: return AArch64::W8;
+  case AArch64::X9: return AArch64::W9;
+  case AArch64::X10: return AArch64::W10;
+  case AArch64::X11: return AArch64::W11;
+  case AArch64::X12: return AArch64::W12;
+  case AArch64::X13: return AArch64::W13;
+  case AArch64::X14: return AArch64::W14;
+  case AArch64::X15: return AArch64::W15;
+  case AArch64::X16: return AArch64::W16;
+  case AArch64::X17: return AArch64::W17;
+  case AArch64::X18: return AArch64::W18;
+  case AArch64::X19: return AArch64::W19;
+  case AArch64::X20: return AArch64::W20;
+  case AArch64::X21: return AArch64::W21;
+  case AArch64::X22: return AArch64::W22;
+  case AArch64::X23: return AArch64::W23;
+  case AArch64::X24: return AArch64::W24;
+  case AArch64::X25: return AArch64::W25;
+  case AArch64::X26: return AArch64::W26;
+  case AArch64::X27: return AArch64::W27;
+  case AArch64::X28: return AArch64::W28;
+  case AArch64::FP: return AArch64::W29;
+  case AArch64::LR: return AArch64::W30;
+  case AArch64::SP: return AArch64::WSP;
+  case AArch64::XZR: return AArch64::WZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+inline static unsigned getXRegFromWReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::W0: return AArch64::X0;
+  case AArch64::W1: return AArch64::X1;
+  case AArch64::W2: return AArch64::X2;
+  case AArch64::W3: return AArch64::X3;
+  case AArch64::W4: return AArch64::X4;
+  case AArch64::W5: return AArch64::X5;
+  case AArch64::W6: return AArch64::X6;
+  case AArch64::W7: return AArch64::X7;
+  case AArch64::W8: return AArch64::X8;
+  case AArch64::W9: return AArch64::X9;
+  case AArch64::W10: return AArch64::X10;
+  case AArch64::W11: return AArch64::X11;
+  case AArch64::W12: return AArch64::X12;
+  case AArch64::W13: return AArch64::X13;
+  case AArch64::W14: return AArch64::X14;
+  case AArch64::W15: return AArch64::X15;
+  case AArch64::W16: return AArch64::X16;
+  case AArch64::W17: return AArch64::X17;
+  case AArch64::W18: return AArch64::X18;
+  case AArch64::W19: return AArch64::X19;
+  case AArch64::W20: return AArch64::X20;
+  case AArch64::W21: return AArch64::X21;
+  case AArch64::W22: return AArch64::X22;
+  case AArch64::W23: return AArch64::X23;
+  case AArch64::W24: return AArch64::X24;
+  case AArch64::W25: return AArch64::X25;
+  case AArch64::W26: return AArch64::X26;
+  case AArch64::W27: return AArch64::X27;
+  case AArch64::W28: return AArch64::X28;
+  case AArch64::W29: return AArch64::FP;
+  case AArch64::W30: return AArch64::LR;
+  case AArch64::WSP: return AArch64::SP;
+  case AArch64::WZR: return AArch64::XZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+static inline unsigned getBRegFromDReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::D0:  return AArch64::B0;
+  case AArch64::D1:  return AArch64::B1;
+  case AArch64::D2:  return AArch64::B2;
+  case AArch64::D3:  return AArch64::B3;
+  case AArch64::D4:  return AArch64::B4;
+  case AArch64::D5:  return AArch64::B5;
+  case AArch64::D6:  return AArch64::B6;
+  case AArch64::D7:  return AArch64::B7;
+  case AArch64::D8:  return AArch64::B8;
+  case AArch64::D9:  return AArch64::B9;
+  case AArch64::D10: return AArch64::B10;
+  case AArch64::D11: return AArch64::B11;
+  case AArch64::D12: return AArch64::B12;
+  case AArch64::D13: return AArch64::B13;
+  case AArch64::D14: return AArch64::B14;
+  case AArch64::D15: return AArch64::B15;
+  case AArch64::D16: return AArch64::B16;
+  case AArch64::D17: return AArch64::B17;
+  case AArch64::D18: return AArch64::B18;
+  case AArch64::D19: return AArch64::B19;
+  case AArch64::D20: return AArch64::B20;
+  case AArch64::D21: return AArch64::B21;
+  case AArch64::D22: return AArch64::B22;
+  case AArch64::D23: return AArch64::B23;
+  case AArch64::D24: return AArch64::B24;
+  case AArch64::D25: return AArch64::B25;
+  case AArch64::D26: return AArch64::B26;
+  case AArch64::D27: return AArch64::B27;
+  case AArch64::D28: return AArch64::B28;
+  case AArch64::D29: return AArch64::B29;
+  case AArch64::D30: return AArch64::B30;
+  case AArch64::D31: return AArch64::B31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+
+static inline unsigned getDRegFromBReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::B0:  return AArch64::D0;
+  case AArch64::B1:  return AArch64::D1;
+  case AArch64::B2:  return AArch64::D2;
+  case AArch64::B3:  return AArch64::D3;
+  case AArch64::B4:  return AArch64::D4;
+  case AArch64::B5:  return AArch64::D5;
+  case AArch64::B6:  return AArch64::D6;
+  case AArch64::B7:  return AArch64::D7;
+  case AArch64::B8:  return AArch64::D8;
+  case AArch64::B9:  return AArch64::D9;
+  case AArch64::B10: return AArch64::D10;
+  case AArch64::B11: return AArch64::D11;
+  case AArch64::B12: return AArch64::D12;
+  case AArch64::B13: return AArch64::D13;
+  case AArch64::B14: return AArch64::D14;
+  case AArch64::B15: return AArch64::D15;
+  case AArch64::B16: return AArch64::D16;
+  case AArch64::B17: return AArch64::D17;
+  case AArch64::B18: return AArch64::D18;
+  case AArch64::B19: return AArch64::D19;
+  case AArch64::B20: return AArch64::D20;
+  case AArch64::B21: return AArch64::D21;
+  case AArch64::B22: return AArch64::D22;
+  case AArch64::B23: return AArch64::D23;
+  case AArch64::B24: return AArch64::D24;
+  case AArch64::B25: return AArch64::D25;
+  case AArch64::B26: return AArch64::D26;
+  case AArch64::B27: return AArch64::D27;
+  case AArch64::B28: return AArch64::D28;
+  case AArch64::B29: return AArch64::D29;
+  case AArch64::B30: return AArch64::D30;
+  case AArch64::B31: return AArch64::D31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+namespace AArch64CC {
+
+// The CondCodes constants map directly to the 4-bit encoding of the condition
+// field for predicated instructions.
+enum CondCode {  // Meaning (integer)          Meaning (floating-point)
+  EQ = 0x0,      // Equal                      Equal
+  NE = 0x1,      // Not equal                  Not equal, or unordered
+  HS = 0x2,      // Unsigned higher or same    >, ==, or unordered
+  LO = 0x3,      // Unsigned lower             Less than
+  MI = 0x4,      // Minus, negative            Less than
+  PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
+  VS = 0x6,      // Overflow                   Unordered
+  VC = 0x7,      // No overflow                Not unordered
+  HI = 0x8,      // Unsigned higher            Greater than, or unordered
+  LS = 0x9,      // Unsigned lower or same     Less than or equal
+  GE = 0xa,      // Greater than or equal      Greater than or equal
+  LT = 0xb,      // Less than                  Less than, or unordered
+  GT = 0xc,      // Greater than               Greater than
+  LE = 0xd,      // Less than or equal         <, ==, or unordered
+  AL = 0xe,      // Always (unconditional)     Always (unconditional)
+  NV = 0xf,      // Always (unconditional)     Always (unconditional)
+  // Note the NV exists purely to disassemble 0b1111. Execution is "always".
+  Invalid
+};
+
+inline static const char *getCondCodeName(CondCode Code) {
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ:  return "eq";
+  case NE:  return "ne";
+  case HS:  return "hs";
+  case LO:  return "lo";
+  case MI:  return "mi";
+  case PL:  return "pl";
+  case VS:  return "vs";
+  case VC:  return "vc";
+  case HI:  return "hi";
+  case LS:  return "ls";
+  case GE:  return "ge";
+  case LT:  return "lt";
+  case GT:  return "gt";
+  case LE:  return "le";
+  case AL:  return "al";
+  case NV:  return "nv";
+  }
+}
+
+inline static CondCode getInvertedCondCode(CondCode Code) {
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ:  return NE;
+  case NE:  return EQ;
+  case HS:  return LO;
+  case LO:  return HS;
+  case MI:  return PL;
+  case PL:  return MI;
+  case VS:  return VC;
+  case VC:  return VS;
+  case HI:  return LS;
+  case LS:  return HI;
+  case GE:  return LT;
+  case LT:  return GE;
+  case GT:  return LE;
+  case LE:  return GT;
+  }
+}
+
+/// Given a condition code, return NZCV flags that would satisfy that condition.
+/// The flag bits are in the format expected by the ccmp instructions.
+/// Note that many different flag settings can satisfy a given condition code,
+/// this function just returns one of them.
+inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
+  // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
+  enum { N = 8, Z = 4, C = 2, V = 1 };
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ: return Z; // Z == 1
+  case NE: return 0; // Z == 0
+  case HS: return C; // C == 1
+  case LO: return 0; // C == 0
+  case MI: return N; // N == 1
+  case PL: return 0; // N == 0
+  case VS: return V; // V == 1
+  case VC: return 0; // V == 0
+  case HI: return C; // C == 1 && Z == 0
+  case LS: return 0; // C == 0 || Z == 1
+  case GE: return 0; // N == V
+  case LT: return N; // N != V
+  case GT: return 0; // Z == 0 && N == V
+  case LE: return Z; // Z == 1 || N != V
+  }
+}
+} // end namespace AArch64CC
+
+/// Instances of this class can perform bidirectional mapping from random
+/// identifier strings to operand encodings. For example "MSR" takes a named
+/// system-register which must be encoded somehow and decoded for printing. This
+/// central location means that the information for those transformations is not
+/// duplicated and remains in sync.
+///
+/// FIXME: currently the algorithm is a completely unoptimised linear
+/// search. Obviously this could be improved, but we would probably want to work
+/// out just how often these instructions are emitted before working on it. It
+/// might even be optimal to just reorder the tables for the common instructions
+/// rather than changing the algorithm.
+struct AArch64NamedImmMapper {
+  struct Mapping {
+    const char *Name;
+    uint32_t Value;
+  };
+
+  template<int N>
+  AArch64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
+    : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {}
+
+  StringRef toString(uint32_t Value, bool &Valid) const;
+  uint32_t fromString(StringRef Name, bool &Valid) const;
+
+  /// Many of the instructions allow an alternative assembly form consisting of
+  /// a simple immediate. Currently the only valid forms are ranges [0, N) where
+  /// N being 0 indicates no immediate syntax-form is allowed.
+  bool validImm(uint32_t Value) const;
+protected:
+  const Mapping *Pairs;
+  size_t NumPairs;
+  uint32_t TooBigImm;
+};
+
+namespace AArch64AT {
+  enum ATValues {
+    Invalid = -1,    // Op0 Op1  CRn   CRm   Op2
+    S1E1R = 0x43c0,  // 01  000  0111  1000  000
+    S1E2R = 0x63c0,  // 01  100  0111  1000  000
+    S1E3R = 0x73c0,  // 01  110  0111  1000  000
+    S1E1W = 0x43c1,  // 01  000  0111  1000  001
+    S1E2W = 0x63c1,  // 01  100  0111  1000  001
+    S1E3W = 0x73c1,  // 01  110  0111  1000  001
+    S1E0R = 0x43c2,  // 01  000  0111  1000  010
+    S1E0W = 0x43c3,  // 01  000  0111  1000  011
+    S12E1R = 0x63c4, // 01  100  0111  1000  100
+    S12E1W = 0x63c5, // 01  100  0111  1000  101
+    S12E0R = 0x63c6, // 01  100  0111  1000  110
+    S12E0W = 0x63c7  // 01  100  0111  1000  111
+  };
+
+  struct ATMapper : AArch64NamedImmMapper {
+    const static Mapping ATPairs[];
+
+    ATMapper();
+  };
+
+}
+namespace AArch64DB {
+  enum DBValues {
+    Invalid = -1,
+    OSHLD = 0x1,
+    OSHST = 0x2,
+    OSH =   0x3,
+    NSHLD = 0x5,
+    NSHST = 0x6,
+    NSH =   0x7,
+    ISHLD = 0x9,
+    ISHST = 0xa,
+    ISH =   0xb,
+    LD =    0xd,
+    ST =    0xe,
+    SY =    0xf
+  };
+
+  struct DBarrierMapper : AArch64NamedImmMapper {
+    const static Mapping DBarrierPairs[];
+
+    DBarrierMapper();
+  };
+}
+
+namespace  AArch64DC {
+  enum DCValues {
+    Invalid = -1,   // Op1  CRn   CRm   Op2
+    ZVA   = 0x5ba1, // 01  011  0111  0100  001
+    IVAC  = 0x43b1, // 01  000  0111  0110  001
+    ISW   = 0x43b2, // 01  000  0111  0110  010
+    CVAC  = 0x5bd1, // 01  011  0111  1010  001
+    CSW   = 0x43d2, // 01  000  0111  1010  010
+    CVAU  = 0x5bd9, // 01  011  0111  1011  001
+    CIVAC = 0x5bf1, // 01  011  0111  1110  001
+    CISW  = 0x43f2  // 01  000  0111  1110  010
+  };
+
+  struct DCMapper : AArch64NamedImmMapper {
+    const static Mapping DCPairs[];
+
+    DCMapper();
+  };
+
+}
+
+namespace  AArch64IC {
+  enum ICValues {
+    Invalid = -1,     // Op1  CRn   CRm   Op2
+    IALLUIS = 0x0388, // 000  0111  0001  000
+    IALLU = 0x03a8,   // 000  0111  0101  000
+    IVAU = 0x1ba9     // 011  0111  0101  001
+  };
+
+
+  struct ICMapper : AArch64NamedImmMapper {
+    const static Mapping ICPairs[];
+
+    ICMapper();
+  };
+
+  static inline bool NeedsRegister(ICValues Val) {
+    return Val == IVAU;
+  }
+}
+
+namespace  AArch64ISB {
+  enum ISBValues {
+    Invalid = -1,
+    SY = 0xf
+  };
+  struct ISBMapper : AArch64NamedImmMapper {
+    const static Mapping ISBPairs[];
+
+    ISBMapper();
+  };
+}
+
+namespace AArch64PRFM {
+  enum PRFMValues {
+    Invalid = -1,
+    PLDL1KEEP = 0x00,
+    PLDL1STRM = 0x01,
+    PLDL2KEEP = 0x02,
+    PLDL2STRM = 0x03,
+    PLDL3KEEP = 0x04,
+    PLDL3STRM = 0x05,
+    PLIL1KEEP = 0x08,
+    PLIL1STRM = 0x09,
+    PLIL2KEEP = 0x0a,
+    PLIL2STRM = 0x0b,
+    PLIL3KEEP = 0x0c,
+    PLIL3STRM = 0x0d,
+    PSTL1KEEP = 0x10,
+    PSTL1STRM = 0x11,
+    PSTL2KEEP = 0x12,
+    PSTL2STRM = 0x13,
+    PSTL3KEEP = 0x14,
+    PSTL3STRM = 0x15
+  };
+
+  struct PRFMMapper : AArch64NamedImmMapper {
+    const static Mapping PRFMPairs[];
+
+    PRFMMapper();
+  };
+}
+
+namespace AArch64PState {
+  enum PStateValues {
+    Invalid = -1,
+    SPSel = 0x05,
+    DAIFSet = 0x1e,
+    DAIFClr = 0x1f
+  };
+
+  struct PStateMapper : AArch64NamedImmMapper {
+    const static Mapping PStatePairs[];
+
+    PStateMapper();
+  };
+
+}
+
+namespace AArch64SE {
+    enum ShiftExtSpecifiers {
+        Invalid = -1,
+        LSL,
+        MSL,
+        LSR,
+        ASR,
+        ROR,
+
+        UXTB,
+        UXTH,
+        UXTW,
+        UXTX,
+
+        SXTB,
+        SXTH,
+        SXTW,
+        SXTX
+    };
+}
+
+namespace AArch64Layout {
+    enum VectorLayout {
+        Invalid = -1,
+        VL_8B,
+        VL_4H,
+        VL_2S,
+        VL_1D,
+
+        VL_16B,
+        VL_8H,
+        VL_4S,
+        VL_2D,
+
+        // Bare layout for the 128-bit vector
+        // (only show ".b", ".h", ".s", ".d" without vector number)
+        VL_B,
+        VL_H,
+        VL_S,
+        VL_D
+    };
+}
+
+inline static const char *
+AArch64VectorLayoutToString(AArch64Layout::VectorLayout Layout) {
+  switch (Layout) {
+  case AArch64Layout::VL_8B:  return ".8b";
+  case AArch64Layout::VL_4H:  return ".4h";
+  case AArch64Layout::VL_2S:  return ".2s";
+  case AArch64Layout::VL_1D:  return ".1d";
+  case AArch64Layout::VL_16B:  return ".16b";
+  case AArch64Layout::VL_8H:  return ".8h";
+  case AArch64Layout::VL_4S:  return ".4s";
+  case AArch64Layout::VL_2D:  return ".2d";
+  case AArch64Layout::VL_B:  return ".b";
+  case AArch64Layout::VL_H:  return ".h";
+  case AArch64Layout::VL_S:  return ".s";
+  case AArch64Layout::VL_D:  return ".d";
+  default: llvm_unreachable("Unknown Vector Layout");
+  }
+}
+
+inline static AArch64Layout::VectorLayout
+AArch64StringToVectorLayout(StringRef LayoutStr) {
+  return StringSwitch<AArch64Layout::VectorLayout>(LayoutStr)
+             .Case(".8b", AArch64Layout::VL_8B)
+             .Case(".4h", AArch64Layout::VL_4H)
+             .Case(".2s", AArch64Layout::VL_2S)
+             .Case(".1d", AArch64Layout::VL_1D)
+             .Case(".16b", AArch64Layout::VL_16B)
+             .Case(".8h", AArch64Layout::VL_8H)
+             .Case(".4s", AArch64Layout::VL_4S)
+             .Case(".2d", AArch64Layout::VL_2D)
+             .Case(".b", AArch64Layout::VL_B)
+             .Case(".h", AArch64Layout::VL_H)
+             .Case(".s", AArch64Layout::VL_S)
+             .Case(".d", AArch64Layout::VL_D)
+             .Default(AArch64Layout::Invalid);
+}
+
+namespace AArch64SysReg {
+  enum SysRegROValues {
+    MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
+    DBGDTRRX_EL0      = 0x9828, // 10  011  0000  0101  000
+    MDRAR_EL1         = 0x8080, // 10  000  0001  0000  000
+    OSLSR_EL1         = 0x808c, // 10  000  0001  0001  100
+    DBGAUTHSTATUS_EL1 = 0x83f6, // 10  000  0111  1110  110
+    PMCEID0_EL0       = 0xdce6, // 11  011  1001  1100  110
+    PMCEID1_EL0       = 0xdce7, // 11  011  1001  1100  111
+    MIDR_EL1          = 0xc000, // 11  000  0000  0000  000
+    CCSIDR_EL1        = 0xc800, // 11  001  0000  0000  000
+    CLIDR_EL1         = 0xc801, // 11  001  0000  0000  001
+    CTR_EL0           = 0xd801, // 11  011  0000  0000  001
+    MPIDR_EL1         = 0xc005, // 11  000  0000  0000  101
+    REVIDR_EL1        = 0xc006, // 11  000  0000  0000  110
+    AIDR_EL1          = 0xc807, // 11  001  0000  0000  111
+    DCZID_EL0         = 0xd807, // 11  011  0000  0000  111
+    ID_PFR0_EL1       = 0xc008, // 11  000  0000  0001  000
+    ID_PFR1_EL1       = 0xc009, // 11  000  0000  0001  001
+    ID_DFR0_EL1       = 0xc00a, // 11  000  0000  0001  010
+    ID_AFR0_EL1       = 0xc00b, // 11  000  0000  0001  011
+    ID_MMFR0_EL1      = 0xc00c, // 11  000  0000  0001  100
+    ID_MMFR1_EL1      = 0xc00d, // 11  000  0000  0001  101
+    ID_MMFR2_EL1      = 0xc00e, // 11  000  0000  0001  110
+    ID_MMFR3_EL1      = 0xc00f, // 11  000  0000  0001  111
+    ID_ISAR0_EL1      = 0xc010, // 11  000  0000  0010  000
+    ID_ISAR1_EL1      = 0xc011, // 11  000  0000  0010  001
+    ID_ISAR2_EL1      = 0xc012, // 11  000  0000  0010  010
+    ID_ISAR3_EL1      = 0xc013, // 11  000  0000  0010  011
+    ID_ISAR4_EL1      = 0xc014, // 11  000  0000  0010  100
+    ID_ISAR5_EL1      = 0xc015, // 11  000  0000  0010  101
+    ID_A64PFR0_EL1    = 0xc020, // 11  000  0000  0100  000
+    ID_A64PFR1_EL1    = 0xc021, // 11  000  0000  0100  001
+    ID_A64DFR0_EL1    = 0xc028, // 11  000  0000  0101  000
+    ID_A64DFR1_EL1    = 0xc029, // 11  000  0000  0101  001
+    ID_A64AFR0_EL1    = 0xc02c, // 11  000  0000  0101  100
+    ID_A64AFR1_EL1    = 0xc02d, // 11  000  0000  0101  101
+    ID_A64ISAR0_EL1   = 0xc030, // 11  000  0000  0110  000
+    ID_A64ISAR1_EL1   = 0xc031, // 11  000  0000  0110  001
+    ID_A64MMFR0_EL1   = 0xc038, // 11  000  0000  0111  000
+    ID_A64MMFR1_EL1   = 0xc039, // 11  000  0000  0111  001
+    MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
+    MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
+    MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
+    RVBAR_EL1         = 0xc601, // 11  000  1100  0000  001
+    RVBAR_EL2         = 0xe601, // 11  100  1100  0000  001
+    RVBAR_EL3         = 0xf601, // 11  110  1100  0000  001
+    ISR_EL1           = 0xc608, // 11  000  1100  0001  000
+    CNTPCT_EL0        = 0xdf01, // 11  011  1110  0000  001
+    CNTVCT_EL0        = 0xdf02,  // 11  011  1110  0000  010
+
+    // Trace registers
+    TRCSTATR          = 0x8818, // 10  001  0000  0011  000
+    TRCIDR8           = 0x8806, // 10  001  0000  0000  110
+    TRCIDR9           = 0x880e, // 10  001  0000  0001  110
+    TRCIDR10          = 0x8816, // 10  001  0000  0010  110
+    TRCIDR11          = 0x881e, // 10  001  0000  0011  110
+    TRCIDR12          = 0x8826, // 10  001  0000  0100  110
+    TRCIDR13          = 0x882e, // 10  001  0000  0101  110
+    TRCIDR0           = 0x8847, // 10  001  0000  1000  111
+    TRCIDR1           = 0x884f, // 10  001  0000  1001  111
+    TRCIDR2           = 0x8857, // 10  001  0000  1010  111
+    TRCIDR3           = 0x885f, // 10  001  0000  1011  111
+    TRCIDR4           = 0x8867, // 10  001  0000  1100  111
+    TRCIDR5           = 0x886f, // 10  001  0000  1101  111
+    TRCIDR6           = 0x8877, // 10  001  0000  1110  111
+    TRCIDR7           = 0x887f, // 10  001  0000  1111  111
+    TRCOSLSR          = 0x888c, // 10  001  0001  0001  100
+    TRCPDSR           = 0x88ac, // 10  001  0001  0101  100
+    TRCDEVAFF0        = 0x8bd6, // 10  001  0111  1010  110
+    TRCDEVAFF1        = 0x8bde, // 10  001  0111  1011  110
+    TRCLSR            = 0x8bee, // 10  001  0111  1101  110
+    TRCAUTHSTATUS     = 0x8bf6, // 10  001  0111  1110  110
+    TRCDEVARCH        = 0x8bfe, // 10  001  0111  1111  110
+    TRCDEVID          = 0x8b97, // 10  001  0111  0010  111
+    TRCDEVTYPE        = 0x8b9f, // 10  001  0111  0011  111
+    TRCPIDR4          = 0x8ba7, // 10  001  0111  0100  111
+    TRCPIDR5          = 0x8baf, // 10  001  0111  0101  111
+    TRCPIDR6          = 0x8bb7, // 10  001  0111  0110  111
+    TRCPIDR7          = 0x8bbf, // 10  001  0111  0111  111
+    TRCPIDR0          = 0x8bc7, // 10  001  0111  1000  111
+    TRCPIDR1          = 0x8bcf, // 10  001  0111  1001  111
+    TRCPIDR2          = 0x8bd7, // 10  001  0111  1010  111
+    TRCPIDR3          = 0x8bdf, // 10  001  0111  1011  111
+    TRCCIDR0          = 0x8be7, // 10  001  0111  1100  111
+    TRCCIDR1          = 0x8bef, // 10  001  0111  1101  111
+    TRCCIDR2          = 0x8bf7, // 10  001  0111  1110  111
+    TRCCIDR3          = 0x8bff, // 10  001  0111  1111  111
+
+    // GICv3 registers
+    ICC_IAR1_EL1      = 0xc660, // 11  000  1100  1100  000
+    ICC_IAR0_EL1      = 0xc640, // 11  000  1100  1000  000
+    ICC_HPPIR1_EL1    = 0xc662, // 11  000  1100  1100  010
+    ICC_HPPIR0_EL1    = 0xc642, // 11  000  1100  1000  010
+    ICC_RPR_EL1       = 0xc65b, // 11  000  1100  1011  011
+    ICH_VTR_EL2       = 0xe659, // 11  100  1100  1011  001
+    ICH_EISR_EL2      = 0xe65b, // 11  100  1100  1011  011
+    ICH_ELSR_EL2      = 0xe65d  // 11  100  1100  1011  101
+  };
+
+  enum SysRegWOValues {
+    DBGDTRTX_EL0      = 0x9828, // 10  011  0000  0101  000
+    OSLAR_EL1         = 0x8084, // 10  000  0001  0000  100
+    PMSWINC_EL0       = 0xdce4,  // 11  011  1001  1100  100
+
+    // Trace Registers
+    TRCOSLAR          = 0x8884, // 10  001  0001  0000  100
+    TRCLAR            = 0x8be6, // 10  001  0111  1100  110
+
+    // GICv3 registers
+    ICC_EOIR1_EL1     = 0xc661, // 11  000  1100  1100  001
+    ICC_EOIR0_EL1     = 0xc641, // 11  000  1100  1000  001
+    ICC_DIR_EL1       = 0xc659, // 11  000  1100  1011  001
+    ICC_SGI1R_EL1     = 0xc65d, // 11  000  1100  1011  101
+    ICC_ASGI1R_EL1    = 0xc65e, // 11  000  1100  1011  110
+    ICC_SGI0R_EL1     = 0xc65f  // 11  000  1100  1011  111
+  };
+
+  enum SysRegValues {
+    Invalid = -1,               // Op0 Op1  CRn   CRm   Op2
+    OSDTRRX_EL1       = 0x8002, // 10  000  0000  0000  010
+    OSDTRTX_EL1       = 0x801a, // 10  000  0000  0011  010
+    TEECR32_EL1       = 0x9000, // 10  010  0000  0000  000
+    MDCCINT_EL1       = 0x8010, // 10  000  0000  0010  000
+    MDSCR_EL1         = 0x8012, // 10  000  0000  0010  010
+    DBGDTR_EL0        = 0x9820, // 10  011  0000  0100  000
+    OSECCR_EL1        = 0x8032, // 10  000  0000  0110  010
+    DBGVCR32_EL2      = 0xa038, // 10  100  0000  0111  000
+    DBGBVR0_EL1       = 0x8004, // 10  000  0000  0000  100
+    DBGBVR1_EL1       = 0x800c, // 10  000  0000  0001  100
+    DBGBVR2_EL1       = 0x8014, // 10  000  0000  0010  100
+    DBGBVR3_EL1       = 0x801c, // 10  000  0000  0011  100
+    DBGBVR4_EL1       = 0x8024, // 10  000  0000  0100  100
+    DBGBVR5_EL1       = 0x802c, // 10  000  0000  0101  100
+    DBGBVR6_EL1       = 0x8034, // 10  000  0000  0110  100
+    DBGBVR7_EL1       = 0x803c, // 10  000  0000  0111  100
+    DBGBVR8_EL1       = 0x8044, // 10  000  0000  1000  100
+    DBGBVR9_EL1       = 0x804c, // 10  000  0000  1001  100
+    DBGBVR10_EL1      = 0x8054, // 10  000  0000  1010  100
+    DBGBVR11_EL1      = 0x805c, // 10  000  0000  1011  100
+    DBGBVR12_EL1      = 0x8064, // 10  000  0000  1100  100
+    DBGBVR13_EL1      = 0x806c, // 10  000  0000  1101  100
+    DBGBVR14_EL1      = 0x8074, // 10  000  0000  1110  100
+    DBGBVR15_EL1      = 0x807c, // 10  000  0000  1111  100
+    DBGBCR0_EL1       = 0x8005, // 10  000  0000  0000  101
+    DBGBCR1_EL1       = 0x800d, // 10  000  0000  0001  101
+    DBGBCR2_EL1       = 0x8015, // 10  000  0000  0010  101
+    DBGBCR3_EL1       = 0x801d, // 10  000  0000  0011  101
+    DBGBCR4_EL1       = 0x8025, // 10  000  0000  0100  101
+    DBGBCR5_EL1       = 0x802d, // 10  000  0000  0101  101
+    DBGBCR6_EL1       = 0x8035, // 10  000  0000  0110  101
+    DBGBCR7_EL1       = 0x803d, // 10  000  0000  0111  101
+    DBGBCR8_EL1       = 0x8045, // 10  000  0000  1000  101
+    DBGBCR9_EL1       = 0x804d, // 10  000  0000  1001  101
+    DBGBCR10_EL1      = 0x8055, // 10  000  0000  1010  101
+    DBGBCR11_EL1      = 0x805d, // 10  000  0000  1011  101
+    DBGBCR12_EL1      = 0x8065, // 10  000  0000  1100  101
+    DBGBCR13_EL1      = 0x806d, // 10  000  0000  1101  101
+    DBGBCR14_EL1      = 0x8075, // 10  000  0000  1110  101
+    DBGBCR15_EL1      = 0x807d, // 10  000  0000  1111  101
+    DBGWVR0_EL1       = 0x8006, // 10  000  0000  0000  110
+    DBGWVR1_EL1       = 0x800e, // 10  000  0000  0001  110
+    DBGWVR2_EL1       = 0x8016, // 10  000  0000  0010  110
+    DBGWVR3_EL1       = 0x801e, // 10  000  0000  0011  110
+    DBGWVR4_EL1       = 0x8026, // 10  000  0000  0100  110
+    DBGWVR5_EL1       = 0x802e, // 10  000  0000  0101  110
+    DBGWVR6_EL1       = 0x8036, // 10  000  0000  0110  110
+    DBGWVR7_EL1       = 0x803e, // 10  000  0000  0111  110
+    DBGWVR8_EL1       = 0x8046, // 10  000  0000  1000  110
+    DBGWVR9_EL1       = 0x804e, // 10  000  0000  1001  110
+    DBGWVR10_EL1      = 0x8056, // 10  000  0000  1010  110
+    DBGWVR11_EL1      = 0x805e, // 10  000  0000  1011  110
+    DBGWVR12_EL1      = 0x8066, // 10  000  0000  1100  110
+    DBGWVR13_EL1      = 0x806e, // 10  000  0000  1101  110
+    DBGWVR14_EL1      = 0x8076, // 10  000  0000  1110  110
+    DBGWVR15_EL1      = 0x807e, // 10  000  0000  1111  110
+    DBGWCR0_EL1       = 0x8007, // 10  000  0000  0000  111
+    DBGWCR1_EL1       = 0x800f, // 10  000  0000  0001  111
+    DBGWCR2_EL1       = 0x8017, // 10  000  0000  0010  111
+    DBGWCR3_EL1       = 0x801f, // 10  000  0000  0011  111
+    DBGWCR4_EL1       = 0x8027, // 10  000  0000  0100  111
+    DBGWCR5_EL1       = 0x802f, // 10  000  0000  0101  111
+    DBGWCR6_EL1       = 0x8037, // 10  000  0000  0110  111
+    DBGWCR7_EL1       = 0x803f, // 10  000  0000  0111  111
+    DBGWCR8_EL1       = 0x8047, // 10  000  0000  1000  111
+    DBGWCR9_EL1       = 0x804f, // 10  000  0000  1001  111
+    DBGWCR10_EL1      = 0x8057, // 10  000  0000  1010  111
+    DBGWCR11_EL1      = 0x805f, // 10  000  0000  1011  111
+    DBGWCR12_EL1      = 0x8067, // 10  000  0000  1100  111
+    DBGWCR13_EL1      = 0x806f, // 10  000  0000  1101  111
+    DBGWCR14_EL1      = 0x8077, // 10  000  0000  1110  111
+    DBGWCR15_EL1      = 0x807f, // 10  000  0000  1111  111
+    TEEHBR32_EL1      = 0x9080, // 10  010  0001  0000  000
+    OSDLR_EL1         = 0x809c, // 10  000  0001  0011  100
+    DBGPRCR_EL1       = 0x80a4, // 10  000  0001  0100  100
+    DBGCLAIMSET_EL1   = 0x83c6, // 10  000  0111  1000  110
+    DBGCLAIMCLR_EL1   = 0x83ce, // 10  000  0111  1001  110
+    CSSELR_EL1        = 0xd000, // 11  010  0000  0000  000
+    VPIDR_EL2         = 0xe000, // 11  100  0000  0000  000
+    VMPIDR_EL2        = 0xe005, // 11  100  0000  0000  101
+    CPACR_EL1         = 0xc082, // 11  000  0001  0000  010
+    SCTLR_EL1         = 0xc080, // 11  000  0001  0000  000
+    SCTLR_EL2         = 0xe080, // 11  100  0001  0000  000
+    SCTLR_EL3         = 0xf080, // 11  110  0001  0000  000
+    ACTLR_EL1         = 0xc081, // 11  000  0001  0000  001
+    ACTLR_EL2         = 0xe081, // 11  100  0001  0000  001
+    ACTLR_EL3         = 0xf081, // 11  110  0001  0000  001
+    HCR_EL2           = 0xe088, // 11  100  0001  0001  000
+    SCR_EL3           = 0xf088, // 11  110  0001  0001  000
+    MDCR_EL2          = 0xe089, // 11  100  0001  0001  001
+    SDER32_EL3        = 0xf089, // 11  110  0001  0001  001
+    CPTR_EL2          = 0xe08a, // 11  100  0001  0001  010
+    CPTR_EL3          = 0xf08a, // 11  110  0001  0001  010
+    HSTR_EL2          = 0xe08b, // 11  100  0001  0001  011
+    HACR_EL2          = 0xe08f, // 11  100  0001  0001  111
+    MDCR_EL3          = 0xf099, // 11  110  0001  0011  001
+    TTBR0_EL1         = 0xc100, // 11  000  0010  0000  000
+    TTBR0_EL2         = 0xe100, // 11  100  0010  0000  000
+    TTBR0_EL3         = 0xf100, // 11  110  0010  0000  000
+    TTBR1_EL1         = 0xc101, // 11  000  0010  0000  001
+    TCR_EL1           = 0xc102, // 11  000  0010  0000  010
+    TCR_EL2           = 0xe102, // 11  100  0010  0000  010
+    TCR_EL3           = 0xf102, // 11  110  0010  0000  010
+    VTTBR_EL2         = 0xe108, // 11  100  0010  0001  000
+    VTCR_EL2          = 0xe10a, // 11  100  0010  0001  010
+    DACR32_EL2        = 0xe180, // 11  100  0011  0000  000
+    SPSR_EL1          = 0xc200, // 11  000  0100  0000  000
+    SPSR_EL2          = 0xe200, // 11  100  0100  0000  000
+    SPSR_EL3          = 0xf200, // 11  110  0100  0000  000
+    ELR_EL1           = 0xc201, // 11  000  0100  0000  001
+    ELR_EL2           = 0xe201, // 11  100  0100  0000  001
+    ELR_EL3           = 0xf201, // 11  110  0100  0000  001
+    SP_EL0            = 0xc208, // 11  000  0100  0001  000
+    SP_EL1            = 0xe208, // 11  100  0100  0001  000
+    SP_EL2            = 0xf208, // 11  110  0100  0001  000
+    SPSel             = 0xc210, // 11  000  0100  0010  000
+    NZCV              = 0xda10, // 11  011  0100  0010  000
+    DAIF              = 0xda11, // 11  011  0100  0010  001
+    CurrentEL         = 0xc212, // 11  000  0100  0010  010
+    SPSR_irq          = 0xe218, // 11  100  0100  0011  000
+    SPSR_abt          = 0xe219, // 11  100  0100  0011  001
+    SPSR_und          = 0xe21a, // 11  100  0100  0011  010
+    SPSR_fiq          = 0xe21b, // 11  100  0100  0011  011
+    FPCR              = 0xda20, // 11  011  0100  0100  000
+    FPSR              = 0xda21, // 11  011  0100  0100  001
+    DSPSR_EL0         = 0xda28, // 11  011  0100  0101  000
+    DLR_EL0           = 0xda29, // 11  011  0100  0101  001
+    IFSR32_EL2        = 0xe281, // 11  100  0101  0000  001
+    AFSR0_EL1         = 0xc288, // 11  000  0101  0001  000
+    AFSR0_EL2         = 0xe288, // 11  100  0101  0001  000
+    AFSR0_EL3         = 0xf288, // 11  110  0101  0001  000
+    AFSR1_EL1         = 0xc289, // 11  000  0101  0001  001
+    AFSR1_EL2         = 0xe289, // 11  100  0101  0001  001
+    AFSR1_EL3         = 0xf289, // 11  110  0101  0001  001
+    ESR_EL1           = 0xc290, // 11  000  0101  0010  000
+    ESR_EL2           = 0xe290, // 11  100  0101  0010  000
+    ESR_EL3           = 0xf290, // 11  110  0101  0010  000
+    FPEXC32_EL2       = 0xe298, // 11  100  0101  0011  000
+    FAR_EL1           = 0xc300, // 11  000  0110  0000  000
+    FAR_EL2           = 0xe300, // 11  100  0110  0000  000
+    FAR_EL3           = 0xf300, // 11  110  0110  0000  000
+    HPFAR_EL2         = 0xe304, // 11  100  0110  0000  100
+    PAR_EL1           = 0xc3a0, // 11  000  0111  0100  000
+    PMCR_EL0          = 0xdce0, // 11  011  1001  1100  000
+    PMCNTENSET_EL0    = 0xdce1, // 11  011  1001  1100  001
+    PMCNTENCLR_EL0    = 0xdce2, // 11  011  1001  1100  010
+    PMOVSCLR_EL0      = 0xdce3, // 11  011  1001  1100  011
+    PMSELR_EL0        = 0xdce5, // 11  011  1001  1100  101
+    PMCCNTR_EL0       = 0xdce8, // 11  011  1001  1101  000
+    PMXEVTYPER_EL0    = 0xdce9, // 11  011  1001  1101  001
+    PMXEVCNTR_EL0     = 0xdcea, // 11  011  1001  1101  010
+    PMUSERENR_EL0     = 0xdcf0, // 11  011  1001  1110  000
+    PMINTENSET_EL1    = 0xc4f1, // 11  000  1001  1110  001
+    PMINTENCLR_EL1    = 0xc4f2, // 11  000  1001  1110  010
+    PMOVSSET_EL0      = 0xdcf3, // 11  011  1001  1110  011
+    MAIR_EL1          = 0xc510, // 11  000  1010  0010  000
+    MAIR_EL2          = 0xe510, // 11  100  1010  0010  000
+    MAIR_EL3          = 0xf510, // 11  110  1010  0010  000
+    AMAIR_EL1         = 0xc518, // 11  000  1010  0011  000
+    AMAIR_EL2         = 0xe518, // 11  100  1010  0011  000
+    AMAIR_EL3         = 0xf518, // 11  110  1010  0011  000
+    VBAR_EL1          = 0xc600, // 11  000  1100  0000  000
+    VBAR_EL2          = 0xe600, // 11  100  1100  0000  000
+    VBAR_EL3          = 0xf600, // 11  110  1100  0000  000
+    RMR_EL1           = 0xc602, // 11  000  1100  0000  010
+    RMR_EL2           = 0xe602, // 11  100  1100  0000  010
+    RMR_EL3           = 0xf602, // 11  110  1100  0000  010
+    CONTEXTIDR_EL1    = 0xc681, // 11  000  1101  0000  001
+    TPIDR_EL0         = 0xde82, // 11  011  1101  0000  010
+    TPIDR_EL2         = 0xe682, // 11  100  1101  0000  010
+    TPIDR_EL3         = 0xf682, // 11  110  1101  0000  010
+    TPIDRRO_EL0       = 0xde83, // 11  011  1101  0000  011
+    TPIDR_EL1         = 0xc684, // 11  000  1101  0000  100
+    CNTFRQ_EL0        = 0xdf00, // 11  011  1110  0000  000
+    CNTVOFF_EL2       = 0xe703, // 11  100  1110  0000  011
+    CNTKCTL_EL1       = 0xc708, // 11  000  1110  0001  000
+    CNTHCTL_EL2       = 0xe708, // 11  100  1110  0001  000
+    CNTP_TVAL_EL0     = 0xdf10, // 11  011  1110  0010  000
+    CNTHP_TVAL_EL2    = 0xe710, // 11  100  1110  0010  000
+    CNTPS_TVAL_EL1    = 0xff10, // 11  111  1110  0010  000
+    CNTP_CTL_EL0      = 0xdf11, // 11  011  1110  0010  001
+    CNTHP_CTL_EL2     = 0xe711, // 11  100  1110  0010  001
+    CNTPS_CTL_EL1     = 0xff11, // 11  111  1110  0010  001
+    CNTP_CVAL_EL0     = 0xdf12, // 11  011  1110  0010  010
+    CNTHP_CVAL_EL2    = 0xe712, // 11  100  1110  0010  010
+    CNTPS_CVAL_EL1    = 0xff12, // 11  111  1110  0010  010
+    CNTV_TVAL_EL0     = 0xdf18, // 11  011  1110  0011  000
+    CNTV_CTL_EL0      = 0xdf19, // 11  011  1110  0011  001
+    CNTV_CVAL_EL0     = 0xdf1a, // 11  011  1110  0011  010
+    PMEVCNTR0_EL0     = 0xdf40, // 11  011  1110  1000  000
+    PMEVCNTR1_EL0     = 0xdf41, // 11  011  1110  1000  001
+    PMEVCNTR2_EL0     = 0xdf42, // 11  011  1110  1000  010
+    PMEVCNTR3_EL0     = 0xdf43, // 11  011  1110  1000  011
+    PMEVCNTR4_EL0     = 0xdf44, // 11  011  1110  1000  100
+    PMEVCNTR5_EL0     = 0xdf45, // 11  011  1110  1000  101
+    PMEVCNTR6_EL0     = 0xdf46, // 11  011  1110  1000  110
+    PMEVCNTR7_EL0     = 0xdf47, // 11  011  1110  1000  111
+    PMEVCNTR8_EL0     = 0xdf48, // 11  011  1110  1001  000
+    PMEVCNTR9_EL0     = 0xdf49, // 11  011  1110  1001  001
+    PMEVCNTR10_EL0    = 0xdf4a, // 11  011  1110  1001  010
+    PMEVCNTR11_EL0    = 0xdf4b, // 11  011  1110  1001  011
+    PMEVCNTR12_EL0    = 0xdf4c, // 11  011  1110  1001  100
+    PMEVCNTR13_EL0    = 0xdf4d, // 11  011  1110  1001  101
+    PMEVCNTR14_EL0    = 0xdf4e, // 11  011  1110  1001  110
+    PMEVCNTR15_EL0    = 0xdf4f, // 11  011  1110  1001  111
+    PMEVCNTR16_EL0    = 0xdf50, // 11  011  1110  1010  000
+    PMEVCNTR17_EL0    = 0xdf51, // 11  011  1110  1010  001
+    PMEVCNTR18_EL0    = 0xdf52, // 11  011  1110  1010  010
+    PMEVCNTR19_EL0    = 0xdf53, // 11  011  1110  1010  011
+    PMEVCNTR20_EL0    = 0xdf54, // 11  011  1110  1010  100
+    PMEVCNTR21_EL0    = 0xdf55, // 11  011  1110  1010  101
+    PMEVCNTR22_EL0    = 0xdf56, // 11  011  1110  1010  110
+    PMEVCNTR23_EL0    = 0xdf57, // 11  011  1110  1010  111
+    PMEVCNTR24_EL0    = 0xdf58, // 11  011  1110  1011  000
+    PMEVCNTR25_EL0    = 0xdf59, // 11  011  1110  1011  001
+    PMEVCNTR26_EL0    = 0xdf5a, // 11  011  1110  1011  010
+    PMEVCNTR27_EL0    = 0xdf5b, // 11  011  1110  1011  011
+    PMEVCNTR28_EL0    = 0xdf5c, // 11  011  1110  1011  100
+    PMEVCNTR29_EL0    = 0xdf5d, // 11  011  1110  1011  101
+    PMEVCNTR30_EL0    = 0xdf5e, // 11  011  1110  1011  110
+    PMCCFILTR_EL0     = 0xdf7f, // 11  011  1110  1111  111
+    PMEVTYPER0_EL0    = 0xdf60, // 11  011  1110  1100  000
+    PMEVTYPER1_EL0    = 0xdf61, // 11  011  1110  1100  001
+    PMEVTYPER2_EL0    = 0xdf62, // 11  011  1110  1100  010
+    PMEVTYPER3_EL0    = 0xdf63, // 11  011  1110  1100  011
+    PMEVTYPER4_EL0    = 0xdf64, // 11  011  1110  1100  100
+    PMEVTYPER5_EL0    = 0xdf65, // 11  011  1110  1100  101
+    PMEVTYPER6_EL0    = 0xdf66, // 11  011  1110  1100  110
+    PMEVTYPER7_EL0    = 0xdf67, // 11  011  1110  1100  111
+    PMEVTYPER8_EL0    = 0xdf68, // 11  011  1110  1101  000
+    PMEVTYPER9_EL0    = 0xdf69, // 11  011  1110  1101  001
+    PMEVTYPER10_EL0   = 0xdf6a, // 11  011  1110  1101  010
+    PMEVTYPER11_EL0   = 0xdf6b, // 11  011  1110  1101  011
+    PMEVTYPER12_EL0   = 0xdf6c, // 11  011  1110  1101  100
+    PMEVTYPER13_EL0   = 0xdf6d, // 11  011  1110  1101  101
+    PMEVTYPER14_EL0   = 0xdf6e, // 11  011  1110  1101  110
+    PMEVTYPER15_EL0   = 0xdf6f, // 11  011  1110  1101  111
+    PMEVTYPER16_EL0   = 0xdf70, // 11  011  1110  1110  000
+    PMEVTYPER17_EL0   = 0xdf71, // 11  011  1110  1110  001
+    PMEVTYPER18_EL0   = 0xdf72, // 11  011  1110  1110  010
+    PMEVTYPER19_EL0   = 0xdf73, // 11  011  1110  1110  011
+    PMEVTYPER20_EL0   = 0xdf74, // 11  011  1110  1110  100
+    PMEVTYPER21_EL0   = 0xdf75, // 11  011  1110  1110  101
+    PMEVTYPER22_EL0   = 0xdf76, // 11  011  1110  1110  110
+    PMEVTYPER23_EL0   = 0xdf77, // 11  011  1110  1110  111
+    PMEVTYPER24_EL0   = 0xdf78, // 11  011  1110  1111  000
+    PMEVTYPER25_EL0   = 0xdf79, // 11  011  1110  1111  001
+    PMEVTYPER26_EL0   = 0xdf7a, // 11  011  1110  1111  010
+    PMEVTYPER27_EL0   = 0xdf7b, // 11  011  1110  1111  011
+    PMEVTYPER28_EL0   = 0xdf7c, // 11  011  1110  1111  100
+    PMEVTYPER29_EL0   = 0xdf7d, // 11  011  1110  1111  101
+    PMEVTYPER30_EL0   = 0xdf7e, // 11  011  1110  1111  110
+
+    // Trace registers
+    TRCPRGCTLR        = 0x8808, // 10  001  0000  0001  000
+    TRCPROCSELR       = 0x8810, // 10  001  0000  0010  000
+    TRCCONFIGR        = 0x8820, // 10  001  0000  0100  000
+    TRCAUXCTLR        = 0x8830, // 10  001  0000  0110  000
+    TRCEVENTCTL0R     = 0x8840, // 10  001  0000  1000  000
+    TRCEVENTCTL1R     = 0x8848, // 10  001  0000  1001  000
+    TRCSTALLCTLR      = 0x8858, // 10  001  0000  1011  000
+    TRCTSCTLR         = 0x8860, // 10  001  0000  1100  000
+    TRCSYNCPR         = 0x8868, // 10  001  0000  1101  000
+    TRCCCCTLR         = 0x8870, // 10  001  0000  1110  000
+    TRCBBCTLR         = 0x8878, // 10  001  0000  1111  000
+    TRCTRACEIDR       = 0x8801, // 10  001  0000  0000  001
+    TRCQCTLR          = 0x8809, // 10  001  0000  0001  001
+    TRCVICTLR         = 0x8802, // 10  001  0000  0000  010
+    TRCVIIECTLR       = 0x880a, // 10  001  0000  0001  010
+    TRCVISSCTLR       = 0x8812, // 10  001  0000  0010  010
+    TRCVIPCSSCTLR     = 0x881a, // 10  001  0000  0011  010
+    TRCVDCTLR         = 0x8842, // 10  001  0000  1000  010
+    TRCVDSACCTLR      = 0x884a, // 10  001  0000  1001  010
+    TRCVDARCCTLR      = 0x8852, // 10  001  0000  1010  010
+    TRCSEQEVR0        = 0x8804, // 10  001  0000  0000  100
+    TRCSEQEVR1        = 0x880c, // 10  001  0000  0001  100
+    TRCSEQEVR2        = 0x8814, // 10  001  0000  0010  100
+    TRCSEQRSTEVR      = 0x8834, // 10  001  0000  0110  100
+    TRCSEQSTR         = 0x883c, // 10  001  0000  0111  100
+    TRCEXTINSELR      = 0x8844, // 10  001  0000  1000  100
+    TRCCNTRLDVR0      = 0x8805, // 10  001  0000  0000  101
+    TRCCNTRLDVR1      = 0x880d, // 10  001  0000  0001  101
+    TRCCNTRLDVR2      = 0x8815, // 10  001  0000  0010  101
+    TRCCNTRLDVR3      = 0x881d, // 10  001  0000  0011  101
+    TRCCNTCTLR0       = 0x8825, // 10  001  0000  0100  101
+    TRCCNTCTLR1       = 0x882d, // 10  001  0000  0101  101
+    TRCCNTCTLR2       = 0x8835, // 10  001  0000  0110  101
+    TRCCNTCTLR3       = 0x883d, // 10  001  0000  0111  101
+    TRCCNTVR0         = 0x8845, // 10  001  0000  1000  101
+    TRCCNTVR1         = 0x884d, // 10  001  0000  1001  101
+    TRCCNTVR2         = 0x8855, // 10  001  0000  1010  101
+    TRCCNTVR3         = 0x885d, // 10  001  0000  1011  101
+    TRCIMSPEC0        = 0x8807, // 10  001  0000  0000  111
+    TRCIMSPEC1        = 0x880f, // 10  001  0000  0001  111
+    TRCIMSPEC2        = 0x8817, // 10  001  0000  0010  111
+    TRCIMSPEC3        = 0x881f, // 10  001  0000  0011  111
+    TRCIMSPEC4        = 0x8827, // 10  001  0000  0100  111
+    TRCIMSPEC5        = 0x882f, // 10  001  0000  0101  111
+    TRCIMSPEC6        = 0x8837, // 10  001  0000  0110  111
+    TRCIMSPEC7        = 0x883f, // 10  001  0000  0111  111
+    TRCRSCTLR2        = 0x8890, // 10  001  0001  0010  000
+    TRCRSCTLR3        = 0x8898, // 10  001  0001  0011  000
+    TRCRSCTLR4        = 0x88a0, // 10  001  0001  0100  000
+    TRCRSCTLR5        = 0x88a8, // 10  001  0001  0101  000
+    TRCRSCTLR6        = 0x88b0, // 10  001  0001  0110  000
+    TRCRSCTLR7        = 0x88b8, // 10  001  0001  0111  000
+    TRCRSCTLR8        = 0x88c0, // 10  001  0001  1000  000
+    TRCRSCTLR9        = 0x88c8, // 10  001  0001  1001  000
+    TRCRSCTLR10       = 0x88d0, // 10  001  0001  1010  000
+    TRCRSCTLR11       = 0x88d8, // 10  001  0001  1011  000
+    TRCRSCTLR12       = 0x88e0, // 10  001  0001  1100  000
+    TRCRSCTLR13       = 0x88e8, // 10  001  0001  1101  000
+    TRCRSCTLR14       = 0x88f0, // 10  001  0001  1110  000
+    TRCRSCTLR15       = 0x88f8, // 10  001  0001  1111  000
+    TRCRSCTLR16       = 0x8881, // 10  001  0001  0000  001
+    TRCRSCTLR17       = 0x8889, // 10  001  0001  0001  001
+    TRCRSCTLR18       = 0x8891, // 10  001  0001  0010  001
+    TRCRSCTLR19       = 0x8899, // 10  001  0001  0011  001
+    TRCRSCTLR20       = 0x88a1, // 10  001  0001  0100  001
+    TRCRSCTLR21       = 0x88a9, // 10  001  0001  0101  001
+    TRCRSCTLR22       = 0x88b1, // 10  001  0001  0110  001
+    TRCRSCTLR23       = 0x88b9, // 10  001  0001  0111  001
+    TRCRSCTLR24       = 0x88c1, // 10  001  0001  1000  001
+    TRCRSCTLR25       = 0x88c9, // 10  001  0001  1001  001
+    TRCRSCTLR26       = 0x88d1, // 10  001  0001  1010  001
+    TRCRSCTLR27       = 0x88d9, // 10  001  0001  1011  001
+    TRCRSCTLR28       = 0x88e1, // 10  001  0001  1100  001
+    TRCRSCTLR29       = 0x88e9, // 10  001  0001  1101  001
+    TRCRSCTLR30       = 0x88f1, // 10  001  0001  1110  001
+    TRCRSCTLR31       = 0x88f9, // 10  001  0001  1111  001
+    TRCSSCCR0         = 0x8882, // 10  001  0001  0000  010
+    TRCSSCCR1         = 0x888a, // 10  001  0001  0001  010
+    TRCSSCCR2         = 0x8892, // 10  001  0001  0010  010
+    TRCSSCCR3         = 0x889a, // 10  001  0001  0011  010
+    TRCSSCCR4         = 0x88a2, // 10  001  0001  0100  010
+    TRCSSCCR5         = 0x88aa, // 10  001  0001  0101  010
+    TRCSSCCR6         = 0x88b2, // 10  001  0001  0110  010
+    TRCSSCCR7         = 0x88ba, // 10  001  0001  0111  010
+    TRCSSCSR0         = 0x88c2, // 10  001  0001  1000  010
+    TRCSSCSR1         = 0x88ca, // 10  001  0001  1001  010
+    TRCSSCSR2         = 0x88d2, // 10  001  0001  1010  010
+    TRCSSCSR3         = 0x88da, // 10  001  0001  1011  010
+    TRCSSCSR4         = 0x88e2, // 10  001  0001  1100  010
+    TRCSSCSR5         = 0x88ea, // 10  001  0001  1101  010
+    TRCSSCSR6         = 0x88f2, // 10  001  0001  1110  010
+    TRCSSCSR7         = 0x88fa, // 10  001  0001  1111  010
+    TRCSSPCICR0       = 0x8883, // 10  001  0001  0000  011
+    TRCSSPCICR1       = 0x888b, // 10  001  0001  0001  011
+    TRCSSPCICR2       = 0x8893, // 10  001  0001  0010  011
+    TRCSSPCICR3       = 0x889b, // 10  001  0001  0011  011
+    TRCSSPCICR4       = 0x88a3, // 10  001  0001  0100  011
+    TRCSSPCICR5       = 0x88ab, // 10  001  0001  0101  011
+    TRCSSPCICR6       = 0x88b3, // 10  001  0001  0110  011
+    TRCSSPCICR7       = 0x88bb, // 10  001  0001  0111  011
+    TRCPDCR           = 0x88a4, // 10  001  0001  0100  100
+    TRCACVR0          = 0x8900, // 10  001  0010  0000  000
+    TRCACVR1          = 0x8910, // 10  001  0010  0010  000
+    TRCACVR2          = 0x8920, // 10  001  0010  0100  000
+    TRCACVR3          = 0x8930, // 10  001  0010  0110  000
+    TRCACVR4          = 0x8940, // 10  001  0010  1000  000
+    TRCACVR5          = 0x8950, // 10  001  0010  1010  000
+    TRCACVR6          = 0x8960, // 10  001  0010  1100  000
+    TRCACVR7          = 0x8970, // 10  001  0010  1110  000
+    TRCACVR8          = 0x8901, // 10  001  0010  0000  001
+    TRCACVR9          = 0x8911, // 10  001  0010  0010  001
+    TRCACVR10         = 0x8921, // 10  001  0010  0100  001
+    TRCACVR11         = 0x8931, // 10  001  0010  0110  001
+    TRCACVR12         = 0x8941, // 10  001  0010  1000  001
+    TRCACVR13         = 0x8951, // 10  001  0010  1010  001
+    TRCACVR14         = 0x8961, // 10  001  0010  1100  001
+    TRCACVR15         = 0x8971, // 10  001  0010  1110  001
+    TRCACATR0         = 0x8902, // 10  001  0010  0000  010
+    TRCACATR1         = 0x8912, // 10  001  0010  0010  010
+    TRCACATR2         = 0x8922, // 10  001  0010  0100  010
+    TRCACATR3         = 0x8932, // 10  001  0010  0110  010
+    TRCACATR4         = 0x8942, // 10  001  0010  1000  010
+    TRCACATR5         = 0x8952, // 10  001  0010  1010  010
+    TRCACATR6         = 0x8962, // 10  001  0010  1100  010
+    TRCACATR7         = 0x8972, // 10  001  0010  1110  010
+    TRCACATR8         = 0x8903, // 10  001  0010  0000  011
+    TRCACATR9         = 0x8913, // 10  001  0010  0010  011
+    TRCACATR10        = 0x8923, // 10  001  0010  0100  011
+    TRCACATR11        = 0x8933, // 10  001  0010  0110  011
+    TRCACATR12        = 0x8943, // 10  001  0010  1000  011
+    TRCACATR13        = 0x8953, // 10  001  0010  1010  011
+    TRCACATR14        = 0x8963, // 10  001  0010  1100  011
+    TRCACATR15        = 0x8973, // 10  001  0010  1110  011
+    TRCDVCVR0         = 0x8904, // 10  001  0010  0000  100
+    TRCDVCVR1         = 0x8924, // 10  001  0010  0100  100
+    TRCDVCVR2         = 0x8944, // 10  001  0010  1000  100
+    TRCDVCVR3         = 0x8964, // 10  001  0010  1100  100
+    TRCDVCVR4         = 0x8905, // 10  001  0010  0000  101
+    TRCDVCVR5         = 0x8925, // 10  001  0010  0100  101
+    TRCDVCVR6         = 0x8945, // 10  001  0010  1000  101
+    TRCDVCVR7         = 0x8965, // 10  001  0010  1100  101
+    TRCDVCMR0         = 0x8906, // 10  001  0010  0000  110
+    TRCDVCMR1         = 0x8926, // 10  001  0010  0100  110
+    TRCDVCMR2         = 0x8946, // 10  001  0010  1000  110
+    TRCDVCMR3         = 0x8966, // 10  001  0010  1100  110
+    TRCDVCMR4         = 0x8907, // 10  001  0010  0000  111
+    TRCDVCMR5         = 0x8927, // 10  001  0010  0100  111
+    TRCDVCMR6         = 0x8947, // 10  001  0010  1000  111
+    TRCDVCMR7         = 0x8967, // 10  001  0010  1100  111
+    TRCCIDCVR0        = 0x8980, // 10  001  0011  0000  000
+    TRCCIDCVR1        = 0x8990, // 10  001  0011  0010  000
+    TRCCIDCVR2        = 0x89a0, // 10  001  0011  0100  000
+    TRCCIDCVR3        = 0x89b0, // 10  001  0011  0110  000
+    TRCCIDCVR4        = 0x89c0, // 10  001  0011  1000  000
+    TRCCIDCVR5        = 0x89d0, // 10  001  0011  1010  000
+    TRCCIDCVR6        = 0x89e0, // 10  001  0011  1100  000
+    TRCCIDCVR7        = 0x89f0, // 10  001  0011  1110  000
+    TRCVMIDCVR0       = 0x8981, // 10  001  0011  0000  001
+    TRCVMIDCVR1       = 0x8991, // 10  001  0011  0010  001
+    TRCVMIDCVR2       = 0x89a1, // 10  001  0011  0100  001
+    TRCVMIDCVR3       = 0x89b1, // 10  001  0011  0110  001
+    TRCVMIDCVR4       = 0x89c1, // 10  001  0011  1000  001
+    TRCVMIDCVR5       = 0x89d1, // 10  001  0011  1010  001
+    TRCVMIDCVR6       = 0x89e1, // 10  001  0011  1100  001
+    TRCVMIDCVR7       = 0x89f1, // 10  001  0011  1110  001
+    TRCCIDCCTLR0      = 0x8982, // 10  001  0011  0000  010
+    TRCCIDCCTLR1      = 0x898a, // 10  001  0011  0001  010
+    TRCVMIDCCTLR0     = 0x8992, // 10  001  0011  0010  010
+    TRCVMIDCCTLR1     = 0x899a, // 10  001  0011  0011  010
+    TRCITCTRL         = 0x8b84, // 10  001  0111  0000  100
+    TRCCLAIMSET       = 0x8bc6, // 10  001  0111  1000  110
+    TRCCLAIMCLR       = 0x8bce, // 10  001  0111  1001  110
+
+    // GICv3 registers
+    ICC_BPR1_EL1      = 0xc663, // 11  000  1100  1100  011
+    ICC_BPR0_EL1      = 0xc643, // 11  000  1100  1000  011
+    ICC_PMR_EL1       = 0xc230, // 11  000  0100  0110  000
+    ICC_CTLR_EL1      = 0xc664, // 11  000  1100  1100  100
+    ICC_CTLR_EL3      = 0xf664, // 11  110  1100  1100  100
+    ICC_SRE_EL1       = 0xc665, // 11  000  1100  1100  101
+    ICC_SRE_EL2       = 0xe64d, // 11  100  1100  1001  101
+    ICC_SRE_EL3       = 0xf665, // 11  110  1100  1100  101
+    ICC_IGRPEN0_EL1   = 0xc666, // 11  000  1100  1100  110
+    ICC_IGRPEN1_EL1   = 0xc667, // 11  000  1100  1100  111
+    ICC_IGRPEN1_EL3   = 0xf667, // 11  110  1100  1100  111
+    ICC_SEIEN_EL1     = 0xc668, // 11  000  1100  1101  000
+    ICC_AP0R0_EL1     = 0xc644, // 11  000  1100  1000  100
+    ICC_AP0R1_EL1     = 0xc645, // 11  000  1100  1000  101
+    ICC_AP0R2_EL1     = 0xc646, // 11  000  1100  1000  110
+    ICC_AP0R3_EL1     = 0xc647, // 11  000  1100  1000  111
+    ICC_AP1R0_EL1     = 0xc648, // 11  000  1100  1001  000
+    ICC_AP1R1_EL1     = 0xc649, // 11  000  1100  1001  001
+    ICC_AP1R2_EL1     = 0xc64a, // 11  000  1100  1001  010
+    ICC_AP1R3_EL1     = 0xc64b, // 11  000  1100  1001  011
+    ICH_AP0R0_EL2     = 0xe640, // 11  100  1100  1000  000
+    ICH_AP0R1_EL2     = 0xe641, // 11  100  1100  1000  001
+    ICH_AP0R2_EL2     = 0xe642, // 11  100  1100  1000  010
+    ICH_AP0R3_EL2     = 0xe643, // 11  100  1100  1000  011
+    ICH_AP1R0_EL2     = 0xe648, // 11  100  1100  1001  000
+    ICH_AP1R1_EL2     = 0xe649, // 11  100  1100  1001  001
+    ICH_AP1R2_EL2     = 0xe64a, // 11  100  1100  1001  010
+    ICH_AP1R3_EL2     = 0xe64b, // 11  100  1100  1001  011
+    ICH_HCR_EL2       = 0xe658, // 11  100  1100  1011  000
+    ICH_MISR_EL2      = 0xe65a, // 11  100  1100  1011  010
+    ICH_VMCR_EL2      = 0xe65f, // 11  100  1100  1011  111
+    ICH_VSEIR_EL2     = 0xe64c, // 11  100  1100  1001  100
+    ICH_LR0_EL2       = 0xe660, // 11  100  1100  1100  000
+    ICH_LR1_EL2       = 0xe661, // 11  100  1100  1100  001
+    ICH_LR2_EL2       = 0xe662, // 11  100  1100  1100  010
+    ICH_LR3_EL2       = 0xe663, // 11  100  1100  1100  011
+    ICH_LR4_EL2       = 0xe664, // 11  100  1100  1100  100
+    ICH_LR5_EL2       = 0xe665, // 11  100  1100  1100  101
+    ICH_LR6_EL2       = 0xe666, // 11  100  1100  1100  110
+    ICH_LR7_EL2       = 0xe667, // 11  100  1100  1100  111
+    ICH_LR8_EL2       = 0xe668, // 11  100  1100  1101  000
+    ICH_LR9_EL2       = 0xe669, // 11  100  1100  1101  001
+    ICH_LR10_EL2      = 0xe66a, // 11  100  1100  1101  010
+    ICH_LR11_EL2      = 0xe66b, // 11  100  1100  1101  011
+    ICH_LR12_EL2      = 0xe66c, // 11  100  1100  1101  100
+    ICH_LR13_EL2      = 0xe66d, // 11  100  1100  1101  101
+    ICH_LR14_EL2      = 0xe66e, // 11  100  1100  1101  110
+    ICH_LR15_EL2      = 0xe66f, // 11  100  1100  1101  111
+  };
+
+  // Cyclone specific system registers
+  enum CycloneSysRegValues {
+    CPM_IOACC_CTL_EL3 = 0xff90
+  };
+
+  // Note that these do not inherit from AArch64NamedImmMapper. This class is
+  // sufficiently different in its behaviour that I don't believe it's worth
+  // burdening the common AArch64NamedImmMapper with abstractions only needed in
+  // this one case.
+  struct SysRegMapper {
+    static const AArch64NamedImmMapper::Mapping SysRegPairs[];
+    static const AArch64NamedImmMapper::Mapping CycloneSysRegPairs[];
+
+    const AArch64NamedImmMapper::Mapping *InstPairs;
+    size_t NumInstPairs;
+    uint64_t FeatureBits;
+
+    SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
+    uint32_t fromString(StringRef Name, bool &Valid) const;
+    std::string toString(uint32_t Bits, bool &Valid) const;
+  };
+
+  struct MSRMapper : SysRegMapper {
+    static const AArch64NamedImmMapper::Mapping MSRPairs[];
+    MSRMapper(uint64_t FeatureBits);
+  };
+
+  struct MRSMapper : SysRegMapper {
+    static const AArch64NamedImmMapper::Mapping MRSPairs[];
+    MRSMapper(uint64_t FeatureBits);
+  };
+
+  uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
+}
+
+namespace AArch64TLBI {
+  enum TLBIValues {
+    Invalid = -1,          // Op0 Op1  CRn   CRm   Op2
+    IPAS2E1IS    = 0x6401, // 01  100  1000  0000  001
+    IPAS2LE1IS   = 0x6405, // 01  100  1000  0000  101
+    VMALLE1IS    = 0x4418, // 01  000  1000  0011  000
+    ALLE2IS      = 0x6418, // 01  100  1000  0011  000
+    ALLE3IS      = 0x7418, // 01  110  1000  0011  000
+    VAE1IS       = 0x4419, // 01  000  1000  0011  001
+    VAE2IS       = 0x6419, // 01  100  1000  0011  001
+    VAE3IS       = 0x7419, // 01  110  1000  0011  001
+    ASIDE1IS     = 0x441a, // 01  000  1000  0011  010
+    VAAE1IS      = 0x441b, // 01  000  1000  0011  011
+    ALLE1IS      = 0x641c, // 01  100  1000  0011  100
+    VALE1IS      = 0x441d, // 01  000  1000  0011  101
+    VALE2IS      = 0x641d, // 01  100  1000  0011  101
+    VALE3IS      = 0x741d, // 01  110  1000  0011  101
+    VMALLS12E1IS = 0x641e, // 01  100  1000  0011  110
+    VAALE1IS     = 0x441f, // 01  000  1000  0011  111
+    IPAS2E1      = 0x6421, // 01  100  1000  0100  001
+    IPAS2LE1     = 0x6425, // 01  100  1000  0100  101
+    VMALLE1      = 0x4438, // 01  000  1000  0111  000
+    ALLE2        = 0x6438, // 01  100  1000  0111  000
+    ALLE3        = 0x7438, // 01  110  1000  0111  000
+    VAE1         = 0x4439, // 01  000  1000  0111  001
+    VAE2         = 0x6439, // 01  100  1000  0111  001
+    VAE3         = 0x7439, // 01  110  1000  0111  001
+    ASIDE1       = 0x443a, // 01  000  1000  0111  010
+    VAAE1        = 0x443b, // 01  000  1000  0111  011
+    ALLE1        = 0x643c, // 01  100  1000  0111  100
+    VALE1        = 0x443d, // 01  000  1000  0111  101
+    VALE2        = 0x643d, // 01  100  1000  0111  101
+    VALE3        = 0x743d, // 01  110  1000  0111  101
+    VMALLS12E1   = 0x643e, // 01  100  1000  0111  110
+    VAALE1       = 0x443f  // 01  000  1000  0111  111
+  };
+
+  struct TLBIMapper : AArch64NamedImmMapper {
+    const static Mapping TLBIPairs[];
+
+    TLBIMapper();
+  };
+
+  static inline bool NeedsRegister(TLBIValues Val) {
+    switch (Val) {
+    case VMALLE1IS:
+    case ALLE2IS:
+    case ALLE3IS:
+    case ALLE1IS:
+    case VMALLS12E1IS:
+    case VMALLE1:
+    case ALLE2:
+    case ALLE3:
+    case ALLE1:
+    case VMALLS12E1:
+      return false;
+    default:
+      return true;
+    }
+  }
+} 
+
+namespace AArch64II {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // AArch64 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    MO_FRAGMENT = 0x7,
+
+    /// MO_PAGE - A symbol operand with this flag represents the pc-relative
+    /// offset of the 4K page containing the symbol.  This is used with the
+    /// ADRP instruction.
+    MO_PAGE = 1,
+
+    /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
+    /// that symbol within a 4K page.  This offset is added to the page address
+    /// to produce the complete address.
+    MO_PAGEOFF = 2,
+
+    /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
+    /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G3 = 3,
+
+    /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
+    /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G2 = 4,
+
+    /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
+    /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G1 = 5,
+
+    /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
+    /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G0 = 6,
+
+    /// MO_GOT - This flag indicates that a symbol operand represents the
+    /// address of the GOT entry for the symbol, rather than the address of
+    /// the symbol itself.
+    MO_GOT = 8,
+
+    /// MO_NC - Indicates whether the linker is expected to check the symbol
+    /// reference for overflow. For example in an ADRP/ADD pair of relocations
+    /// the ADRP usually does check, but not the ADD.
+    MO_NC = 0x10,
+
+    /// MO_TLS - Indicates that the operand being accessed is some kind of
+    /// thread-local symbol. On Darwin, only one type of thread-local access
+    /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
+    /// referee will affect interpretation.
+    MO_TLS = 0x20
+  };
+} // end namespace AArch64II
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/Utils/CMakeLists.txt b/lib/Target/AArch64/Utils/CMakeLists.txt
new file mode 100644
index 00000000000..8ee03a7571b
--- /dev/null
+++ b/lib/Target/AArch64/Utils/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMAArch64Utils
+  AArch64BaseInfo.cpp
+  )
diff --git a/lib/Target/AArch64/Utils/LLVMBuild.txt b/lib/Target/AArch64/Utils/LLVMBuild.txt
new file mode 100644
index 00000000000..bcefeb672f7
--- /dev/null
+++ b/lib/Target/AArch64/Utils/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/AArch64/Utils/LLVMBuild.txt ----------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AArch64Utils
+parent = AArch64
+required_libraries = Support
+add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/Utils/Makefile b/lib/Target/AArch64/Utils/Makefile
new file mode 100644
index 00000000000..0b80f82f2b9
--- /dev/null
+++ b/lib/Target/AArch64/Utils/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/AArch64/Utils/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAArch64Utils
+
+# Hack: we need to include 'main' AArch64 target directory to grab private
+# headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/ARM64.h b/lib/Target/ARM64/ARM64.h
deleted file mode 100644
index debb9002eb4..00000000000
--- a/lib/Target/ARM64/ARM64.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===-- ARM64.h - Top-level interface for ARM64 representation --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the entry points for global functions defined in the LLVM
-// ARM64 back-end.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef TARGET_ARM64_H
-#define TARGET_ARM64_H
-
-#include "Utils/ARM64BaseInfo.h"
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-
-class ARM64TargetMachine;
-class FunctionPass;
-class MachineFunctionPass;
-
-FunctionPass *createARM64DeadRegisterDefinitions();
-FunctionPass *createARM64ConditionalCompares();
-FunctionPass *createARM64AdvSIMDScalar();
-FunctionPass *createARM64BranchRelaxation();
-FunctionPass *createARM64ISelDag(ARM64TargetMachine &TM,
-                                 CodeGenOpt::Level OptLevel);
-FunctionPass *createARM64StorePairSuppressPass();
-FunctionPass *createARM64ExpandPseudoPass();
-FunctionPass *createARM64LoadStoreOptimizationPass();
-ModulePass *createARM64PromoteConstantPass();
-FunctionPass *createARM64AddressTypePromotionPass();
-/// \brief Creates an ARM-specific Target Transformation Info pass.
-ImmutablePass *createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM);
-
-FunctionPass *createARM64CleanupLocalDynamicTLSPass();
-
-FunctionPass *createARM64CollectLOHPass();
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64.td b/lib/Target/ARM64/ARM64.td
deleted file mode 100644
index c473205f17c..00000000000
--- a/lib/Target/ARM64/ARM64.td
+++ /dev/null
@@ -1,134 +0,0 @@
-//===- ARM64.td - Describe the ARM64 Target Machine --------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Target-independent interfaces which we are implementing
-//===----------------------------------------------------------------------===//
-
-include "llvm/Target/Target.td"
-
-//===----------------------------------------------------------------------===//
-// ARM64 Subtarget features.
-//
-
-def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
-                                       "Enable ARMv8 FP">;
-
-def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
-  "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
-
-def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
-  "Enable cryptographic instructions">;
-
-def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
-  "Enable ARMv8 CRC-32 checksum instructions">;
-
-/// Cyclone has register move instructions which are "free".
-def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
-                                        "Has zero-cycle register moves">;
-
-/// Cyclone has instructions which zero registers for "free".
-def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
-                                        "Has zero-cycle zeroing instructions">;
-
-//===----------------------------------------------------------------------===//
-// Register File Description
-//===----------------------------------------------------------------------===//
-
-include "ARM64RegisterInfo.td"
-include "ARM64CallingConvention.td"
-
-//===----------------------------------------------------------------------===//
-// Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-include "ARM64Schedule.td"
-include "ARM64InstrInfo.td"
-
-def ARM64InstrInfo : InstrInfo;
-
-//===----------------------------------------------------------------------===//
-// ARM64 Processors supported.
-//
-include "ARM64SchedA53.td"
-include "ARM64SchedCyclone.td"
-
-def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
-                                   "Cortex-A53 ARM processors",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
-                                   FeatureCRC]>;
-
-def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
-                                   "Cortex-A57 ARM processors",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
-                                   FeatureCRC]>;
-
-def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
-                                   "Cyclone",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
-                                   FeatureCRC,
-                                   FeatureZCRegMove, FeatureZCZeroing]>;
-
-def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
-                                              FeatureNEON,
-                                              FeatureCRC]>;
-
-def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
-def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>;
-def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
-
-//===----------------------------------------------------------------------===//
-// Assembly parser
-//===----------------------------------------------------------------------===//
-
-def GenericAsmParserVariant : AsmParserVariant {
-  int Variant = 0;
-  string Name = "generic";
-}
-
-def AppleAsmParserVariant : AsmParserVariant {
-  int Variant = 1;
-  string Name = "apple-neon";
-}
-
-//===----------------------------------------------------------------------===//
-// Assembly printer
-//===----------------------------------------------------------------------===//
-// ARM64 Uses the MC printer for asm output, so make sure the TableGen
-// AsmWriter bits get associated with the correct class.
-def GenericAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  int Variant = 0;
-  bit isMCAsmWriter = 1;
-}
-
-def AppleAsmWriter : AsmWriter {
-  let AsmWriterClassName = "AppleInstPrinter";
-  int Variant = 1;
-  int isMCAsmWriter = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Target Declaration
-//===----------------------------------------------------------------------===//
-
-def ARM64 : Target {
-  let InstructionSet = ARM64InstrInfo;
-  let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
-  let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
-}
diff --git a/lib/Target/ARM64/ARM64AddressTypePromotion.cpp b/lib/Target/ARM64/ARM64AddressTypePromotion.cpp
deleted file mode 100644
index be2b5eed2ad..00000000000
--- a/lib/Target/ARM64/ARM64AddressTypePromotion.cpp
+++ /dev/null
@@ -1,493 +0,0 @@
-
-//===-- ARM64AddressTypePromotion.cpp --- Promote type for addr accesses -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass tries to promote the computations use to obtained a sign extended
-// value used into memory accesses.
-// E.g.
-// a = add nsw i32 b, 3
-// d = sext i32 a to i64
-// e = getelementptr ..., i64 d
-//
-// =>
-// f = sext i32 b to i64
-// a = add nsw i64 f, 3
-// e = getelementptr ..., i64 a
-//
-// This is legal to do so if the computations are markers with either nsw or nuw
-// markers.
-// Moreover, the current heuristic is simple: it does not create new sext
-// operations, i.e., it gives up when a sext would have forked (e.g., if
-// a = add i32 b, c, two sexts are required to promote the computation).
-//
-// FIXME: This pass may be useful for other targets too.
-// ===---------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-type-promotion"
-
-static cl::opt<bool>
-EnableAddressTypePromotion("arm64-type-promotion", cl::Hidden,
-                           cl::desc("Enable the type promotion pass"),
-                           cl::init(true));
-static cl::opt<bool>
-EnableMerge("arm64-type-promotion-merge", cl::Hidden,
-            cl::desc("Enable merging of redundant sexts when one is dominating"
-                     " the other."),
-            cl::init(true));
-
-//===----------------------------------------------------------------------===//
-//                       ARM64AddressTypePromotion
-//===----------------------------------------------------------------------===//
-
-namespace llvm {
-void initializeARM64AddressTypePromotionPass(PassRegistry &);
-}
-
-namespace {
-class ARM64AddressTypePromotion : public FunctionPass {
-
-public:
-  static char ID;
-  ARM64AddressTypePromotion()
-      : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) {
-    initializeARM64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
-  }
-
-  const char *getPassName() const override {
-    return "ARM64 Address Type Promotion";
-  }
-
-  /// Iterate over the functions and promote the computation of interesting
-  // sext instructions.
-  bool runOnFunction(Function &F) override;
-
-private:
-  /// The current function.
-  Function *Func;
-  /// Filter out all sexts that does not have this type.
-  /// Currently initialized with Int64Ty.
-  Type *ConsideredSExtType;
-
-  // This transformation requires dominator info.
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    FunctionPass::getAnalysisUsage(AU);
-  }
-
-  typedef SmallPtrSet<Instruction *, 32> SetOfInstructions;
-  typedef SmallVector<Instruction *, 16> Instructions;
-  typedef DenseMap<Value *, Instructions> ValueToInsts;
-
-  /// Check if it is profitable to move a sext through this instruction.
-  /// Currently, we consider it is profitable if:
-  /// - Inst is used only once (no need to insert truncate).
-  /// - Inst has only one operand that will require a sext operation (we do
-  ///   do not create new sext operation).
-  bool shouldGetThrough(const Instruction *Inst);
-
-  /// Check if it is possible and legal to move a sext through this
-  /// instruction.
-  /// Current heuristic considers that we can get through:
-  /// - Arithmetic operation marked with the nsw or nuw flag.
-  /// - Other sext operation.
-  /// - Truncate operation if it was just dropping sign extended bits.
-  bool canGetThrough(const Instruction *Inst);
-
-  /// Move sext operations through safe to sext instructions.
-  bool propagateSignExtension(Instructions &SExtInsts);
-
-  /// Is this sext should be considered for code motion.
-  /// We look for sext with ConsideredSExtType and uses in at least one
-  // GetElementPtrInst.
-  bool shouldConsiderSExt(const Instruction *SExt) const;
-
-  /// Collect all interesting sext operations, i.e., the ones with the right
-  /// type and used in memory accesses.
-  /// More precisely, a sext instruction is considered as interesting if it
-  /// is used in a "complex" getelementptr or it exits at least another
-  /// sext instruction that sign extended the same initial value.
-  /// A getelementptr is considered as "complex" if it has more than 2
-  // operands.
-  void analyzeSExtension(Instructions &SExtInsts);
-
-  /// Merge redundant sign extension operations in common dominator.
-  void mergeSExts(ValueToInsts &ValToSExtendedUses,
-                  SetOfInstructions &ToRemove);
-};
-} // end anonymous namespace.
-
-char ARM64AddressTypePromotion::ID = 0;
-
-INITIALIZE_PASS_BEGIN(ARM64AddressTypePromotion, "arm64-type-promotion",
-                      "ARM64 Type Promotion Pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ARM64AddressTypePromotion, "arm64-type-promotion",
-                    "ARM64 Type Promotion Pass", false, false)
-
-FunctionPass *llvm::createARM64AddressTypePromotionPass() {
-  return new ARM64AddressTypePromotion();
-}
-
-bool ARM64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
-  if (isa<SExtInst>(Inst))
-    return true;
-
-  const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
-  if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
-      (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
-    return true;
-
-  // sext(trunc(sext)) --> sext
-  if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) {
-    const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0));
-    // Check that the truncate just drop sign extended bits.
-    if (Inst->getType()->getIntegerBitWidth() >=
-            Opnd->getOperand(0)->getType()->getIntegerBitWidth() &&
-        Inst->getOperand(0)->getType()->getIntegerBitWidth() <=
-            ConsideredSExtType->getIntegerBitWidth())
-      return true;
-  }
-
-  return false;
-}
-
-bool ARM64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
-  // If the type of the sext is the same as the considered one, this sext
-  // will become useless.
-  // Otherwise, we will have to do something to preserve the original value,
-  // unless it is used once.
-  if (isa<SExtInst>(Inst) &&
-      (Inst->getType() == ConsideredSExtType || Inst->hasOneUse()))
-    return true;
-
-  // If the Inst is used more that once, we may need to insert truncate
-  // operations and we don't do that at the moment.
-  if (!Inst->hasOneUse())
-    return false;
-
-  // This truncate is used only once, thus if we can get thourgh, it will become
-  // useless.
-  if (isa<TruncInst>(Inst))
-    return true;
-
-  // If both operands are not constant, a new sext will be created here.
-  // Current heuristic is: each step should be profitable.
-  // Therefore we don't allow to increase the number of sext even if it may
-  // be profitable later on.
-  if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1)))
-    return true;
-
-  return false;
-}
-
-static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
-  if (isa<SelectInst>(Inst) && OpIdx == 0)
-    return false;
-  return true;
-}
-
-bool
-ARM64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
-  if (SExt->getType() != ConsideredSExtType)
-    return false;
-
-  for (const Use &U : SExt->uses()) {
-    if (isa<GetElementPtrInst>(*U))
-      return true;
-  }
-
-  return false;
-}
-
-// Input:
-// - SExtInsts contains all the sext instructions that are use direclty in
-//   GetElementPtrInst, i.e., access to memory.
-// Algorithm:
-// - For each sext operation in SExtInsts:
-//   Let var be the operand of sext.
-//   while it is profitable (see shouldGetThrough), legal, and safe
-//   (see canGetThrough) to move sext through var's definition:
-//   * promote the type of var's definition.
-//   * fold var into sext uses.
-//   * move sext above var's definition.
-//   * update sext operand to use the operand of var that should be sign
-//     extended (by construction there is only one).
-//
-//   E.g.,
-//   a = ... i32 c, 3
-//   b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a'
-//   ...
-//   = b
-// => Yes, update the code
-//   b = sext i32 c to i64
-//   a = ... i64 b, 3
-//   ...
-//   = a
-// Iterate on 'c'.
-bool
-ARM64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
-  DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
-
-  bool LocalChange = false;
-  SetOfInstructions ToRemove;
-  ValueToInsts ValToSExtendedUses;
-  while (!SExtInsts.empty()) {
-    // Get through simple chain.
-    Instruction *SExt = SExtInsts.pop_back_val();
-
-    DEBUG(dbgs() << "Consider:\n" << *SExt << '\n');
-
-    // If this SExt has already been merged continue.
-    if (SExt->use_empty() && ToRemove.count(SExt)) {
-      DEBUG(dbgs() << "No uses => marked as delete\n");
-      continue;
-    }
-
-    // Now try to get through the chain of definitions.
-    while (isa<Instruction>(SExt->getOperand(0))) {
-      Instruction *Inst = dyn_cast<Instruction>(SExt->getOperand(0));
-      DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
-      if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
-        // We cannot get through something that is not an Instruction
-        // or not safe to SExt.
-        DEBUG(dbgs() << "Cannot get through\n");
-        break;
-      }
-
-      LocalChange = true;
-      // If this is a sign extend, it becomes useless.
-      if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) {
-        DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n");
-        // We cannot use replaceAllUsesWith here because we may trigger some
-        // assertion on the type as all involved sext operation may have not
-        // been moved yet.
-        while (!Inst->use_empty()) {
-          Value::use_iterator UseIt = Inst->use_begin();
-          Instruction *UseInst = dyn_cast<Instruction>(*UseIt);
-          assert(UseInst && "Use of sext is not an Instruction!");
-          UseInst->setOperand(UseIt->getOperandNo(), SExt);
-        }
-        ToRemove.insert(Inst);
-        SExt->setOperand(0, Inst->getOperand(0));
-        SExt->moveBefore(Inst);
-        continue;
-      }
-
-      // Get through the Instruction:
-      // 1. Update its type.
-      // 2. Replace the uses of SExt by Inst.
-      // 3. Sign extend each operand that needs to be sign extended.
-
-      // Step #1.
-      Inst->mutateType(SExt->getType());
-      // Step #2.
-      SExt->replaceAllUsesWith(Inst);
-      // Step #3.
-      Instruction *SExtForOpnd = SExt;
-
-      DEBUG(dbgs() << "Propagate SExt to operands\n");
-      for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx;
-           ++OpIdx) {
-        DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n');
-        if (Inst->getOperand(OpIdx)->getType() == SExt->getType() ||
-            !shouldSExtOperand(Inst, OpIdx)) {
-          DEBUG(dbgs() << "No need to propagate\n");
-          continue;
-        }
-        // Check if we can statically sign extend the operand.
-        Value *Opnd = Inst->getOperand(OpIdx);
-        if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
-          DEBUG(dbgs() << "Statically sign extend\n");
-          Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(),
-                                                         Cst->getSExtValue()));
-          continue;
-        }
-        // UndefValue are typed, so we have to statically sign extend them.
-        if (isa<UndefValue>(Opnd)) {
-          DEBUG(dbgs() << "Statically sign extend\n");
-          Inst->setOperand(OpIdx, UndefValue::get(SExt->getType()));
-          continue;
-        }
-
-        // Otherwise we have to explicity sign extend it.
-        assert(SExtForOpnd &&
-               "Only one operand should have been sign extended");
-
-        SExtForOpnd->setOperand(0, Opnd);
-
-        DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n");
-        // Move the sign extension before the insertion point.
-        SExtForOpnd->moveBefore(Inst);
-        Inst->setOperand(OpIdx, SExtForOpnd);
-        // If more sext are required, new instructions will have to be created.
-        SExtForOpnd = nullptr;
-      }
-      if (SExtForOpnd == SExt) {
-        DEBUG(dbgs() << "Sign extension is useless now\n");
-        ToRemove.insert(SExt);
-        break;
-      }
-    }
-
-    // If the use is already of the right type, connect its uses to its argument
-    // and delete it.
-    // This can happen for an Instruction which all uses are sign extended.
-    if (!ToRemove.count(SExt) &&
-        SExt->getType() == SExt->getOperand(0)->getType()) {
-      DEBUG(dbgs() << "Sign extension is useless, attach its use to "
-                      "its argument\n");
-      SExt->replaceAllUsesWith(SExt->getOperand(0));
-      ToRemove.insert(SExt);
-    } else
-      ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt);
-  }
-
-  if (EnableMerge)
-    mergeSExts(ValToSExtendedUses, ToRemove);
-
-  // Remove all instructions marked as ToRemove.
-  for (Instruction *I: ToRemove)
-    I->eraseFromParent();
-  return LocalChange;
-}
-
-void ARM64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
-                                           SetOfInstructions &ToRemove) {
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
-  for (auto &Entry : ValToSExtendedUses) {
-    Instructions &Insts = Entry.second;
-    Instructions CurPts;
-    for (Instruction *Inst : Insts) {
-      if (ToRemove.count(Inst))
-        continue;
-      bool inserted = false;
-      for (auto Pt : CurPts) {
-        if (DT.dominates(Inst, Pt)) {
-          DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n"
-                       << *Inst << '\n');
-          (Pt)->replaceAllUsesWith(Inst);
-          ToRemove.insert(Pt);
-          Pt = Inst;
-          inserted = true;
-          break;
-        }
-        if (!DT.dominates(Pt, Inst))
-          // Give up if we need to merge in a common dominator as the
-          // expermients show it is not profitable.
-          continue;
-
-        DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n"
-                     << *Pt << '\n');
-        Inst->replaceAllUsesWith(Pt);
-        ToRemove.insert(Inst);
-        inserted = true;
-        break;
-      }
-      if (!inserted)
-        CurPts.push_back(Inst);
-    }
-  }
-}
-
-void ARM64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
-  DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
-
-  DenseMap<Value *, Instruction *> SeenChains;
-
-  for (auto &BB : *Func) {
-    for (auto &II : BB) {
-      Instruction *SExt = &II;
-
-      // Collect all sext operation per type.
-      if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt))
-        continue;
-
-      DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n');
-
-      // Cases where we actually perform the optimization:
-      // 1. SExt is used in a getelementptr with more than 2 operand =>
-      //    likely we can merge some computation if they are done on 64 bits.
-      // 2. The beginning of the SExt chain is SExt several time. =>
-      //    code sharing is possible.
-
-      bool insert = false;
-      // #1.
-      for (const Use &U : SExt->uses()) {
-        const Instruction *Inst = dyn_cast<GetElementPtrInst>(U);
-        if (Inst && Inst->getNumOperands() > 2) {
-          DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
-                       << '\n');
-          insert = true;
-          break;
-        }
-      }
-
-      // #2.
-      // Check the head of the chain.
-      Instruction *Inst = SExt;
-      Value *Last;
-      do {
-        int OpdIdx = 0;
-        const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
-        if (BinOp && isa<ConstantInt>(BinOp->getOperand(0)))
-          OpdIdx = 1;
-        Last = Inst->getOperand(OpdIdx);
-        Inst = dyn_cast<Instruction>(Last);
-      } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst));
-
-      DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n');
-      DenseMap<Value *, Instruction *>::iterator AlreadySeen =
-          SeenChains.find(Last);
-      if (insert || AlreadySeen != SeenChains.end()) {
-        DEBUG(dbgs() << "Insert\n");
-        SExtInsts.push_back(SExt);
-        if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) {
-          DEBUG(dbgs() << "Insert chain member\n");
-          SExtInsts.push_back(AlreadySeen->second);
-          SeenChains[Last] = nullptr;
-        }
-      } else {
-        DEBUG(dbgs() << "Record its chain membership\n");
-        SeenChains[Last] = SExt;
-      }
-    }
-  }
-}
-
-bool ARM64AddressTypePromotion::runOnFunction(Function &F) {
-  if (!EnableAddressTypePromotion || F.isDeclaration())
-    return false;
-  Func = &F;
-  ConsideredSExtType = Type::getInt64Ty(Func->getContext());
-
-  DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n');
-
-  Instructions SExtInsts;
-  analyzeSExtension(SExtInsts);
-  return propagateSignExtension(SExtInsts);
-}
diff --git a/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp b/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp
deleted file mode 100644
index 5950a8f18e1..00000000000
--- a/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp
+++ /dev/null
@@ -1,385 +0,0 @@
-//===-- ARM64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// When profitable, replace GPR targeting i64 instructions with their
-// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
-// as minimizing the number of cross-class register copies.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// TODO: Graph based predicate heuristics.
-// Walking the instruction list linearly will get many, perhaps most, of
-// the cases, but to do a truly thorough job of this, we need a more
-// wholistic approach.
-//
-// This optimization is very similar in spirit to the register allocator's
-// spill placement, only here we're determining where to place cross-class
-// register copies rather than spills. As such, a similar approach is
-// called for.
-//
-// We want to build up a set of graphs of all instructions which are candidates
-// for transformation along with instructions which generate their inputs and
-// consume their outputs. For each edge in the graph, we assign a weight
-// based on whether there is a copy required there (weight zero if not) and
-// the block frequency of the block containing the defining or using
-// instruction, whichever is less. Our optimization is then a graph problem
-// to minimize the total weight of all the graphs, then transform instructions
-// and add or remove copy instructions as called for to implement the
-// solution.
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64RegisterInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-simd-scalar"
-
-// Allow forcing all i64 operations with equivalent SIMD instructions to use
-// them. For stress-testing the transformation function.
-static cl::opt<bool>
-TransformAll("arm64-simd-scalar-force-all",
-             cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
-             cl::init(false), cl::Hidden);
-
-STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
-STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
-STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
-
-namespace {
-class ARM64AdvSIMDScalar : public MachineFunctionPass {
-  MachineRegisterInfo *MRI;
-  const ARM64InstrInfo *TII;
-
-private:
-  // isProfitableToTransform - Predicate function to determine whether an
-  // instruction should be transformed to its equivalent AdvSIMD scalar
-  // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
-  bool isProfitableToTransform(const MachineInstr *MI) const;
-
-  // transformInstruction - Perform the transformation of an instruction
-  // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
-  // to be the correct register class, minimizing cross-class copies.
-  void transformInstruction(MachineInstr *MI);
-
-  // processMachineBasicBlock - Main optimzation loop.
-  bool processMachineBasicBlock(MachineBasicBlock *MBB);
-
-public:
-  static char ID; // Pass identification, replacement for typeid.
-  explicit ARM64AdvSIMDScalar() : MachineFunctionPass(ID) {}
-
-  bool runOnMachineFunction(MachineFunction &F) override;
-
-  const char *getPassName() const override {
-    return "AdvSIMD Scalar Operation Optimization";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-char ARM64AdvSIMDScalar::ID = 0;
-} // end anonymous namespace
-
-static bool isGPR64(unsigned Reg, unsigned SubReg,
-                    const MachineRegisterInfo *MRI) {
-  if (SubReg)
-    return false;
-  if (TargetRegisterInfo::isVirtualRegister(Reg))
-    return MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::GPR64RegClass);
-  return ARM64::GPR64RegClass.contains(Reg);
-}
-
-static bool isFPR64(unsigned Reg, unsigned SubReg,
-                    const MachineRegisterInfo *MRI) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg))
-    return (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR64RegClass) &&
-            SubReg == 0) ||
-           (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR128RegClass) &&
-            SubReg == ARM64::dsub);
-  // Physical register references just check the register class directly.
-  return (ARM64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
-         (ARM64::FPR128RegClass.contains(Reg) && SubReg == ARM64::dsub);
-}
-
-// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
-// copy instruction. Return zero_reg if the instruction is not a copy.
-static unsigned getSrcFromCopy(const MachineInstr *MI,
-                               const MachineRegisterInfo *MRI,
-                               unsigned &SubReg) {
-  SubReg = 0;
-  // The "FMOV Xd, Dn" instruction is the typical form.
-  if (MI->getOpcode() == ARM64::FMOVDXr || MI->getOpcode() == ARM64::FMOVXDr)
-    return MI->getOperand(1).getReg();
-  // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
-  // these at this stage, but it's easy to check for.
-  if (MI->getOpcode() == ARM64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
-    SubReg = ARM64::dsub;
-    return MI->getOperand(1).getReg();
-  }
-  // Or just a plain COPY instruction. This can be directly to/from FPR64,
-  // or it can be a dsub subreg reference to an FPR128.
-  if (MI->getOpcode() == ARM64::COPY) {
-    if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
-                MRI) &&
-        isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
-      return MI->getOperand(1).getReg();
-    if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
-                MRI) &&
-        isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
-                MRI)) {
-      SubReg = MI->getOperand(1).getSubReg();
-      return MI->getOperand(1).getReg();
-    }
-  }
-
-  // Otherwise, this is some other kind of instruction.
-  return 0;
-}
-
-// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
-// that we're considering transforming to, return that AdvSIMD opcode. For all
-// others, return the original opcode.
-static int getTransformOpcode(unsigned Opc) {
-  switch (Opc) {
-  default:
-    break;
-  // FIXME: Lots more possibilities.
-  case ARM64::ADDXrr:
-    return ARM64::ADDv1i64;
-  case ARM64::SUBXrr:
-    return ARM64::SUBv1i64;
-  }
-  // No AdvSIMD equivalent, so just return the original opcode.
-  return Opc;
-}
-
-static bool isTransformable(const MachineInstr *MI) {
-  int Opc = MI->getOpcode();
-  return Opc != getTransformOpcode(Opc);
-}
-
-// isProfitableToTransform - Predicate function to determine whether an
-// instruction should be transformed to its equivalent AdvSIMD scalar
-// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
-bool ARM64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
-  // If this instruction isn't eligible to be transformed (no SIMD equivalent),
-  // early exit since that's the common case.
-  if (!isTransformable(MI))
-    return false;
-
-  // Count the number of copies we'll need to add and approximate the number
-  // of copies that a transform will enable us to remove.
-  unsigned NumNewCopies = 3;
-  unsigned NumRemovableCopies = 0;
-
-  unsigned OrigSrc0 = MI->getOperand(1).getReg();
-  unsigned OrigSrc1 = MI->getOperand(2).getReg();
-  unsigned Src0 = 0, SubReg0;
-  unsigned Src1 = 0, SubReg1;
-  if (!MRI->def_empty(OrigSrc0)) {
-    MachineRegisterInfo::def_instr_iterator Def =
-        MRI->def_instr_begin(OrigSrc0);
-    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
-    // If the source was from a copy, we don't need to insert a new copy.
-    if (Src0)
-      --NumNewCopies;
-    // If there are no other users of the original source, we can delete
-    // that instruction.
-    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0))
-      ++NumRemovableCopies;
-  }
-  if (!MRI->def_empty(OrigSrc1)) {
-    MachineRegisterInfo::def_instr_iterator Def =
-        MRI->def_instr_begin(OrigSrc1);
-    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
-    if (Src1)
-      --NumNewCopies;
-    // If there are no other users of the original source, we can delete
-    // that instruction.
-    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1))
-      ++NumRemovableCopies;
-  }
-
-  // If any of the uses of the original instructions is a cross class copy,
-  // that's a copy that will be removable if we transform. Likewise, if
-  // any of the uses is a transformable instruction, it's likely the tranforms
-  // will chain, enabling us to save a copy there, too. This is an aggressive
-  // heuristic that approximates the graph based cost analysis described above.
-  unsigned Dst = MI->getOperand(0).getReg();
-  bool AllUsesAreCopies = true;
-  for (MachineRegisterInfo::use_instr_nodbg_iterator
-           Use = MRI->use_instr_nodbg_begin(Dst),
-           E = MRI->use_instr_nodbg_end();
-       Use != E; ++Use) {
-    unsigned SubReg;
-    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use))
-      ++NumRemovableCopies;
-    // If the use is an INSERT_SUBREG, that's still something that can
-    // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
-    // preferable to have it use the FPR64 in most cases, as if the source
-    // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
-    // Ditto for a lane insert.
-    else if (Use->getOpcode() == ARM64::INSERT_SUBREG ||
-             Use->getOpcode() == ARM64::INSvi64gpr)
-      ;
-    else
-      AllUsesAreCopies = false;
-  }
-  // If all of the uses of the original destination register are copies to
-  // FPR64, then we won't end up having a new copy back to GPR64 either.
-  if (AllUsesAreCopies)
-    --NumNewCopies;
-
-  // If a transform will not increase the number of cross-class copies required,
-  // return true.
-  if (NumNewCopies <= NumRemovableCopies)
-    return true;
-
-  // Finally, even if we otherwise wouldn't transform, check if we're forcing
-  // transformation of everything.
-  return TransformAll;
-}
-
-static MachineInstr *insertCopy(const ARM64InstrInfo *TII, MachineInstr *MI,
-                                unsigned Dst, unsigned Src, bool IsKill) {
-  MachineInstrBuilder MIB =
-      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(ARM64::COPY),
-              Dst)
-          .addReg(Src, getKillRegState(IsKill));
-  DEBUG(dbgs() << "    adding copy: " << *MIB);
-  ++NumCopiesInserted;
-  return MIB;
-}
-
-// transformInstruction - Perform the transformation of an instruction
-// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
-// to be the correct register class, minimizing cross-class copies.
-void ARM64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
-  DEBUG(dbgs() << "Scalar transform: " << *MI);
-
-  MachineBasicBlock *MBB = MI->getParent();
-  int OldOpc = MI->getOpcode();
-  int NewOpc = getTransformOpcode(OldOpc);
-  assert(OldOpc != NewOpc && "transform an instruction to itself?!");
-
-  // Check if we need a copy for the source registers.
-  unsigned OrigSrc0 = MI->getOperand(1).getReg();
-  unsigned OrigSrc1 = MI->getOperand(2).getReg();
-  unsigned Src0 = 0, SubReg0;
-  unsigned Src1 = 0, SubReg1;
-  if (!MRI->def_empty(OrigSrc0)) {
-    MachineRegisterInfo::def_instr_iterator Def =
-        MRI->def_instr_begin(OrigSrc0);
-    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
-    // If there are no other users of the original source, we can delete
-    // that instruction.
-    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) {
-      assert(Src0 && "Can't delete copy w/o a valid original source!");
-      Def->eraseFromParent();
-      ++NumCopiesDeleted;
-    }
-  }
-  if (!MRI->def_empty(OrigSrc1)) {
-    MachineRegisterInfo::def_instr_iterator Def =
-        MRI->def_instr_begin(OrigSrc1);
-    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
-    // If there are no other users of the original source, we can delete
-    // that instruction.
-    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) {
-      assert(Src1 && "Can't delete copy w/o a valid original source!");
-      Def->eraseFromParent();
-      ++NumCopiesDeleted;
-    }
-  }
-  // If we weren't able to reference the original source directly, create a
-  // copy.
-  if (!Src0) {
-    SubReg0 = 0;
-    Src0 = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
-    insertCopy(TII, MI, Src0, OrigSrc0, true);
-  }
-  if (!Src1) {
-    SubReg1 = 0;
-    Src1 = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
-    insertCopy(TII, MI, Src1, OrigSrc1, true);
-  }
-
-  // Create a vreg for the destination.
-  // FIXME: No need to do this if the ultimate user expects an FPR64.
-  // Check for that and avoid the copy if possible.
-  unsigned Dst = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
-
-  // For now, all of the new instructions have the same simple three-register
-  // form, so no need to special case based on what instruction we're
-  // building.
-  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst)
-      .addReg(Src0, getKillRegState(true), SubReg0)
-      .addReg(Src1, getKillRegState(true), SubReg1);
-
-  // Now copy the result back out to a GPR.
-  // FIXME: Try to avoid this if all uses could actually just use the FPR64
-  // directly.
-  insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true);
-
-  // Erase the old instruction.
-  MI->eraseFromParent();
-
-  ++NumScalarInsnsUsed;
-}
-
-// processMachineBasicBlock - Main optimzation loop.
-bool ARM64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
-  bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
-    MachineInstr *MI = I;
-    ++I;
-    if (isProfitableToTransform(MI)) {
-      transformInstruction(MI);
-      Changed = true;
-    }
-  }
-  return Changed;
-}
-
-// runOnMachineFunction - Pass entry point from PassManager.
-bool ARM64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
-  bool Changed = false;
-  DEBUG(dbgs() << "***** ARM64AdvSIMDScalar *****\n");
-
-  const TargetMachine &TM = mf.getTarget();
-  MRI = &mf.getRegInfo();
-  TII = static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
-
-  // Just check things on a one-block-at-a-time basis.
-  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
-    if (processMachineBasicBlock(I))
-      Changed = true;
-  return Changed;
-}
-
-// createARM64AdvSIMDScalar - Factory function used by ARM64TargetMachine
-// to add the pass to the PassManager.
-FunctionPass *llvm::createARM64AdvSIMDScalar() {
-  return new ARM64AdvSIMDScalar();
-}
diff --git a/lib/Target/ARM64/ARM64AsmPrinter.cpp b/lib/Target/ARM64/ARM64AsmPrinter.cpp
deleted file mode 100644
index 7e17985bf4a..00000000000
--- a/lib/Target/ARM64/ARM64AsmPrinter.cpp
+++ /dev/null
@@ -1,514 +0,0 @@
-//===-- ARM64AsmPrinter.cpp - ARM64 LLVM assembly writer ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a printer that converts from our internal representation
-// of machine-dependent LLVM code to the ARM64 assembly language.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64MCInstLower.h"
-#include "ARM64RegisterInfo.h"
-#include "ARM64Subtarget.h"
-#include "InstPrinter/ARM64InstPrinter.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/StackMaps.h"
-#include "llvm/CodeGen/MachineModuleInfoImpls.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstBuilder.h"
-#include "llvm/MC/MCLinkerOptimizationHint.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "asm-printer"
-
-namespace {
-
-class ARM64AsmPrinter : public AsmPrinter {
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
-  /// make the right decision when printing asm code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-  ARM64MCInstLower MCInstLowering;
-  StackMaps SM;
-
-public:
-  ARM64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer), Subtarget(&TM.getSubtarget<ARM64Subtarget>()),
-        MCInstLowering(OutContext, *Mang, *this), SM(*this), ARM64FI(nullptr),
-        LOHLabelCounter(0) {}
-
-  const char *getPassName() const override { return "ARM64 Assembly Printer"; }
-
-  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
-  /// tblgen'erated pseudo lowering.
-  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
-    return MCInstLowering.lowerOperand(MO, MCOp);
-  }
-
-  void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
-                     const MachineInstr &MI);
-  void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
-                       const MachineInstr &MI);
-  /// \brief tblgen'erated driver function for lowering simple MI->MC
-  /// pseudo instructions.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
-
-  void EmitInstruction(const MachineInstr *MI) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AsmPrinter::getAnalysisUsage(AU);
-    AU.setPreservesAll();
-  }
-
-  bool runOnMachineFunction(MachineFunction &F) override {
-    ARM64FI = F.getInfo<ARM64FunctionInfo>();
-    return AsmPrinter::runOnMachineFunction(F);
-  }
-
-private:
-  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
-  void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
-  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
-  bool printAsmRegInClass(const MachineOperand &MO,
-                          const TargetRegisterClass *RC, bool isVector,
-                          raw_ostream &O);
-
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &O) override;
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
-                             unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &O) override;
-
-  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-
-  void EmitFunctionBodyEnd() override;
-
-  MCSymbol *GetCPISymbol(unsigned CPID) const override;
-  void EmitEndOfAsmFile(Module &M) override;
-  ARM64FunctionInfo *ARM64FI;
-
-  /// \brief Emit the LOHs contained in ARM64FI.
-  void EmitLOHs();
-
-  typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
-  MInstToMCSymbol LOHInstToLabel;
-  unsigned LOHLabelCounter;
-};
-
-} // end of anonymous namespace
-
-//===----------------------------------------------------------------------===//
-
-void ARM64AsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (Subtarget->isTargetMachO()) {
-    // Funny Darwin hack: This flag tells the linker that no global symbols
-    // contain code that falls through to other global symbols (e.g. the obvious
-    // implementation of multiple entry points).  If this doesn't occur, the
-    // linker can safely perform dead code stripping.  Since LLVM never
-    // generates code that does this, it is always safe to set.
-    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
-    SM.serializeToStackMapSection();
-  }
-
-  // Emit a .data.rel section containing any stubs that were created.
-  if (Subtarget->isTargetELF()) {
-    const TargetLoweringObjectFileELF &TLOFELF =
-      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
-
-    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
-
-    // Output stubs for external and common global variables.
-    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
-    if (!Stubs.empty()) {
-      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getDataLayout();
-
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        OutStreamer.EmitLabel(Stubs[i].first);
-        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
-                                    TD->getPointerSize(0));
-      }
-      Stubs.clear();
-    }
-  }
-
-}
-
-MachineLocation
-ARM64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
-  MachineLocation Location;
-  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
-  // Frame address.  Currently handles register +- offset only.
-  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
-    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
-  else {
-    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
-  }
-  return Location;
-}
-
-void ARM64AsmPrinter::EmitLOHs() {
-  SmallVector<MCSymbol *, 3> MCArgs;
-
-  for (const auto &D : ARM64FI->getLOHContainer()) {
-    for (const MachineInstr *MI : D.getArgs()) {
-      MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
-      assert(LabelIt != LOHInstToLabel.end() &&
-             "Label hasn't been inserted for LOH related instruction");
-      MCArgs.push_back(LabelIt->second);
-    }
-    OutStreamer.EmitLOHDirective(D.getKind(), MCArgs);
-    MCArgs.clear();
-  }
-}
-
-void ARM64AsmPrinter::EmitFunctionBodyEnd() {
-  if (!ARM64FI->getLOHRelated().empty())
-    EmitLOHs();
-}
-
-/// GetCPISymbol - Return the symbol for the specified constant pool entry.
-MCSymbol *ARM64AsmPrinter::GetCPISymbol(unsigned CPID) const {
-  // Darwin uses a linker-private symbol name for constant-pools (to
-  // avoid addends on the relocation?), ELF has no such concept and
-  // uses a normal private symbol.
-  if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
-    return OutContext.GetOrCreateSymbol(
-        Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
-        Twine(getFunctionNumber()) + "_" + Twine(CPID));
-
-  return OutContext.GetOrCreateSymbol(
-      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
-      Twine(getFunctionNumber()) + "_" + Twine(CPID));
-}
-
-void ARM64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
-                                   raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  switch (MO.getType()) {
-  default:
-    assert(0 && "<unknown operand type>");
-  case MachineOperand::MO_Register: {
-    unsigned Reg = MO.getReg();
-    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
-    assert(!MO.getSubReg() && "Subregs should be eliminated!");
-    O << ARM64InstPrinter::getRegisterName(Reg);
-    break;
-  }
-  case MachineOperand::MO_Immediate: {
-    int64_t Imm = MO.getImm();
-    O << '#' << Imm;
-    break;
-  }
-  }
-}
-
-bool ARM64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
-                                        raw_ostream &O) {
-  unsigned Reg = MO.getReg();
-  switch (Mode) {
-  default:
-    return true; // Unknown mode.
-  case 'w':
-    Reg = getWRegFromXReg(Reg);
-    break;
-  case 'x':
-    Reg = getXRegFromWReg(Reg);
-    break;
-  }
-
-  O << ARM64InstPrinter::getRegisterName(Reg);
-  return false;
-}
-
-// Prints the register in MO using class RC using the offset in the
-// new register class. This should not be used for cross class
-// printing.
-bool ARM64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
-                                         const TargetRegisterClass *RC,
-                                         bool isVector, raw_ostream &O) {
-  assert(MO.isReg() && "Should only get here with a register!");
-  const ARM64RegisterInfo *RI =
-      static_cast<const ARM64RegisterInfo *>(TM.getRegisterInfo());
-  unsigned Reg = MO.getReg();
-  unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
-  assert(RI->regsOverlap(RegToPrint, Reg));
-  O << ARM64InstPrinter::getRegisterName(
-           RegToPrint, isVector ? ARM64::vreg : ARM64::NoRegAltName);
-  return false;
-}
-
-bool ARM64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                                      unsigned AsmVariant,
-                                      const char *ExtraCode, raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  // Does this asm operand have a single letter operand modifier?
-  if (ExtraCode && ExtraCode[0]) {
-    if (ExtraCode[1] != 0)
-      return true; // Unknown modifier.
-
-    switch (ExtraCode[0]) {
-    default:
-      return true; // Unknown modifier.
-    case 'w':      // Print W register
-    case 'x':      // Print X register
-      if (MO.isReg())
-        return printAsmMRegister(MO, ExtraCode[0], O);
-      if (MO.isImm() && MO.getImm() == 0) {
-        unsigned Reg = ExtraCode[0] == 'w' ? ARM64::WZR : ARM64::XZR;
-        O << ARM64InstPrinter::getRegisterName(Reg);
-        return false;
-      }
-      printOperand(MI, OpNum, O);
-      return false;
-    case 'b': // Print B register.
-    case 'h': // Print H register.
-    case 's': // Print S register.
-    case 'd': // Print D register.
-    case 'q': // Print Q register.
-      if (MO.isReg()) {
-        const TargetRegisterClass *RC;
-        switch (ExtraCode[0]) {
-        case 'b':
-          RC = &ARM64::FPR8RegClass;
-          break;
-        case 'h':
-          RC = &ARM64::FPR16RegClass;
-          break;
-        case 's':
-          RC = &ARM64::FPR32RegClass;
-          break;
-        case 'd':
-          RC = &ARM64::FPR64RegClass;
-          break;
-        case 'q':
-          RC = &ARM64::FPR128RegClass;
-          break;
-        default:
-          return true;
-        }
-        return printAsmRegInClass(MO, RC, false /* vector */, O);
-      }
-      printOperand(MI, OpNum, O);
-      return false;
-    }
-  }
-
-  // According to ARM, we should emit x and v registers unless we have a
-  // modifier.
-  if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
-
-    // If this is a w or x register, print an x register.
-    if (ARM64::GPR32allRegClass.contains(Reg) ||
-        ARM64::GPR64allRegClass.contains(Reg))
-      return printAsmMRegister(MO, 'x', O);
-
-    // If this is a b, h, s, d, or q register, print it as a v register.
-    return printAsmRegInClass(MO, &ARM64::FPR128RegClass, true /* vector */, O);
-  }
-
-  printOperand(MI, OpNum, O);
-  return false;
-}
-
-bool ARM64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
-                                            unsigned OpNum, unsigned AsmVariant,
-                                            const char *ExtraCode,
-                                            raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier.
-
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  assert(MO.isReg() && "unexpected inline asm memory operand");
-  O << "[" << ARM64InstPrinter::getRegisterName(MO.getReg()) << "]";
-  return false;
-}
-
-void ARM64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
-                                             raw_ostream &OS) {
-  unsigned NOps = MI->getNumOperands();
-  assert(NOps == 4);
-  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
-  // cast away const; DIetc do not take const operands for some reason.
-  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
-  OS << V.getName();
-  OS << " <- ";
-  // Frame address.  Currently handles register +- offset only.
-  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
-  OS << '[';
-  printOperand(MI, 0, OS);
-  OS << '+';
-  printOperand(MI, 1, OS);
-  OS << ']';
-  OS << "+";
-  printOperand(MI, NOps - 2, OS);
-}
-
-void ARM64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
-                                    const MachineInstr &MI) {
-  unsigned NumNOPBytes = MI.getOperand(1).getImm();
-
-  SM.recordStackMap(MI);
-  // Emit padding.
-  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
-  for (unsigned i = 0; i < NumNOPBytes; i += 4)
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0));
-}
-
-// Lower a patchpoint of the form:
-// [<def>], <id>, <numBytes>, <target>, <numArgs>
-void ARM64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
-                                      const MachineInstr &MI) {
-  SM.recordPatchPoint(MI);
-
-  PatchPointOpers Opers(&MI);
-
-  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
-  unsigned EncodedBytes = 0;
-  if (CallTarget) {
-    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
-           "High 16 bits of call target should be zero.");
-    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
-    EncodedBytes = 16;
-    // Materialize the jump address:
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVZWi)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 32) & 0xFFFF)
-                                    .addImm(32));
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 16) & 0xFFFF)
-                                    .addImm(16));
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm(CallTarget & 0xFFFF)
-                                    .addImm(0));
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::BLR).addReg(ScratchReg));
-  }
-  // Emit padding.
-  unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
-  assert(NumBytes >= EncodedBytes &&
-         "Patchpoint can't request size less than the length of a call.");
-  assert((NumBytes - EncodedBytes) % 4 == 0 &&
-         "Invalid number of NOP bytes requested!");
-  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0));
-}
-
-// Simple pseudo-instructions have their lowering (with expansion to real
-// instructions) auto-generated.
-#include "ARM64GenMCPseudoLowering.inc"
-
-void ARM64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(OutStreamer, MI))
-    return;
-
-  if (ARM64FI->getLOHRelated().count(MI)) {
-    // Generate a label for LOH related instruction
-    MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
-    // Associate the instruction with the label
-    LOHInstToLabel[MI] = LOHLabel;
-    OutStreamer.EmitLabel(LOHLabel);
-  }
-
-  // Do any manual lowerings.
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::DBG_VALUE: {
-    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
-      SmallString<128> TmpStr;
-      raw_svector_ostream OS(TmpStr);
-      PrintDebugValueComment(MI, OS);
-      OutStreamer.EmitRawText(StringRef(OS.str()));
-    }
-    return;
-  }
-
-  // Tail calls use pseudo instructions so they have the proper code-gen
-  // attributes (isCall, isReturn, etc.). We lower them to the real
-  // instruction here.
-  case ARM64::TCRETURNri: {
-    MCInst TmpInst;
-    TmpInst.setOpcode(ARM64::BR);
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-  case ARM64::TCRETURNdi: {
-    MCOperand Dest;
-    MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
-    MCInst TmpInst;
-    TmpInst.setOpcode(ARM64::B);
-    TmpInst.addOperand(Dest);
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-  case ARM64::TLSDESC_BLR: {
-    MCOperand Callee, Sym;
-    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
-    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
-
-    // First emit a relocation-annotation. This expands to no code, but requests
-    // the following instruction gets an R_AARCH64_TLSDESC_CALL.
-    MCInst TLSDescCall;
-    TLSDescCall.setOpcode(ARM64::TLSDESCCALL);
-    TLSDescCall.addOperand(Sym);
-    EmitToStreamer(OutStreamer, TLSDescCall);
-
-    // Other than that it's just a normal indirect call to the function loaded
-    // from the descriptor.
-    MCInst BLR;
-    BLR.setOpcode(ARM64::BLR);
-    BLR.addOperand(Callee);
-    EmitToStreamer(OutStreamer, BLR);
-
-    return;
-  }
-
-  case TargetOpcode::STACKMAP:
-    return LowerSTACKMAP(OutStreamer, SM, *MI);
-
-  case TargetOpcode::PATCHPOINT:
-    return LowerPATCHPOINT(OutStreamer, SM, *MI);
-  }
-
-  // Finally, do the automated lowerings for everything else.
-  MCInst TmpInst;
-  MCInstLowering.Lower(MI, TmpInst);
-  EmitToStreamer(OutStreamer, TmpInst);
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializeARM64AsmPrinter() {
-  RegisterAsmPrinter<ARM64AsmPrinter> X(TheARM64leTarget);
-  RegisterAsmPrinter<ARM64AsmPrinter> Y(TheARM64beTarget);
-
-  RegisterAsmPrinter<ARM64AsmPrinter> Z(TheAArch64leTarget);
-  RegisterAsmPrinter<ARM64AsmPrinter> W(TheAArch64beTarget);
-}
diff --git a/lib/Target/ARM64/ARM64BranchRelaxation.cpp b/lib/Target/ARM64/ARM64BranchRelaxation.cpp
deleted file mode 100644
index 73be3504790..00000000000
--- a/lib/Target/ARM64/ARM64BranchRelaxation.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-//===-- ARM64BranchRelaxation.cpp - ARM64 branch relaxation ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-branch-relax"
-
-static cl::opt<bool>
-BranchRelaxation("arm64-branch-relax", cl::Hidden, cl::init(true),
-                 cl::desc("Relax out of range conditional branches"));
-
-static cl::opt<unsigned>
-TBZDisplacementBits("arm64-tbz-offset-bits", cl::Hidden, cl::init(14),
-                    cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
-
-static cl::opt<unsigned>
-CBZDisplacementBits("arm64-cbz-offset-bits", cl::Hidden, cl::init(19),
-                    cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
-
-static cl::opt<unsigned>
-BCCDisplacementBits("arm64-bcc-offset-bits", cl::Hidden, cl::init(19),
-                    cl::desc("Restrict range of Bcc instructions (DEBUG)"));
-
-STATISTIC(NumSplit, "Number of basic blocks split");
-STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
-
-namespace {
-class ARM64BranchRelaxation : public MachineFunctionPass {
-  /// BasicBlockInfo - Information about the offset and size of a single
-  /// basic block.
-  struct BasicBlockInfo {
-    /// Offset - Distance from the beginning of the function to the beginning
-    /// of this basic block.
-    ///
-    /// The offset is always aligned as required by the basic block.
-    unsigned Offset;
-
-    /// Size - Size of the basic block in bytes.  If the block contains
-    /// inline assembly, this is a worst case estimate.
-    ///
-    /// The size does not include any alignment padding whether from the
-    /// beginning of the block, or from an aligned jump table at the end.
-    unsigned Size;
-
-    BasicBlockInfo() : Offset(0), Size(0) {}
-
-    /// Compute the offset immediately following this block.  If LogAlign is
-    /// specified, return the offset the successor block will get if it has
-    /// this alignment.
-    unsigned postOffset(unsigned LogAlign = 0) const {
-      unsigned PO = Offset + Size;
-      unsigned Align = 1 << LogAlign;
-      return (PO + Align - 1) / Align * Align;
-    }
-  };
-
-  SmallVector<BasicBlockInfo, 16> BlockInfo;
-
-  MachineFunction *MF;
-  const ARM64InstrInfo *TII;
-
-  bool relaxBranchInstructions();
-  void scanFunction();
-  MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
-  void adjustBlockOffsets(MachineBasicBlock &MBB);
-  bool isBlockInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
-  bool fixupConditionalBranch(MachineInstr *MI);
-  void computeBlockSize(const MachineBasicBlock &MBB);
-  unsigned getInstrOffset(MachineInstr *MI) const;
-  void dumpBBs();
-  void verify();
-
-public:
-  static char ID;
-  ARM64BranchRelaxation() : MachineFunctionPass(ID) {}
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "ARM64 branch relaxation pass";
-  }
-};
-char ARM64BranchRelaxation::ID = 0;
-}
-
-/// verify - check BBOffsets, BBSizes, alignment of islands
-void ARM64BranchRelaxation::verify() {
-#ifndef NDEBUG
-  unsigned PrevNum = MF->begin()->getNumber();
-  for (MachineBasicBlock &MBB : *MF) {
-    unsigned Align = MBB.getAlignment();
-    unsigned Num = MBB.getNumber();
-    assert(BlockInfo[Num].Offset % (1u << Align) == 0);
-    assert(!Num || BlockInfo[PrevNum].postOffset() <= BlockInfo[Num].Offset);
-    PrevNum = Num;
-  }
-#endif
-}
-
-/// print block size and offset information - debugging
-void ARM64BranchRelaxation::dumpBBs() {
-  for (auto &MBB : *MF) {
-    const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
-    dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
-           << format("size=%#x\n", BBI.Size);
-  }
-}
-
-/// BBHasFallthrough - Return true if the specified basic block can fallthrough
-/// into the block immediately after it.
-static bool BBHasFallthrough(MachineBasicBlock *MBB) {
-  // Get the next machine basic block in the function.
-  MachineFunction::iterator MBBI = MBB;
-  // Can't fall off end of function.
-  MachineBasicBlock *NextBB = std::next(MBBI);
-  if (NextBB == MBB->getParent()->end())
-    return false;
-
-  for (MachineBasicBlock *S : MBB->successors()) 
-    if (S == NextBB)
-      return true;
-
-  return false;
-}
-
-/// scanFunction - Do the initial scan of the function, building up
-/// information about each block.
-void ARM64BranchRelaxation::scanFunction() {
-  BlockInfo.clear();
-  BlockInfo.resize(MF->getNumBlockIDs());
-
-  // First thing, compute the size of all basic blocks, and see if the function
-  // has any inline assembly in it. If so, we have to be conservative about
-  // alignment assumptions, as we don't know for sure the size of any
-  // instructions in the inline assembly.
-  for (MachineBasicBlock &MBB : *MF)
-    computeBlockSize(MBB);
-
-  // Compute block offsets and known bits.
-  adjustBlockOffsets(*MF->begin());
-}
-
-/// computeBlockSize - Compute the size for MBB.
-/// This function updates BlockInfo directly.
-void ARM64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) {
-  unsigned Size = 0;
-  for (const MachineInstr &MI : MBB)
-    Size += TII->GetInstSizeInBytes(&MI);
-  BlockInfo[MBB.getNumber()].Size = Size;
-}
-
-/// getInstrOffset - Return the current offset of the specified machine
-/// instruction from the start of the function.  This offset changes as stuff is
-/// moved around inside the function.
-unsigned ARM64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
-  MachineBasicBlock *MBB = MI->getParent();
-
-  // The offset is composed of two things: the sum of the sizes of all MBB's
-  // before this instruction's block, and the offset from the start of the block
-  // it is in.
-  unsigned Offset = BlockInfo[MBB->getNumber()].Offset;
-
-  // Sum instructions before MI in MBB.
-  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
-    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
-    Offset += TII->GetInstSizeInBytes(I);
-  }
-  return Offset;
-}
-
-void ARM64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
-  unsigned PrevNum = Start.getNumber();
-  for (auto &MBB : make_range(MachineFunction::iterator(Start), MF->end())) {
-    unsigned Num = MBB.getNumber();
-    if (!Num) // block zero is never changed from offset zero.
-      continue;
-    // Get the offset and known bits at the end of the layout predecessor.
-    // Include the alignment of the current block.
-    unsigned LogAlign = MBB.getAlignment();
-    BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(LogAlign);
-    PrevNum = Num;
-  }
-}
-
-/// Split the basic block containing MI into two blocks, which are joined by
-/// an unconditional branch.  Update data structures and renumber blocks to
-/// account for this change and returns the newly created block.
-/// NOTE: Successor list of the original BB is out of date after this function,
-/// and must be updated by the caller! Other transforms follow using this
-/// utility function, so no point updating now rather than waiting.
-MachineBasicBlock *
-ARM64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
-  MachineBasicBlock *OrigBB = MI->getParent();
-
-  // Create a new MBB for the code after the OrigBB.
-  MachineBasicBlock *NewBB =
-      MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
-  MachineFunction::iterator MBBI = OrigBB;
-  ++MBBI;
-  MF->insert(MBBI, NewBB);
-
-  // Splice the instructions starting with MI over to NewBB.
-  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
-
-  // Add an unconditional branch from OrigBB to NewBB.
-  // Note the new unconditional branch is not being recorded.
-  // There doesn't seem to be meaningful DebugInfo available; this doesn't
-  // correspond to anything in the source.
-  BuildMI(OrigBB, DebugLoc(), TII->get(ARM64::B)).addMBB(NewBB);
-
-  // Insert an entry into BlockInfo to align it properly with the block numbers.
-  BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
-
-  // Figure out how large the OrigBB is.  As the first half of the original
-  // block, it cannot contain a tablejump.  The size includes
-  // the new jump we added.  (It should be possible to do this without
-  // recounting everything, but it's very confusing, and this is rarely
-  // executed.)
-  computeBlockSize(*OrigBB);
-
-  // Figure out how large the NewMBB is.  As the second half of the original
-  // block, it may contain a tablejump.
-  computeBlockSize(*NewBB);
-
-  // All BBOffsets following these blocks must be modified.
-  adjustBlockOffsets(*OrigBB);
-
-  ++NumSplit;
-
-  return NewBB;
-}
-
-/// isBlockInRange - Returns true if the distance between specific MI and
-/// specific BB can fit in MI's displacement field.
-bool ARM64BranchRelaxation::isBlockInRange(MachineInstr *MI,
-                                           MachineBasicBlock *DestBB,
-                                           unsigned Bits) {
-  unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2;
-  unsigned BrOffset = getInstrOffset(MI);
-  unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset;
-
-  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
-               << " from BB#" << MI->getParent()->getNumber()
-               << " max delta=" << MaxOffs << " from " << getInstrOffset(MI)
-               << " to " << DestOffset << " offset "
-               << int(DestOffset - BrOffset) << "\t" << *MI);
-
-  // Branch before the Dest.
-  if (BrOffset <= DestOffset)
-    return (DestOffset - BrOffset <= MaxOffs);
-  return (BrOffset - DestOffset <= MaxOffs);
-}
-
-static bool isConditionalBranch(unsigned Opc) {
-  switch (Opc) {
-  default:
-    return false;
-  case ARM64::TBZW:
-  case ARM64::TBNZW:
-  case ARM64::TBZX:
-  case ARM64::TBNZX:
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-  case ARM64::Bcc:
-    return true;
-  }
-}
-
-static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
-  default:
-    assert(0 && "unexpected opcode!");
-  case ARM64::TBZW:
-  case ARM64::TBNZW:
-  case ARM64::TBZX:
-  case ARM64::TBNZX:
-    return MI->getOperand(2).getMBB();
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-  case ARM64::Bcc:
-    return MI->getOperand(1).getMBB();
-  }
-}
-
-static unsigned getOppositeConditionOpcode(unsigned Opc) {
-  switch (Opc) {
-  default:
-    assert(0 && "unexpected opcode!");
-  case ARM64::TBNZW:   return ARM64::TBZW;
-  case ARM64::TBNZX:   return ARM64::TBZX;
-  case ARM64::TBZW:    return ARM64::TBNZW;
-  case ARM64::TBZX:    return ARM64::TBNZX;
-  case ARM64::CBNZW:   return ARM64::CBZW;
-  case ARM64::CBNZX:   return ARM64::CBZX;
-  case ARM64::CBZW:    return ARM64::CBNZW;
-  case ARM64::CBZX:    return ARM64::CBNZX;
-  case ARM64::Bcc:     return ARM64::Bcc; // Condition is an operand for Bcc.
-  }
-}
-
-static unsigned getBranchDisplacementBits(unsigned Opc) {
-  switch (Opc) {
-  default:
-    assert(0 && "unexpected opcode!");
-  case ARM64::TBNZW:
-  case ARM64::TBZW:
-  case ARM64::TBNZX:
-  case ARM64::TBZX:
-    return TBZDisplacementBits;
-  case ARM64::CBNZW:
-  case ARM64::CBZW:
-  case ARM64::CBNZX:
-  case ARM64::CBZX:
-    return CBZDisplacementBits;
-  case ARM64::Bcc:
-    return BCCDisplacementBits;
-  }
-}
-
-static inline void invertBccCondition(MachineInstr *MI) {
-  assert(MI->getOpcode() == ARM64::Bcc && "Unexpected opcode!");
-  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(0).getImm();
-  CC = ARM64CC::getInvertedCondCode(CC);
-  MI->getOperand(0).setImm((int64_t)CC);
-}
-
-/// fixupConditionalBranch - Fix up a conditional branch whose destination is
-/// too far away to fit in its displacement field. It is converted to an inverse
-/// conditional branch + an unconditional branch to the destination.
-bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
-  MachineBasicBlock *DestBB = getDestBlock(MI);
-
-  // Add an unconditional branch to the destination and invert the branch
-  // condition to jump over it:
-  // tbz L1
-  // =>
-  // tbnz L2
-  // b   L1
-  // L2:
-
-  // If the branch is at the end of its MBB and that has a fall-through block,
-  // direct the updated conditional branch to the fall-through block. Otherwise,
-  // split the MBB before the next instruction.
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineInstr *BMI = &MBB->back();
-  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
-
-  if (BMI != MI) {
-    if (std::next(MachineBasicBlock::iterator(MI)) ==
-            std::prev(MBB->getLastNonDebugInstr()) &&
-        BMI->getOpcode() == ARM64::B) {
-      // Last MI in the BB is an unconditional branch. Can we simply invert the
-      // condition and swap destinations:
-      // beq L1
-      // b   L2
-      // =>
-      // bne L2
-      // b   L1
-      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
-      if (isBlockInRange(MI, NewDest,
-                         getBranchDisplacementBits(MI->getOpcode()))) {
-        DEBUG(dbgs() << "  Invert condition and swap its destination with "
-                     << *BMI);
-        BMI->getOperand(0).setMBB(DestBB);
-        unsigned OpNum =
-            (MI->getOpcode() == ARM64::TBZW || MI->getOpcode() == ARM64::TBNZW ||
-             MI->getOpcode() == ARM64::TBZX || MI->getOpcode() == ARM64::TBNZX)
-                ? 2
-                : 1;
-        MI->getOperand(OpNum).setMBB(NewDest);
-        MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode())));
-        if (MI->getOpcode() == ARM64::Bcc)
-          invertBccCondition(MI);
-        return true;
-      }
-    }
-  }
-
-  if (NeedSplit) {
-    // Analyze the branch so we know how to update the successor lists.
-    MachineBasicBlock *TBB, *FBB;
-    SmallVector<MachineOperand, 2> Cond;
-    TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false);
-
-    MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI);
-    // No need for the branch to the next block. We're adding an unconditional
-    // branch to the destination.
-    int delta = TII->GetInstSizeInBytes(&MBB->back());
-    BlockInfo[MBB->getNumber()].Size -= delta;
-    MBB->back().eraseFromParent();
-    // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below
-
-    // Update the successor lists according to the transformation to follow.
-    // Do it here since if there's no split, no update is needed.
-    MBB->replaceSuccessor(FBB, NewBB);
-    NewBB->addSuccessor(FBB);
-  }
-  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
-
-  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
-               << ", invert condition and change dest. to BB#"
-               << NextBB->getNumber() << "\n");
-
-  // Insert a new conditional branch and a new unconditional branch.
-  MachineInstrBuilder MIB = BuildMI(
-      MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode())))
-                                .addOperand(MI->getOperand(0));
-  if (MI->getOpcode() == ARM64::TBZW || MI->getOpcode() == ARM64::TBNZW ||
-      MI->getOpcode() == ARM64::TBZX || MI->getOpcode() == ARM64::TBNZX)
-    MIB.addOperand(MI->getOperand(1));
-  if (MI->getOpcode() == ARM64::Bcc)
-    invertBccCondition(MIB);
-  MIB.addMBB(NextBB);
-  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
-  BuildMI(MBB, DebugLoc(), TII->get(ARM64::B)).addMBB(DestBB);
-  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
-
-  // Remove the old conditional branch.  It may or may not still be in MBB.
-  BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
-  MI->eraseFromParent();
-
-  // Finally, keep the block offsets up to date.
-  adjustBlockOffsets(*MBB);
-  return true;
-}
-
-bool ARM64BranchRelaxation::relaxBranchInstructions() {
-  bool Changed = false;
-  // Relaxing branches involves creating new basic blocks, so re-eval
-  // end() for termination.
-  for (auto &MBB : *MF) {
-    MachineInstr *MI = MBB.getFirstTerminator();
-    if (isConditionalBranch(MI->getOpcode()) &&
-        !isBlockInRange(MI, getDestBlock(MI),
-                        getBranchDisplacementBits(MI->getOpcode()))) {
-      fixupConditionalBranch(MI);
-      ++NumRelaxed;
-      Changed = true;
-    }
-  }
-  return Changed;
-}
-
-bool ARM64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
-  MF = &mf;
-
-  // If the pass is disabled, just bail early.
-  if (!BranchRelaxation)
-    return false;
-
-  DEBUG(dbgs() << "***** ARM64BranchRelaxation *****\n");
-
-  TII = (const ARM64InstrInfo *)MF->getTarget().getInstrInfo();
-
-  // Renumber all of the machine basic blocks in the function, guaranteeing that
-  // the numbers agree with the position of the block in the function.
-  MF->RenumberBlocks();
-
-  // Do the initial scan of the function, building up information about the
-  // sizes of each block.
-  scanFunction();
-
-  DEBUG(dbgs() << "  Basic blocks before relaxation\n");
-  DEBUG(dumpBBs());
-
-  bool MadeChange = false;
-  while (relaxBranchInstructions())
-    MadeChange = true;
-
-  // After a while, this might be made debug-only, but it is not expensive.
-  verify();
-
-  DEBUG(dbgs() << "  Basic blocks after relaxation\n");
-  DEBUG(dbgs() << '\n'; dumpBBs());
-
-  BlockInfo.clear();
-
-  return MadeChange;
-}
-
-/// createARM64BranchRelaxation - returns an instance of the constpool
-/// island pass.
-FunctionPass *llvm::createARM64BranchRelaxation() {
-  return new ARM64BranchRelaxation();
-}
diff --git a/lib/Target/ARM64/ARM64CallingConv.h b/lib/Target/ARM64/ARM64CallingConv.h
deleted file mode 100644
index f24ba59dfb9..00000000000
--- a/lib/Target/ARM64/ARM64CallingConv.h
+++ /dev/null
@@ -1,94 +0,0 @@
-//=== ARM64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the custom routines for the ARM64 Calling Convention that
-// aren't done by tablegen.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64CALLINGCONV_H
-#define ARM64CALLINGCONV_H
-
-#include "ARM64InstrInfo.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-namespace llvm {
-
-/// CC_ARM64_Custom_i1i8i16_Reg - customized handling of passing i1/i8/i16 via
-/// register. Here, ValVT can be i1/i8/i16 or i32 depending on whether the
-/// argument is already promoted and LocVT is i1/i8/i16. We only promote the
-/// argument to i32 if we are sure this argument will be passed in register.
-static bool CC_ARM64_Custom_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                        CCValAssign::LocInfo LocInfo,
-                                        ISD::ArgFlagsTy ArgFlags,
-                                        CCState &State,
-                                        bool IsWebKitJS = false) {
-  static const MCPhysReg RegList1[] = { ARM64::W0, ARM64::W1, ARM64::W2,
-                                        ARM64::W3, ARM64::W4, ARM64::W5,
-                                        ARM64::W6, ARM64::W7 };
-  static const MCPhysReg RegList2[] = { ARM64::X0, ARM64::X1, ARM64::X2,
-                                        ARM64::X3, ARM64::X4, ARM64::X5,
-                                        ARM64::X6, ARM64::X7 };
-  static const MCPhysReg WebKitRegList1[] = { ARM64::W0 };
-  static const MCPhysReg WebKitRegList2[] = { ARM64::X0 };
-
-  const MCPhysReg *List1 = IsWebKitJS ? WebKitRegList1 : RegList1;
-  const MCPhysReg *List2 = IsWebKitJS ? WebKitRegList2 : RegList2;
-
-  if (unsigned Reg = State.AllocateReg(List1, List2, 8)) {
-    // Customized extra section for handling i1/i8/i16:
-    // We need to promote the argument to i32 if it is not done already.
-    if (ValVT != MVT::i32) {
-      if (ArgFlags.isSExt())
-        LocInfo = CCValAssign::SExt;
-      else if (ArgFlags.isZExt())
-        LocInfo = CCValAssign::ZExt;
-      else
-        LocInfo = CCValAssign::AExt;
-      ValVT = MVT::i32;
-    }
-    // Set LocVT to i32 as well if passing via register.
-    LocVT = MVT::i32;
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    return true;
-  }
-  return false;
-}
-
-/// CC_ARM64_WebKit_JS_i1i8i16_Reg - customized handling of passing i1/i8/i16
-/// via register. This behaves the same as CC_ARM64_Custom_i1i8i16_Reg, but only
-/// uses the first register.
-static bool CC_ARM64_WebKit_JS_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                           CCValAssign::LocInfo LocInfo,
-                                           ISD::ArgFlagsTy ArgFlags,
-                                           CCState &State) {
-  return CC_ARM64_Custom_i1i8i16_Reg(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
-                                     State, true);
-}
-
-/// CC_ARM64_Custom_i1i8i16_Stack: customized handling of passing i1/i8/i16 on
-/// stack. Here, ValVT can be i1/i8/i16 or i32 depending on whether the argument
-/// is already promoted and LocVT is i1/i8/i16. If ValVT is already promoted,
-/// it will be truncated back to i1/i8/i16.
-static bool CC_ARM64_Custom_i1i8i16_Stack(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                          CCValAssign::LocInfo LocInfo,
-                                          ISD::ArgFlagsTy ArgFlags,
-                                          CCState &State) {
-  unsigned Space = ((LocVT == MVT::i1 || LocVT == MVT::i8) ? 1 : 2);
-  unsigned Offset12 = State.AllocateStack(Space, Space);
-  ValVT = LocVT;
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset12, LocVT, LocInfo));
-  return true;
-}
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/ARM64/ARM64CallingConvention.td b/lib/Target/ARM64/ARM64CallingConvention.td
deleted file mode 100644
index 0ef5601718d..00000000000
--- a/lib/Target/ARM64/ARM64CallingConvention.td
+++ /dev/null
@@ -1,236 +0,0 @@
-//===- ARM64CallingConv.td - Calling Conventions for ARM64 -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This describes the calling conventions for ARM64 architecture.
-//
-//===----------------------------------------------------------------------===//
-
-/// CCIfAlign - Match of the original alignment of the arg
-class CCIfAlign<string Align, CCAction A> :
-  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
-/// CCIfBigEndian - Match only if we're in big endian mode.
-class CCIfBigEndian<CCAction A> :
-  CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>;
-
-//===----------------------------------------------------------------------===//
-// ARM AAPCS64 Calling Convention
-//===----------------------------------------------------------------------===//
-
-def CC_ARM64_AAPCS : CallingConv<[
-  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
-  CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
-
-  // Big endian vectors must be passed as if they were 1-element vectors so that
-  // their lanes are in a consistent order.
-  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
-                         CCBitConvertToType<f64>>>,
-  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
-                         CCBitConvertToType<f128>>>,
-
-  // An SRet is passed in X8, not X0 like a normal pointer parameter.
-  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
-
-  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
-  // slot is 64-bit.
-  CCIfByVal<CCPassByVal<8, 8>>,
-
-  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
-  // up to eight each of GPR and FPR.
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
-  // i128 is split to two i64s, we can't fit half to register X7.
-  CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6],
-                                                    [X0, X1, X3, X5]>>>,
-
-  // i128 is split to two i64s, and its stack alignment is 16 bytes.
-  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
-
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
-           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
-           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-
-  // If more than will fit in registers, pass them on the stack instead.
-  CCIfType<[i1, i8, i16], CCAssignToStack<8, 8>>,
-  CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
-  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
-           CCAssignToStack<8, 8>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
-           CCAssignToStack<16, 16>>
-]>;
-
-def RetCC_ARM64_AAPCS : CallingConv<[
-  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
-  CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
-
-  // Big endian vectors must be passed as if they were 1-element vectors so that
-  // their lanes are in a consistent order.
-  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
-                         CCBitConvertToType<f64>>>,
-  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
-                         CCBitConvertToType<f128>>>,
-
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
-      CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                              [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
-      CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
-]>;
-
-
-// Darwin uses a calling convention which differs in only two ways
-// from the standard one at this level:
-//     + i128s (i.e. split i64s) don't need even registers.
-//     + Stack slots are sized as needed rather than being at least 64-bit.
-def CC_ARM64_DarwinPCS : CallingConv<[
-  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
-  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
-
-  // An SRet is passed in X8, not X0 like a normal pointer parameter.
-  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
-
-  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
-  // slot is 64-bit.
-  CCIfByVal<CCPassByVal<8, 8>>,
-
-  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
-  // up to eight each of GPR and FPR.
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
-  // i128 is split to two i64s, we can't fit half to register X7.
-  CCIfType<[i64],
-           CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6],
-                                             [W0, W1, W2, W3, W4, W5, W6]>>>,
-  // i128 is split to two i64s, and its stack alignment is 16 bytes.
-  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
-
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
-           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
-           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-
-  // If more than will fit in registers, pass them on the stack instead.
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Stack">>,
-  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
-           CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
-]>;
-
-def CC_ARM64_DarwinPCS_VarArg : CallingConv<[
-  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
-  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
-
-  // Handle all scalar types as either i64 or f64.
-  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
-  CCIfType<[f32],          CCPromoteToType<f64>>,
-
-  // Everything is on the stack.
-  // i128 is split to two i64s, and its stack alignment is 16 bytes.
-  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
-  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],   CCAssignToStack<16, 16>>
-]>;
-
-// The WebKit_JS calling convention only passes the first argument (the callee)
-// in register and the remaining arguments on stack. We allow 32bit stack slots,
-// so that WebKit can write partial values in the stack and define the other
-// 32bit quantity as undef.
-def CC_ARM64_WebKit_JS : CallingConv<[
-  // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_WebKit_JS_i1i8i16_Reg">>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
-
-  // Pass the remaining arguments on the stack instead.
-  CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>,
-  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
-]>;
-
-def RetCC_ARM64_WebKit_JS : CallingConv<[
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
-]>;
-
-// FIXME: LR is only callee-saved in the sense that *we* preserve it and are
-// presumably a callee to someone. External functions may not do so, but this
-// is currently safe since BL has LR as an implicit-def and what happens after a
-// tail call doesn't matter.
-//
-// It would be better to model its preservation semantics properly (create a
-// vreg on entry, use it in RET & tail call generation; make that vreg def if we
-// end up saving LR as part of a call frame). Watch this space...
-def CSR_ARM64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
-                                           X23, X24, X25, X26, X27, X28,
-                                           D8,  D9,  D10, D11,
-                                           D12, D13, D14, D15)>;
-
-// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
-// 'this' and the pointer return value are both passed in X0 in these cases,
-// this can be partially modelled by treating X0 as a callee-saved register;
-// only the resulting RegMask is used; the SaveList is ignored
-//
-// (For generic ARM 64-bit ABI code, clang will not generate constructors or
-// destructors with 'this' returns, so this RegMask will not be used in that
-// case)
-def CSR_ARM64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_ARM64_AAPCS, X0)>;
-
-// The function used by Darwin to obtain the address of a thread-local variable
-// guarantees more than a normal AAPCS function. x16 and x17 are used on the
-// fast path for calculation, but other registers except X0 (argument/return)
-// and LR (it is a call, after all) are preserved.
-def CSR_ARM64_TLS_Darwin
-    : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
-                           FP,
-                           (sequence "Q%u", 0, 31))>;
-
-// The ELF stub used for TLS-descriptor access saves every feasible
-// register. Only X0 and LR are clobbered.
-def CSR_ARM64_TLS_ELF
-    : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP,
-                           (sequence "Q%u", 0, 31))>;
-
-def CSR_ARM64_AllRegs
-    : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP,
-                           (sequence "X%u", 0, 28), FP, LR, SP,
-                           (sequence "B%u", 0, 31), (sequence "H%u", 0, 31),
-                           (sequence "S%u", 0, 31), (sequence "D%u", 0, 31),
-                           (sequence "Q%u", 0, 31))>;
-
diff --git a/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp b/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp
deleted file mode 100644
index dce1301b92e..00000000000
--- a/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-//===-- ARM64CleanupLocalDynamicTLSPass.cpp -----------------------*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Local-dynamic access to thread-local variables proceeds in three stages.
-//
-// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated
-//    in much the same way as a general-dynamic TLS-descriptor access against
-//    the special symbol _TLS_MODULE_BASE.
-// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using
-//    instructions with "dtprel" modifiers.
-// 3. These two are added, together with TPIDR_EL0, to obtain the variable's
-//    true address.
-//
-// This is only better than general-dynamic access to the variable if two or
-// more of the first stage TLS-descriptor calculations can be combined. This
-// pass looks through a function and performs such combinations.
-//
-//===----------------------------------------------------------------------===//
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-using namespace llvm;
-
-namespace {
-struct LDTLSCleanup : public MachineFunctionPass {
-  static char ID;
-  LDTLSCleanup() : MachineFunctionPass(ID) {}
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-    if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
-      // No point folding accesses if there isn't at least two.
-      return false;
-    }
-
-    MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
-    return VisitNode(DT->getRootNode(), 0);
-  }
-
-  // Visit the dominator subtree rooted at Node in pre-order.
-  // If TLSBaseAddrReg is non-null, then use that to replace any
-  // TLS_base_addr instructions. Otherwise, create the register
-  // when the first such instruction is seen, and then use it
-  // as we encounter more instructions.
-  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
-    MachineBasicBlock *BB = Node->getBlock();
-    bool Changed = false;
-
-    // Traverse the current block.
-    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
-         ++I) {
-      switch (I->getOpcode()) {
-      case ARM64::TLSDESC_BLR:
-        // Make sure it's a local dynamic access.
-        if (!I->getOperand(1).isSymbol() ||
-            strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
-          break;
-
-        if (TLSBaseAddrReg)
-          I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg);
-        else
-          I = setRegister(I, &TLSBaseAddrReg);
-        Changed = true;
-        break;
-      default:
-        break;
-      }
-    }
-
-    // Visit the children of this block in the dominator tree.
-    for (MachineDomTreeNode *N : *Node) {
-      Changed |= VisitNode(N, TLSBaseAddrReg);
-    }
-
-    return Changed;
-  }
-
-  // Replace the TLS_base_addr instruction I with a copy from
-  // TLSBaseAddrReg, returning the new instruction.
-  MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
-                                       unsigned TLSBaseAddrReg) {
-    MachineFunction *MF = I->getParent()->getParent();
-    const ARM64TargetMachine *TM =
-        static_cast<const ARM64TargetMachine *>(&MF->getTarget());
-    const ARM64InstrInfo *TII = TM->getInstrInfo();
-
-    // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
-    // code sequence assumes the address will be.
-    MachineInstr *Copy =
-        BuildMI(*I->getParent(), I, I->getDebugLoc(),
-                TII->get(TargetOpcode::COPY), ARM64::X0).addReg(TLSBaseAddrReg);
-
-    // Erase the TLS_base_addr instruction.
-    I->eraseFromParent();
-
-    return Copy;
-  }
-
-  // Create a virtal register in *TLSBaseAddrReg, and populate it by
-  // inserting a copy instruction after I. Returns the new instruction.
-  MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
-    MachineFunction *MF = I->getParent()->getParent();
-    const ARM64TargetMachine *TM =
-        static_cast<const ARM64TargetMachine *>(&MF->getTarget());
-    const ARM64InstrInfo *TII = TM->getInstrInfo();
-
-    // Create a virtual register for the TLS base address.
-    MachineRegisterInfo &RegInfo = MF->getRegInfo();
-    *TLSBaseAddrReg = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
-
-    // Insert a copy from X0 to TLSBaseAddrReg for later.
-    MachineInstr *Next = I->getNextNode();
-    MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
-                                 TII->get(TargetOpcode::COPY),
-                                 *TLSBaseAddrReg).addReg(ARM64::X0);
-
-    return Copy;
-  }
-
-  const char *getPassName() const override {
-    return "Local Dynamic TLS Access Clean-up";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-}
-
-char LDTLSCleanup::ID = 0;
-FunctionPass *llvm::createARM64CleanupLocalDynamicTLSPass() {
-  return new LDTLSCleanup();
-}
diff --git a/lib/Target/ARM64/ARM64CollectLOH.cpp b/lib/Target/ARM64/ARM64CollectLOH.cpp
deleted file mode 100644
index 8b48f3ae9b2..00000000000
--- a/lib/Target/ARM64/ARM64CollectLOH.cpp
+++ /dev/null
@@ -1,1117 +0,0 @@
-//===-------------- ARM64CollectLOH.cpp - ARM64 collect LOH pass --*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that collect the Linker Optimization Hint (LOH).
-// This pass should be run at the very end of the compilation flow, just before
-// assembly printer.
-// To be useful for the linker, the LOH must be printed into the assembly file.
-//
-// A LOH describes a sequence of instructions that may be optimized by the
-// linker.
-// This same sequence cannot be optimized by the compiler because some of
-// the information will be known at link time.
-// For instance, consider the following sequence:
-//     L1: adrp xA, sym@PAGE
-//     L2: add xB, xA, sym@PAGEOFF
-//     L3: ldr xC, [xB, #imm]
-// This sequence can be turned into:
-// A literal load if sym@PAGE + sym@PAGEOFF + #imm - address(L3) is < 1MB:
-//     L3: ldr xC, sym+#imm
-// It may also be turned into either the following more efficient
-// code sequences:
-// - If sym@PAGEOFF + #imm fits the encoding space of L3.
-//     L1: adrp xA, sym@PAGE
-//     L3: ldr xC, [xB, sym@PAGEOFF + #imm]
-// - If sym@PAGE + sym@PAGEOFF - address(L1) < 1MB:
-//     L1: adr xA, sym
-//     L3: ldr xC, [xB, #imm]
-//
-// To be valid a LOH must meet all the requirements needed by all the related
-// possible linker transformations.
-// For instance, using the running example, the constraints to emit
-// ".loh AdrpAddLdr" are:
-// - L1, L2, and L3 instructions are of the expected type, i.e.,
-//   respectively ADRP, ADD (immediate), and LD.
-// - The result of L1 is used only by L2.
-// - The register argument (xA) used in the ADD instruction is defined
-//   only by L1.
-// - The result of L2 is used only by L3.
-// - The base address (xB) in L3 is defined only L2.
-// - The ADRP in L1 and the ADD in L2 must reference the same symbol using
-//   @PAGE/@PAGEOFF with no additional constants
-//
-// Currently supported LOHs are:
-// * So called non-ADRP-related:
-//   - .loh AdrpAddLdr L1, L2, L3:
-//     L1: adrp xA, sym@PAGE
-//     L2: add xB, xA, sym@PAGEOFF
-//     L3: ldr xC, [xB, #imm]
-//   - .loh AdrpLdrGotLdr L1, L2, L3:
-//     L1: adrp xA, sym@GOTPAGE
-//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
-//     L3: ldr xC, [xB, #imm]
-//   - .loh AdrpLdr L1, L3:
-//     L1: adrp xA, sym@PAGE
-//     L3: ldr xC, [xA, sym@PAGEOFF]
-//   - .loh AdrpAddStr L1, L2, L3:
-//     L1: adrp xA, sym@PAGE
-//     L2: add xB, xA, sym@PAGEOFF
-//     L3: str xC, [xB, #imm]
-//   - .loh AdrpLdrGotStr L1, L2, L3:
-//     L1: adrp xA, sym@GOTPAGE
-//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
-//     L3: str xC, [xB, #imm]
-//   - .loh AdrpAdd L1, L2:
-//     L1: adrp xA, sym@PAGE
-//     L2: add xB, xA, sym@PAGEOFF
-//   For all these LOHs, L1, L2, L3 form a simple chain:
-//   L1 result is used only by L2 and L2 result by L3.
-//   L3 LOH-related argument is defined only by L2 and L2 LOH-related argument
-//   by L1.
-// All these LOHs aim at using more efficient load/store patterns by folding
-// some instructions used to compute the address directly into the load/store.
-//
-// * So called ADRP-related:
-//  - .loh AdrpAdrp L2, L1:
-//    L2: ADRP xA, sym1@PAGE
-//    L1: ADRP xA, sym2@PAGE
-//    L2 dominates L1 and xA is not redifined between L2 and L1
-// This LOH aims at getting rid of redundant ADRP instructions.
-//
-// The overall design for emitting the LOHs is:
-// 1. ARM64CollectLOH (this pass) records the LOHs in the ARM64FunctionInfo.
-// 2. ARM64AsmPrinter reads the LOHs from ARM64FunctionInfo and it:
-//     1. Associates them a label.
-//     2. Emits them in a MCStreamer (EmitLOHDirective).
-//         - The MCMachOStreamer records them into the MCAssembler.
-//         - The MCAsmStreamer prints them.
-//         - Other MCStreamers ignore them.
-//     3. Closes the MCStreamer:
-//         - The MachObjectWriter gets them from the MCAssembler and writes
-//           them in the object file.
-//         - Other ObjectWriters ignore them.
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-collect-loh"
-
-static cl::opt<bool>
-PreCollectRegister("arm64-collect-loh-pre-collect-register", cl::Hidden,
-                   cl::desc("Restrict analysis to registers invovled"
-                            " in LOHs"),
-                   cl::init(true));
-
-static cl::opt<bool>
-BasicBlockScopeOnly("arm64-collect-loh-bb-only", cl::Hidden,
-                    cl::desc("Restrict analysis at basic block scope"),
-                    cl::init(true));
-
-STATISTIC(NumADRPSimpleCandidate,
-          "Number of simplifiable ADRP dominate by another");
-STATISTIC(NumADRPComplexCandidate2,
-          "Number of simplifiable ADRP reachable by 2 defs");
-STATISTIC(NumADRPComplexCandidate3,
-          "Number of simplifiable ADRP reachable by 3 defs");
-STATISTIC(NumADRPComplexCandidateOther,
-          "Number of simplifiable ADRP reachable by 4 or more defs");
-STATISTIC(NumADDToSTRWithImm,
-          "Number of simplifiable STR with imm reachable by ADD");
-STATISTIC(NumLDRToSTRWithImm,
-          "Number of simplifiable STR with imm reachable by LDR");
-STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD");
-STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR");
-STATISTIC(NumADDToLDRWithImm,
-          "Number of simplifiable LDR with imm reachable by ADD");
-STATISTIC(NumLDRToLDRWithImm,
-          "Number of simplifiable LDR with imm reachable by LDR");
-STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD");
-STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR");
-STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP");
-STATISTIC(NumCplxLvl1, "Number of complex case of level 1");
-STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1");
-STATISTIC(NumCplxLvl2, "Number of complex case of level 2");
-STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2");
-STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
-STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
-
-namespace llvm {
-void initializeARM64CollectLOHPass(PassRegistry &);
-}
-
-namespace {
-struct ARM64CollectLOH : public MachineFunctionPass {
-  static char ID;
-  ARM64CollectLOH() : MachineFunctionPass(ID) {
-    initializeARM64CollectLOHPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "ARM64 Collect Linker Optimization Hint (LOH)";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<MachineDominatorTree>();
-  }
-
-private:
-};
-
-/// A set of MachineInstruction.
-typedef SetVector<const MachineInstr *> SetOfMachineInstr;
-/// Map a basic block to a set of instructions per register.
-/// This is used to represent the exposed uses of a basic block
-/// per register.
-typedef MapVector<const MachineBasicBlock *, SetOfMachineInstr *>
-BlockToSetOfInstrsPerColor;
-/// Map a basic block to an instruction per register.
-/// This is used to represent the live-out definitions of a basic block
-/// per register.
-typedef MapVector<const MachineBasicBlock *, const MachineInstr **>
-BlockToInstrPerColor;
-/// Map an instruction to a set of instructions. Used to represent the
-/// mapping def to reachable uses or use to definitions.
-typedef MapVector<const MachineInstr *, SetOfMachineInstr> InstrToInstrs;
-/// Map a basic block to a BitVector.
-/// This is used to record the kill registers per basic block.
-typedef MapVector<const MachineBasicBlock *, BitVector> BlockToRegSet;
-
-/// Map a register to a dense id.
-typedef DenseMap<unsigned, unsigned> MapRegToId;
-/// Map a dense id to a register. Used for debug purposes.
-typedef SmallVector<unsigned, 32> MapIdToReg;
-} // end anonymous namespace.
-
-char ARM64CollectLOH::ID = 0;
-
-INITIALIZE_PASS_BEGIN(ARM64CollectLOH, "arm64-collect-loh",
-                      "ARM64 Collect Linker Optimization Hint (LOH)", false,
-                      false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(ARM64CollectLOH, "arm64-collect-loh",
-                    "ARM64 Collect Linker Optimization Hint (LOH)", false,
-                    false)
-
-/// Given a couple (MBB, reg) get the corresponding set of instruction from
-/// the given "sets".
-/// If this couple does not reference any set, an empty set is added to "sets"
-/// for this couple and returned.
-/// \param nbRegs is used internally allocate some memory. It must be consistent
-/// with the way sets is used.
-static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
-                                 const MachineBasicBlock &MBB, unsigned reg,
-                                 unsigned nbRegs) {
-  SetOfMachineInstr *result;
-  BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
-  if (it != sets.end())
-    result = it->second;
-  else
-    result = sets[&MBB] = new SetOfMachineInstr[nbRegs];
-
-  return result[reg];
-}
-
-/// Given a couple (reg, MI) get the corresponding set of instructions from the
-/// the given "sets".
-/// This is used to get the uses record in sets of a definition identified by
-/// MI and reg, i.e., MI defines reg.
-/// If the couple does not reference anything, an empty set is added to
-/// "sets[reg]".
-/// \pre set[reg] is valid.
-static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg,
-                                  const MachineInstr &MI) {
-  return sets[reg][&MI];
-}
-
-/// Same as getUses but does not modify the input map: sets.
-/// \return NULL if the couple (reg, MI) is not in sets.
-static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
-                                        const MachineInstr &MI) {
-  InstrToInstrs::const_iterator Res = sets[reg].find(&MI);
-  if (Res != sets[reg].end())
-    return &(Res->second);
-  return nullptr;
-}
-
-/// Initialize the reaching definition algorithm:
-/// For each basic block BB in MF, record:
-/// - its kill set.
-/// - its reachable uses (uses that are exposed to BB's predecessors).
-/// - its the generated definitions.
-/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to
-/// the list of uses of exposed defintions.
-/// \param ADRPMode specifies to only consider ADRP instructions for generated
-/// definition. It also consider definitions of ADRP instructions as uses and
-/// ignore other uses. The ADRPMode is used to collect the information for LHO
-/// that involve ADRP operation only.
-static void initReachingDef(MachineFunction &MF,
-                            InstrToInstrs *ColorOpToReachedUses,
-                            BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
-                            BlockToSetOfInstrsPerColor &ReachableUses,
-                            const MapRegToId &RegToId,
-                            const MachineInstr *DummyOp, bool ADRPMode) {
-  const TargetMachine &TM = MF.getTarget();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-
-  unsigned NbReg = RegToId.size();
-
-  for (MachineBasicBlock &MBB : MF) {
-    const MachineInstr **&BBGen = Gen[&MBB];
-    BBGen = new const MachineInstr *[NbReg];
-    memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg);
-
-    BitVector &BBKillSet = Kill[&MBB];
-    BBKillSet.resize(NbReg);
-    for (const MachineInstr &MI : MBB) {
-      bool IsADRP = MI.getOpcode() == ARM64::ADRP;
-
-      // Process uses first.
-      if (IsADRP || !ADRPMode)
-        for (const MachineOperand &MO : MI.operands()) {
-          // Treat ADRP def as use, as the goal of the analysis is to find
-          // ADRP defs reached by other ADRP defs.
-          if (!MO.isReg() || (!ADRPMode && !MO.isUse()) ||
-              (ADRPMode && (!IsADRP || !MO.isDef())))
-            continue;
-          unsigned CurReg = MO.getReg();
-          MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
-          if (ItCurRegId == RegToId.end())
-            continue;
-          CurReg = ItCurRegId->second;
-
-          // if CurReg has not been defined, this use is reachable.
-          if (!BBGen[CurReg] && !BBKillSet.test(CurReg))
-            getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI);
-          // current basic block definition for this color, if any, is in Gen.
-          if (BBGen[CurReg])
-            getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI);
-        }
-
-      // Process clobbers.
-      for (const MachineOperand &MO : MI.operands()) {
-        if (!MO.isRegMask())
-          continue;
-        // Clobbers kill the related colors.
-        const uint32_t *PreservedRegs = MO.getRegMask();
-
-        // Set generated regs.
-        for (const auto Entry : RegToId) {
-          unsigned Reg = Entry.second;
-          // Use the global register ID when querying APIs external to this
-          // pass.
-          if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) {
-            // Do not register clobbered definition for no ADRP.
-            // This definition is not used anyway (otherwise register
-            // allocation is wrong).
-            BBGen[Reg] = ADRPMode ? &MI : nullptr;
-            BBKillSet.set(Reg);
-          }
-        }
-      }
-
-      // Process register defs.
-      for (const MachineOperand &MO : MI.operands()) {
-        if (!MO.isReg() || !MO.isDef())
-          continue;
-        unsigned CurReg = MO.getReg();
-        MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
-        if (ItCurRegId == RegToId.end())
-          continue;
-
-        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
-          MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
-          assert(ItRegId != RegToId.end() &&
-                 "Sub-register of an "
-                 "involved register, not recorded as involved!");
-          BBKillSet.set(ItRegId->second);
-          BBGen[ItRegId->second] = &MI;
-        }
-        BBGen[ItCurRegId->second] = &MI;
-      }
-    }
-
-    // If we restrict our analysis to basic block scope, conservatively add a
-    // dummy
-    // use for each generated value.
-    if (!ADRPMode && DummyOp && !MBB.succ_empty())
-      for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg)
-        if (BBGen[CurReg])
-          getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp);
-  }
-}
-
-/// Reaching def core algorithm:
-/// while an Out has changed
-///    for each bb
-///       for each color
-///           In[bb][color] = U Out[bb.predecessors][color]
-///           insert reachableUses[bb][color] in each in[bb][color]
-///                 op.reachedUses
-///
-///           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
-static void reachingDefAlgorithm(MachineFunction &MF,
-                                 InstrToInstrs *ColorOpToReachedUses,
-                                 BlockToSetOfInstrsPerColor &In,
-                                 BlockToSetOfInstrsPerColor &Out,
-                                 BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
-                                 BlockToSetOfInstrsPerColor &ReachableUses,
-                                 unsigned NbReg) {
-  bool HasChanged;
-  do {
-    HasChanged = false;
-    for (MachineBasicBlock &MBB : MF) {
-      unsigned CurReg;
-      for (CurReg = 0; CurReg < NbReg; ++CurReg) {
-        SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg);
-        SetOfMachineInstr &BBReachableUses =
-            getSet(ReachableUses, MBB, CurReg, NbReg);
-        SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg);
-        unsigned Size = BBOutSet.size();
-        //   In[bb][color] = U Out[bb.predecessors][color]
-        for (MachineBasicBlock *PredMBB : MBB.predecessors()) {
-          SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg);
-          BBInSet.insert(PredOutSet.begin(), PredOutSet.end());
-        }
-        //   insert reachableUses[bb][color] in each in[bb][color] op.reachedses
-        for (const MachineInstr *MI : BBInSet) {
-          SetOfMachineInstr &OpReachedUses =
-              getUses(ColorOpToReachedUses, CurReg, *MI);
-          OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end());
-        }
-        //           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
-        if (!Kill[&MBB].test(CurReg))
-          BBOutSet.insert(BBInSet.begin(), BBInSet.end());
-        if (Gen[&MBB][CurReg])
-          BBOutSet.insert(Gen[&MBB][CurReg]);
-        HasChanged |= BBOutSet.size() != Size;
-      }
-    }
-  } while (HasChanged);
-}
-
-/// Release all memory dynamically allocated during the reaching
-/// definition algorithm.
-static void finitReachingDef(BlockToSetOfInstrsPerColor &In,
-                             BlockToSetOfInstrsPerColor &Out,
-                             BlockToInstrPerColor &Gen,
-                             BlockToSetOfInstrsPerColor &ReachableUses) {
-  for (auto &IT : Out)
-    delete[] IT.second;
-  for (auto &IT : In)
-    delete[] IT.second;
-  for (auto &IT : ReachableUses)
-    delete[] IT.second;
-  for (auto &IT : Gen)
-    delete[] IT.second;
-}
-
-/// Reaching definition algorithm.
-/// \param MF function on which the algorithm will operate.
-/// \param[out] ColorOpToReachedUses will contain the result of the reaching
-/// def algorithm.
-/// \param ADRPMode specify whether the reaching def algorithm should be tuned
-/// for ADRP optimization. \see initReachingDef for more details.
-/// \param DummyOp if not NULL, the algorithm will work at
-/// basic block scope and will set for every exposed definition a use to
-/// @p DummyOp.
-/// \pre ColorOpToReachedUses is an array of at least number of registers of
-/// InstrToInstrs.
-static void reachingDef(MachineFunction &MF,
-                        InstrToInstrs *ColorOpToReachedUses,
-                        const MapRegToId &RegToId, bool ADRPMode = false,
-                        const MachineInstr *DummyOp = nullptr) {
-  // structures:
-  // For each basic block.
-  // Out: a set per color of definitions that reach the
-  //      out boundary of this block.
-  // In: Same as Out but for in boundary.
-  // Gen: generated color in this block (one operation per color).
-  // Kill: register set of killed color in this block.
-  // ReachableUses: a set per color of uses (operation) reachable
-  //                for "In" definitions.
-  BlockToSetOfInstrsPerColor Out, In, ReachableUses;
-  BlockToInstrPerColor Gen;
-  BlockToRegSet Kill;
-
-  // Initialize Gen, kill and reachableUses.
-  initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId,
-                  DummyOp, ADRPMode);
-
-  // Algo.
-  if (!DummyOp)
-    reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
-                         ReachableUses, RegToId.size());
-
-  // finit.
-  finitReachingDef(In, Out, Gen, ReachableUses);
-}
-
-#ifndef NDEBUG
-/// print the result of the reaching definition algorithm.
-static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses,
-                             unsigned NbReg, const TargetRegisterInfo *TRI,
-                             const MapIdToReg &IdToReg) {
-  unsigned CurReg;
-  for (CurReg = 0; CurReg < NbReg; ++CurReg) {
-    if (ColorOpToReachedUses[CurReg].empty())
-      continue;
-    DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n");
-
-    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
-      DEBUG(dbgs() << "Def:\n");
-      DEBUG(DefsIt.first->print(dbgs()));
-      DEBUG(dbgs() << "Reachable uses:\n");
-      for (const MachineInstr *MI : DefsIt.second) {
-        DEBUG(MI->print(dbgs()));
-      }
-    }
-  }
-}
-#endif // NDEBUG
-
-/// Answer the following question: Can Def be one of the definition
-/// involved in a part of a LOH?
-static bool canDefBePartOfLOH(const MachineInstr *Def) {
-  unsigned Opc = Def->getOpcode();
-  // Accept ADRP, ADDLow and LOADGot.
-  switch (Opc) {
-  default:
-    return false;
-  case ARM64::ADRP:
-    return true;
-  case ARM64::ADDXri:
-    // Check immediate to see if the immediate is an address.
-    switch (Def->getOperand(2).getType()) {
-    default:
-      return false;
-    case MachineOperand::MO_GlobalAddress:
-    case MachineOperand::MO_JumpTableIndex:
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_BlockAddress:
-      return true;
-    }
-  case ARM64::LDRXui:
-    // Check immediate to see if the immediate is an address.
-    switch (Def->getOperand(2).getType()) {
-    default:
-      return false;
-    case MachineOperand::MO_GlobalAddress:
-      return true;
-    }
-  }
-  // Unreachable.
-  return false;
-}
-
-/// Check whether the given instruction can the end of a LOH chain involving a
-/// store.
-static bool isCandidateStore(const MachineInstr *Instr) {
-  switch (Instr->getOpcode()) {
-  default:
-    return false;
-  case ARM64::STRBui:
-  case ARM64::STRHui:
-  case ARM64::STRWui:
-  case ARM64::STRXui:
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STRQui:
-    // In case we have str xA, [xA, #imm], this is two different uses
-    // of xA and we cannot fold, otherwise the xA stored may be wrong,
-    // even if #imm == 0.
-    if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg())
-      return true;
-  }
-  return false;
-}
-
-/// Given the result of a reaching definition algorithm in ColorOpToReachedUses,
-/// Build the Use to Defs information and filter out obvious non-LOH candidates.
-/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions.
-/// In non-ADRPMode, non-LOH candidates are "uses" with several definition,
-/// i.e., no simple chain.
-/// \param ADRPMode -- \see initReachingDef.
-static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
-                              const InstrToInstrs *ColorOpToReachedUses,
-                              const MapRegToId &RegToId,
-                              bool ADRPMode = false) {
-
-  SetOfMachineInstr NotCandidate;
-  unsigned NbReg = RegToId.size();
-  MapRegToId::const_iterator EndIt = RegToId.end();
-  for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) {
-    // If this color is never defined, continue.
-    if (ColorOpToReachedUses[CurReg].empty())
-      continue;
-
-    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
-      for (const MachineInstr *MI : DefsIt.second) {
-        const MachineInstr *Def = DefsIt.first;
-        MapRegToId::const_iterator It;
-        // if all the reaching defs are not adrp, this use will not be
-        // simplifiable.
-        if ((ADRPMode && Def->getOpcode() != ARM64::ADRP) ||
-            (!ADRPMode && !canDefBePartOfLOH(Def)) ||
-            (!ADRPMode && isCandidateStore(MI) &&
-             // store are LOH candidate iff the end of the chain is used as
-             // base.
-             ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt ||
-              It->second != CurReg))) {
-          NotCandidate.insert(MI);
-          continue;
-        }
-        // Do not consider self reaching as a simplifiable case for ADRP.
-        if (!ADRPMode || MI != DefsIt.first) {
-          UseToReachingDefs[MI].insert(DefsIt.first);
-          // If UsesIt has several reaching definitions, it is not
-          // candidate for simplificaton in non-ADRPMode.
-          if (!ADRPMode && UseToReachingDefs[MI].size() > 1)
-            NotCandidate.insert(MI);
-        }
-      }
-    }
-  }
-  for (const MachineInstr *Elem : NotCandidate) {
-    DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n");
-    // It would have been better if we could just remove the entry
-    // from the map.  Because of that, we have to filter the garbage
-    // (second.empty) in the subsequence analysis.
-    UseToReachingDefs[Elem].clear();
-  }
-}
-
-/// Based on the use to defs information (in ADRPMode), compute the
-/// opportunities of LOH ADRP-related.
-static void computeADRP(const InstrToInstrs &UseToDefs,
-                        ARM64FunctionInfo &ARM64FI,
-                        const MachineDominatorTree *MDT) {
-  DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
-  for (const auto &Entry : UseToDefs) {
-    unsigned Size = Entry.second.size();
-    if (Size == 0)
-      continue;
-    if (Size == 1) {
-      const MachineInstr *L2 = *Entry.second.begin();
-      const MachineInstr *L1 = Entry.first;
-      if (!MDT->dominates(L2, L1)) {
-        DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1
-                     << '\n');
-        continue;
-      }
-      DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
-      SmallVector<const MachineInstr *, 2> Args;
-      Args.push_back(L2);
-      Args.push_back(L1);
-      ARM64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
-      ++NumADRPSimpleCandidate;
-    }
-#ifdef DEBUG
-    else if (Size == 2)
-      ++NumADRPComplexCandidate2;
-    else if (Size == 3)
-      ++NumADRPComplexCandidate3;
-    else
-      ++NumADRPComplexCandidateOther;
-#endif
-    // if Size < 1, the use should have been removed from the candidates
-    assert(Size >= 1 && "No reaching defs for that use!");
-  }
-}
-
-/// Check whether the given instruction can be the end of a LOH chain
-/// involving a load.
-static bool isCandidateLoad(const MachineInstr *Instr) {
-  switch (Instr->getOpcode()) {
-  default:
-    return false;
-  case ARM64::LDRSBWui:
-  case ARM64::LDRSBXui:
-  case ARM64::LDRSHWui:
-  case ARM64::LDRSHXui:
-  case ARM64::LDRSWui:
-  case ARM64::LDRBui:
-  case ARM64::LDRHui:
-  case ARM64::LDRWui:
-  case ARM64::LDRXui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-    if (Instr->getOperand(2).getTargetFlags() & ARM64II::MO_GOT)
-      return false;
-    return true;
-  }
-  // Unreachable.
-  return false;
-}
-
-/// Check whether the given instruction can load a litteral.
-static bool supportLoadFromLiteral(const MachineInstr *Instr) {
-  switch (Instr->getOpcode()) {
-  default:
-    return false;
-  case ARM64::LDRSWui:
-  case ARM64::LDRWui:
-  case ARM64::LDRXui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-    return true;
-  }
-  // Unreachable.
-  return false;
-}
-
-/// Check whether the given instruction is a LOH candidate.
-/// \param UseToDefs is used to check that Instr is at the end of LOH supported
-/// chain.
-/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are
-/// already been filtered out.
-static bool isCandidate(const MachineInstr *Instr,
-                        const InstrToInstrs &UseToDefs,
-                        const MachineDominatorTree *MDT) {
-  if (!isCandidateLoad(Instr) && !isCandidateStore(Instr))
-    return false;
-
-  const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
-  if (Def->getOpcode() != ARM64::ADRP) {
-    // At this point, Def is ADDXri or LDRXui of the right type of
-    // symbol, because we filtered out the uses that were not defined
-    // by these kind of instructions (+ ADRP).
-
-    // Check if this forms a simple chain: each intermediate node must
-    // dominates the next one.
-    if (!MDT->dominates(Def, Instr))
-      return false;
-    // Move one node up in the simple chain.
-    if (UseToDefs.find(Def) ==
-            UseToDefs.end()
-            // The map may contain garbage we have to ignore.
-        ||
-        UseToDefs.find(Def)->second.empty())
-      return false;
-    Instr = Def;
-    Def = *UseToDefs.find(Def)->second.begin();
-  }
-  // Check if we reached the top of the simple chain:
-  // - top is ADRP.
-  // - check the simple chain property: each intermediate node must
-  // dominates the next one.
-  if (Def->getOpcode() == ARM64::ADRP)
-    return MDT->dominates(Def, Instr);
-  return false;
-}
-
-static bool registerADRCandidate(const MachineInstr &Use,
-                                 const InstrToInstrs &UseToDefs,
-                                 const InstrToInstrs *DefsPerColorToUses,
-                                 ARM64FunctionInfo &ARM64FI,
-                                 SetOfMachineInstr *InvolvedInLOHs,
-                                 const MapRegToId &RegToId) {
-  // Look for opportunities to turn ADRP -> ADD or
-  // ADRP -> LDR GOTPAGEOFF into ADR.
-  // If ADRP has more than one use. Give up.
-  if (Use.getOpcode() != ARM64::ADDXri &&
-      (Use.getOpcode() != ARM64::LDRXui ||
-       !(Use.getOperand(2).getTargetFlags() & ARM64II::MO_GOT)))
-    return false;
-  InstrToInstrs::const_iterator It = UseToDefs.find(&Use);
-  // The map may contain garbage that we need to ignore.
-  if (It == UseToDefs.end() || It->second.empty())
-    return false;
-  const MachineInstr &Def = **It->second.begin();
-  if (Def.getOpcode() != ARM64::ADRP)
-    return false;
-  // Check the number of users of ADRP.
-  const SetOfMachineInstr *Users =
-      getUses(DefsPerColorToUses,
-              RegToId.find(Def.getOperand(0).getReg())->second, Def);
-  if (Users->size() > 1) {
-    ++NumADRComplexCandidate;
-    return false;
-  }
-  ++NumADRSimpleCandidate;
-  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) &&
-         "ADRP already involved in LOH.");
-  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) &&
-         "ADD already involved in LOH.");
-  DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
-
-  SmallVector<const MachineInstr *, 2> Args;
-  Args.push_back(&Def);
-  Args.push_back(&Use);
-
-  ARM64FI.addLOHDirective(Use.getOpcode() == ARM64::ADDXri ? MCLOH_AdrpAdd
-                                                           : MCLOH_AdrpLdrGot,
-                          Args);
-  return true;
-}
-
-/// Based on the use to defs information (in non-ADRPMode), compute the
-/// opportunities of LOH non-ADRP-related
-static void computeOthers(const InstrToInstrs &UseToDefs,
-                          const InstrToInstrs *DefsPerColorToUses,
-                          ARM64FunctionInfo &ARM64FI, const MapRegToId &RegToId,
-                          const MachineDominatorTree *MDT) {
-  SetOfMachineInstr *InvolvedInLOHs = nullptr;
-#ifdef DEBUG
-  SetOfMachineInstr InvolvedInLOHsStorage;
-  InvolvedInLOHs = &InvolvedInLOHsStorage;
-#endif // DEBUG
-  DEBUG(dbgs() << "*** Compute LOH for Others\n");
-  // ADRP -> ADD/LDR -> LDR/STR pattern.
-  // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern.
-
-  // FIXME: When the statistics are not important,
-  // This initial filtering loop can be merged into the next loop.
-  // Currently, we didn't do it to have the same code for both DEBUG and
-  // NDEBUG builds. Indeed, the iterator of the second loop would need
-  // to be changed.
-  SetOfMachineInstr PotentialCandidates;
-  SetOfMachineInstr PotentialADROpportunities;
-  for (auto &Use : UseToDefs) {
-    // If no definition is available, this is a non candidate.
-    if (Use.second.empty())
-      continue;
-    // Keep only instructions that are load or store and at the end of
-    // a ADRP -> ADD/LDR/Nothing chain.
-    // We already filtered out the no-chain cases.
-    if (!isCandidate(Use.first, UseToDefs, MDT)) {
-      PotentialADROpportunities.insert(Use.first);
-      continue;
-    }
-    PotentialCandidates.insert(Use.first);
-  }
-
-  // Make the following distinctions for statistics as the linker does
-  // know how to decode instructions:
-  // - ADD/LDR/Nothing make there different patterns.
-  // - LDR/STR make two different patterns.
-  // Hence, 6 - 1 base patterns.
-  // (because ADRP-> Nothing -> STR is not simplifiable)
-
-  // The linker is only able to have a simple semantic, i.e., if pattern A
-  // do B.
-  // However, we want to see the opportunity we may miss if we were able to
-  // catch more complex cases.
-
-  // PotentialCandidates are result of a chain ADRP -> ADD/LDR ->
-  // A potential candidate becomes a candidate, if its current immediate
-  // operand is zero and all nodes of the chain have respectively only one user
-#ifdef DEBUG
-  SetOfMachineInstr DefsOfPotentialCandidates;
-#endif
-  for (const MachineInstr *Candidate : PotentialCandidates) {
-    // Get the definition of the candidate i.e., ADD or LDR.
-    const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin();
-    // Record the elements of the chain.
-    const MachineInstr *L1 = Def;
-    const MachineInstr *L2 = nullptr;
-    unsigned ImmediateDefOpc = Def->getOpcode();
-    if (Def->getOpcode() != ARM64::ADRP) {
-      // Check the number of users of this node.
-      const SetOfMachineInstr *Users =
-          getUses(DefsPerColorToUses,
-                  RegToId.find(Def->getOperand(0).getReg())->second, *Def);
-      if (Users->size() > 1) {
-#ifdef DEBUG
-        // if all the uses of this def are in potential candidate, this is
-        // a complex candidate of level 2.
-        bool IsLevel2 = true;
-        for (const MachineInstr *MI : *Users) {
-          if (!PotentialCandidates.count(MI)) {
-            ++NumTooCplxLvl2;
-            IsLevel2 = false;
-            break;
-          }
-        }
-        if (IsLevel2)
-          ++NumCplxLvl2;
-#endif // DEBUG
-        PotentialADROpportunities.insert(Def);
-        continue;
-      }
-      L2 = Def;
-      Def = *UseToDefs.find(Def)->second.begin();
-      L1 = Def;
-    } // else the element in the middle of the chain is nothing, thus
-      // Def already contains the first element of the chain.
-
-    // Check the number of users of the first node in the chain, i.e., ADRP
-    const SetOfMachineInstr *Users =
-        getUses(DefsPerColorToUses,
-                RegToId.find(Def->getOperand(0).getReg())->second, *Def);
-    if (Users->size() > 1) {
-#ifdef DEBUG
-      // if all the uses of this def are in the defs of the potential candidate,
-      // this is a complex candidate of level 1
-      if (DefsOfPotentialCandidates.empty()) {
-        // lazy init
-        DefsOfPotentialCandidates = PotentialCandidates;
-        for (const MachineInstr *Candidate : PotentialCandidates) {
-          if (!UseToDefs.find(Candidate)->second.empty())
-            DefsOfPotentialCandidates.insert(
-                *UseToDefs.find(Candidate)->second.begin());
-        }
-      }
-      bool Found = false;
-      for (auto &Use : *Users) {
-        if (!DefsOfPotentialCandidates.count(Use)) {
-          ++NumTooCplxLvl1;
-          Found = true;
-          break;
-        }
-      }
-      if (!Found)
-        ++NumCplxLvl1;
-#endif // DEBUG
-      continue;
-    }
-
-    bool IsL2Add = (ImmediateDefOpc == ARM64::ADDXri);
-    // If the chain is three instructions long and ldr is the second element,
-    // then this ldr must load form GOT, otherwise this is not a correct chain.
-    if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != ARM64II::MO_GOT)
-      continue;
-    SmallVector<const MachineInstr *, 3> Args;
-    MCLOHType Kind;
-    if (isCandidateLoad(Candidate)) {
-      if (!L2) {
-        // At this point, the candidate LOH indicates that the ldr instruction
-        // may use a direct access to the symbol. There is not such encoding
-        // for loads of byte and half.
-        if (!supportLoadFromLiteral(Candidate))
-          continue;
-
-        DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate
-                     << '\n');
-        Kind = MCLOH_AdrpLdr;
-        Args.push_back(L1);
-        Args.push_back(Candidate);
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
-               "L1 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
-               "Candidate already involved in LOH.");
-        ++NumADRPToLDR;
-      } else {
-        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
-                     << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
-                     << '\n');
-
-        Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr;
-        Args.push_back(L1);
-        Args.push_back(L2);
-        Args.push_back(Candidate);
-
-        PotentialADROpportunities.remove(L2);
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
-               "L1 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
-               "L2 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
-               "Candidate already involved in LOH.");
-#ifdef DEBUG
-        // get the immediate of the load
-        if (Candidate->getOperand(2).getImm() == 0)
-          if (ImmediateDefOpc == ARM64::ADDXri)
-            ++NumADDToLDR;
-          else
-            ++NumLDRToLDR;
-        else if (ImmediateDefOpc == ARM64::ADDXri)
-          ++NumADDToLDRWithImm;
-        else
-          ++NumLDRToLDRWithImm;
-#endif // DEBUG
-      }
-    } else {
-      if (ImmediateDefOpc == ARM64::ADRP)
-        continue;
-      else {
-
-        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
-                     << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
-                     << '\n');
-
-        Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr;
-        Args.push_back(L1);
-        Args.push_back(L2);
-        Args.push_back(Candidate);
-
-        PotentialADROpportunities.remove(L2);
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
-               "L1 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
-               "L2 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
-               "Candidate already involved in LOH.");
-#ifdef DEBUG
-        // get the immediate of the store
-        if (Candidate->getOperand(2).getImm() == 0)
-          if (ImmediateDefOpc == ARM64::ADDXri)
-            ++NumADDToSTR;
-          else
-            ++NumLDRToSTR;
-        else if (ImmediateDefOpc == ARM64::ADDXri)
-          ++NumADDToSTRWithImm;
-        else
-          ++NumLDRToSTRWithImm;
-#endif // DEBUG
-      }
-    }
-    ARM64FI.addLOHDirective(Kind, Args);
-  }
-
-  // Now, we grabbed all the big patterns, check ADR opportunities.
-  for (const MachineInstr *Candidate : PotentialADROpportunities)
-    registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, ARM64FI,
-                         InvolvedInLOHs, RegToId);
-}
-
-/// Look for every register defined by potential LOHs candidates.
-/// Map these registers with dense id in @p RegToId and vice-versa in
-/// @p IdToReg. @p IdToReg is populated only in DEBUG mode.
-static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
-                               MapIdToReg &IdToReg,
-                               const TargetRegisterInfo *TRI) {
-  unsigned CurRegId = 0;
-  if (!PreCollectRegister) {
-    unsigned NbReg = TRI->getNumRegs();
-    for (; CurRegId < NbReg; ++CurRegId) {
-      RegToId[CurRegId] = CurRegId;
-      DEBUG(IdToReg.push_back(CurRegId));
-      DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches"));
-    }
-    return;
-  }
-
-  DEBUG(dbgs() << "** Collect Involved Register\n");
-  for (const auto &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      if (!canDefBePartOfLOH(&MI))
-        continue;
-
-      // Process defs
-      for (MachineInstr::const_mop_iterator IO = MI.operands_begin(),
-                                            IOEnd = MI.operands_end();
-           IO != IOEnd; ++IO) {
-        if (!IO->isReg() || !IO->isDef())
-          continue;
-        unsigned CurReg = IO->getReg();
-        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI)
-          if (RegToId.find(*AI) == RegToId.end()) {
-            DEBUG(IdToReg.push_back(*AI);
-                  assert(IdToReg[CurRegId] == *AI &&
-                         "Reg index mismatches insertion index."));
-            RegToId[*AI] = CurRegId++;
-            DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n');
-          }
-      }
-    }
-  }
-}
-
-bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
-  const TargetMachine &TM = MF.getTarget();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-  const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
-
-  MapRegToId RegToId;
-  MapIdToReg IdToReg;
-  ARM64FunctionInfo *ARM64FI = MF.getInfo<ARM64FunctionInfo>();
-  assert(ARM64FI && "No MachineFunctionInfo for this function!");
-
-  DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n');
-
-  collectInvolvedReg(MF, RegToId, IdToReg, TRI);
-  if (RegToId.empty())
-    return false;
-
-  MachineInstr *DummyOp = nullptr;
-  if (BasicBlockScopeOnly) {
-    const ARM64InstrInfo *TII =
-        static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
-    // For local analysis, create a dummy operation to record uses that are not
-    // local.
-    DummyOp = MF.CreateMachineInstr(TII->get(ARM64::COPY), DebugLoc());
-  }
-
-  unsigned NbReg = RegToId.size();
-  bool Modified = false;
-
-  // Start with ADRP.
-  InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg];
-
-  // Compute the reaching def in ADRP mode, meaning ADRP definitions
-  // are first considered as uses.
-  reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp);
-  DEBUG(dbgs() << "ADRP reaching defs\n");
-  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
-
-  // Translate the definition to uses map into a use to definitions map to ease
-  // statistic computation.
-  InstrToInstrs ADRPToReachingDefs;
-  reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
-
-  // Compute LOH for ADRP.
-  computeADRP(ADRPToReachingDefs, *ARM64FI, MDT);
-  delete[] ColorOpToReachedUses;
-
-  // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
-  ColorOpToReachedUses = new InstrToInstrs[NbReg];
-
-  // first perform a regular reaching def analysis.
-  reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp);
-  DEBUG(dbgs() << "All reaching defs\n");
-  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
-
-  // Turn that into a use to defs to ease statistic computation.
-  InstrToInstrs UsesToReachingDefs;
-  reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
-
-  // Compute other than AdrpAdrp LOH.
-  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *ARM64FI, RegToId,
-                MDT);
-  delete[] ColorOpToReachedUses;
-
-  if (BasicBlockScopeOnly)
-    MF.DeleteMachineInstr(DummyOp);
-
-  return Modified;
-}
-
-/// createARM64CollectLOHPass - returns an instance of the Statistic for
-/// linker optimization pass.
-FunctionPass *llvm::createARM64CollectLOHPass() {
-  return new ARM64CollectLOH();
-}
diff --git a/lib/Target/ARM64/ARM64ConditionalCompares.cpp b/lib/Target/ARM64/ARM64ConditionalCompares.cpp
deleted file mode 100644
index 2243cce51a1..00000000000
--- a/lib/Target/ARM64/ARM64ConditionalCompares.cpp
+++ /dev/null
@@ -1,919 +0,0 @@
-//===-- ARM64ConditionalCompares.cpp --- CCMP formation for ARM64 ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64ConditionalCompares pass which reduces
-// branching and code size by using the conditional compare instructions CCMP,
-// CCMN, and FCMP.
-//
-// The CFG transformations for forming conditional compares are very similar to
-// if-conversion, and this pass should run immediately before the early
-// if-conversion pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SparseSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineTraceMetrics.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-ccmp"
-
-// Absolute maximum number of instructions allowed per speculated block.
-// This bypasses all other heuristics, so it should be set fairly high.
-static cl::opt<unsigned> BlockInstrLimit(
-    "arm64-ccmp-limit", cl::init(30), cl::Hidden,
-    cl::desc("Maximum number of instructions per speculated block."));
-
-// Stress testing mode - disable heuristics.
-static cl::opt<bool> Stress("arm64-stress-ccmp", cl::Hidden,
-                            cl::desc("Turn all knobs to 11"));
-
-STATISTIC(NumConsidered, "Number of ccmps considered");
-STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)");
-STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)");
-STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)");
-STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)");
-STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)");
-STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)");
-STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)");
-STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)");
-STATISTIC(NumMultNZCVUses, "Number of ccmps rejected (NZCV used)");
-STATISTIC(NumUnknNZCVDefs, "Number of ccmps rejected (NZCV def unknown)");
-
-STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)");
-
-STATISTIC(NumConverted, "Number of ccmp instructions created");
-STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted");
-
-//===----------------------------------------------------------------------===//
-//                                 SSACCmpConv
-//===----------------------------------------------------------------------===//
-//
-// The SSACCmpConv class performs ccmp-conversion on SSA form machine code
-// after determining if it is possible. The class contains no heuristics;
-// external code should be used to determine when ccmp-conversion is a good
-// idea.
-//
-// CCmp-formation works on a CFG representing chained conditions, typically
-// from C's short-circuit || and && operators:
-//
-//   From:         Head            To:         Head
-//                 / |                         CmpBB
-//                /  |                         / |
-//               |  CmpBB                     /  |
-//               |  / |                    Tail  |
-//               | /  |                      |   |
-//              Tail  |                      |   |
-//                |   |                      |   |
-//               ... ...                    ... ...
-//
-// The Head block is terminated by a br.cond instruction, and the CmpBB block
-// contains compare + br.cond. Tail must be a successor of both.
-//
-// The cmp-conversion turns the compare instruction in CmpBB into a conditional
-// compare, and merges CmpBB into Head, speculatively executing its
-// instructions. The ARM64 conditional compare instructions have an immediate
-// operand that specifies the NZCV flag values when the condition is false and
-// the compare isn't executed. This makes it possible to chain compares with
-// different condition codes.
-//
-// Example:
-//
-//    if (a == 5 || b == 17)
-//      foo();
-//
-//    Head:
-//       cmp  w0, #5
-//       b.eq Tail
-//    CmpBB:
-//       cmp  w1, #17
-//       b.eq Tail
-//    ...
-//    Tail:
-//      bl _foo
-//
-//  Becomes:
-//
-//    Head:
-//       cmp  w0, #5
-//       ccmp w1, #17, 4, ne  ; 4 = nZcv
-//       b.eq Tail
-//    ...
-//    Tail:
-//      bl _foo
-//
-// The ccmp condition code is the one that would cause the Head terminator to
-// branch to CmpBB.
-//
-// FIXME: It should also be possible to speculate a block on the critical edge
-// between Head and Tail, just like if-converting a diamond.
-//
-// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion).
-
-namespace {
-class SSACCmpConv {
-  MachineFunction *MF;
-  const TargetInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  MachineRegisterInfo *MRI;
-
-public:
-  /// The first block containing a conditional branch, dominating everything
-  /// else.
-  MachineBasicBlock *Head;
-
-  /// The block containing cmp+br.cond with a successor shared with Head.
-  MachineBasicBlock *CmpBB;
-
-  /// The common successor for Head and CmpBB.
-  MachineBasicBlock *Tail;
-
-  /// The compare instruction in CmpBB that can be converted to a ccmp.
-  MachineInstr *CmpMI;
-
-private:
-  /// The branch condition in Head as determined by AnalyzeBranch.
-  SmallVector<MachineOperand, 4> HeadCond;
-
-  /// The condition code that makes Head branch to CmpBB.
-  ARM64CC::CondCode HeadCmpBBCC;
-
-  /// The branch condition in CmpBB.
-  SmallVector<MachineOperand, 4> CmpBBCond;
-
-  /// The condition code that makes CmpBB branch to Tail.
-  ARM64CC::CondCode CmpBBTailCC;
-
-  /// Check if the Tail PHIs are trivially convertible.
-  bool trivialTailPHIs();
-
-  /// Remove CmpBB from the Tail PHIs.
-  void updateTailPHIs();
-
-  /// Check if an operand defining DstReg is dead.
-  bool isDeadDef(unsigned DstReg);
-
-  /// Find the compare instruction in MBB that controls the conditional branch.
-  /// Return NULL if a convertible instruction can't be found.
-  MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB);
-
-  /// Return true if all non-terminator instructions in MBB can be safely
-  /// speculated.
-  bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI);
-
-public:
-  /// runOnMachineFunction - Initialize per-function data structures.
-  void runOnMachineFunction(MachineFunction &MF) {
-    this->MF = &MF;
-    TII = MF.getTarget().getInstrInfo();
-    TRI = MF.getTarget().getRegisterInfo();
-    MRI = &MF.getRegInfo();
-  }
-
-  /// If the sub-CFG headed by MBB can be cmp-converted, initialize the
-  /// internal state, and return true.
-  bool canConvert(MachineBasicBlock *MBB);
-
-  /// Cmo-convert the last block passed to canConvertCmp(), assuming
-  /// it is possible. Add any erased blocks to RemovedBlocks.
-  void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks);
-
-  /// Return the expected code size delta if the conversion into a
-  /// conditional compare is performed.
-  int expectedCodeSizeDelta() const;
-};
-} // end anonymous namespace
-
-// Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
-// This means that no if-conversion is required when merging CmpBB into Head.
-bool SSACCmpConv::trivialTailPHIs() {
-  for (auto &I : *Tail) {
-    if (!I.isPHI())
-      break;
-    unsigned HeadReg = 0, CmpBBReg = 0;
-    // PHI operands come in (VReg, MBB) pairs.
-    for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
-      MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
-      unsigned Reg = I.getOperand(oi).getReg();
-      if (MBB == Head) {
-        assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
-        HeadReg = Reg;
-      }
-      if (MBB == CmpBB) {
-        assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands");
-        CmpBBReg = Reg;
-      }
-    }
-    if (HeadReg != CmpBBReg)
-      return false;
-  }
-  return true;
-}
-
-// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply
-// removing the CmpBB operands. The Head operands will be identical.
-void SSACCmpConv::updateTailPHIs() {
-  for (auto &I : *Tail) {
-    if (!I.isPHI())
-      break;
-    // I is a PHI. It can have multiple entries for CmpBB.
-    for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) {
-      // PHI operands are (Reg, MBB) at (oi-2, oi-1).
-      if (I.getOperand(oi - 1).getMBB() == CmpBB) {
-        I.RemoveOperand(oi - 1);
-        I.RemoveOperand(oi - 2);
-      }
-    }
-  }
-}
-
-// This pass runs before the ARM64DeadRegisterDefinitions pass, so compares are
-// still writing virtual registers without any uses.
-bool SSACCmpConv::isDeadDef(unsigned DstReg) {
-  // Writes to the zero register are dead.
-  if (DstReg == ARM64::WZR || DstReg == ARM64::XZR)
-    return true;
-  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
-    return false;
-  // A virtual register def without any uses will be marked dead later, and
-  // eventually replaced by the zero register.
-  return MRI->use_nodbg_empty(DstReg);
-}
-
-// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
-// corresponding to TBB.
-// Return
-static bool parseCond(ArrayRef<MachineOperand> Cond, ARM64CC::CondCode &CC) {
-  // A normal br.cond simply has the condition code.
-  if (Cond[0].getImm() != -1) {
-    assert(Cond.size() == 1 && "Unknown Cond array format");
-    CC = (ARM64CC::CondCode)(int)Cond[0].getImm();
-    return true;
-  }
-  // For tbz and cbz instruction, the opcode is next.
-  switch (Cond[1].getImm()) {
-  default:
-    // This includes tbz / tbnz branches which can't be converted to
-    // ccmp + br.cond.
-    return false;
-  case ARM64::CBZW:
-  case ARM64::CBZX:
-    assert(Cond.size() == 3 && "Unknown Cond array format");
-    CC = ARM64CC::EQ;
-    return true;
-  case ARM64::CBNZW:
-  case ARM64::CBNZX:
-    assert(Cond.size() == 3 && "Unknown Cond array format");
-    CC = ARM64CC::NE;
-    return true;
-  }
-}
-
-MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
-  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
-  if (I == MBB->end())
-    return nullptr;
-  // The terminator must be controlled by the flags.
-  if (!I->readsRegister(ARM64::NZCV)) {
-    switch (I->getOpcode()) {
-    case ARM64::CBZW:
-    case ARM64::CBZX:
-    case ARM64::CBNZW:
-    case ARM64::CBNZX:
-      // These can be converted into a ccmp against #0.
-      return I;
-    }
-    ++NumCmpTermRejs;
-    DEBUG(dbgs() << "Flags not used by terminator: " << *I);
-    return nullptr;
-  }
-
-  // Now find the instruction controlling the terminator.
-  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
-    --I;
-    assert(!I->isTerminator() && "Spurious terminator");
-    switch (I->getOpcode()) {
-    // cmp is an alias for subs with a dead destination register.
-    case ARM64::SUBSWri:
-    case ARM64::SUBSXri:
-    // cmn is an alias for adds with a dead destination register.
-    case ARM64::ADDSWri:
-    case ARM64::ADDSXri:
-      // Check that the immediate operand is within range, ccmp wants a uimm5.
-      // Rd = SUBSri Rn, imm, shift
-      if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
-        DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
-        ++NumImmRangeRejs;
-        return nullptr;
-      }
-    // Fall through.
-    case ARM64::SUBSWrr:
-    case ARM64::SUBSXrr:
-    case ARM64::ADDSWrr:
-    case ARM64::ADDSXrr:
-      if (isDeadDef(I->getOperand(0).getReg()))
-        return I;
-      DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
-      ++NumLiveDstRejs;
-      return nullptr;
-    case ARM64::FCMPSrr:
-    case ARM64::FCMPDrr:
-    case ARM64::FCMPESrr:
-    case ARM64::FCMPEDrr:
-      return I;
-    }
-
-    // Check for flag reads and clobbers.
-    MIOperands::PhysRegInfo PRI =
-        MIOperands(I).analyzePhysReg(ARM64::NZCV, TRI);
-
-    if (PRI.Reads) {
-      // The ccmp doesn't produce exactly the same flags as the original
-      // compare, so reject the transform if there are uses of the flags
-      // besides the terminators.
-      DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
-      ++NumMultNZCVUses;
-      return nullptr;
-    }
-
-    if (PRI.Clobbers) {
-      DEBUG(dbgs() << "Not convertible compare: " << *I);
-      ++NumUnknNZCVDefs;
-      return nullptr;
-    }
-  }
-  DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
-  return nullptr;
-}
-
-/// Determine if all the instructions in MBB can safely
-/// be speculated. The terminators are not considered.
-///
-/// Only CmpMI is allowed to clobber the flags.
-///
-bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
-                                     const MachineInstr *CmpMI) {
-  // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to
-  // get right.
-  if (!MBB->livein_empty()) {
-    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
-    return false;
-  }
-
-  unsigned InstrCount = 0;
-
-  // Check all instructions, except the terminators. It is assumed that
-  // terminators never have side effects or define any used register values.
-  for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) {
-    if (I.isDebugValue())
-      continue;
-
-    if (++InstrCount > BlockInstrLimit && !Stress) {
-      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
-                   << BlockInstrLimit << " instructions.\n");
-      return false;
-    }
-
-    // There shouldn't normally be any phis in a single-predecessor block.
-    if (I.isPHI()) {
-      DEBUG(dbgs() << "Can't hoist: " << I);
-      return false;
-    }
-
-    // Don't speculate loads. Note that it may be possible and desirable to
-    // speculate GOT or constant pool loads that are guaranteed not to trap,
-    // but we don't support that for now.
-    if (I.mayLoad()) {
-      DEBUG(dbgs() << "Won't speculate load: " << I);
-      return false;
-    }
-
-    // We never speculate stores, so an AA pointer isn't necessary.
-    bool DontMoveAcrossStore = true;
-    if (!I.isSafeToMove(TII, nullptr, DontMoveAcrossStore)) {
-      DEBUG(dbgs() << "Can't speculate: " << I);
-      return false;
-    }
-
-    // Only CmpMI is allowed to clobber the flags.
-    if (&I != CmpMI && I.modifiesRegister(ARM64::NZCV, TRI)) {
-      DEBUG(dbgs() << "Clobbers flags: " << I);
-      return false;
-    }
-  }
-  return true;
-}
-
-/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential
-/// candidate for cmp-conversion. Fill out the internal state.
-///
-bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
-  Head = MBB;
-  Tail = CmpBB = nullptr;
-
-  if (Head->succ_size() != 2)
-    return false;
-  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
-  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
-
-  // CmpBB can only have a single predecessor. Tail is allowed many.
-  if (Succ0->pred_size() != 1)
-    std::swap(Succ0, Succ1);
-
-  // Succ0 is our candidate for CmpBB.
-  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2)
-    return false;
-
-  CmpBB = Succ0;
-  Tail = Succ1;
-
-  if (!CmpBB->isSuccessor(Tail))
-    return false;
-
-  // The CFG topology checks out.
-  DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#"
-               << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n');
-  ++NumConsidered;
-
-  // Tail is allowed to have many predecessors, but we can't handle PHIs yet.
-  //
-  // FIXME: Real PHIs could be if-converted as long as the CmpBB values are
-  // defined before The CmpBB cmp clobbers the flags. Alternatively, it should
-  // always be safe to sink the ccmp down to immediately before the CmpBB
-  // terminators.
-  if (!trivialTailPHIs()) {
-    DEBUG(dbgs() << "Can't handle phis in Tail.\n");
-    ++NumPhiRejs;
-    return false;
-  }
-
-  if (!Tail->livein_empty()) {
-    DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
-    ++NumPhysRejs;
-    return false;
-  }
-
-  // CmpBB should never have PHIs since Head is its only predecessor.
-  // FIXME: Clean them up if it happens.
-  if (!CmpBB->empty() && CmpBB->front().isPHI()) {
-    DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
-    ++NumPhi2Rejs;
-    return false;
-  }
-
-  if (!CmpBB->livein_empty()) {
-    DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
-    ++NumPhysRejs;
-    return false;
-  }
-
-  // The branch we're looking to eliminate must be analyzable.
-  HeadCond.clear();
-  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
-  if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) {
-    DEBUG(dbgs() << "Head branch not analyzable.\n");
-    ++NumHeadBranchRejs;
-    return false;
-  }
-
-  // This is weird, probably some sort of degenerate CFG, or an edge to a
-  // landing pad.
-  if (!TBB || HeadCond.empty()) {
-    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
-    ++NumHeadBranchRejs;
-    return false;
-  }
-
-  if (!parseCond(HeadCond, HeadCmpBBCC)) {
-    DEBUG(dbgs() << "Unsupported branch type on Head\n");
-    ++NumHeadBranchRejs;
-    return false;
-  }
-
-  // Make sure the branch direction is right.
-  if (TBB != CmpBB) {
-    assert(TBB == Tail && "Unexpected TBB");
-    HeadCmpBBCC = ARM64CC::getInvertedCondCode(HeadCmpBBCC);
-  }
-
-  CmpBBCond.clear();
-  TBB = FBB = nullptr;
-  if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
-    DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
-    ++NumCmpBranchRejs;
-    return false;
-  }
-
-  if (!TBB || CmpBBCond.empty()) {
-    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
-    ++NumCmpBranchRejs;
-    return false;
-  }
-
-  if (!parseCond(CmpBBCond, CmpBBTailCC)) {
-    DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
-    ++NumCmpBranchRejs;
-    return false;
-  }
-
-  if (TBB != Tail)
-    CmpBBTailCC = ARM64CC::getInvertedCondCode(CmpBBTailCC);
-
-  DEBUG(dbgs() << "Head->CmpBB on " << ARM64CC::getCondCodeName(HeadCmpBBCC)
-               << ", CmpBB->Tail on " << ARM64CC::getCondCodeName(CmpBBTailCC)
-               << '\n');
-
-  CmpMI = findConvertibleCompare(CmpBB);
-  if (!CmpMI)
-    return false;
-
-  if (!canSpeculateInstrs(CmpBB, CmpMI)) {
-    ++NumSpeculateRejs;
-    return false;
-  }
-  return true;
-}
-
-void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
-  DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#"
-               << Head->getNumber() << ":\n" << *CmpBB);
-
-  // All CmpBB instructions are moved into Head, and CmpBB is deleted.
-  // Update the CFG first.
-  updateTailPHIs();
-  Head->removeSuccessor(CmpBB);
-  CmpBB->removeSuccessor(Tail);
-  Head->transferSuccessorsAndUpdatePHIs(CmpBB);
-  DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
-  TII->RemoveBranch(*Head);
-
-  // If the Head terminator was one of the cbz / tbz branches with built-in
-  // compare, we need to insert an explicit compare instruction in its place.
-  if (HeadCond[0].getImm() == -1) {
-    ++NumCompBranches;
-    unsigned Opc = 0;
-    switch (HeadCond[1].getImm()) {
-    case ARM64::CBZW:
-    case ARM64::CBNZW:
-      Opc = ARM64::SUBSWri;
-      break;
-    case ARM64::CBZX:
-    case ARM64::CBNZX:
-      Opc = ARM64::SUBSXri;
-      break;
-    default:
-      llvm_unreachable("Cannot convert Head branch");
-    }
-    const MCInstrDesc &MCID = TII->get(Opc);
-    // Create a dummy virtual register for the SUBS def.
-    unsigned DestReg =
-        MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
-    // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
-    BuildMI(*Head, Head->end(), TermDL, MCID)
-        .addReg(DestReg, RegState::Define | RegState::Dead)
-        .addOperand(HeadCond[2])
-        .addImm(0)
-        .addImm(0);
-    // SUBS uses the GPR*sp register classes.
-    MRI->constrainRegClass(HeadCond[2].getReg(),
-                           TII->getRegClass(MCID, 1, TRI, *MF));
-  }
-
-  Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
-
-  // Now replace CmpMI with a ccmp instruction that also considers the incoming
-  // flags.
-  unsigned Opc = 0;
-  unsigned FirstOp = 1;   // First CmpMI operand to copy.
-  bool isZBranch = false; // CmpMI is a cbz/cbnz instruction.
-  switch (CmpMI->getOpcode()) {
-  default:
-    llvm_unreachable("Unknown compare opcode");
-  case ARM64::SUBSWri:    Opc = ARM64::CCMPWi; break;
-  case ARM64::SUBSWrr:    Opc = ARM64::CCMPWr; break;
-  case ARM64::SUBSXri:    Opc = ARM64::CCMPXi; break;
-  case ARM64::SUBSXrr:    Opc = ARM64::CCMPXr; break;
-  case ARM64::ADDSWri:    Opc = ARM64::CCMNWi; break;
-  case ARM64::ADDSWrr:    Opc = ARM64::CCMNWr; break;
-  case ARM64::ADDSXri:    Opc = ARM64::CCMNXi; break;
-  case ARM64::ADDSXrr:    Opc = ARM64::CCMNXr; break;
-  case ARM64::FCMPSrr:    Opc = ARM64::FCCMPSrr; FirstOp = 0; break;
-  case ARM64::FCMPDrr:    Opc = ARM64::FCCMPDrr; FirstOp = 0; break;
-  case ARM64::FCMPESrr:   Opc = ARM64::FCCMPESrr; FirstOp = 0; break;
-  case ARM64::FCMPEDrr:   Opc = ARM64::FCCMPEDrr; FirstOp = 0; break;
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-    Opc = ARM64::CCMPWi;
-    FirstOp = 0;
-    isZBranch = true;
-    break;
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-    Opc = ARM64::CCMPXi;
-    FirstOp = 0;
-    isZBranch = true;
-    break;
-  }
-
-  // The ccmp instruction should set the flags according to the comparison when
-  // Head would have branched to CmpBB.
-  // The NZCV immediate operand should provide flags for the case where Head
-  // would have branched to Tail. These flags should cause the new Head
-  // terminator to branch to tail.
-  unsigned NZCV = ARM64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
-  const MCInstrDesc &MCID = TII->get(Opc);
-  MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
-                         TII->getRegClass(MCID, 0, TRI, *MF));
-  if (CmpMI->getOperand(FirstOp + 1).isReg())
-    MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
-                           TII->getRegClass(MCID, 1, TRI, *MF));
-  MachineInstrBuilder MIB =
-      BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
-          .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
-  if (isZBranch)
-    MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
-  else
-    MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
-  MIB.addImm(NZCV).addImm(HeadCmpBBCC);
-
-  // If CmpMI was a terminator, we need a new conditional branch to replace it.
-  // This now becomes a Head terminator.
-  if (isZBranch) {
-    bool isNZ = CmpMI->getOpcode() == ARM64::CBNZW ||
-                CmpMI->getOpcode() == ARM64::CBNZX;
-    BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(ARM64::Bcc))
-        .addImm(isNZ ? ARM64CC::NE : ARM64CC::EQ)
-        .addOperand(CmpMI->getOperand(1)); // Branch target.
-  }
-  CmpMI->eraseFromParent();
-  Head->updateTerminator();
-
-  RemovedBlocks.push_back(CmpBB);
-  CmpBB->eraseFromParent();
-  DEBUG(dbgs() << "Result:\n" << *Head);
-  ++NumConverted;
-}
-
-int SSACCmpConv::expectedCodeSizeDelta() const {
-  int delta = 0;
-  // If the Head terminator was one of the cbz / tbz branches with built-in
-  // compare, we need to insert an explicit compare instruction in its place
-  // plus a branch instruction.
-  if (HeadCond[0].getImm() == -1) {
-    switch (HeadCond[1].getImm()) {
-    case ARM64::CBZW:
-    case ARM64::CBNZW:
-    case ARM64::CBZX:
-    case ARM64::CBNZX:
-      // Therefore delta += 1
-      delta = 1;
-      break;
-    default:
-      llvm_unreachable("Cannot convert Head branch");
-    }
-  }
-  // If the Cmp terminator was one of the cbz / tbz branches with
-  // built-in compare, it will be turned into a compare instruction
-  // into Head, but we do not save any instruction.
-  // Otherwise, we save the branch instruction.
-  switch (CmpMI->getOpcode()) {
-  default:
-    --delta;
-    break;
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-    break;
-  }
-  return delta;
-}
-
-//===----------------------------------------------------------------------===//
-//                       ARM64ConditionalCompares Pass
-//===----------------------------------------------------------------------===//
-
-namespace {
-class ARM64ConditionalCompares : public MachineFunctionPass {
-  const TargetInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  const MCSchedModel *SchedModel;
-  // Does the proceeded function has Oz attribute.
-  bool MinSize;
-  MachineRegisterInfo *MRI;
-  MachineDominatorTree *DomTree;
-  MachineLoopInfo *Loops;
-  MachineTraceMetrics *Traces;
-  MachineTraceMetrics::Ensemble *MinInstr;
-  SSACCmpConv CmpConv;
-
-public:
-  static char ID;
-  ARM64ConditionalCompares() : MachineFunctionPass(ID) {}
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool runOnMachineFunction(MachineFunction &MF) override;
-  const char *getPassName() const override {
-    return "ARM64 Conditional Compares";
-  }
-
-private:
-  bool tryConvert(MachineBasicBlock *);
-  void updateDomTree(ArrayRef<MachineBasicBlock *> Removed);
-  void updateLoops(ArrayRef<MachineBasicBlock *> Removed);
-  void invalidateTraces();
-  bool shouldConvert();
-};
-} // end anonymous namespace
-
-char ARM64ConditionalCompares::ID = 0;
-
-namespace llvm {
-void initializeARM64ConditionalComparesPass(PassRegistry &);
-}
-
-INITIALIZE_PASS_BEGIN(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
-INITIALIZE_PASS_END(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass",
-                    false, false)
-
-FunctionPass *llvm::createARM64ConditionalCompares() {
-  return new ARM64ConditionalCompares();
-}
-
-void ARM64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineBranchProbabilityInfo>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addPreserved<MachineDominatorTree>();
-  AU.addRequired<MachineLoopInfo>();
-  AU.addPreserved<MachineLoopInfo>();
-  AU.addRequired<MachineTraceMetrics>();
-  AU.addPreserved<MachineTraceMetrics>();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-/// Update the dominator tree after if-conversion erased some blocks.
-void
-ARM64ConditionalCompares::updateDomTree(ArrayRef<MachineBasicBlock *> Removed) {
-  // convert() removes CmpBB which was previously dominated by Head.
-  // CmpBB children should be transferred to Head.
-  MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
-  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
-    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
-    assert(Node != HeadNode && "Cannot erase the head node");
-    assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
-    while (Node->getNumChildren())
-      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
-    DomTree->eraseNode(Removed[i]);
-  }
-}
-
-/// Update LoopInfo after if-conversion.
-void
-ARM64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
-  if (!Loops)
-    return;
-  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
-    Loops->removeBlock(Removed[i]);
-}
-
-/// Invalidate MachineTraceMetrics before if-conversion.
-void ARM64ConditionalCompares::invalidateTraces() {
-  Traces->invalidate(CmpConv.Head);
-  Traces->invalidate(CmpConv.CmpBB);
-}
-
-/// Apply cost model and heuristics to the if-conversion in IfConv.
-/// Return true if the conversion is a good idea.
-///
-bool ARM64ConditionalCompares::shouldConvert() {
-  // Stress testing mode disables all cost considerations.
-  if (Stress)
-    return true;
-  if (!MinInstr)
-    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
-
-  // Head dominates CmpBB, so it is always included in its trace.
-  MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB);
-
-  // If code size is the main concern
-  if (MinSize) {
-    int CodeSizeDelta = CmpConv.expectedCodeSizeDelta();
-    DEBUG(dbgs() << "Code size delta:  " << CodeSizeDelta << '\n');
-    // If we are minimizing the code size, do the conversion whatever
-    // the cost is.
-    if (CodeSizeDelta < 0)
-      return true;
-    if (CodeSizeDelta > 0) {
-      DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
-      return false;
-    }
-    // CodeSizeDelta == 0, continue with the regular heuristics
-  }
-
-  // Heuristic: The compare conversion delays the execution of the branch
-  // instruction because we must wait for the inputs to the second compare as
-  // well. The branch has no dependent instructions, but delaying it increases
-  // the cost of a misprediction.
-  //
-  // Set a limit on the delay we will accept.
-  unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4;
-
-  // Instruction depths can be computed for all trace instructions above CmpBB.
-  unsigned HeadDepth =
-      Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth;
-  unsigned CmpBBDepth =
-      Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth;
-  DEBUG(dbgs() << "Head depth:  " << HeadDepth
-               << "\nCmpBB depth: " << CmpBBDepth << '\n');
-  if (CmpBBDepth > HeadDepth + DelayLimit) {
-    DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
-                 << " cycles.\n");
-    return false;
-  }
-
-  // Check the resource depth at the bottom of CmpBB - these instructions will
-  // be speculated.
-  unsigned ResDepth = Trace.getResourceDepth(true);
-  DEBUG(dbgs() << "Resources:   " << ResDepth << '\n');
-
-  // Heuristic: The speculatively executed instructions must all be able to
-  // merge into the Head block. The Head critical path should dominate the
-  // resource cost of the speculated instructions.
-  if (ResDepth > HeadDepth) {
-    DEBUG(dbgs() << "Too many instructions to speculate.\n");
-    return false;
-  }
-  return true;
-}
-
-bool ARM64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
-  bool Changed = false;
-  while (CmpConv.canConvert(MBB) && shouldConvert()) {
-    invalidateTraces();
-    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
-    CmpConv.convert(RemovedBlocks);
-    Changed = true;
-    updateDomTree(RemovedBlocks);
-    updateLoops(RemovedBlocks);
-  }
-  return Changed;
-}
-
-bool ARM64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** ARM64 Conditional Compares **********\n"
-               << "********** Function: " << MF.getName() << '\n');
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
-  SchedModel =
-      MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
-  MRI = &MF.getRegInfo();
-  DomTree = &getAnalysis<MachineDominatorTree>();
-  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
-  Traces = &getAnalysis<MachineTraceMetrics>();
-  MinInstr = nullptr;
-  MinSize = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::MinSize);
-
-  bool Changed = false;
-  CmpConv.runOnMachineFunction(MF);
-
-  // Visit blocks in dominator tree pre-order. The pre-order enables multiple
-  // cmp-conversions from the same head block.
-  // Note that updateDomTree() modifies the children of the DomTree node
-  // currently being visited. The df_iterator supports that; it doesn't look at
-  // child_begin() / child_end() until after a node has been visited.
-  for (auto *I : depth_first(DomTree))
-    if (tryConvert(I->getBlock()))
-      Changed = true;
-
-  return Changed;
-}
diff --git a/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp b/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
deleted file mode 100644
index e8f03ec833f..00000000000
--- a/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-//===-- ARM64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// When allowed by the instruction, replace a dead definition of a GPR with
-// the zero register. This makes the code a bit friendlier towards the
-// hardware's register renamer.
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64RegisterInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-dead-defs"
-
-STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
-
-namespace {
-class ARM64DeadRegisterDefinitions : public MachineFunctionPass {
-private:
-  const TargetRegisterInfo *TRI;
-  bool implicitlyDefinesOverlappingReg(unsigned Reg, const MachineInstr &MI);
-  bool processMachineBasicBlock(MachineBasicBlock &MBB);
-  bool usesFrameIndex(const MachineInstr &MI);
-public:
-  static char ID; // Pass identification, replacement for typeid.
-  explicit ARM64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
-
-  virtual bool runOnMachineFunction(MachineFunction &F) override;
-
-  const char *getPassName() const override { return "Dead register definitions"; }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-char ARM64DeadRegisterDefinitions::ID = 0;
-} // end anonymous namespace
-
-bool ARM64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg(
-    unsigned Reg, const MachineInstr &MI) {
-  for (const MachineOperand &MO : MI.implicit_operands())
-    if (MO.isReg() && MO.isDef())
-      if (TRI->regsOverlap(Reg, MO.getReg()))
-        return true;
-  return false;
-}
-
-bool ARM64DeadRegisterDefinitions::usesFrameIndex(const MachineInstr &MI) {
-  for (const MachineOperand &Op : MI.uses())
-    if (Op.isFI())
-      return true;
-  return false;
-}
-
-bool
-ARM64DeadRegisterDefinitions::processMachineBasicBlock(MachineBasicBlock &MBB) {
-  bool Changed = false;
-  for (MachineInstr &MI : MBB) {
-    if (usesFrameIndex(MI)) {
-      // We need to skip this instruction because while it appears to have a
-      // dead def it uses a frame index which might expand into a multi
-      // instruction sequence during EPI.
-      DEBUG(dbgs() << "    Ignoring, operand is frame index\n");
-      continue;
-    }
-    for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) {
-      MachineOperand &MO = MI.getOperand(i);
-      if (MO.isReg() && MO.isDead() && MO.isDef()) {
-        assert(!MO.isImplicit() && "Unexpected implicit def!");
-        DEBUG(dbgs() << "  Dead def operand #" << i << " in:\n    ";
-              MI.print(dbgs()));
-        // Be careful not to change the register if it's a tied operand.
-        if (MI.isRegTiedToUseOperand(i)) {
-          DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
-          continue;
-        }
-        // Don't change the register if there's an implicit def of a subreg or
-        // supperreg.
-        if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) {
-          DEBUG(dbgs() << "    Ignoring, implicitly defines overlap reg.\n");
-          continue;
-        }
-        // Make sure the instruction take a register class that contains
-        // the zero register and replace it if so.
-        unsigned NewReg;
-        switch (MI.getDesc().OpInfo[i].RegClass) {
-        default:
-          DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
-          continue;
-        case ARM64::GPR32RegClassID:
-          NewReg = ARM64::WZR;
-          break;
-        case ARM64::GPR64RegClassID:
-          NewReg = ARM64::XZR;
-          break;
-        }
-        DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
-        MO.setReg(NewReg);
-        DEBUG(MI.print(dbgs()));
-        ++NumDeadDefsReplaced;
-      }
-    }
-  }
-  return Changed;
-}
-
-// Scan the function for instructions that have a dead definition of a
-// register. Replace that register with the zero register when possible.
-bool ARM64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
-  TRI = MF.getTarget().getRegisterInfo();
-  bool Changed = false;
-  DEBUG(dbgs() << "***** ARM64DeadRegisterDefinitions *****\n");
-
-  for (auto &MBB : MF)
-    if (processMachineBasicBlock(MBB))
-      Changed = true;
-  return Changed;
-}
-
-FunctionPass *llvm::createARM64DeadRegisterDefinitions() {
-  return new ARM64DeadRegisterDefinitions();
-}
diff --git a/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp b/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp
deleted file mode 100644
index a4b5d31314e..00000000000
--- a/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp
+++ /dev/null
@@ -1,745 +0,0 @@
-//===-- ARM64ExpandPseudoInsts.cpp - Expand pseudo instructions ---*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that expands pseudo instructions into target
-// instructions to allow proper scheduling and other late optimizations.  This
-// pass should be run after register allocation but before the post-regalloc
-// scheduling pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "ARM64InstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/MathExtras.h"
-using namespace llvm;
-
-namespace {
-class ARM64ExpandPseudo : public MachineFunctionPass {
-public:
-  static char ID;
-  ARM64ExpandPseudo() : MachineFunctionPass(ID) {}
-
-  const ARM64InstrInfo *TII;
-
-  bool runOnMachineFunction(MachineFunction &Fn) override;
-
-  const char *getPassName() const override {
-    return "ARM64 pseudo instruction expansion pass";
-  }
-
-private:
-  bool expandMBB(MachineBasicBlock &MBB);
-  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
-  bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                    unsigned BitSize);
-};
-char ARM64ExpandPseudo::ID = 0;
-}
-
-/// \brief Transfer implicit operands on the pseudo instruction to the
-/// instructions created from the expansion.
-static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
-                           MachineInstrBuilder &DefMI) {
-  const MCInstrDesc &Desc = OldMI.getDesc();
-  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
-       ++i) {
-    const MachineOperand &MO = OldMI.getOperand(i);
-    assert(MO.isReg() && MO.getReg());
-    if (MO.isUse())
-      UseMI.addOperand(MO);
-    else
-      DefMI.addOperand(MO);
-  }
-}
-
-/// \brief Helper function which extracts the specified 16-bit chunk from a
-/// 64-bit value.
-static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
-  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
-
-  return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
-}
-
-/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
-/// value. Indices correspond to element numbers in a v4i16.
-static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
-  assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
-  const unsigned ShiftAmt = ToIdx * 16;
-
-  // Replicate the source chunk to the destination position.
-  const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
-  // Clear the destination chunk.
-  Imm &= ~(0xFFFFLL << ShiftAmt);
-  // Insert the replicated chunk.
-  return Imm | Chunk;
-}
-
-/// \brief Helper function which tries to materialize a 64-bit value with an
-/// ORR + MOVK instruction sequence.
-static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
-                       MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator &MBBI,
-                       const ARM64InstrInfo *TII, unsigned ChunkIdx) {
-  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
-  const unsigned ShiftAmt = ChunkIdx * 16;
-
-  uint64_t Encoding;
-  if (ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
-    // Create the ORR-immediate instruction.
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
-            .addOperand(MI.getOperand(0))
-            .addReg(ARM64::XZR)
-            .addImm(Encoding);
-
-    // Create the MOVK instruction.
-    const unsigned Imm16 = getChunk(UImm, ChunkIdx);
-    const unsigned DstReg = MI.getOperand(0).getReg();
-    const bool DstIsDead = MI.getOperand(0).isDead();
-    MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
-            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-            .addReg(DstReg)
-            .addImm(Imm16)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
-
-    transferImpOps(MI, MIB, MIB1);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  return false;
-}
-
-/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
-/// can be materialized with an ORR instruction.
-static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
-  Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
-
-  return ARM64_AM::processLogicalImmediate(Chunk, 64, Encoding);
-}
-
-/// \brief Check for identical 16-bit chunks within the constant and if so
-/// materialize them with a single ORR instruction. The remaining one or two
-/// 16-bit chunks will be materialized with MOVK instructions.
-///
-/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
-/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
-/// an ORR instruction.
-///
-static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
-                                 MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator &MBBI,
-                                 const ARM64InstrInfo *TII) {
-  typedef DenseMap<uint64_t, unsigned> CountMap;
-  CountMap Counts;
-
-  // Scan the constant and count how often every chunk occurs.
-  for (unsigned Idx = 0; Idx < 4; ++Idx)
-    ++Counts[getChunk(UImm, Idx)];
-
-  // Traverse the chunks to find one which occurs more than once.
-  for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
-       Chunk != End; ++Chunk) {
-    const uint64_t ChunkVal = Chunk->first;
-    const unsigned Count = Chunk->second;
-
-    uint64_t Encoding = 0;
-
-    // We are looking for chunks which have two or three instances and can be
-    // materialized with an ORR instruction.
-    if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
-      continue;
-
-    const bool CountThree = Count == 3;
-    // Create the ORR-immediate instruction.
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
-            .addOperand(MI.getOperand(0))
-            .addReg(ARM64::XZR)
-            .addImm(Encoding);
-
-    const unsigned DstReg = MI.getOperand(0).getReg();
-    const bool DstIsDead = MI.getOperand(0).isDead();
-
-    unsigned ShiftAmt = 0;
-    uint64_t Imm16 = 0;
-    // Find the first chunk not materialized with the ORR instruction.
-    for (; ShiftAmt < 64; ShiftAmt += 16) {
-      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
-
-      if (Imm16 != ChunkVal)
-        break;
-    }
-
-    // Create the first MOVK instruction.
-    MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
-            .addReg(DstReg,
-                    RegState::Define | getDeadRegState(DstIsDead && CountThree))
-            .addReg(DstReg)
-            .addImm(Imm16)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
-
-    // In case we have three instances the whole constant is now materialized
-    // and we can exit.
-    if (CountThree) {
-      transferImpOps(MI, MIB, MIB1);
-      MI.eraseFromParent();
-      return true;
-    }
-
-    // Find the remaining chunk which needs to be materialized.
-    for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
-      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
-
-      if (Imm16 != ChunkVal)
-        break;
-    }
-
-    // Create the second MOVK instruction.
-    MachineInstrBuilder MIB2 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
-            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-            .addReg(DstReg)
-            .addImm(Imm16)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
-
-    transferImpOps(MI, MIB, MIB2);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  return false;
-}
-
-/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
-/// starts a contiguous sequence of ones if we look at the bits from the LSB
-/// towards the MSB.
-static bool isStartChunk(uint64_t Chunk) {
-  if (Chunk == 0 || Chunk == UINT64_MAX)
-    return false;
-
-  return (CountLeadingOnes_64(Chunk) + countTrailingZeros(Chunk)) == 64;
-}
-
-/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
-/// ends a contiguous sequence of ones if we look at the bits from the LSB
-/// towards the MSB.
-static bool isEndChunk(uint64_t Chunk) {
-  if (Chunk == 0 || Chunk == UINT64_MAX)
-    return false;
-
-  return (countLeadingZeros(Chunk) + CountTrailingOnes_64(Chunk)) == 64;
-}
-
-/// \brief Clear or set all bits in the chunk at the given index.
-static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
-  const uint64_t Mask = 0xFFFF;
-
-  if (Clear)
-    // Clear chunk in the immediate.
-    Imm &= ~(Mask << (Idx * 16));
-  else
-    // Set all bits in the immediate for the particular chunk.
-    Imm |= Mask << (Idx * 16);
-
-  return Imm;
-}
-
-/// \brief Check whether the constant contains a sequence of contiguous ones,
-/// which might be interrupted by one or two chunks. If so, materialize the
-/// sequence of contiguous ones with an ORR instruction.
-/// Materialize the chunks which are either interrupting the sequence or outside
-/// of the sequence with a MOVK instruction.
-///
-/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
-/// which ends the sequence (0...1...). Then we are looking for constants which
-/// contain at least one S and E chunk.
-/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
-///
-/// We are also looking for constants like |S|A|B|E| where the contiguous
-/// sequence of ones wraps around the MSB into the LSB.
-///
-static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
-                              MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator &MBBI,
-                              const ARM64InstrInfo *TII) {
-  const int NotSet = -1;
-  const uint64_t Mask = 0xFFFF;
-
-  int StartIdx = NotSet;
-  int EndIdx = NotSet;
-  // Try to find the chunks which start/end a contiguous sequence of ones.
-  for (int Idx = 0; Idx < 4; ++Idx) {
-    int64_t Chunk = getChunk(UImm, Idx);
-    // Sign extend the 16-bit chunk to 64-bit.
-    Chunk = (Chunk << 48) >> 48;
-
-    if (isStartChunk(Chunk))
-      StartIdx = Idx;
-    else if (isEndChunk(Chunk))
-      EndIdx = Idx;
-  }
-
-  // Early exit in case we can't find a start/end chunk.
-  if (StartIdx == NotSet || EndIdx == NotSet)
-    return false;
-
-  // Outside of the contiguous sequence of ones everything needs to be zero.
-  uint64_t Outside = 0;
-  // Chunks between the start and end chunk need to have all their bits set.
-  uint64_t Inside = Mask;
-
-  // If our contiguous sequence of ones wraps around from the MSB into the LSB,
-  // just swap indices and pretend we are materializing a contiguous sequence
-  // of zeros surrounded by a contiguous sequence of ones.
-  if (StartIdx > EndIdx) {
-    std::swap(StartIdx, EndIdx);
-    std::swap(Outside, Inside);
-  }
-
-  uint64_t OrrImm = UImm;
-  int FirstMovkIdx = NotSet;
-  int SecondMovkIdx = NotSet;
-
-  // Find out which chunks we need to patch up to obtain a contiguous sequence
-  // of ones.
-  for (int Idx = 0; Idx < 4; ++Idx) {
-    const uint64_t Chunk = getChunk(UImm, Idx);
-
-    // Check whether we are looking at a chunk which is not part of the
-    // contiguous sequence of ones.
-    if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
-      OrrImm = updateImm(OrrImm, Idx, Outside == 0);
-
-      // Remember the index we need to patch.
-      if (FirstMovkIdx == NotSet)
-        FirstMovkIdx = Idx;
-      else
-        SecondMovkIdx = Idx;
-
-      // Check whether we are looking a chunk which is part of the contiguous
-      // sequence of ones.
-    } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
-      OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
-
-      // Remember the index we need to patch.
-      if (FirstMovkIdx == NotSet)
-        FirstMovkIdx = Idx;
-      else
-        SecondMovkIdx = Idx;
-    }
-  }
-  assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
-
-  // Create the ORR-immediate instruction.
-  uint64_t Encoding = 0;
-  ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
-  MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
-          .addOperand(MI.getOperand(0))
-          .addReg(ARM64::XZR)
-          .addImm(Encoding);
-
-  const unsigned DstReg = MI.getOperand(0).getReg();
-  const bool DstIsDead = MI.getOperand(0).isDead();
-
-  const bool SingleMovk = SecondMovkIdx == NotSet;
-  // Create the first MOVK instruction.
-  MachineInstrBuilder MIB1 =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
-          .addReg(DstReg,
-                  RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
-          .addReg(DstReg)
-          .addImm(getChunk(UImm, FirstMovkIdx))
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, FirstMovkIdx * 16));
-
-  // Early exit in case we only need to emit a single MOVK instruction.
-  if (SingleMovk) {
-    transferImpOps(MI, MIB, MIB1);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  // Create the second MOVK instruction.
-  MachineInstrBuilder MIB2 =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
-          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-          .addReg(DstReg)
-          .addImm(getChunk(UImm, SecondMovkIdx))
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, SecondMovkIdx * 16));
-
-  transferImpOps(MI, MIB, MIB2);
-  MI.eraseFromParent();
-  return true;
-}
-
-/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
-/// real move-immediate instructions to synthesize the immediate.
-bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MBBI,
-                                     unsigned BitSize) {
-  MachineInstr &MI = *MBBI;
-  uint64_t Imm = MI.getOperand(1).getImm();
-  const unsigned Mask = 0xFFFF;
-
-  // Try a MOVI instruction (aka ORR-immediate with the zero register).
-  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
-  uint64_t Encoding;
-  if (ARM64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
-    unsigned Opc = (BitSize == 32 ? ARM64::ORRWri : ARM64::ORRXri);
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
-            .addOperand(MI.getOperand(0))
-            .addReg(BitSize == 32 ? ARM64::WZR : ARM64::XZR)
-            .addImm(Encoding);
-    transferImpOps(MI, MIB, MIB);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  // Scan the immediate and count the number of 16-bit chunks which are either
-  // all ones or all zeros.
-  unsigned OneChunks = 0;
-  unsigned ZeroChunks = 0;
-  for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
-    const unsigned Chunk = (Imm >> Shift) & Mask;
-    if (Chunk == Mask)
-      OneChunks++;
-    else if (Chunk == 0)
-      ZeroChunks++;
-  }
-
-  // Since we can't materialize the constant with a single ORR instruction,
-  // let's see whether we can materialize 3/4 of the constant with an ORR
-  // instruction and use an additional MOVK instruction to materialize the
-  // remaining 1/4.
-  //
-  // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
-  //
-  // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
-  // we would create the following instruction sequence:
-  //
-  // ORR x0, xzr, |A|X|A|X|
-  // MOVK x0, |B|, LSL #16
-  //
-  // Only look at 64-bit constants which can't be materialized with a single
-  // instruction e.g. which have less than either three all zero or all one
-  // chunks.
-  //
-  // Ignore 32-bit constants here, they always can be materialized with a
-  // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
-  // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
-  // Thus we fall back to the default code below which in the best case creates
-  // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
-  //
-  if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
-    // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
-    // identical?
-    if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
-      // See if we can come up with a constant which can be materialized with
-      // ORR-immediate by replicating element 3 into element 1.
-      uint64_t OrrImm = replicateChunk(UImm, 3, 1);
-      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
-        return true;
-
-      // See if we can come up with a constant which can be materialized with
-      // ORR-immediate by replicating element 1 into element 3.
-      OrrImm = replicateChunk(UImm, 1, 3);
-      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
-        return true;
-
-      // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
-      // identical?
-    } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
-      // See if we can come up with a constant which can be materialized with
-      // ORR-immediate by replicating element 2 into element 0.
-      uint64_t OrrImm = replicateChunk(UImm, 2, 0);
-      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
-        return true;
-
-      // See if we can come up with a constant which can be materialized with
-      // ORR-immediate by replicating element 1 into element 3.
-      OrrImm = replicateChunk(UImm, 0, 2);
-      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
-        return true;
-    }
-  }
-
-  // Check for identical 16-bit chunks within the constant and if so materialize
-  // them with a single ORR instruction. The remaining one or two 16-bit chunks
-  // will be materialized with MOVK instructions.
-  if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
-    return true;
-
-  // Check whether the constant contains a sequence of contiguous ones, which
-  // might be interrupted by one or two chunks. If so, materialize the sequence
-  // of contiguous ones with an ORR instruction. Materialize the chunks which
-  // are either interrupting the sequence or outside of the sequence with a
-  // MOVK instruction.
-  if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
-    return true;
-
-  // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
-  // more MOVK instructions to insert additional 16-bit portions into the
-  // lower bits.
-  bool isNeg = false;
-
-  // Use MOVN to materialize the high bits if we have more all one chunks
-  // than all zero chunks.
-  if (OneChunks > ZeroChunks) {
-    isNeg = true;
-    Imm = ~Imm;
-  }
-
-  unsigned FirstOpc;
-  if (BitSize == 32) {
-    Imm &= (1LL << 32) - 1;
-    FirstOpc = (isNeg ? ARM64::MOVNWi : ARM64::MOVZWi);
-  } else {
-    FirstOpc = (isNeg ? ARM64::MOVNXi : ARM64::MOVZXi);
-  }
-  unsigned Shift = 0;     // LSL amount for high bits with MOVZ/MOVN
-  unsigned LastShift = 0; // LSL amount for last MOVK
-  if (Imm != 0) {
-    unsigned LZ = countLeadingZeros(Imm);
-    unsigned TZ = countTrailingZeros(Imm);
-    Shift = ((63 - LZ) / 16) * 16;
-    LastShift = (TZ / 16) * 16;
-  }
-  unsigned Imm16 = (Imm >> Shift) & Mask;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  bool DstIsDead = MI.getOperand(0).isDead();
-  MachineInstrBuilder MIB1 =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
-          .addReg(DstReg, RegState::Define |
-                              getDeadRegState(DstIsDead && Shift == LastShift))
-          .addImm(Imm16)
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift));
-
-  // If a MOVN was used for the high bits of a negative value, flip the rest
-  // of the bits back for use with MOVK.
-  if (isNeg)
-    Imm = ~Imm;
-
-  if (Shift == LastShift) {
-    transferImpOps(MI, MIB1, MIB1);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  MachineInstrBuilder MIB2;
-  unsigned Opc = (BitSize == 32 ? ARM64::MOVKWi : ARM64::MOVKXi);
-  while (Shift != LastShift) {
-    Shift -= 16;
-    Imm16 = (Imm >> Shift) & Mask;
-    if (Imm16 == (isNeg ? Mask : 0))
-      continue; // This 16-bit portion is already set correctly.
-    MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
-               .addReg(DstReg,
-                       RegState::Define |
-                           getDeadRegState(DstIsDead && Shift == LastShift))
-               .addReg(DstReg)
-               .addImm(Imm16)
-               .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift));
-  }
-
-  transferImpOps(MI, MIB1, MIB2);
-  MI.eraseFromParent();
-  return true;
-}
-
-/// \brief If MBBI references a pseudo instruction that should be expanded here,
-/// do the expansion and return true.  Otherwise return false.
-bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI) {
-  MachineInstr &MI = *MBBI;
-  unsigned Opcode = MI.getOpcode();
-  switch (Opcode) {
-  default:
-    break;
-
-  case ARM64::ADDWrr:
-  case ARM64::SUBWrr:
-  case ARM64::ADDXrr:
-  case ARM64::SUBXrr:
-  case ARM64::ADDSWrr:
-  case ARM64::SUBSWrr:
-  case ARM64::ADDSXrr:
-  case ARM64::SUBSXrr:
-  case ARM64::ANDWrr:
-  case ARM64::ANDXrr:
-  case ARM64::BICWrr:
-  case ARM64::BICXrr:
-  case ARM64::ANDSWrr:
-  case ARM64::ANDSXrr:
-  case ARM64::BICSWrr:
-  case ARM64::BICSXrr:
-  case ARM64::EONWrr:
-  case ARM64::EONXrr:
-  case ARM64::EORWrr:
-  case ARM64::EORXrr:
-  case ARM64::ORNWrr:
-  case ARM64::ORNXrr:
-  case ARM64::ORRWrr:
-  case ARM64::ORRXrr: {
-    unsigned Opcode;
-    switch (MI.getOpcode()) {
-    default:
-      return false;
-    case ARM64::ADDWrr:      Opcode = ARM64::ADDWrs; break;
-    case ARM64::SUBWrr:      Opcode = ARM64::SUBWrs; break;
-    case ARM64::ADDXrr:      Opcode = ARM64::ADDXrs; break;
-    case ARM64::SUBXrr:      Opcode = ARM64::SUBXrs; break;
-    case ARM64::ADDSWrr:     Opcode = ARM64::ADDSWrs; break;
-    case ARM64::SUBSWrr:     Opcode = ARM64::SUBSWrs; break;
-    case ARM64::ADDSXrr:     Opcode = ARM64::ADDSXrs; break;
-    case ARM64::SUBSXrr:     Opcode = ARM64::SUBSXrs; break;
-    case ARM64::ANDWrr:      Opcode = ARM64::ANDWrs; break;
-    case ARM64::ANDXrr:      Opcode = ARM64::ANDXrs; break;
-    case ARM64::BICWrr:      Opcode = ARM64::BICWrs; break;
-    case ARM64::BICXrr:      Opcode = ARM64::BICXrs; break;
-    case ARM64::ANDSWrr:     Opcode = ARM64::ANDSWrs; break;
-    case ARM64::ANDSXrr:     Opcode = ARM64::ANDSXrs; break;
-    case ARM64::BICSWrr:     Opcode = ARM64::BICSWrs; break;
-    case ARM64::BICSXrr:     Opcode = ARM64::BICSXrs; break;
-    case ARM64::EONWrr:      Opcode = ARM64::EONWrs; break;
-    case ARM64::EONXrr:      Opcode = ARM64::EONXrs; break;
-    case ARM64::EORWrr:      Opcode = ARM64::EORWrs; break;
-    case ARM64::EORXrr:      Opcode = ARM64::EORXrs; break;
-    case ARM64::ORNWrr:      Opcode = ARM64::ORNWrs; break;
-    case ARM64::ORNXrr:      Opcode = ARM64::ORNXrs; break;
-    case ARM64::ORRWrr:      Opcode = ARM64::ORRWrs; break;
-    case ARM64::ORRXrr:      Opcode = ARM64::ORRXrs; break;
-    }
-    MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
-                MI.getOperand(0).getReg())
-            .addOperand(MI.getOperand(1))
-            .addOperand(MI.getOperand(2))
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    transferImpOps(MI, MIB1, MIB1);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  case ARM64::FCVTSHpseudo: {
-    MachineOperand Src = MI.getOperand(1);
-    Src.setImplicit();
-    unsigned SrcH = TII->getRegisterInfo().getSubReg(Src.getReg(), ARM64::hsub);
-    auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::FCVTSHr))
-                   .addOperand(MI.getOperand(0))
-                   .addReg(SrcH, RegState::Undef)
-                   .addOperand(Src);
-    transferImpOps(MI, MIB, MIB);
-    MI.eraseFromParent();
-    return true;
-  }
-  case ARM64::LOADgot: {
-    // Expand into ADRP + LDR.
-    unsigned DstReg = MI.getOperand(0).getReg();
-    const MachineOperand &MO1 = MI.getOperand(1);
-    unsigned Flags = MO1.getTargetFlags();
-    MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg);
-    MachineInstrBuilder MIB2 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::LDRXui))
-            .addOperand(MI.getOperand(0))
-            .addReg(DstReg);
-
-    if (MO1.isGlobal()) {
-      MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | ARM64II::MO_PAGE);
-      MIB2.addGlobalAddress(MO1.getGlobal(), 0,
-                            Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-    } else if (MO1.isSymbol()) {
-      MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | ARM64II::MO_PAGE);
-      MIB2.addExternalSymbol(MO1.getSymbolName(),
-                             Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-    } else {
-      assert(MO1.isCPI() &&
-             "Only expect globals, externalsymbols, or constant pools");
-      MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
-                                Flags | ARM64II::MO_PAGE);
-      MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
-                                Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-    }
-
-    transferImpOps(MI, MIB1, MIB2);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  case ARM64::MOVaddr:
-  case ARM64::MOVaddrJT:
-  case ARM64::MOVaddrCP:
-  case ARM64::MOVaddrBA:
-  case ARM64::MOVaddrTLS:
-  case ARM64::MOVaddrEXT: {
-    // Expand into ADRP + ADD.
-    unsigned DstReg = MI.getOperand(0).getReg();
-    MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg)
-            .addOperand(MI.getOperand(1));
-
-    MachineInstrBuilder MIB2 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADDXri))
-            .addOperand(MI.getOperand(0))
-            .addReg(DstReg)
-            .addOperand(MI.getOperand(2))
-            .addImm(0);
-
-    transferImpOps(MI, MIB1, MIB2);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  case ARM64::MOVi32imm:
-    return expandMOVImm(MBB, MBBI, 32);
-  case ARM64::MOVi64imm:
-    return expandMOVImm(MBB, MBBI, 64);
-  case ARM64::RET_ReallyLR:
-    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::RET))
-        .addReg(ARM64::LR);
-    MI.eraseFromParent();
-    return true;
-  }
-  return false;
-}
-
-/// \brief Iterate over the instructions in basic block MBB and expand any
-/// pseudo instructions.  Return true if anything was modified.
-bool ARM64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
-  bool Modified = false;
-
-  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-  while (MBBI != E) {
-    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
-    Modified |= expandMI(MBB, MBBI);
-    MBBI = NMBBI;
-  }
-
-  return Modified;
-}
-
-bool ARM64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
-
-  bool Modified = false;
-  for (auto &MBB : MF)
-    Modified |= expandMBB(MBB);
-  return Modified;
-}
-
-/// \brief Returns an instance of the pseudo instruction expansion pass.
-FunctionPass *llvm::createARM64ExpandPseudoPass() {
-  return new ARM64ExpandPseudo();
-}
diff --git a/lib/Target/ARM64/ARM64FastISel.cpp b/lib/Target/ARM64/ARM64FastISel.cpp
deleted file mode 100644
index f4bf616559a..00000000000
--- a/lib/Target/ARM64/ARM64FastISel.cpp
+++ /dev/null
@@ -1,1977 +0,0 @@
-//===-- ARM6464FastISel.cpp - ARM64 FastISel implementation ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the ARM64-specific support for the FastISel class. Some
-// of the target-specific code is generated by tablegen in the file
-// ARM64GenFastISel.inc, which is #included here.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64TargetMachine.h"
-#include "ARM64Subtarget.h"
-#include "ARM64CallingConv.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/FastISel.h"
-#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/Support/CommandLine.h"
-using namespace llvm;
-
-namespace {
-
-class ARM64FastISel : public FastISel {
-
-  class Address {
-  public:
-    typedef enum {
-      RegBase,
-      FrameIndexBase
-    } BaseKind;
-
-  private:
-    BaseKind Kind;
-    union {
-      unsigned Reg;
-      int FI;
-    } Base;
-    int64_t Offset;
-
-  public:
-    Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; }
-    void setKind(BaseKind K) { Kind = K; }
-    BaseKind getKind() const { return Kind; }
-    bool isRegBase() const { return Kind == RegBase; }
-    bool isFIBase() const { return Kind == FrameIndexBase; }
-    void setReg(unsigned Reg) {
-      assert(isRegBase() && "Invalid base register access!");
-      Base.Reg = Reg;
-    }
-    unsigned getReg() const {
-      assert(isRegBase() && "Invalid base register access!");
-      return Base.Reg;
-    }
-    void setFI(unsigned FI) {
-      assert(isFIBase() && "Invalid base frame index  access!");
-      Base.FI = FI;
-    }
-    unsigned getFI() const {
-      assert(isFIBase() && "Invalid base frame index access!");
-      return Base.FI;
-    }
-    void setOffset(int64_t O) { Offset = O; }
-    int64_t getOffset() { return Offset; }
-
-    bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); }
-  };
-
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-  LLVMContext *Context;
-
-private:
-  // Selection routines.
-  bool SelectLoad(const Instruction *I);
-  bool SelectStore(const Instruction *I);
-  bool SelectBranch(const Instruction *I);
-  bool SelectIndirectBr(const Instruction *I);
-  bool SelectCmp(const Instruction *I);
-  bool SelectSelect(const Instruction *I);
-  bool SelectFPExt(const Instruction *I);
-  bool SelectFPTrunc(const Instruction *I);
-  bool SelectFPToInt(const Instruction *I, bool Signed);
-  bool SelectIntToFP(const Instruction *I, bool Signed);
-  bool SelectRem(const Instruction *I, unsigned ISDOpcode);
-  bool SelectCall(const Instruction *I, const char *IntrMemName);
-  bool SelectIntrinsicCall(const IntrinsicInst &I);
-  bool SelectRet(const Instruction *I);
-  bool SelectTrunc(const Instruction *I);
-  bool SelectIntExt(const Instruction *I);
-  bool SelectMul(const Instruction *I);
-
-  // Utility helper routines.
-  bool isTypeLegal(Type *Ty, MVT &VT);
-  bool isLoadStoreTypeLegal(Type *Ty, MVT &VT);
-  bool ComputeAddress(const Value *Obj, Address &Addr);
-  bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
-                       bool UseUnscaled);
-  void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
-                            unsigned Flags, bool UseUnscaled);
-  bool IsMemCpySmall(uint64_t Len, unsigned Alignment);
-  bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
-                          unsigned Alignment);
-  // Emit functions.
-  bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt);
-  bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
-                bool UseUnscaled = false);
-  bool EmitStore(MVT VT, unsigned SrcReg, Address Addr,
-                 bool UseUnscaled = false);
-  unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
-  unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
-
-  unsigned ARM64MaterializeFP(const ConstantFP *CFP, MVT VT);
-  unsigned ARM64MaterializeGV(const GlobalValue *GV);
-
-  // Call handling routines.
-private:
-  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
-  bool ProcessCallArgs(SmallVectorImpl<Value *> &Args,
-                       SmallVectorImpl<unsigned> &ArgRegs,
-                       SmallVectorImpl<MVT> &ArgVTs,
-                       SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
-                       SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
-                       unsigned &NumBytes);
-  bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
-                  const Instruction *I, CallingConv::ID CC, unsigned &NumBytes);
-
-public:
-  // Backend specific FastISel code.
-  unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
-  unsigned TargetMaterializeConstant(const Constant *C) override;
-
-  explicit ARM64FastISel(FunctionLoweringInfo &funcInfo,
-                         const TargetLibraryInfo *libInfo)
-      : FastISel(funcInfo, libInfo) {
-    Subtarget = &TM.getSubtarget<ARM64Subtarget>();
-    Context = &funcInfo.Fn->getContext();
-  }
-
-  bool TargetSelectInstruction(const Instruction *I) override;
-
-#include "ARM64GenFastISel.inc"
-};
-
-} // end anonymous namespace
-
-#include "ARM64GenCallingConv.inc"
-
-CCAssignFn *ARM64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
-  if (CC == CallingConv::WebKit_JS)
-    return CC_ARM64_WebKit_JS;
-  return Subtarget->isTargetDarwin() ? CC_ARM64_DarwinPCS : CC_ARM64_AAPCS;
-}
-
-unsigned ARM64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
-  assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
-         "Alloca should always return a pointer.");
-
-  // Don't handle dynamic allocas.
-  if (!FuncInfo.StaticAllocaMap.count(AI))
-    return 0;
-
-  DenseMap<const AllocaInst *, int>::iterator SI =
-      FuncInfo.StaticAllocaMap.find(AI);
-
-  if (SI != FuncInfo.StaticAllocaMap.end()) {
-    unsigned ResultReg = createResultReg(&ARM64::GPR64RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADDXri),
-            ResultReg)
-        .addFrameIndex(SI->second)
-        .addImm(0)
-        .addImm(0);
-    return ResultReg;
-  }
-
-  return 0;
-}
-
-unsigned ARM64FastISel::ARM64MaterializeFP(const ConstantFP *CFP, MVT VT) {
-  if (VT != MVT::f32 && VT != MVT::f64)
-    return 0;
-
-  const APFloat Val = CFP->getValueAPF();
-  bool is64bit = (VT == MVT::f64);
-
-  // This checks to see if we can use FMOV instructions to materialize
-  // a constant, otherwise we have to materialize via the constant pool.
-  if (TLI.isFPImmLegal(Val, VT)) {
-    int Imm;
-    unsigned Opc;
-    if (is64bit) {
-      Imm = ARM64_AM::getFP64Imm(Val);
-      Opc = ARM64::FMOVDi;
-    } else {
-      Imm = ARM64_AM::getFP32Imm(Val);
-      Opc = ARM64::FMOVSi;
-    }
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-        .addImm(Imm);
-    return ResultReg;
-  }
-
-  // Materialize via constant pool.  MachineConstantPool wants an explicit
-  // alignment.
-  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
-  if (Align == 0)
-    Align = DL.getTypeAllocSize(CFP->getType());
-
-  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
-  unsigned ADRPReg = createResultReg(&ARM64::GPR64commonRegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
-          ADRPReg).addConstantPoolIndex(Idx, 0, ARM64II::MO_PAGE);
-
-  unsigned Opc = is64bit ? ARM64::LDRDui : ARM64::LDRSui;
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(ADRPReg)
-      .addConstantPoolIndex(Idx, 0, ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-  return ResultReg;
-}
-
-unsigned ARM64FastISel::ARM64MaterializeGV(const GlobalValue *GV) {
-  // We can't handle thread-local variables quickly yet. Unfortunately we have
-  // to peer through any aliases to find out if that rule applies.
-  const GlobalValue *TLSGV = GV;
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    TLSGV = GA->getAliasee();
-
-  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(TLSGV))
-    if (GVar->isThreadLocal())
-      return 0;
-
-  unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
-
-  EVT DestEVT = TLI.getValueType(GV->getType(), true);
-  if (!DestEVT.isSimple())
-    return 0;
-
-  unsigned ADRPReg = createResultReg(&ARM64::GPR64commonRegClass);
-  unsigned ResultReg;
-
-  if (OpFlags & ARM64II::MO_GOT) {
-    // ADRP + LDRX
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
-            ADRPReg)
-        .addGlobalAddress(GV, 0, ARM64II::MO_GOT | ARM64II::MO_PAGE);
-
-    ResultReg = createResultReg(&ARM64::GPR64RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::LDRXui),
-            ResultReg)
-        .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0, ARM64II::MO_GOT | ARM64II::MO_PAGEOFF |
-                          ARM64II::MO_NC);
-  } else {
-    // ADRP + ADDX
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
-            ADRPReg).addGlobalAddress(GV, 0, ARM64II::MO_PAGE);
-
-    ResultReg = createResultReg(&ARM64::GPR64spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADDXri),
-            ResultReg)
-        .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0, ARM64II::MO_PAGEOFF | ARM64II::MO_NC)
-        .addImm(0);
-  }
-  return ResultReg;
-}
-
-unsigned ARM64FastISel::TargetMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
-
-  // Only handle simple types.
-  if (!CEVT.isSimple())
-    return 0;
-  MVT VT = CEVT.getSimpleVT();
-
-  // FIXME: Handle ConstantInt.
-  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
-    return ARM64MaterializeFP(CFP, VT);
-  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
-    return ARM64MaterializeGV(GV);
-
-  return 0;
-}
-
-// Computes the address to get to an object.
-bool ARM64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
-  const User *U = nullptr;
-  unsigned Opcode = Instruction::UserOp1;
-  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
-    // Don't walk into other basic blocks unless the object is an alloca from
-    // another block, otherwise it may not have a virtual register assigned.
-    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
-        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
-      Opcode = I->getOpcode();
-      U = I;
-    }
-  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
-    Opcode = C->getOpcode();
-    U = C;
-  }
-
-  if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
-    if (Ty->getAddressSpace() > 255)
-      // Fast instruction selection doesn't support the special
-      // address spaces.
-      return false;
-
-  switch (Opcode) {
-  default:
-    break;
-  case Instruction::BitCast: {
-    // Look through bitcasts.
-    return ComputeAddress(U->getOperand(0), Addr);
-  }
-  case Instruction::IntToPtr: {
-    // Look past no-op inttoptrs.
-    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
-      return ComputeAddress(U->getOperand(0), Addr);
-    break;
-  }
-  case Instruction::PtrToInt: {
-    // Look past no-op ptrtoints.
-    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
-      return ComputeAddress(U->getOperand(0), Addr);
-    break;
-  }
-  case Instruction::GetElementPtr: {
-    Address SavedAddr = Addr;
-    uint64_t TmpOffset = Addr.getOffset();
-
-    // Iterate through the GEP folding the constants into offsets where
-    // we can.
-    gep_type_iterator GTI = gep_type_begin(U);
-    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
-         ++i, ++GTI) {
-      const Value *Op = *i;
-      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        const StructLayout *SL = DL.getStructLayout(STy);
-        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
-        TmpOffset += SL->getElementOffset(Idx);
-      } else {
-        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
-        for (;;) {
-          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
-            // Constant-offset addressing.
-            TmpOffset += CI->getSExtValue() * S;
-            break;
-          }
-          if (canFoldAddIntoGEP(U, Op)) {
-            // A compatible add with a constant operand. Fold the constant.
-            ConstantInt *CI =
-                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
-            TmpOffset += CI->getSExtValue() * S;
-            // Iterate on the other operand.
-            Op = cast<AddOperator>(Op)->getOperand(0);
-            continue;
-          }
-          // Unsupported
-          goto unsupported_gep;
-        }
-      }
-    }
-
-    // Try to grab the base operand now.
-    Addr.setOffset(TmpOffset);
-    if (ComputeAddress(U->getOperand(0), Addr))
-      return true;
-
-    // We failed, restore everything and try the other options.
-    Addr = SavedAddr;
-
-  unsupported_gep:
-    break;
-  }
-  case Instruction::Alloca: {
-    const AllocaInst *AI = cast<AllocaInst>(Obj);
-    DenseMap<const AllocaInst *, int>::iterator SI =
-        FuncInfo.StaticAllocaMap.find(AI);
-    if (SI != FuncInfo.StaticAllocaMap.end()) {
-      Addr.setKind(Address::FrameIndexBase);
-      Addr.setFI(SI->second);
-      return true;
-    }
-    break;
-  }
-  }
-
-  // Try to get this in a register if nothing else has worked.
-  if (!Addr.isValid())
-    Addr.setReg(getRegForValue(Obj));
-  return Addr.isValid();
-}
-
-bool ARM64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
-  EVT evt = TLI.getValueType(Ty, true);
-
-  // Only handle simple types.
-  if (evt == MVT::Other || !evt.isSimple())
-    return false;
-  VT = evt.getSimpleVT();
-
-  // This is a legal type, but it's not something we handle in fast-isel.
-  if (VT == MVT::f128)
-    return false;
-
-  // Handle all other legal types, i.e. a register that will directly hold this
-  // value.
-  return TLI.isTypeLegal(VT);
-}
-
-bool ARM64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
-  if (isTypeLegal(Ty, VT))
-    return true;
-
-  // If this is a type than can be sign or zero-extended to a basic operation
-  // go ahead and accept it now. For stores, this reflects truncation.
-  if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
-    return true;
-
-  return false;
-}
-
-bool ARM64FastISel::SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
-                                    bool UseUnscaled) {
-  bool needsLowering = false;
-  int64_t Offset = Addr.getOffset();
-  switch (VT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i1:
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32:
-  case MVT::i64:
-  case MVT::f32:
-  case MVT::f64:
-    if (!UseUnscaled)
-      // Using scaled, 12-bit, unsigned immediate offsets.
-      needsLowering = ((Offset & 0xfff) != Offset);
-    else
-      // Using unscaled, 9-bit, signed immediate offsets.
-      needsLowering = (Offset > 256 || Offset < -256);
-    break;
-  }
-
-  // FIXME: If this is a stack pointer and the offset needs to be simplified
-  // then put the alloca address into a register, set the base type back to
-  // register and continue. This should almost never happen.
-  if (needsLowering && Addr.getKind() == Address::FrameIndexBase) {
-    return false;
-  }
-
-  // Since the offset is too large for the load/store instruction get the
-  // reg+offset into a register.
-  if (needsLowering) {
-    uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor;
-    unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false,
-                                      UnscaledOffset, MVT::i64);
-    if (ResultReg == 0)
-      return false;
-    Addr.setReg(ResultReg);
-    Addr.setOffset(0);
-  }
-  return true;
-}
-
-void ARM64FastISel::AddLoadStoreOperands(Address &Addr,
-                                         const MachineInstrBuilder &MIB,
-                                         unsigned Flags, bool UseUnscaled) {
-  int64_t Offset = Addr.getOffset();
-  // Frame base works a bit differently. Handle it separately.
-  if (Addr.getKind() == Address::FrameIndexBase) {
-    int FI = Addr.getFI();
-    // FIXME: We shouldn't be using getObjectSize/getObjectAlignment.  The size
-    // and alignment should be based on the VT.
-    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(FI, Offset), Flags,
-        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
-    // Now add the rest of the operands.
-    MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
-  } else {
-    // Now add the rest of the operands.
-    MIB.addReg(Addr.getReg());
-    MIB.addImm(Offset);
-  }
-}
-
-bool ARM64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
-                             bool UseUnscaled) {
-  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
-  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
-  if (!UseUnscaled && Addr.getOffset() < 0)
-    UseUnscaled = true;
-
-  unsigned Opc;
-  const TargetRegisterClass *RC;
-  bool VTIsi1 = false;
-  int64_t ScaleFactor = 0;
-  switch (VT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i1:
-    VTIsi1 = true;
-  // Intentional fall-through.
-  case MVT::i8:
-    Opc = UseUnscaled ? ARM64::LDURBBi : ARM64::LDRBBui;
-    RC = &ARM64::GPR32RegClass;
-    ScaleFactor = 1;
-    break;
-  case MVT::i16:
-    Opc = UseUnscaled ? ARM64::LDURHHi : ARM64::LDRHHui;
-    RC = &ARM64::GPR32RegClass;
-    ScaleFactor = 2;
-    break;
-  case MVT::i32:
-    Opc = UseUnscaled ? ARM64::LDURWi : ARM64::LDRWui;
-    RC = &ARM64::GPR32RegClass;
-    ScaleFactor = 4;
-    break;
-  case MVT::i64:
-    Opc = UseUnscaled ? ARM64::LDURXi : ARM64::LDRXui;
-    RC = &ARM64::GPR64RegClass;
-    ScaleFactor = 8;
-    break;
-  case MVT::f32:
-    Opc = UseUnscaled ? ARM64::LDURSi : ARM64::LDRSui;
-    RC = TLI.getRegClassFor(VT);
-    ScaleFactor = 4;
-    break;
-  case MVT::f64:
-    Opc = UseUnscaled ? ARM64::LDURDi : ARM64::LDRDui;
-    RC = TLI.getRegClassFor(VT);
-    ScaleFactor = 8;
-    break;
-  }
-  // Scale the offset.
-  if (!UseUnscaled) {
-    int64_t Offset = Addr.getOffset();
-    if (Offset & (ScaleFactor - 1))
-      // Retry using an unscaled, 9-bit, signed immediate offset.
-      return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true);
-
-    Addr.setOffset(Offset / ScaleFactor);
-  }
-
-  // Simplify this down to something we can handle.
-  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
-    return false;
-
-  // Create the base instruction, then add the operands.
-  ResultReg = createResultReg(RC);
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(Opc), ResultReg);
-  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled);
-
-  // Loading an i1 requires special handling.
-  if (VTIsi1) {
-    MRI.constrainRegClass(ResultReg, &ARM64::GPR32RegClass);
-    unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-            ANDReg)
-        .addReg(ResultReg)
-        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
-    ResultReg = ANDReg;
-  }
-  return true;
-}
-
-bool ARM64FastISel::SelectLoad(const Instruction *I) {
-  MVT VT;
-  // Verify we have a legal type before going any further.  Currently, we handle
-  // simple types that will directly fit in a register (i32/f32/i64/f64) or
-  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
-  if (!isLoadStoreTypeLegal(I->getType(), VT) || cast<LoadInst>(I)->isAtomic())
-    return false;
-
-  // See if we can handle this address.
-  Address Addr;
-  if (!ComputeAddress(I->getOperand(0), Addr))
-    return false;
-
-  unsigned ResultReg;
-  if (!EmitLoad(VT, ResultReg, Addr))
-    return false;
-
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
-                              bool UseUnscaled) {
-  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
-  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
-  if (!UseUnscaled && Addr.getOffset() < 0)
-    UseUnscaled = true;
-
-  unsigned StrOpc;
-  bool VTIsi1 = false;
-  int64_t ScaleFactor = 0;
-  // Using scaled, 12-bit, unsigned immediate offsets.
-  switch (VT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i1:
-    VTIsi1 = true;
-  case MVT::i8:
-    StrOpc = UseUnscaled ? ARM64::STURBBi : ARM64::STRBBui;
-    ScaleFactor = 1;
-    break;
-  case MVT::i16:
-    StrOpc = UseUnscaled ? ARM64::STURHHi : ARM64::STRHHui;
-    ScaleFactor = 2;
-    break;
-  case MVT::i32:
-    StrOpc = UseUnscaled ? ARM64::STURWi : ARM64::STRWui;
-    ScaleFactor = 4;
-    break;
-  case MVT::i64:
-    StrOpc = UseUnscaled ? ARM64::STURXi : ARM64::STRXui;
-    ScaleFactor = 8;
-    break;
-  case MVT::f32:
-    StrOpc = UseUnscaled ? ARM64::STURSi : ARM64::STRSui;
-    ScaleFactor = 4;
-    break;
-  case MVT::f64:
-    StrOpc = UseUnscaled ? ARM64::STURDi : ARM64::STRDui;
-    ScaleFactor = 8;
-    break;
-  }
-  // Scale the offset.
-  if (!UseUnscaled) {
-    int64_t Offset = Addr.getOffset();
-    if (Offset & (ScaleFactor - 1))
-      // Retry using an unscaled, 9-bit, signed immediate offset.
-      return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true);
-
-    Addr.setOffset(Offset / ScaleFactor);
-  }
-
-  // Simplify this down to something we can handle.
-  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
-    return false;
-
-  // Storing an i1 requires special handling.
-  if (VTIsi1) {
-    MRI.constrainRegClass(SrcReg, &ARM64::GPR32RegClass);
-    unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-            ANDReg)
-        .addReg(SrcReg)
-        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
-    SrcReg = ANDReg;
-  }
-  // Create the base instruction, then add the operands.
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(StrOpc)).addReg(SrcReg);
-  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled);
-  return true;
-}
-
-bool ARM64FastISel::SelectStore(const Instruction *I) {
-  MVT VT;
-  Value *Op0 = I->getOperand(0);
-  // Verify we have a legal type before going any further.  Currently, we handle
-  // simple types that will directly fit in a register (i32/f32/i64/f64) or
-  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
-  if (!isLoadStoreTypeLegal(Op0->getType(), VT) ||
-      cast<StoreInst>(I)->isAtomic())
-    return false;
-
-  // Get the value to be stored into a register.
-  unsigned SrcReg = getRegForValue(Op0);
-  if (SrcReg == 0)
-    return false;
-
-  // See if we can handle this address.
-  Address Addr;
-  if (!ComputeAddress(I->getOperand(1), Addr))
-    return false;
-
-  if (!EmitStore(VT, SrcReg, Addr))
-    return false;
-  return true;
-}
-
-static ARM64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
-  switch (Pred) {
-  case CmpInst::FCMP_ONE:
-  case CmpInst::FCMP_UEQ:
-  default:
-    // AL is our "false" for now. The other two need more compares.
-    return ARM64CC::AL;
-  case CmpInst::ICMP_EQ:
-  case CmpInst::FCMP_OEQ:
-    return ARM64CC::EQ;
-  case CmpInst::ICMP_SGT:
-  case CmpInst::FCMP_OGT:
-    return ARM64CC::GT;
-  case CmpInst::ICMP_SGE:
-  case CmpInst::FCMP_OGE:
-    return ARM64CC::GE;
-  case CmpInst::ICMP_UGT:
-  case CmpInst::FCMP_UGT:
-    return ARM64CC::HI;
-  case CmpInst::FCMP_OLT:
-    return ARM64CC::MI;
-  case CmpInst::ICMP_ULE:
-  case CmpInst::FCMP_OLE:
-    return ARM64CC::LS;
-  case CmpInst::FCMP_ORD:
-    return ARM64CC::VC;
-  case CmpInst::FCMP_UNO:
-    return ARM64CC::VS;
-  case CmpInst::FCMP_UGE:
-    return ARM64CC::PL;
-  case CmpInst::ICMP_SLT:
-  case CmpInst::FCMP_ULT:
-    return ARM64CC::LT;
-  case CmpInst::ICMP_SLE:
-  case CmpInst::FCMP_ULE:
-    return ARM64CC::LE;
-  case CmpInst::FCMP_UNE:
-  case CmpInst::ICMP_NE:
-    return ARM64CC::NE;
-  case CmpInst::ICMP_UGE:
-    return ARM64CC::HS;
-  case CmpInst::ICMP_ULT:
-    return ARM64CC::LO;
-  }
-}
-
-bool ARM64FastISel::SelectBranch(const Instruction *I) {
-  const BranchInst *BI = cast<BranchInst>(I);
-  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
-  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
-
-  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
-    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
-      // We may not handle every CC for now.
-      ARM64CC::CondCode CC = getCompareCC(CI->getPredicate());
-      if (CC == ARM64CC::AL)
-        return false;
-
-      // Emit the cmp.
-      if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
-        return false;
-
-      // Emit the branch.
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
-          .addImm(CC)
-          .addMBB(TBB);
-      FuncInfo.MBB->addSuccessor(TBB);
-
-      FastEmitBranch(FBB, DbgLoc);
-      return true;
-    }
-  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
-    MVT SrcVT;
-    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
-        (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) {
-      unsigned CondReg = getRegForValue(TI->getOperand(0));
-      if (CondReg == 0)
-        return false;
-
-      // Issue an extract_subreg to get the lower 32-bits.
-      if (SrcVT == MVT::i64)
-        CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true,
-                                             ARM64::sub_32);
-
-      MRI.constrainRegClass(CondReg, &ARM64::GPR32RegClass);
-      unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-              ANDReg)
-          .addReg(CondReg)
-          .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri))
-          .addReg(ANDReg)
-          .addReg(ANDReg)
-          .addImm(0)
-          .addImm(0);
-
-      unsigned CC = ARM64CC::NE;
-      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
-        std::swap(TBB, FBB);
-        CC = ARM64CC::EQ;
-      }
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
-          .addImm(CC)
-          .addMBB(TBB);
-      FuncInfo.MBB->addSuccessor(TBB);
-      FastEmitBranch(FBB, DbgLoc);
-      return true;
-    }
-  } else if (const ConstantInt *CI =
-                 dyn_cast<ConstantInt>(BI->getCondition())) {
-    uint64_t Imm = CI->getZExtValue();
-    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::B))
-        .addMBB(Target);
-    FuncInfo.MBB->addSuccessor(Target);
-    return true;
-  }
-
-  unsigned CondReg = getRegForValue(BI->getCondition());
-  if (CondReg == 0)
-    return false;
-
-  // We've been divorced from our compare!  Our block was split, and
-  // now our compare lives in a predecessor block.  We musn't
-  // re-compare here, as the children of the compare aren't guaranteed
-  // live across the block boundary (we *could* check for this).
-  // Regardless, the compare has been done in the predecessor block,
-  // and it left a value for us in a virtual register.  Ergo, we test
-  // the one-bit value left in the virtual register.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri),
-          ARM64::WZR)
-      .addReg(CondReg)
-      .addImm(0)
-      .addImm(0);
-
-  unsigned CC = ARM64CC::NE;
-  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
-    std::swap(TBB, FBB);
-    CC = ARM64CC::EQ;
-  }
-
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
-      .addImm(CC)
-      .addMBB(TBB);
-  FuncInfo.MBB->addSuccessor(TBB);
-  FastEmitBranch(FBB, DbgLoc);
-  return true;
-}
-
-bool ARM64FastISel::SelectIndirectBr(const Instruction *I) {
-  const IndirectBrInst *BI = cast<IndirectBrInst>(I);
-  unsigned AddrReg = getRegForValue(BI->getOperand(0));
-  if (AddrReg == 0)
-    return false;
-
-  // Emit the indirect branch.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BR))
-      .addReg(AddrReg);
-
-  // Make sure the CFG is up-to-date.
-  for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
-    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]);
-
-  return true;
-}
-
-bool ARM64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
-  Type *Ty = Src1Value->getType();
-  EVT SrcEVT = TLI.getValueType(Ty, true);
-  if (!SrcEVT.isSimple())
-    return false;
-  MVT SrcVT = SrcEVT.getSimpleVT();
-
-  // Check to see if the 2nd operand is a constant that we can encode directly
-  // in the compare.
-  uint64_t Imm;
-  bool UseImm = false;
-  bool isNegativeImm = false;
-  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
-    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
-        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
-      const APInt &CIVal = ConstInt->getValue();
-
-      Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue();
-      if (CIVal.isNegative()) {
-        isNegativeImm = true;
-        Imm = -Imm;
-      }
-      // FIXME: We can handle more immediates using shifts.
-      UseImm = ((Imm & 0xfff) == Imm);
-    }
-  } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
-    if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
-      if (ConstFP->isZero() && !ConstFP->isNegative())
-        UseImm = true;
-  }
-
-  unsigned ZReg;
-  unsigned CmpOpc;
-  bool isICmp = true;
-  bool needsExt = false;
-  switch (SrcVT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i1:
-  case MVT::i8:
-  case MVT::i16:
-    needsExt = true;
-  // Intentional fall-through.
-  case MVT::i32:
-    ZReg = ARM64::WZR;
-    if (UseImm)
-      CmpOpc = isNegativeImm ? ARM64::ADDSWri : ARM64::SUBSWri;
-    else
-      CmpOpc = ARM64::SUBSWrr;
-    break;
-  case MVT::i64:
-    ZReg = ARM64::XZR;
-    if (UseImm)
-      CmpOpc = isNegativeImm ? ARM64::ADDSXri : ARM64::SUBSXri;
-    else
-      CmpOpc = ARM64::SUBSXrr;
-    break;
-  case MVT::f32:
-    isICmp = false;
-    CmpOpc = UseImm ? ARM64::FCMPSri : ARM64::FCMPSrr;
-    break;
-  case MVT::f64:
-    isICmp = false;
-    CmpOpc = UseImm ? ARM64::FCMPDri : ARM64::FCMPDrr;
-    break;
-  }
-
-  unsigned SrcReg1 = getRegForValue(Src1Value);
-  if (SrcReg1 == 0)
-    return false;
-
-  unsigned SrcReg2;
-  if (!UseImm) {
-    SrcReg2 = getRegForValue(Src2Value);
-    if (SrcReg2 == 0)
-      return false;
-  }
-
-  // We have i1, i8, or i16, we need to either zero extend or sign extend.
-  if (needsExt) {
-    SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
-    if (SrcReg1 == 0)
-      return false;
-    if (!UseImm) {
-      SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
-      if (SrcReg2 == 0)
-        return false;
-    }
-  }
-
-  if (isICmp) {
-    if (UseImm)
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(ZReg)
-          .addReg(SrcReg1)
-          .addImm(Imm)
-          .addImm(0);
-    else
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(ZReg)
-          .addReg(SrcReg1)
-          .addReg(SrcReg2);
-  } else {
-    if (UseImm)
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(SrcReg1);
-    else
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(SrcReg1)
-          .addReg(SrcReg2);
-  }
-  return true;
-}
-
-bool ARM64FastISel::SelectCmp(const Instruction *I) {
-  const CmpInst *CI = cast<CmpInst>(I);
-
-  // We may not handle every CC for now.
-  ARM64CC::CondCode CC = getCompareCC(CI->getPredicate());
-  if (CC == ARM64CC::AL)
-    return false;
-
-  // Emit the cmp.
-  if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
-    return false;
-
-  // Now set a register based on the comparison.
-  ARM64CC::CondCode invertedCC = getInvertedCondCode(CC);
-  unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::CSINCWr),
-          ResultReg)
-      .addReg(ARM64::WZR)
-      .addReg(ARM64::WZR)
-      .addImm(invertedCC);
-
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::SelectSelect(const Instruction *I) {
-  const SelectInst *SI = cast<SelectInst>(I);
-
-  EVT DestEVT = TLI.getValueType(SI->getType(), true);
-  if (!DestEVT.isSimple())
-    return false;
-
-  MVT DestVT = DestEVT.getSimpleVT();
-  if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 &&
-      DestVT != MVT::f64)
-    return false;
-
-  unsigned CondReg = getRegForValue(SI->getCondition());
-  if (CondReg == 0)
-    return false;
-  unsigned TrueReg = getRegForValue(SI->getTrueValue());
-  if (TrueReg == 0)
-    return false;
-  unsigned FalseReg = getRegForValue(SI->getFalseValue());
-  if (FalseReg == 0)
-    return false;
-
-
-  MRI.constrainRegClass(CondReg, &ARM64::GPR32RegClass);
-  unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-          ANDReg)
-      .addReg(CondReg)
-      .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
-
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri))
-      .addReg(ANDReg)
-      .addReg(ANDReg)
-      .addImm(0)
-      .addImm(0);
-
-  unsigned SelectOpc;
-  switch (DestVT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i32:
-    SelectOpc = ARM64::CSELWr;
-    break;
-  case MVT::i64:
-    SelectOpc = ARM64::CSELXr;
-    break;
-  case MVT::f32:
-    SelectOpc = ARM64::FCSELSrrr;
-    break;
-  case MVT::f64:
-    SelectOpc = ARM64::FCSELDrrr;
-    break;
-  }
-
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc),
-          ResultReg)
-      .addReg(TrueReg)
-      .addReg(FalseReg)
-      .addImm(ARM64CC::NE);
-
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::SelectFPExt(const Instruction *I) {
-  Value *V = I->getOperand(0);
-  if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
-    return false;
-
-  unsigned Op = getRegForValue(V);
-  if (Op == 0)
-    return false;
-
-  unsigned ResultReg = createResultReg(&ARM64::FPR64RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::FCVTDSr),
-          ResultReg).addReg(Op);
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::SelectFPTrunc(const Instruction *I) {
-  Value *V = I->getOperand(0);
-  if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
-    return false;
-
-  unsigned Op = getRegForValue(V);
-  if (Op == 0)
-    return false;
-
-  unsigned ResultReg = createResultReg(&ARM64::FPR32RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::FCVTSDr),
-          ResultReg).addReg(Op);
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-// FPToUI and FPToSI
-bool ARM64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
-  MVT DestVT;
-  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
-    return false;
-
-  unsigned SrcReg = getRegForValue(I->getOperand(0));
-  if (SrcReg == 0)
-    return false;
-
-  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
-  if (SrcVT == MVT::f128)
-    return false;
-
-  unsigned Opc;
-  if (SrcVT == MVT::f64) {
-    if (Signed)
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZSUWDr : ARM64::FCVTZSUXDr;
-    else
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZUUWDr : ARM64::FCVTZUUXDr;
-  } else {
-    if (Signed)
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZSUWSr : ARM64::FCVTZSUXSr;
-    else
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZUUWSr : ARM64::FCVTZUUXSr;
-  }
-  unsigned ResultReg = createResultReg(
-      DestVT == MVT::i32 ? &ARM64::GPR32RegClass : &ARM64::GPR64RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(SrcReg);
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
-  MVT DestVT;
-  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
-    return false;
-  assert ((DestVT == MVT::f32 || DestVT == MVT::f64) &&
-          "Unexpected value type.");
-
-  unsigned SrcReg = getRegForValue(I->getOperand(0));
-  if (SrcReg == 0)
-    return false;
-
-  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
-
-  // Handle sign-extension.
-  if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
-    SrcReg =
-        EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
-    if (SrcReg == 0)
-      return false;
-  }
-
-  MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &ARM64::GPR64RegClass
-                                                  : &ARM64::GPR32RegClass);
-
-  unsigned Opc;
-  if (SrcVT == MVT::i64) {
-    if (Signed)
-      Opc = (DestVT == MVT::f32) ? ARM64::SCVTFUXSri : ARM64::SCVTFUXDri;
-    else
-      Opc = (DestVT == MVT::f32) ? ARM64::UCVTFUXSri : ARM64::UCVTFUXDri;
-  } else {
-    if (Signed)
-      Opc = (DestVT == MVT::f32) ? ARM64::SCVTFUWSri : ARM64::SCVTFUWDri;
-    else
-      Opc = (DestVT == MVT::f32) ? ARM64::UCVTFUWSri : ARM64::UCVTFUWDri;
-  }
-
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(SrcReg);
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl<Value *> &Args,
-                                    SmallVectorImpl<unsigned> &ArgRegs,
-                                    SmallVectorImpl<MVT> &ArgVTs,
-                                    SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
-                                    SmallVectorImpl<unsigned> &RegArgs,
-                                    CallingConv::ID CC, unsigned &NumBytes) {
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
-  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
-
-  // Get a count of how many bytes are to be pushed on the stack.
-  NumBytes = CCInfo.getNextStackOffset();
-
-  // Issue CALLSEQ_START
-  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-      .addImm(NumBytes);
-
-  // Process the args.
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-    unsigned Arg = ArgRegs[VA.getValNo()];
-    MVT ArgVT = ArgVTs[VA.getValNo()];
-
-    // Handle arg promotion: SExt, ZExt, AExt.
-    switch (VA.getLocInfo()) {
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::SExt: {
-      MVT DestVT = VA.getLocVT();
-      MVT SrcVT = ArgVT;
-      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
-      if (Arg == 0)
-        return false;
-      ArgVT = DestVT;
-      break;
-    }
-    case CCValAssign::AExt:
-    // Intentional fall-through.
-    case CCValAssign::ZExt: {
-      MVT DestVT = VA.getLocVT();
-      MVT SrcVT = ArgVT;
-      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
-      if (Arg == 0)
-        return false;
-      ArgVT = DestVT;
-      break;
-    }
-    default:
-      llvm_unreachable("Unknown arg promotion!");
-    }
-
-    // Now copy/store arg to correct locations.
-    if (VA.isRegLoc() && !VA.needsCustom()) {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
-      RegArgs.push_back(VA.getLocReg());
-    } else if (VA.needsCustom()) {
-      // FIXME: Handle custom args.
-      return false;
-    } else {
-      assert(VA.isMemLoc() && "Assuming store on stack.");
-
-      // Need to store on the stack.
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
-
-      unsigned BEAlign = 0;
-      if (ArgSize < 8 && !Subtarget->isLittleEndian())
-        BEAlign = 8 - ArgSize;
-
-      Address Addr;
-      Addr.setKind(Address::RegBase);
-      Addr.setReg(ARM64::SP);
-      Addr.setOffset(VA.getLocMemOffset() + BEAlign);
-
-      if (!EmitStore(ArgVT, Arg, Addr))
-        return false;
-    }
-  }
-  return true;
-}
-
-bool ARM64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
-                               const Instruction *I, CallingConv::ID CC,
-                               unsigned &NumBytes) {
-  // Issue CALLSEQ_END
-  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
-      .addImm(NumBytes)
-      .addImm(0);
-
-  // Now the return value.
-  if (RetVT != MVT::isVoid) {
-    SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
-    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
-
-    // Only handle a single return value.
-    if (RVLocs.size() != 1)
-      return false;
-
-    // Copy all of the result registers out of their specified physreg.
-    MVT CopyVT = RVLocs[0].getValVT();
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY),
-            ResultReg).addReg(RVLocs[0].getLocReg());
-    UsedRegs.push_back(RVLocs[0].getLocReg());
-
-    // Finally update the result.
-    UpdateValueMap(I, ResultReg);
-  }
-
-  return true;
-}
-
-bool ARM64FastISel::SelectCall(const Instruction *I,
-                               const char *IntrMemName = nullptr) {
-  const CallInst *CI = cast<CallInst>(I);
-  const Value *Callee = CI->getCalledValue();
-
-  // Don't handle inline asm or intrinsics.
-  if (isa<InlineAsm>(Callee))
-    return false;
-
-  // Only handle global variable Callees.
-  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
-  if (!GV)
-    return false;
-
-  // Check the calling convention.
-  ImmutableCallSite CS(CI);
-  CallingConv::ID CC = CS.getCallingConv();
-
-  // Let SDISel handle vararg functions.
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  if (FTy->isVarArg())
-    return false;
-
-  // Handle *simple* calls for now.
-  MVT RetVT;
-  Type *RetTy = I->getType();
-  if (RetTy->isVoidTy())
-    RetVT = MVT::isVoid;
-  else if (!isTypeLegal(RetTy, RetVT))
-    return false;
-
-  // Set up the argument vectors.
-  SmallVector<Value *, 8> Args;
-  SmallVector<unsigned, 8> ArgRegs;
-  SmallVector<MVT, 8> ArgVTs;
-  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
-  Args.reserve(CS.arg_size());
-  ArgRegs.reserve(CS.arg_size());
-  ArgVTs.reserve(CS.arg_size());
-  ArgFlags.reserve(CS.arg_size());
-
-  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
-       i != e; ++i) {
-    // If we're lowering a memory intrinsic instead of a regular call, skip the
-    // last two arguments, which shouldn't be passed to the underlying function.
-    if (IntrMemName && e - i <= 2)
-      break;
-
-    unsigned Arg = getRegForValue(*i);
-    if (Arg == 0)
-      return false;
-
-    ISD::ArgFlagsTy Flags;
-    unsigned AttrInd = i - CS.arg_begin() + 1;
-    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
-      Flags.setSExt();
-    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
-      Flags.setZExt();
-
-    // FIXME: Only handle *easy* calls for now.
-    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
-        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
-        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
-        CS.paramHasAttr(AttrInd, Attribute::ByVal))
-      return false;
-
-    MVT ArgVT;
-    Type *ArgTy = (*i)->getType();
-    if (!isTypeLegal(ArgTy, ArgVT) &&
-        !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16))
-      return false;
-
-    // We don't handle vector parameters yet.
-    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
-      return false;
-
-    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
-    Flags.setOrigAlign(OriginalAlignment);
-
-    Args.push_back(*i);
-    ArgRegs.push_back(Arg);
-    ArgVTs.push_back(ArgVT);
-    ArgFlags.push_back(Flags);
-  }
-
-  // Handle the arguments now that we've gotten them.
-  SmallVector<unsigned, 4> RegArgs;
-  unsigned NumBytes;
-  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
-    return false;
-
-  // Issue the call.
-  MachineInstrBuilder MIB;
-  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BL));
-  if (!IntrMemName)
-    MIB.addGlobalAddress(GV, 0, 0);
-  else
-    MIB.addExternalSymbol(IntrMemName, 0);
-
-  // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
-
-  // Add a register mask with the call-preserved registers.
-  // Proper defs for return values will be added by setPhysRegsDeadExcept().
-  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
-
-  // Finish off the call including any return values.
-  SmallVector<unsigned, 4> UsedRegs;
-  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes))
-    return false;
-
-  // Set all unused physreg defs as dead.
-  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
-
-  return true;
-}
-
-bool ARM64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
-  if (Alignment)
-    return Len / Alignment <= 4;
-  else
-    return Len < 32;
-}
-
-bool ARM64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
-                                       unsigned Alignment) {
-  // Make sure we don't bloat code by inlining very large memcpy's.
-  if (!IsMemCpySmall(Len, Alignment))
-    return false;
-
-  int64_t UnscaledOffset = 0;
-  Address OrigDest = Dest;
-  Address OrigSrc = Src;
-
-  while (Len) {
-    MVT VT;
-    if (!Alignment || Alignment >= 8) {
-      if (Len >= 8)
-        VT = MVT::i64;
-      else if (Len >= 4)
-        VT = MVT::i32;
-      else if (Len >= 2)
-        VT = MVT::i16;
-      else {
-        VT = MVT::i8;
-      }
-    } else {
-      // Bound based on alignment.
-      if (Len >= 4 && Alignment == 4)
-        VT = MVT::i32;
-      else if (Len >= 2 && Alignment == 2)
-        VT = MVT::i16;
-      else {
-        VT = MVT::i8;
-      }
-    }
-
-    bool RV;
-    unsigned ResultReg;
-    RV = EmitLoad(VT, ResultReg, Src);
-    assert(RV == true && "Should be able to handle this load.");
-    RV = EmitStore(VT, ResultReg, Dest);
-    assert(RV == true && "Should be able to handle this store.");
-    (void)RV;
-
-    int64_t Size = VT.getSizeInBits() / 8;
-    Len -= Size;
-    UnscaledOffset += Size;
-
-    // We need to recompute the unscaled offset for each iteration.
-    Dest.setOffset(OrigDest.getOffset() + UnscaledOffset);
-    Src.setOffset(OrigSrc.getOffset() + UnscaledOffset);
-  }
-
-  return true;
-}
-
-bool ARM64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
-  // FIXME: Handle more intrinsics.
-  switch (I.getIntrinsicID()) {
-  default:
-    return false;
-  case Intrinsic::memcpy:
-  case Intrinsic::memmove: {
-    const MemTransferInst &MTI = cast<MemTransferInst>(I);
-    // Don't handle volatile.
-    if (MTI.isVolatile())
-      return false;
-
-    // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
-    // we would emit dead code because we don't currently handle memmoves.
-    bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
-    if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
-      // Small memcpy's are common enough that we want to do them without a call
-      // if possible.
-      uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
-      unsigned Alignment = MTI.getAlignment();
-      if (IsMemCpySmall(Len, Alignment)) {
-        Address Dest, Src;
-        if (!ComputeAddress(MTI.getRawDest(), Dest) ||
-            !ComputeAddress(MTI.getRawSource(), Src))
-          return false;
-        if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment))
-          return true;
-      }
-    }
-
-    if (!MTI.getLength()->getType()->isIntegerTy(64))
-      return false;
-
-    if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
-      // Fast instruction selection doesn't support the special
-      // address spaces.
-      return false;
-
-    const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
-    return SelectCall(&I, IntrMemName);
-  }
-  case Intrinsic::memset: {
-    const MemSetInst &MSI = cast<MemSetInst>(I);
-    // Don't handle volatile.
-    if (MSI.isVolatile())
-      return false;
-
-    if (!MSI.getLength()->getType()->isIntegerTy(64))
-      return false;
-
-    if (MSI.getDestAddressSpace() > 255)
-      // Fast instruction selection doesn't support the special
-      // address spaces.
-      return false;
-
-    return SelectCall(&I, "memset");
-  }
-  case Intrinsic::trap: {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BRK))
-        .addImm(1);
-    return true;
-  }
-  }
-  return false;
-}
-
-bool ARM64FastISel::SelectRet(const Instruction *I) {
-  const ReturnInst *Ret = cast<ReturnInst>(I);
-  const Function &F = *I->getParent()->getParent();
-
-  if (!FuncInfo.CanLowerReturn)
-    return false;
-
-  if (F.isVarArg())
-    return false;
-
-  // Build a list of return value registers.
-  SmallVector<unsigned, 4> RetRegs;
-
-  if (Ret->getNumOperands() > 0) {
-    CallingConv::ID CC = F.getCallingConv();
-    SmallVector<ISD::OutputArg, 4> Outs;
-    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
-
-    // Analyze operands of the call, assigning locations to each operand.
-    SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
-                   I->getContext());
-    CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                     : RetCC_ARM64_AAPCS;
-    CCInfo.AnalyzeReturn(Outs, RetCC);
-
-    // Only handle a single return value for now.
-    if (ValLocs.size() != 1)
-      return false;
-
-    CCValAssign &VA = ValLocs[0];
-    const Value *RV = Ret->getOperand(0);
-
-    // Don't bother handling odd stuff for now.
-    if (VA.getLocInfo() != CCValAssign::Full)
-      return false;
-    // Only handle register returns for now.
-    if (!VA.isRegLoc())
-      return false;
-    unsigned Reg = getRegForValue(RV);
-    if (Reg == 0)
-      return false;
-
-    unsigned SrcReg = Reg + VA.getValNo();
-    unsigned DestReg = VA.getLocReg();
-    // Avoid a cross-class copy. This is very unlikely.
-    if (!MRI.getRegClass(SrcReg)->contains(DestReg))
-      return false;
-
-    EVT RVEVT = TLI.getValueType(RV->getType());
-    if (!RVEVT.isSimple())
-      return false;
-
-    // Vectors (of > 1 lane) in big endian need tricky handling.
-    if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1)
-      return false;
-
-    MVT RVVT = RVEVT.getSimpleVT();
-    if (RVVT == MVT::f128)
-      return false;
-    MVT DestVT = VA.getValVT();
-    // Special handling for extended integers.
-    if (RVVT != DestVT) {
-      if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
-        return false;
-
-      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
-        return false;
-
-      bool isZExt = Outs[0].Flags.isZExt();
-      SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt);
-      if (SrcReg == 0)
-        return false;
-    }
-
-    // Make the copy.
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
-
-    // Add register to return instruction.
-    RetRegs.push_back(VA.getLocReg());
-  }
-
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(ARM64::RET_ReallyLR));
-  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
-    MIB.addReg(RetRegs[i], RegState::Implicit);
-  return true;
-}
-
-bool ARM64FastISel::SelectTrunc(const Instruction *I) {
-  Type *DestTy = I->getType();
-  Value *Op = I->getOperand(0);
-  Type *SrcTy = Op->getType();
-
-  EVT SrcEVT = TLI.getValueType(SrcTy, true);
-  EVT DestEVT = TLI.getValueType(DestTy, true);
-  if (!SrcEVT.isSimple())
-    return false;
-  if (!DestEVT.isSimple())
-    return false;
-
-  MVT SrcVT = SrcEVT.getSimpleVT();
-  MVT DestVT = DestEVT.getSimpleVT();
-
-  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
-      SrcVT != MVT::i8)
-    return false;
-  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 &&
-      DestVT != MVT::i1)
-    return false;
-
-  unsigned SrcReg = getRegForValue(Op);
-  if (!SrcReg)
-    return false;
-
-  // If we're truncating from i64 to a smaller non-legal type then generate an
-  // AND.  Otherwise, we know the high bits are undefined and a truncate doesn't
-  // generate any code.
-  if (SrcVT == MVT::i64) {
-    uint64_t Mask = 0;
-    switch (DestVT.SimpleTy) {
-    default:
-      // Trunc i64 to i32 is handled by the target-independent fast-isel.
-      return false;
-    case MVT::i1:
-      Mask = 0x1;
-      break;
-    case MVT::i8:
-      Mask = 0xff;
-      break;
-    case MVT::i16:
-      Mask = 0xffff;
-      break;
-    }
-    // Issue an extract_subreg to get the lower 32-bits.
-    unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true,
-                                                ARM64::sub_32);
-    MRI.constrainRegClass(Reg32, &ARM64::GPR32RegClass);
-    // Create the AND instruction which performs the actual truncation.
-    unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-            ANDReg)
-        .addReg(Reg32)
-        .addImm(ARM64_AM::encodeLogicalImmediate(Mask, 32));
-    SrcReg = ANDReg;
-  }
-
-  UpdateValueMap(I, SrcReg);
-  return true;
-}
-
-unsigned ARM64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
-  assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
-          DestVT == MVT::i64) &&
-         "Unexpected value type.");
-  // Handle i8 and i16 as i32.
-  if (DestVT == MVT::i8 || DestVT == MVT::i16)
-    DestVT = MVT::i32;
-
-  if (isZExt) {
-    MRI.constrainRegClass(SrcReg, &ARM64::GPR32RegClass);
-    unsigned ResultReg = createResultReg(&ARM64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-            ResultReg)
-        .addReg(SrcReg)
-        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
-
-    if (DestVT == MVT::i64) {
-      // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
-      // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
-      unsigned Reg64 = MRI.createVirtualRegister(&ARM64::GPR64RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(ARM64::SUBREG_TO_REG), Reg64)
-          .addImm(0)
-          .addReg(ResultReg)
-          .addImm(ARM64::sub_32);
-      ResultReg = Reg64;
-    }
-    return ResultReg;
-  } else {
-    if (DestVT == MVT::i64) {
-      // FIXME: We're SExt i1 to i64.
-      return 0;
-    }
-    unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SBFMWri),
-            ResultReg)
-        .addReg(SrcReg)
-        .addImm(0)
-        .addImm(0);
-    return ResultReg;
-  }
-}
-
-unsigned ARM64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
-                                   bool isZExt) {
-  assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
-  unsigned Opc;
-  unsigned Imm = 0;
-
-  switch (SrcVT.SimpleTy) {
-  default:
-    return 0;
-  case MVT::i1:
-    return Emiti1Ext(SrcReg, DestVT, isZExt);
-  case MVT::i8:
-    if (DestVT == MVT::i64)
-      Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
-    else
-      Opc = isZExt ? ARM64::UBFMWri : ARM64::SBFMWri;
-    Imm = 7;
-    break;
-  case MVT::i16:
-    if (DestVT == MVT::i64)
-      Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
-    else
-      Opc = isZExt ? ARM64::UBFMWri : ARM64::SBFMWri;
-    Imm = 15;
-    break;
-  case MVT::i32:
-    assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
-    Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
-    Imm = 31;
-    break;
-  }
-
-  // Handle i8 and i16 as i32.
-  if (DestVT == MVT::i8 || DestVT == MVT::i16)
-    DestVT = MVT::i32;
-  else if (DestVT == MVT::i64) {
-    unsigned Src64 = MRI.createVirtualRegister(&ARM64::GPR64RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(ARM64::SUBREG_TO_REG), Src64)
-        .addImm(0)
-        .addReg(SrcReg)
-        .addImm(ARM64::sub_32);
-    SrcReg = Src64;
-  }
-
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(SrcReg)
-      .addImm(0)
-      .addImm(Imm);
-
-  return ResultReg;
-}
-
-bool ARM64FastISel::SelectIntExt(const Instruction *I) {
-  // On ARM, in general, integer casts don't involve legal types; this code
-  // handles promotable integers.  The high bits for a type smaller than
-  // the register size are assumed to be undefined.
-  Type *DestTy = I->getType();
-  Value *Src = I->getOperand(0);
-  Type *SrcTy = Src->getType();
-
-  bool isZExt = isa<ZExtInst>(I);
-  unsigned SrcReg = getRegForValue(Src);
-  if (!SrcReg)
-    return false;
-
-  EVT SrcEVT = TLI.getValueType(SrcTy, true);
-  EVT DestEVT = TLI.getValueType(DestTy, true);
-  if (!SrcEVT.isSimple())
-    return false;
-  if (!DestEVT.isSimple())
-    return false;
-
-  MVT SrcVT = SrcEVT.getSimpleVT();
-  MVT DestVT = DestEVT.getSimpleVT();
-  unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
-  if (ResultReg == 0)
-    return false;
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
-  EVT DestEVT = TLI.getValueType(I->getType(), true);
-  if (!DestEVT.isSimple())
-    return false;
-
-  MVT DestVT = DestEVT.getSimpleVT();
-  if (DestVT != MVT::i64 && DestVT != MVT::i32)
-    return false;
-
-  unsigned DivOpc;
-  bool is64bit = (DestVT == MVT::i64);
-  switch (ISDOpcode) {
-  default:
-    return false;
-  case ISD::SREM:
-    DivOpc = is64bit ? ARM64::SDIVXr : ARM64::SDIVWr;
-    break;
-  case ISD::UREM:
-    DivOpc = is64bit ? ARM64::UDIVXr : ARM64::UDIVWr;
-    break;
-  }
-  unsigned MSubOpc = is64bit ? ARM64::MSUBXrrr : ARM64::MSUBWrrr;
-  unsigned Src0Reg = getRegForValue(I->getOperand(0));
-  if (!Src0Reg)
-    return false;
-
-  unsigned Src1Reg = getRegForValue(I->getOperand(1));
-  if (!Src1Reg)
-    return false;
-
-  unsigned QuotReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), QuotReg)
-      .addReg(Src0Reg)
-      .addReg(Src1Reg);
-  // The remainder is computed as numerator - (quotient * denominator) using the
-  // MSUB instruction.
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg)
-      .addReg(QuotReg)
-      .addReg(Src1Reg)
-      .addReg(Src0Reg);
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::SelectMul(const Instruction *I) {
-  EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
-  if (!SrcEVT.isSimple())
-    return false;
-  MVT SrcVT = SrcEVT.getSimpleVT();
-
-  // Must be simple value type.  Don't handle vectors.
-  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
-      SrcVT != MVT::i8)
-    return false;
-
-  unsigned Opc;
-  unsigned ZReg;
-  switch (SrcVT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32:
-    ZReg = ARM64::WZR;
-    Opc = ARM64::MADDWrrr;
-    break;
-  case MVT::i64:
-    ZReg = ARM64::XZR;
-    Opc = ARM64::MADDXrrr;
-    break;
-  }
-
-  unsigned Src0Reg = getRegForValue(I->getOperand(0));
-  if (!Src0Reg)
-    return false;
-
-  unsigned Src1Reg = getRegForValue(I->getOperand(1));
-  if (!Src1Reg)
-    return false;
-
-  // Create the base instruction, then add the operands.
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(Src0Reg)
-      .addReg(Src1Reg)
-      .addReg(ZReg);
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::TargetSelectInstruction(const Instruction *I) {
-  switch (I->getOpcode()) {
-  default:
-    break;
-  case Instruction::Load:
-    return SelectLoad(I);
-  case Instruction::Store:
-    return SelectStore(I);
-  case Instruction::Br:
-    return SelectBranch(I);
-  case Instruction::IndirectBr:
-    return SelectIndirectBr(I);
-  case Instruction::FCmp:
-  case Instruction::ICmp:
-    return SelectCmp(I);
-  case Instruction::Select:
-    return SelectSelect(I);
-  case Instruction::FPExt:
-    return SelectFPExt(I);
-  case Instruction::FPTrunc:
-    return SelectFPTrunc(I);
-  case Instruction::FPToSI:
-    return SelectFPToInt(I, /*Signed=*/true);
-  case Instruction::FPToUI:
-    return SelectFPToInt(I, /*Signed=*/false);
-  case Instruction::SIToFP:
-    return SelectIntToFP(I, /*Signed=*/true);
-  case Instruction::UIToFP:
-    return SelectIntToFP(I, /*Signed=*/false);
-  case Instruction::SRem:
-    return SelectRem(I, ISD::SREM);
-  case Instruction::URem:
-    return SelectRem(I, ISD::UREM);
-  case Instruction::Call:
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
-      return SelectIntrinsicCall(*II);
-    return SelectCall(I);
-  case Instruction::Ret:
-    return SelectRet(I);
-  case Instruction::Trunc:
-    return SelectTrunc(I);
-  case Instruction::ZExt:
-  case Instruction::SExt:
-    return SelectIntExt(I);
-  case Instruction::Mul:
-    // FIXME: This really should be handled by the target-independent selector.
-    return SelectMul(I);
-  }
-  return false;
-  // Silence warnings.
-  (void)&CC_ARM64_DarwinPCS_VarArg;
-}
-
-namespace llvm {
-llvm::FastISel *ARM64::createFastISel(FunctionLoweringInfo &funcInfo,
-                                      const TargetLibraryInfo *libInfo) {
-  return new ARM64FastISel(funcInfo, libInfo);
-}
-}
diff --git a/lib/Target/ARM64/ARM64FrameLowering.cpp b/lib/Target/ARM64/ARM64FrameLowering.cpp
deleted file mode 100644
index 9c17488ec58..00000000000
--- a/lib/Target/ARM64/ARM64FrameLowering.cpp
+++ /dev/null
@@ -1,888 +0,0 @@
-//===- ARM64FrameLowering.cpp - ARM64 Frame Lowering -----------*- C++ -*-====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of TargetFrameLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64FrameLowering.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64Subtarget.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "frame-info"
-
-static cl::opt<bool> EnableRedZone("arm64-redzone",
-                                   cl::desc("enable use of redzone on ARM64"),
-                                   cl::init(false), cl::Hidden);
-
-STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
-
-static unsigned estimateStackSize(MachineFunction &MF) {
-  const MachineFrameInfo *FFI = MF.getFrameInfo();
-  int Offset = 0;
-  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
-    int FixedOff = -FFI->getObjectOffset(i);
-    if (FixedOff > Offset)
-      Offset = FixedOff;
-  }
-  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
-    if (FFI->isDeadObjectIndex(i))
-      continue;
-    Offset += FFI->getObjectSize(i);
-    unsigned Align = FFI->getObjectAlignment(i);
-    // Adjust to alignment boundary
-    Offset = (Offset + Align - 1) / Align * Align;
-  }
-  // This does not include the 16 bytes used for fp and lr.
-  return (unsigned)Offset;
-}
-
-bool ARM64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
-  if (!EnableRedZone)
-    return false;
-  // Don't use the red zone if the function explicitly asks us not to.
-  // This is typically used for kernel code.
-  if (MF.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::NoRedZone))
-    return false;
-
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  unsigned NumBytes = AFI->getLocalStackSize();
-
-  // Note: currently hasFP() is always true for hasCalls(), but that's an
-  // implementation detail of the current code, not a strict requirement,
-  // so stay safe here and check both.
-  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
-    return false;
-  return true;
-}
-
-/// hasFP - Return true if the specified function should have a dedicated frame
-/// pointer register.
-bool ARM64FrameLowering::hasFP(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-#ifndef NDEBUG
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
-  assert(!RegInfo->needsStackRealignment(MF) &&
-         "No stack realignment on ARM64!");
-#endif
-
-  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken());
-}
-
-/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
-/// not required, we reserve argument space for call sites in the function
-/// immediately on entry to the current function.  This eliminates the need for
-/// add/sub sp brackets around call sites.  Returns true if the call frame is
-/// included as part of the stack frame.
-bool ARM64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  return !MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-void ARM64FrameLowering::eliminateCallFramePseudoInstr(
-    MachineFunction &MF, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator I) const {
-  const ARM64InstrInfo *TII =
-      static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
-  DebugLoc DL = I->getDebugLoc();
-  int Opc = I->getOpcode();
-  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
-  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
-
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  if (!TFI->hasReservedCallFrame(MF)) {
-    unsigned Align = getStackAlignment();
-
-    int64_t Amount = I->getOperand(0).getImm();
-    Amount = RoundUpToAlignment(Amount, Align);
-    if (!IsDestroy)
-      Amount = -Amount;
-
-    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
-    // doesn't have to pop anything), then the first operand will be zero too so
-    // this adjustment is a no-op.
-    if (CalleePopAmount == 0) {
-      // FIXME: in-function stack adjustment for calls is limited to 24-bits
-      // because there's no guaranteed temporary register available.
-      //
-      // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
-      // 1) For offset <= 12-bit, we use LSL #0
-      // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
-      // LSL #0, and the other uses LSL #12.
-      //
-      // Mostly call frames will be allocated at the start of a function so
-      // this is OK, but it is a limitation that needs dealing with.
-      assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
-      emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
-    }
-  } else if (CalleePopAmount != 0) {
-    // If the calling convention demands that the callee pops arguments from the
-    // stack, we want to add it back if we have a reserved call frame.
-    assert(CalleePopAmount < 0xffffff && "call frame too large");
-    emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -CalleePopAmount, TII);
-  }
-  MBB.erase(I);
-}
-
-void
-ARM64FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                              MachineBasicBlock::iterator MBBI,
-                                              unsigned FramePtr) const {
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const ARM64InstrInfo *TII = TM.getInstrInfo();
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-
-  // Add callee saved registers to move list.
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  if (CSI.empty())
-    return;
-
-  const DataLayout *TD = MF.getTarget().getDataLayout();
-  bool HasFP = hasFP(MF);
-
-  // Calculate amount of bytes used for return address storing.
-  int stackGrowth = -TD->getPointerSize(0);
-
-  // Calculate offsets.
-  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
-  unsigned TotalSkipped = 0;
-  for (const auto &Info : CSI) {
-    unsigned Reg = Info.getReg();
-    int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
-                     getOffsetOfLocalArea() + saveAreaOffset;
-
-    // Don't output a new CFI directive if we're re-saving the frame pointer or
-    // link register. This happens when the PrologEpilogInserter has inserted an
-    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
-    // method automatically generates the directives when frame pointers are
-    // used. If we generate CFI directives for the extra "STP"s, the linker will
-    // lose track of the correct values for the frame pointer and link register.
-    if (HasFP && (FramePtr == Reg || Reg == ARM64::LR)) {
-      TotalSkipped += stackGrowth;
-      continue;
-    }
-
-    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
-        nullptr, DwarfReg, Offset - TotalSkipped));
-    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
-  }
-}
-
-void ARM64FrameLowering::emitPrologue(MachineFunction &MF) const {
-  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const Function *Fn = MF.getFunction();
-  const ARM64RegisterInfo *RegInfo = TM.getRegisterInfo();
-  const ARM64InstrInfo *TII = TM.getInstrInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
-  bool HasFP = hasFP(MF);
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-
-  int NumBytes = (int)MFI->getStackSize();
-  if (!AFI->hasStackFrame()) {
-    assert(!HasFP && "unexpected function without stack frame but with FP");
-
-    // All of the stack allocation is for locals.
-    AFI->setLocalStackSize(NumBytes);
-
-    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
-    MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-
-    // REDZONE: If the stack size is less than 128 bytes, we don't need
-    // to actually allocate.
-    if (NumBytes && !canUseRedZone(MF)) {
-      emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup);
-
-      // Encode the stack size of the leaf function.
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    } else if (NumBytes) {
-      ++NumRedZoneFunctions;
-    }
-
-    return;
-  }
-
-  // Only set up FP if we actually need to.
-  int FPOffset = 0;
-  if (HasFP) {
-    // First instruction must a) allocate the stack  and b) have an immediate
-    // that is a multiple of -2.
-    assert((MBBI->getOpcode() == ARM64::STPXpre ||
-            MBBI->getOpcode() == ARM64::STPDpre) &&
-           MBBI->getOperand(3).getReg() == ARM64::SP &&
-           MBBI->getOperand(4).getImm() < 0 &&
-           (MBBI->getOperand(4).getImm() & 1) == 0);
-
-    // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
-    // required for the callee saved register area we get the frame pointer
-    // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
-    FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8;
-    assert(FPOffset >= 0 && "Bad Framepointer Offset");
-  }
-
-  // Move past the saves of the callee-saved registers.
-  while (MBBI->getOpcode() == ARM64::STPXi ||
-         MBBI->getOpcode() == ARM64::STPDi ||
-         MBBI->getOpcode() == ARM64::STPXpre ||
-         MBBI->getOpcode() == ARM64::STPDpre) {
-    ++MBBI;
-    NumBytes -= 16;
-  }
-  assert(NumBytes >= 0 && "Negative stack allocation size!?");
-  if (HasFP) {
-    // Issue    sub fp, sp, FPOffset or
-    //          mov fp,sp          when FPOffset is zero.
-    // Note: All stores of callee-saved registers are marked as "FrameSetup".
-    // This code marks the instruction(s) that set the FP also.
-    emitFrameOffset(MBB, MBBI, DL, ARM64::FP, ARM64::SP, FPOffset, TII,
-                    MachineInstr::FrameSetup);
-  }
-
-  // All of the remaining stack allocations are for locals.
-  AFI->setLocalStackSize(NumBytes);
-
-  // Allocate space for the rest of the frame.
-  if (NumBytes) {
-    // If we're a leaf function, try using the red zone.
-    if (!canUseRedZone(MF))
-      emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup);
-  }
-
-  // If we need a base pointer, set it up here. It's whatever the value of the
-  // stack pointer is at this point. Any variable size objects will be allocated
-  // after this, so we can still use the base pointer to reference locals.
-  //
-  // FIXME: Clarify FrameSetup flags here.
-  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
-  // needed.
-  //
-  if (RegInfo->hasBasePointer(MF))
-    TII->copyPhysReg(MBB, MBBI, DL, ARM64::X19, ARM64::SP, false);
-
-  if (needsFrameMoves) {
-    const DataLayout *TD = MF.getTarget().getDataLayout();
-    const int StackGrowth = -TD->getPointerSize(0);
-    unsigned FramePtr = RegInfo->getFrameRegister(MF);
-
-    // An example of the prologue:
-    //
-    //     .globl __foo
-    //     .align 2
-    //  __foo:
-    // Ltmp0:
-    //     .cfi_startproc
-    //     .cfi_personality 155, ___gxx_personality_v0
-    // Leh_func_begin:
-    //     .cfi_lsda 16, Lexception33
-    //
-    //     stp  xa,bx, [sp, -#offset]!
-    //     ...
-    //     stp  x28, x27, [sp, #offset-32]
-    //     stp  fp, lr, [sp, #offset-16]
-    //     add  fp, sp, #offset - 16
-    //     sub  sp, sp, #1360
-    //
-    // The Stack:
-    //       +-------------------------------------------+
-    // 10000 | ........ | ........ | ........ | ........ |
-    // 10004 | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    // 10008 | ........ | ........ | ........ | ........ |
-    // 1000c | ........ | ........ | ........ | ........ |
-    //       +===========================================+
-    // 10010 |                X28 Register               |
-    // 10014 |                X28 Register               |
-    //       +-------------------------------------------+
-    // 10018 |                X27 Register               |
-    // 1001c |                X27 Register               |
-    //       +===========================================+
-    // 10020 |                Frame Pointer              |
-    // 10024 |                Frame Pointer              |
-    //       +-------------------------------------------+
-    // 10028 |                Link Register              |
-    // 1002c |                Link Register              |
-    //       +===========================================+
-    // 10030 | ........ | ........ | ........ | ........ |
-    // 10034 | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    // 10038 | ........ | ........ | ........ | ........ |
-    // 1003c | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    //
-    //     [sp] = 10030        ::    >>initial value<<
-    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
-    //     fp = sp == 10020    ::  mov fp, sp
-    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
-    //     sp == 10010         ::    >>final value<<
-    //
-    // The frame pointer (w29) points to address 10020. If we use an offset of
-    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
-    // for w27, and -32 for w28:
-    //
-    //  Ltmp1:
-    //     .cfi_def_cfa w29, 16
-    //  Ltmp2:
-    //     .cfi_offset w30, -8
-    //  Ltmp3:
-    //     .cfi_offset w29, -16
-    //  Ltmp4:
-    //     .cfi_offset w27, -24
-    //  Ltmp5:
-    //     .cfi_offset w28, -32
-
-    if (HasFP) {
-      // Define the current CFA rule to use the provided FP.
-      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-
-      // Record the location of the stored LR
-      unsigned LR = RegInfo->getDwarfRegNum(ARM64::LR, true);
-      CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-
-      // Record the location of the stored FP
-      CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    } else {
-      // Encode the stack size of the leaf function.
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    }
-
-    // Now emit the moves for whatever callee saved regs we have.
-    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
-  }
-}
-
-static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
-static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
-  unsigned RtIdx = 0;
-  if (MI->getOpcode() == ARM64::LDPXpost || MI->getOpcode() == ARM64::LDPDpost)
-    RtIdx = 1;
-
-  if (MI->getOpcode() == ARM64::LDPXpost ||
-      MI->getOpcode() == ARM64::LDPDpost || MI->getOpcode() == ARM64::LDPXi ||
-      MI->getOpcode() == ARM64::LDPDi) {
-    if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) ||
-        !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) ||
-        MI->getOperand(RtIdx + 2).getReg() != ARM64::SP)
-      return false;
-    return true;
-  }
-
-  return false;
-}
-
-void ARM64FrameLowering::emitEpilogue(MachineFunction &MF,
-                                      MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARM64InstrInfo *TII =
-      static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
-  const ARM64RegisterInfo *RegInfo =
-      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  DebugLoc DL = MBBI->getDebugLoc();
-  unsigned RetOpcode = MBBI->getOpcode();
-
-  int NumBytes = MFI->getStackSize();
-  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-
-  // Initial and residual are named for consitency with the prologue. Note that
-  // in the epilogue, the residual adjustment is executed first.
-  uint64_t ArgumentPopSize = 0;
-  if (RetOpcode == ARM64::TCRETURNdi || RetOpcode == ARM64::TCRETURNri) {
-    MachineOperand &StackAdjust = MBBI->getOperand(1);
-
-    // For a tail-call in a callee-pops-arguments environment, some or all of
-    // the stack may actually be in use for the call's arguments, this is
-    // calculated during LowerCall and consumed here...
-    ArgumentPopSize = StackAdjust.getImm();
-  } else {
-    // ... otherwise the amount to pop is *all* of the argument space,
-    // conveniently stored in the MachineFunctionInfo by
-    // LowerFormalArguments. This will, of course, be zero for the C calling
-    // convention.
-    ArgumentPopSize = AFI->getArgumentStackToRestore();
-  }
-
-  // The stack frame should be like below,
-  //
-  //      ----------------------                     ---
-  //      |                    |                      |
-  //      | BytesInStackArgArea|              CalleeArgStackSize
-  //      | (NumReusableBytes) |                (of tail call)
-  //      |                    |                     ---
-  //      |                    |                      |
-  //      ---------------------|        ---           |
-  //      |                    |         |            |
-  //      |   CalleeSavedReg   |         |            |
-  //      | (NumRestores * 16) |         |            |
-  //      |                    |         |            |
-  //      ---------------------|         |         NumBytes
-  //      |                    |     StackSize  (StackAdjustUp)
-  //      |   LocalStackSize   |         |            |
-  //      | (covering callee   |         |            |
-  //      |       args)        |         |            |
-  //      |                    |         |            |
-  //      ----------------------        ---          ---
-  //
-  // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
-  //             = StackSize + ArgumentPopSize
-  //
-  // ARM64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
-  // it as the 2nd argument of ARM64ISD::TC_RETURN.
-  NumBytes += ArgumentPopSize;
-
-  unsigned NumRestores = 0;
-  // Move past the restores of the callee-saved registers.
-  MachineBasicBlock::iterator LastPopI = MBBI;
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
-  if (LastPopI != MBB.begin()) {
-    do {
-      ++NumRestores;
-      --LastPopI;
-    } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
-    if (!isCSRestore(LastPopI, CSRegs)) {
-      ++LastPopI;
-      --NumRestores;
-    }
-  }
-  NumBytes -= NumRestores * 16;
-  assert(NumBytes >= 0 && "Negative stack allocation size!?");
-
-  if (!hasFP(MF)) {
-    // If this was a redzone leaf function, we don't need to restore the
-    // stack pointer.
-    if (!canUseRedZone(MF))
-      emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::SP, NumBytes, TII);
-    return;
-  }
-
-  // Restore the original stack pointer.
-  // FIXME: Rather than doing the math here, we should instead just use
-  // non-post-indexed loads for the restores if we aren't actually going to
-  // be able to save any instructions.
-  if (NumBytes || MFI->hasVarSizedObjects())
-    emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::FP,
-                    -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
-}
-
-/// getFrameIndexOffset - Returns the displacement from the frame register to
-/// the stack frame of the specified index.
-int ARM64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                            int FI) const {
-  unsigned FrameReg;
-  return getFrameIndexReference(MF, FI, FrameReg);
-}
-
-/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
-/// debug info.  It's the same as what we use for resolving the code-gen
-/// references for now.  FIXME: This can go wrong when references are
-/// SP-relative and simple call frames aren't used.
-int ARM64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                               int FI,
-                                               unsigned &FrameReg) const {
-  return resolveFrameIndexReference(MF, FI, FrameReg);
-}
-
-int ARM64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
-                                                   int FI, unsigned &FrameReg,
-                                                   bool PreferFP) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARM64RegisterInfo *RegInfo =
-      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  int FPOffset = MFI->getObjectOffset(FI) + 16;
-  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
-  bool isFixed = MFI->isFixedObjectIndex(FI);
-
-  // Use frame pointer to reference fixed objects. Use it for locals if
-  // there are VLAs (and thus the SP isn't reliable as a base).
-  // Make sure useFPForScavengingIndex() does the right thing for the emergency
-  // spill slot.
-  bool UseFP = false;
-  if (AFI->hasStackFrame()) {
-    // Note: Keeping the following as multiple 'if' statements rather than
-    // merging to a single expression for readability.
-    //
-    // Argument access should always use the FP.
-    if (isFixed) {
-      UseFP = hasFP(MF);
-    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) {
-      // Use SP or FP, whichever gives us the best chance of the offset
-      // being in range for direct access. If the FPOffset is positive,
-      // that'll always be best, as the SP will be even further away.
-      // If the FPOffset is negative, we have to keep in mind that the
-      // available offset range for negative offsets is smaller than for
-      // positive ones. If we have variable sized objects, we're stuck with
-      // using the FP regardless, though, as the SP offset is unknown
-      // and we don't have a base pointer available. If an offset is
-      // available via the FP and the SP, use whichever is closest.
-      if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
-          (FPOffset >= -256 && Offset > -FPOffset))
-        UseFP = true;
-    }
-  }
-
-  if (UseFP) {
-    FrameReg = RegInfo->getFrameRegister(MF);
-    return FPOffset;
-  }
-
-  // Use the base pointer if we have one.
-  if (RegInfo->hasBasePointer(MF))
-    FrameReg = RegInfo->getBaseRegister();
-  else {
-    FrameReg = ARM64::SP;
-    // If we're using the red zone for this function, the SP won't actually
-    // be adjusted, so the offsets will be negative. They're also all
-    // within range of the signed 9-bit immediate instructions.
-    if (canUseRedZone(MF))
-      Offset -= AFI->getLocalStackSize();
-  }
-
-  return Offset;
-}
-
-static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
-  if (Reg != ARM64::LR)
-    return getKillRegState(true);
-
-  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
-  bool LRLiveIn = MF.getRegInfo().isLiveIn(ARM64::LR);
-  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
-  return getKillRegState(LRKill);
-}
-
-bool ARM64FrameLowering::spillCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
-  MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  unsigned Count = CSI.size();
-  DebugLoc DL;
-  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-
-  if (MI != MBB.end())
-    DL = MI->getDebugLoc();
-
-  for (unsigned i = 0; i < Count; i += 2) {
-    unsigned idx = Count - i - 2;
-    unsigned Reg1 = CSI[idx].getReg();
-    unsigned Reg2 = CSI[idx + 1].getReg();
-    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
-    // list to come in sorted by frame index so that we can issue the store
-    // pair instructions directly. Assert if we see anything otherwise.
-    //
-    // The order of the registers in the list is controlled by
-    // getCalleeSavedRegs(), so they will always be in-order, as well.
-    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
-           "Out of order callee saved regs!");
-    unsigned StrOpc;
-    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
-    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
-    // first spill is a pre-increment that allocates the stack.
-    // For example:
-    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
-    //    stp     x20, x19, [sp, #16]    // addImm(+2)
-    //    stp     fp, lr, [sp, #32]      // addImm(+4)
-    // Rationale: This sequence saves uop updates compared to a sequence of
-    // pre-increment spills like stp xi,xj,[sp,#-16]!
-    // Note: Similar rational and sequence for restores in epilog.
-    if (ARM64::GPR64RegClass.contains(Reg1)) {
-      assert(ARM64::GPR64RegClass.contains(Reg2) &&
-             "Expected GPR64 callee-saved register pair!");
-      // For first spill use pre-increment store.
-      if (i == 0)
-        StrOpc = ARM64::STPXpre;
-      else
-        StrOpc = ARM64::STPXi;
-    } else if (ARM64::FPR64RegClass.contains(Reg1)) {
-      assert(ARM64::FPR64RegClass.contains(Reg2) &&
-             "Expected FPR64 callee-saved register pair!");
-      // For first spill use pre-increment store.
-      if (i == 0)
-        StrOpc = ARM64::STPDpre;
-      else
-        StrOpc = ARM64::STPDi;
-    } else
-      llvm_unreachable("Unexpected callee saved register!");
-    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
-                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
-                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
-    // Compute offset: i = 0 => offset = -Count;
-    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
-    const int Offset = (i == 0) ? -Count : i;
-    assert((Offset >= -64 && Offset <= 63) &&
-           "Offset out of bounds for STP immediate");
-    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
-    if (StrOpc == ARM64::STPDpre || StrOpc == ARM64::STPXpre)
-      MIB.addReg(ARM64::SP, RegState::Define);
-
-    MIB.addReg(Reg2, getPrologueDeath(MF, Reg2))
-        .addReg(Reg1, getPrologueDeath(MF, Reg1))
-        .addReg(ARM64::SP)
-        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
-        .setMIFlag(MachineInstr::FrameSetup);
-  }
-  return true;
-}
-
-bool ARM64FrameLowering::restoreCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
-  MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  unsigned Count = CSI.size();
-  DebugLoc DL;
-  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-
-  if (MI != MBB.end())
-    DL = MI->getDebugLoc();
-
-  for (unsigned i = 0; i < Count; i += 2) {
-    unsigned Reg1 = CSI[i].getReg();
-    unsigned Reg2 = CSI[i + 1].getReg();
-    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
-    // list to come in sorted by frame index so that we can issue the store
-    // pair instructions directly. Assert if we see anything otherwise.
-    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
-           "Out of order callee saved regs!");
-    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
-    // the last load is sp-pi post-increment and de-allocates the stack:
-    // For example:
-    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
-    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
-    //    ldp     x22, x21, [sp], #48     // addImm(+6)
-    // Note: see comment in spillCalleeSavedRegisters()
-    unsigned LdrOpc;
-
-    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
-    if (ARM64::GPR64RegClass.contains(Reg1)) {
-      assert(ARM64::GPR64RegClass.contains(Reg2) &&
-             "Expected GPR64 callee-saved register pair!");
-      if (i == Count - 2)
-        LdrOpc = ARM64::LDPXpost;
-      else
-        LdrOpc = ARM64::LDPXi;
-    } else if (ARM64::FPR64RegClass.contains(Reg1)) {
-      assert(ARM64::FPR64RegClass.contains(Reg2) &&
-             "Expected FPR64 callee-saved register pair!");
-      if (i == Count - 2)
-        LdrOpc = ARM64::LDPDpost;
-      else
-        LdrOpc = ARM64::LDPDi;
-    } else
-      llvm_unreachable("Unexpected callee saved register!");
-    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
-                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
-                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
-
-    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
-    // etc.
-    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
-    assert((Offset >= -64 && Offset <= 63) &&
-           "Offset out of bounds for LDP immediate");
-    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
-    if (LdrOpc == ARM64::LDPXpost || LdrOpc == ARM64::LDPDpost)
-      MIB.addReg(ARM64::SP, RegState::Define);
-
-    MIB.addReg(Reg2, getDefRegState(true))
-        .addReg(Reg1, getDefRegState(true))
-        .addReg(ARM64::SP)
-        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
-                         // where the factor * 8 is implicit
-  }
-  return true;
-}
-
-void ARM64FrameLowering::processFunctionBeforeCalleeSavedScan(
-    MachineFunction &MF, RegScavenger *RS) const {
-  const ARM64RegisterInfo *RegInfo =
-      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  MachineRegisterInfo *MRI = &MF.getRegInfo();
-  SmallVector<unsigned, 4> UnspilledCSGPRs;
-  SmallVector<unsigned, 4> UnspilledCSFPRs;
-
-  // The frame record needs to be created by saving the appropriate registers
-  if (hasFP(MF)) {
-    MRI->setPhysRegUsed(ARM64::FP);
-    MRI->setPhysRegUsed(ARM64::LR);
-  }
-
-  // Spill the BasePtr if it's used. Do this first thing so that the
-  // getCalleeSavedRegs() below will get the right answer.
-  if (RegInfo->hasBasePointer(MF))
-    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
-
-  // If any callee-saved registers are used, the frame cannot be eliminated.
-  unsigned NumGPRSpilled = 0;
-  unsigned NumFPRSpilled = 0;
-  bool ExtraCSSpill = false;
-  bool CanEliminateFrame = true;
-  DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
-
-  // Check pairs of consecutive callee-saved registers.
-  for (unsigned i = 0; CSRegs[i]; i += 2) {
-    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
-
-    const unsigned OddReg = CSRegs[i];
-    const unsigned EvenReg = CSRegs[i + 1];
-    assert((ARM64::GPR64RegClass.contains(OddReg) &&
-            ARM64::GPR64RegClass.contains(EvenReg)) ^
-               (ARM64::FPR64RegClass.contains(OddReg) &&
-                ARM64::FPR64RegClass.contains(EvenReg)) &&
-           "Register class mismatch!");
-
-    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
-    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
-
-    // Early exit if none of the registers in the register pair is actually
-    // used.
-    if (!OddRegUsed && !EvenRegUsed) {
-      if (ARM64::GPR64RegClass.contains(OddReg)) {
-        UnspilledCSGPRs.push_back(OddReg);
-        UnspilledCSGPRs.push_back(EvenReg);
-      } else {
-        UnspilledCSFPRs.push_back(OddReg);
-        UnspilledCSFPRs.push_back(EvenReg);
-      }
-      continue;
-    }
-
-    unsigned Reg = ARM64::NoRegister;
-    // If only one of the registers of the register pair is used, make sure to
-    // mark the other one as used as well.
-    if (OddRegUsed ^ EvenRegUsed) {
-      // Find out which register is the additional spill.
-      Reg = OddRegUsed ? EvenReg : OddReg;
-      MRI->setPhysRegUsed(Reg);
-    }
-
-    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
-    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
-
-    assert(((OddReg == ARM64::LR && EvenReg == ARM64::FP) ||
-            (RegInfo->getEncodingValue(OddReg) + 1 ==
-             RegInfo->getEncodingValue(EvenReg))) &&
-           "Register pair of non-adjacent registers!");
-    if (ARM64::GPR64RegClass.contains(OddReg)) {
-      NumGPRSpilled += 2;
-      // If it's not a reserved register, we can use it in lieu of an
-      // emergency spill slot for the register scavenger.
-      // FIXME: It would be better to instead keep looking and choose another
-      // unspilled register that isn't reserved, if there is one.
-      if (Reg != ARM64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
-        ExtraCSSpill = true;
-    } else
-      NumFPRSpilled += 2;
-
-    CanEliminateFrame = false;
-  }
-
-  // FIXME: Set BigStack if any stack slot references may be out of range.
-  // For now, just conservatively guestimate based on unscaled indexing
-  // range. We'll end up allocating an unnecessary spill slot a lot, but
-  // realistically that's not a big deal at this stage of the game.
-  // The CSR spill slots have not been allocated yet, so estimateStackSize
-  // won't include them.
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
-  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
-  bool BigStack = (CFSize >= 256);
-  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
-    AFI->setHasStackFrame(true);
-
-  // Estimate if we might need to scavenge a register at some point in order
-  // to materialize a stack offset. If so, either spill one additional
-  // callee-saved register or reserve a special spill slot to facilitate
-  // register scavenging. If we already spilled an extra callee-saved register
-  // above to keep the number of spills even, we don't need to do anything else
-  // here.
-  if (BigStack && !ExtraCSSpill) {
-
-    // If we're adding a register to spill here, we have to add two of them
-    // to keep the number of regs to spill even.
-    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
-    unsigned Count = 0;
-    while (!UnspilledCSGPRs.empty() && Count < 2) {
-      unsigned Reg = UnspilledCSGPRs.back();
-      UnspilledCSGPRs.pop_back();
-      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
-                   << " to get a scratch register.\n");
-      MRI->setPhysRegUsed(Reg);
-      ExtraCSSpill = true;
-      ++Count;
-    }
-
-    // If we didn't find an extra callee-saved register to spill, create
-    // an emergency spill slot.
-    if (!ExtraCSSpill) {
-      const TargetRegisterClass *RC = &ARM64::GPR64RegClass;
-      int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
-      RS->addScavengingFrameIndex(FI);
-      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
-                   << " as the emergency spill slot.\n");
-    }
-  }
-}
diff --git a/lib/Target/ARM64/ARM64FrameLowering.h b/lib/Target/ARM64/ARM64FrameLowering.h
deleted file mode 100644
index 1991a0a18dd..00000000000
--- a/lib/Target/ARM64/ARM64FrameLowering.h
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- ARM64FrameLowering.h - TargetFrameLowering for ARM64 ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64_FRAMELOWERING_H
-#define ARM64_FRAMELOWERING_H
-
-#include "llvm/Target/TargetFrameLowering.h"
-
-namespace llvm {
-
-class ARM64Subtarget;
-class ARM64TargetMachine;
-
-class ARM64FrameLowering : public TargetFrameLowering {
-  const ARM64TargetMachine &TM;
-
-public:
-  explicit ARM64FrameLowering(const ARM64TargetMachine &TM,
-                              const ARM64Subtarget &STI)
-      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
-                            false /*StackRealignable*/),
-        TM(TM) {}
-
-  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI,
-                                 unsigned FramePtr) const;
-
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                  MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) const override;
-
-  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
-  /// the function.
-  void emitPrologue(MachineFunction &MF) const override;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
-
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
-  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
-                                 unsigned &FrameReg,
-                                 bool PreferFP = false) const;
-  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const override;
-
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  const std::vector<CalleeSavedInfo> &CSI,
-                                  const TargetRegisterInfo *TRI) const override;
-
-  /// \brief Can this function use the red zone for local allocations.
-  bool canUseRedZone(const MachineFunction &MF) const;
-
-  bool hasFP(const MachineFunction &MF) const override;
-  bool hasReservedCallFrame(const MachineFunction &MF) const override;
-
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
deleted file mode 100644
index 23c45d414e2..00000000000
--- a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
+++ /dev/null
@@ -1,3030 +0,0 @@
-//===-- ARM64ISelDAGToDAG.cpp - A dag to dag inst selector for ARM64 ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an instruction selector for the ARM64 target.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64TargetMachine.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/APSInt.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/Function.h" // To access function attributes.
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-isel"
-
-//===--------------------------------------------------------------------===//
-/// ARM64DAGToDAGISel - ARM64 specific code to select ARM64 machine
-/// instructions for SelectionDAG operations.
-///
-namespace {
-
-class ARM64DAGToDAGISel : public SelectionDAGISel {
-  ARM64TargetMachine &TM;
-
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-  bool ForCodeSize;
-
-public:
-  explicit ARM64DAGToDAGISel(ARM64TargetMachine &tm, CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(tm, OptLevel), TM(tm),
-        Subtarget(nullptr), ForCodeSize(false) {}
-
-  const char *getPassName() const override {
-    return "ARM64 Instruction Selection";
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    AttributeSet FnAttrs = MF.getFunction()->getAttributes();
-    ForCodeSize =
-        FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                             Attribute::OptimizeForSize) ||
-        FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
-    Subtarget = &TM.getSubtarget<ARM64Subtarget>();
-    return SelectionDAGISel::runOnMachineFunction(MF);
-  }
-
-  SDNode *Select(SDNode *Node) override;
-
-  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
-  /// inline asm expressions.
-  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                    char ConstraintCode,
-                                    std::vector<SDValue> &OutOps) override;
-
-  SDNode *SelectMLAV64LaneV128(SDNode *N);
-  SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
-  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
-  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
-  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
-  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
-    return SelectShiftedRegister(N, false, Reg, Shift);
-  }
-  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
-    return SelectShiftedRegister(N, true, Reg, Shift);
-  }
-  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 1, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 2, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 4, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 8, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 16, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 1, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 2, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 4, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 8, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
-  }
-
-  template<int Width>
-  bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
-                         SDValue &SignExtend, SDValue &DoShift) {
-    return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
-  }
-
-  template<int Width>
-  bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
-                         SDValue &SignExtend, SDValue &DoShift) {
-    return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
-  }
-
-
-  /// Form sequences of consecutive 64/128-bit registers for use in NEON
-  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
-  /// between 1 and 4 elements. If it contains a single element that is returned
-  /// unchanged; otherwise a REG_SEQUENCE value is returned.
-  SDValue createDTuple(ArrayRef<SDValue> Vecs);
-  SDValue createQTuple(ArrayRef<SDValue> Vecs);
-
-  /// Generic helper for the createDTuple/createQTuple
-  /// functions. Those should almost always be called instead.
-  SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
-                      unsigned SubRegs[]);
-
-  SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
-
-  SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
-
-  SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
-                     unsigned SubRegIdx);
-  SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
-                         unsigned SubRegIdx);
-  SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-
-  SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-
-  SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node);
-  SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node);
-
-  SDNode *SelectBitfieldExtractOp(SDNode *N);
-  SDNode *SelectBitfieldInsertOp(SDNode *N);
-
-  SDNode *SelectLIBM(SDNode *N);
-
-// Include the pieces autogenerated from the target description.
-#include "ARM64GenDAGISel.inc"
-
-private:
-  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
-                             SDValue &Shift);
-  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
-                             SDValue &OffImm);
-  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
-                              SDValue &OffImm);
-  bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
-                         SDValue &Offset, SDValue &SignExtend,
-                         SDValue &DoShift);
-  bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
-                         SDValue &Offset, SDValue &SignExtend,
-                         SDValue &DoShift);
-  bool isWorthFolding(SDValue V) const;
-  bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
-                         SDValue &Offset, SDValue &SignExtend);
-
-  template<unsigned RegWidth>
-  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
-    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
-  }
-
-  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
-};
-} // end anonymous namespace
-
-/// isIntImmediate - This method tests to see if the node is a constant
-/// operand. If so Imm will receive the 32-bit value.
-static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
-  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
-    Imm = C->getZExtValue();
-    return true;
-  }
-  return false;
-}
-
-// isIntImmediate - This method tests to see if a constant operand.
-// If so Imm will receive the value.
-static bool isIntImmediate(SDValue N, uint64_t &Imm) {
-  return isIntImmediate(N.getNode(), Imm);
-}
-
-// isOpcWithIntImmediate - This method tests to see if the node is a specific
-// opcode and that it has a immediate integer right operand.
-// If so Imm will receive the 32 bit value.
-static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
-                                  uint64_t &Imm) {
-  return N->getOpcode() == Opc &&
-         isIntImmediate(N->getOperand(1).getNode(), Imm);
-}
-
-bool ARM64DAGToDAGISel::SelectInlineAsmMemoryOperand(
-    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
-  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
-  // Require the address to be in a register.  That is safe for all ARM64
-  // variants and it is hard to do anything much smarter without knowing
-  // how the operand is used.
-  OutOps.push_back(Op);
-  return false;
-}
-
-/// SelectArithImmed - Select an immediate value that can be represented as
-/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
-/// Val set to the 12-bit value and Shift set to the shifter operand.
-bool ARM64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
-                                         SDValue &Shift) {
-  // This function is called from the addsub_shifted_imm ComplexPattern,
-  // which lists [imm] as the list of opcode it's interested in, however
-  // we still need to check whether the operand is actually an immediate
-  // here because the ComplexPattern opcode list is only used in
-  // root-level opcode matching.
-  if (!isa<ConstantSDNode>(N.getNode()))
-    return false;
-
-  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
-  unsigned ShiftAmt;
-
-  if (Immed >> 12 == 0) {
-    ShiftAmt = 0;
-  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
-    ShiftAmt = 12;
-    Immed = Immed >> 12;
-  } else
-    return false;
-
-  unsigned ShVal = ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt);
-  Val = CurDAG->getTargetConstant(Immed, MVT::i32);
-  Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
-  return true;
-}
-
-/// SelectNegArithImmed - As above, but negates the value before trying to
-/// select it.
-bool ARM64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
-                                            SDValue &Shift) {
-  // This function is called from the addsub_shifted_imm ComplexPattern,
-  // which lists [imm] as the list of opcode it's interested in, however
-  // we still need to check whether the operand is actually an immediate
-  // here because the ComplexPattern opcode list is only used in
-  // root-level opcode matching.
-  if (!isa<ConstantSDNode>(N.getNode()))
-    return false;
-
-  // The immediate operand must be a 24-bit zero-extended immediate.
-  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
-
-  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
-  // have the opposite effect on the C flag, so this pattern mustn't match under
-  // those circumstances.
-  if (Immed == 0)
-    return false;
-
-  if (N.getValueType() == MVT::i32)
-    Immed = ~((uint32_t)Immed) + 1;
-  else
-    Immed = ~Immed + 1ULL;
-  if (Immed & 0xFFFFFFFFFF000000ULL)
-    return false;
-
-  Immed &= 0xFFFFFFULL;
-  return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift);
-}
-
-/// getShiftTypeForNode - Translate a shift node to the corresponding
-/// ShiftType value.
-static ARM64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
-  switch (N.getOpcode()) {
-  default:
-    return ARM64_AM::InvalidShiftExtend;
-  case ISD::SHL:
-    return ARM64_AM::LSL;
-  case ISD::SRL:
-    return ARM64_AM::LSR;
-  case ISD::SRA:
-    return ARM64_AM::ASR;
-  case ISD::ROTR:
-    return ARM64_AM::ROR;
-  }
-}
-
-/// \brief Determine wether it is worth to fold V into an extended register.
-bool ARM64DAGToDAGISel::isWorthFolding(SDValue V) const {
-  // it hurts if the a value is used at least twice, unless we are optimizing
-  // for code size.
-  if (ForCodeSize || V.hasOneUse())
-    return true;
-  return false;
-}
-
-/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
-/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
-/// instructions allow the shifted register to be rotated, but the arithmetic
-/// instructions do not.  The AllowROR parameter specifies whether ROR is
-/// supported.
-bool ARM64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
-                                              SDValue &Reg, SDValue &Shift) {
-  ARM64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
-  if (ShType == ARM64_AM::InvalidShiftExtend)
-    return false;
-  if (!AllowROR && ShType == ARM64_AM::ROR)
-    return false;
-
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    unsigned BitSize = N.getValueType().getSizeInBits();
-    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
-    unsigned ShVal = ARM64_AM::getShifterImm(ShType, Val);
-
-    Reg = N.getOperand(0);
-    Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
-    return isWorthFolding(N);
-  }
-
-  return false;
-}
-
-/// getExtendTypeForNode - Translate an extend node to the corresponding
-/// ExtendType value.
-static ARM64_AM::ShiftExtendType
-getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
-  if (N.getOpcode() == ISD::SIGN_EXTEND ||
-      N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
-    EVT SrcVT;
-    if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
-      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
-    else
-      SrcVT = N.getOperand(0).getValueType();
-
-    if (!IsLoadStore && SrcVT == MVT::i8)
-      return ARM64_AM::SXTB;
-    else if (!IsLoadStore && SrcVT == MVT::i16)
-      return ARM64_AM::SXTH;
-    else if (SrcVT == MVT::i32)
-      return ARM64_AM::SXTW;
-    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
-
-    return ARM64_AM::InvalidShiftExtend;
-  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
-             N.getOpcode() == ISD::ANY_EXTEND) {
-    EVT SrcVT = N.getOperand(0).getValueType();
-    if (!IsLoadStore && SrcVT == MVT::i8)
-      return ARM64_AM::UXTB;
-    else if (!IsLoadStore && SrcVT == MVT::i16)
-      return ARM64_AM::UXTH;
-    else if (SrcVT == MVT::i32)
-      return ARM64_AM::UXTW;
-    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
-
-    return ARM64_AM::InvalidShiftExtend;
-  } else if (N.getOpcode() == ISD::AND) {
-    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
-    if (!CSD)
-      return ARM64_AM::InvalidShiftExtend;
-    uint64_t AndMask = CSD->getZExtValue();
-
-    switch (AndMask) {
-    default:
-      return ARM64_AM::InvalidShiftExtend;
-    case 0xFF:
-      return !IsLoadStore ? ARM64_AM::UXTB : ARM64_AM::InvalidShiftExtend;
-    case 0xFFFF:
-      return !IsLoadStore ? ARM64_AM::UXTH : ARM64_AM::InvalidShiftExtend;
-    case 0xFFFFFFFF:
-      return ARM64_AM::UXTW;
-    }
-  }
-
-  return ARM64_AM::InvalidShiftExtend;
-}
-
-// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
-static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
-  if (DL->getOpcode() != ARM64ISD::DUPLANE16 &&
-      DL->getOpcode() != ARM64ISD::DUPLANE32)
-    return false;
-
-  SDValue SV = DL->getOperand(0);
-  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
-    return false;
-
-  SDValue EV = SV.getOperand(1);
-  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
-    return false;
-
-  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
-  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
-  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
-  LaneOp = EV.getOperand(0);
-
-  return true;
-}
-
-// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
-// high lane extract.
-static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
-                             SDValue &LaneOp, int &LaneIdx) {
-
-  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
-    std::swap(Op0, Op1);
-    if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
-      return false;
-  }
-  StdOp = Op1;
-  return true;
-}
-
-/// SelectMLAV64LaneV128 - ARM64 supports vector MLAs where one multiplicand is
-/// a lane in the upper half of a 128-bit vector.  Recognize and select this so
-/// that we don't emit unnecessary lane extracts.
-SDNode *ARM64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
-  SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
-  int LaneIdx = -1; // Will hold the lane index.
-
-  if (Op1.getOpcode() != ISD::MUL ||
-      !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
-                        LaneIdx)) {
-    std::swap(Op0, Op1);
-    if (Op1.getOpcode() != ISD::MUL ||
-        !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
-                          LaneIdx))
-      return nullptr;
-  }
-
-  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
-
-  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
-
-  unsigned MLAOpc = ~0U;
-
-  switch (N->getSimpleValueType(0).SimpleTy) {
-  default:
-    llvm_unreachable("Unrecognized MLA.");
-  case MVT::v4i16:
-    MLAOpc = ARM64::MLAv4i16_indexed;
-    break;
-  case MVT::v8i16:
-    MLAOpc = ARM64::MLAv8i16_indexed;
-    break;
-  case MVT::v2i32:
-    MLAOpc = ARM64::MLAv2i32_indexed;
-    break;
-  case MVT::v4i32:
-    MLAOpc = ARM64::MLAv4i32_indexed;
-    break;
-  }
-
-  return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
-  SDValue SMULLOp0;
-  SDValue SMULLOp1;
-  int LaneIdx;
-
-  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
-                        LaneIdx))
-    return nullptr;
-
-  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
-
-  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
-
-  unsigned SMULLOpc = ~0U;
-
-  if (IntNo == Intrinsic::arm64_neon_smull) {
-    switch (N->getSimpleValueType(0).SimpleTy) {
-    default:
-      llvm_unreachable("Unrecognized SMULL.");
-    case MVT::v4i32:
-      SMULLOpc = ARM64::SMULLv4i16_indexed;
-      break;
-    case MVT::v2i64:
-      SMULLOpc = ARM64::SMULLv2i32_indexed;
-      break;
-    }
-  } else if (IntNo == Intrinsic::arm64_neon_umull) {
-    switch (N->getSimpleValueType(0).SimpleTy) {
-    default:
-      llvm_unreachable("Unrecognized SMULL.");
-    case MVT::v4i32:
-      SMULLOpc = ARM64::UMULLv4i16_indexed;
-      break;
-    case MVT::v2i64:
-      SMULLOpc = ARM64::UMULLv2i32_indexed;
-      break;
-    }
-  } else
-    llvm_unreachable("Unrecognized intrinsic.");
-
-  return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops);
-}
-
-/// Instructions that accept extend modifiers like UXTW expect the register
-/// being extended to be a GPR32, but the incoming DAG might be acting on a
-/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
-/// this is the case.
-static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
-  if (N.getValueType() == MVT::i32)
-    return N;
-
-  SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-  MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                               SDLoc(N), MVT::i32, N, SubReg);
-  return SDValue(Node, 0);
-}
-
-
-/// SelectArithExtendedRegister - Select a "extended register" operand.  This
-/// operand folds in an extend followed by an optional left shift.
-bool ARM64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
-                                                    SDValue &Shift) {
-  unsigned ShiftVal = 0;
-  ARM64_AM::ShiftExtendType Ext;
-
-  if (N.getOpcode() == ISD::SHL) {
-    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
-    if (!CSD)
-      return false;
-    ShiftVal = CSD->getZExtValue();
-    if (ShiftVal > 4)
-      return false;
-
-    Ext = getExtendTypeForNode(N.getOperand(0));
-    if (Ext == ARM64_AM::InvalidShiftExtend)
-      return false;
-
-    Reg = N.getOperand(0).getOperand(0);
-  } else {
-    Ext = getExtendTypeForNode(N);
-    if (Ext == ARM64_AM::InvalidShiftExtend)
-      return false;
-
-    Reg = N.getOperand(0);
-  }
-
-  // ARM64 mandates that the RHS of the operation must use the smallest
-  // register classs that could contain the size being extended from.  Thus,
-  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
-  // there might not be an actual 32-bit value in the program.  We can
-  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
-  assert(Ext != ARM64_AM::UXTX && Ext != ARM64_AM::SXTX);
-  Reg = narrowIfNeeded(CurDAG, Reg);
-  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32);
-  return isWorthFolding(N);
-}
-
-/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
-/// immediate" address.  The "Size" argument is the size in bytes of the memory
-/// reference, which determines the scale.
-bool ARM64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
-                                              SDValue &Base, SDValue &OffImm) {
-  const TargetLowering *TLI = getTargetLowering();
-  if (N.getOpcode() == ISD::FrameIndex) {
-    int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-    OffImm = CurDAG->getTargetConstant(0, MVT::i64);
-    return true;
-  }
-
-  if (N.getOpcode() == ARM64ISD::ADDlow) {
-    GlobalAddressSDNode *GAN =
-        dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
-    Base = N.getOperand(0);
-    OffImm = N.getOperand(1);
-    if (!GAN)
-      return true;
-
-    const GlobalValue *GV = GAN->getGlobal();
-    unsigned Alignment = GV->getAlignment();
-    const DataLayout *DL = TLI->getDataLayout();
-    if (Alignment == 0 && !Subtarget->isTargetDarwin())
-      Alignment = DL->getABITypeAlignment(GV->getType()->getElementType());
-
-    if (Alignment >= Size)
-      return true;
-  }
-
-  if (CurDAG->isBaseWithConstantOffset(N)) {
-    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-      int64_t RHSC = (int64_t)RHS->getZExtValue();
-      unsigned Scale = Log2_32(Size);
-      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
-        Base = N.getOperand(0);
-        if (Base.getOpcode() == ISD::FrameIndex) {
-          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-        }
-        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64);
-        return true;
-      }
-    }
-  }
-
-  // Before falling back to our general case, check if the unscaled
-  // instructions can handle this. If so, that's preferable.
-  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
-    return false;
-
-  // Base only. The address will be materialized into a register before
-  // the memory is accessed.
-  //    add x0, Xbase, #offset
-  //    ldr x0, [x0]
-  Base = N;
-  OffImm = CurDAG->getTargetConstant(0, MVT::i64);
-  return true;
-}
-
-/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
-/// immediate" address.  This should only match when there is an offset that
-/// is not valid for a scaled immediate addressing mode.  The "Size" argument
-/// is the size in bytes of the memory reference, which is needed here to know
-/// what is valid for a scaled immediate.
-bool ARM64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
-                                               SDValue &Base, SDValue &OffImm) {
-  if (!CurDAG->isBaseWithConstantOffset(N))
-    return false;
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    int64_t RHSC = RHS->getSExtValue();
-    // If the offset is valid as a scaled immediate, don't match here.
-    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
-        RHSC < (0x1000 << Log2_32(Size)))
-      return false;
-    if (RHSC >= -256 && RHSC < 256) {
-      Base = N.getOperand(0);
-      if (Base.getOpcode() == ISD::FrameIndex) {
-        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        const TargetLowering *TLI = getTargetLowering();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-      }
-      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64);
-      return true;
-    }
-  }
-  return false;
-}
-
-static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
-  SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-  SDValue ImpDef = SDValue(
-      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64),
-      0);
-  MachineSDNode *Node = CurDAG->getMachineNode(
-      TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg);
-  return SDValue(Node, 0);
-}
-
-/// \brief Check if the given SHL node (\p N), can be used to form an
-/// extended register for an addressing mode.
-bool ARM64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
-                                          bool WantExtend, SDValue &Offset,
-                                          SDValue &SignExtend) {
-  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
-  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
-  if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
-    return false;
-
-  if (WantExtend) {
-    ARM64_AM::ShiftExtendType Ext = getExtendTypeForNode(N.getOperand(0), true);
-    if (Ext == ARM64_AM::InvalidShiftExtend)
-      return false;
-
-    Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
-    SignExtend = CurDAG->getTargetConstant(Ext == ARM64_AM::SXTW, MVT::i32);
-  } else {
-    Offset = N.getOperand(0);
-    SignExtend = CurDAG->getTargetConstant(0, MVT::i32);
-  }
-
-  unsigned LegalShiftVal = Log2_32(Size);
-  unsigned ShiftVal = CSD->getZExtValue();
-
-  if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
-    return false;
-
-  if (isWorthFolding(N))
-    return true;
-
-  return false;
-}
-
-bool ARM64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
-                                         SDValue &Base, SDValue &Offset,
-                                         SDValue &SignExtend,
-                                         SDValue &DoShift) {
-  if (N.getOpcode() != ISD::ADD)
-    return false;
-  SDValue LHS = N.getOperand(0);
-  SDValue RHS = N.getOperand(1);
-
-  // We don't want to match immediate adds here, because they are better lowered
-  // to the register-immediate addressing modes.
-  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
-    return false;
-
-  // Check if this particular node is reused in any non-memory related
-  // operation.  If yes, do not try to fold this node into the address
-  // computation, since the computation will be kept.
-  const SDNode *Node = N.getNode();
-  for (SDNode *UI : Node->uses()) {
-    if (!isa<MemSDNode>(*UI))
-      return false;
-  }
-
-  // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
-
-  // Try to match a shifted extend on the RHS.
-  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
-      SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
-    Base = LHS;
-    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
-    return true;
-  }
-
-  // Try to match a shifted extend on the LHS.
-  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
-      SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
-    Base = RHS;
-    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
-    return true;
-  }
-
-  // There was no shift, whatever else we find.
-  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
-
-  ARM64_AM::ShiftExtendType Ext = ARM64_AM::InvalidShiftExtend;
-  // Try to match an unshifted extend on the LHS.
-  if (IsExtendedRegisterWorthFolding &&
-      (Ext = getExtendTypeForNode(LHS, true)) != ARM64_AM::InvalidShiftExtend) {
-    Base = RHS;
-    Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
-    SignExtend = CurDAG->getTargetConstant(Ext == ARM64_AM::SXTW, MVT::i32);
-    if (isWorthFolding(LHS))
-      return true;
-  }
-
-  // Try to match an unshifted extend on the RHS.
-  if (IsExtendedRegisterWorthFolding &&
-      (Ext = getExtendTypeForNode(RHS, true)) != ARM64_AM::InvalidShiftExtend) {
-    Base = LHS;
-    Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
-    SignExtend = CurDAG->getTargetConstant(Ext == ARM64_AM::SXTW, MVT::i32);
-    if (isWorthFolding(RHS))
-      return true;
-  }
-
-  return false;
-}
-
-bool ARM64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
-                                          SDValue &Base, SDValue &Offset,
-                                          SDValue &SignExtend,
-                                          SDValue &DoShift) {
-  if (N.getOpcode() != ISD::ADD)
-    return false;
-  SDValue LHS = N.getOperand(0);
-  SDValue RHS = N.getOperand(1);
-
-  // We don't want to match immediate adds here, because they are better lowered
-  // to the register-immediate addressing modes.
-  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
-    return false;
-
-  // Check if this particular node is reused in any non-memory related
-  // operation.  If yes, do not try to fold this node into the address
-  // computation, since the computation will be kept.
-  const SDNode *Node = N.getNode();
-  for (SDNode *UI : Node->uses()) {
-    if (!isa<MemSDNode>(*UI))
-      return false;
-  }
-
-  // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
-
-  // Try to match a shifted extend on the RHS.
-  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
-      SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
-    Base = LHS;
-    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
-    return true;
-  }
-
-  // Try to match a shifted extend on the LHS.
-  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
-      SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
-    Base = RHS;
-    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
-    return true;
-  }
-
-  // Match any non-shifted, non-extend, non-immediate add expression.
-  Base = LHS;
-  Offset = RHS;
-  SignExtend = CurDAG->getTargetConstant(false, MVT::i32);
-  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
-  // Reg1 + Reg2 is free: no check needed.
-  return true;
-}
-
-SDValue ARM64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { ARM64::DDRegClassID, ARM64::DDDRegClassID,
-                                    ARM64::DDDDRegClassID };
-  static unsigned SubRegs[] = { ARM64::dsub0, ARM64::dsub1,
-                                ARM64::dsub2, ARM64::dsub3 };
-
-  return createTuple(Regs, RegClassIDs, SubRegs);
-}
-
-SDValue ARM64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { ARM64::QQRegClassID, ARM64::QQQRegClassID,
-                                    ARM64::QQQQRegClassID };
-  static unsigned SubRegs[] = { ARM64::qsub0, ARM64::qsub1,
-                                ARM64::qsub2, ARM64::qsub3 };
-
-  return createTuple(Regs, RegClassIDs, SubRegs);
-}
-
-SDValue ARM64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
-                                       unsigned RegClassIDs[],
-                                       unsigned SubRegs[]) {
-  // There's no special register-class for a vector-list of 1 element: it's just
-  // a vector.
-  if (Regs.size() == 1)
-    return Regs[0];
-
-  assert(Regs.size() >= 2 && Regs.size() <= 4);
-
-  SDLoc DL(Regs[0].getNode());
-
-  SmallVector<SDValue, 4> Ops;
-
-  // First operand of REG_SEQUENCE is the desired RegClass.
-  Ops.push_back(
-      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32));
-
-  // Then we get pairs of source & subregister-position for the components.
-  for (unsigned i = 0; i < Regs.size(); ++i) {
-    Ops.push_back(Regs[i]);
-    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32));
-  }
-
-  SDNode *N =
-      CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
-  return SDValue(N, 0);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
-                                       unsigned Opc, bool isExt) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-
-  unsigned ExtOff = isExt;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  unsigned Vec0Off = ExtOff + 1;
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
-                               N->op_begin() + Vec0Off + NumVecs);
-  SDValue RegSeq = createQTuple(Regs);
-
-  SmallVector<SDValue, 6> Ops;
-  if (isExt)
-    Ops.push_back(N->getOperand(1));
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  if (LD->isUnindexed())
-    return nullptr;
-  EVT VT = LD->getMemoryVT();
-  EVT DstVT = N->getValueType(0);
-  ISD::MemIndexedMode AM = LD->getAddressingMode();
-  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
-
-  // We're not doing validity checking here. That was done when checking
-  // if we should mark the load as indexed or not. We're just selecting
-  // the right instruction.
-  unsigned Opcode = 0;
-
-  ISD::LoadExtType ExtType = LD->getExtensionType();
-  bool InsertTo64 = false;
-  if (VT == MVT::i64)
-    Opcode = IsPre ? ARM64::LDRXpre : ARM64::LDRXpost;
-  else if (VT == MVT::i32) {
-    if (ExtType == ISD::NON_EXTLOAD)
-      Opcode = IsPre ? ARM64::LDRWpre : ARM64::LDRWpost;
-    else if (ExtType == ISD::SEXTLOAD)
-      Opcode = IsPre ? ARM64::LDRSWpre : ARM64::LDRSWpost;
-    else {
-      Opcode = IsPre ? ARM64::LDRWpre : ARM64::LDRWpost;
-      InsertTo64 = true;
-      // The result of the load is only i32. It's the subreg_to_reg that makes
-      // it into an i64.
-      DstVT = MVT::i32;
-    }
-  } else if (VT == MVT::i16) {
-    if (ExtType == ISD::SEXTLOAD) {
-      if (DstVT == MVT::i64)
-        Opcode = IsPre ? ARM64::LDRSHXpre : ARM64::LDRSHXpost;
-      else
-        Opcode = IsPre ? ARM64::LDRSHWpre : ARM64::LDRSHWpost;
-    } else {
-      Opcode = IsPre ? ARM64::LDRHHpre : ARM64::LDRHHpost;
-      InsertTo64 = DstVT == MVT::i64;
-      // The result of the load is only i32. It's the subreg_to_reg that makes
-      // it into an i64.
-      DstVT = MVT::i32;
-    }
-  } else if (VT == MVT::i8) {
-    if (ExtType == ISD::SEXTLOAD) {
-      if (DstVT == MVT::i64)
-        Opcode = IsPre ? ARM64::LDRSBXpre : ARM64::LDRSBXpost;
-      else
-        Opcode = IsPre ? ARM64::LDRSBWpre : ARM64::LDRSBWpost;
-    } else {
-      Opcode = IsPre ? ARM64::LDRBBpre : ARM64::LDRBBpost;
-      InsertTo64 = DstVT == MVT::i64;
-      // The result of the load is only i32. It's the subreg_to_reg that makes
-      // it into an i64.
-      DstVT = MVT::i32;
-    }
-  } else if (VT == MVT::f32) {
-    Opcode = IsPre ? ARM64::LDRSpre : ARM64::LDRSpost;
-  } else if (VT == MVT::f64 || VT.is64BitVector()) {
-    Opcode = IsPre ? ARM64::LDRDpre : ARM64::LDRDpost;
-  } else if (VT.is128BitVector()) {
-    Opcode = IsPre ? ARM64::LDRQpre : ARM64::LDRQpost;
-  } else
-    return nullptr;
-  SDValue Chain = LD->getChain();
-  SDValue Base = LD->getBasePtr();
-  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
-  int OffsetVal = (int)OffsetOp->getZExtValue();
-  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64);
-  SDValue Ops[] = { Base, Offset, Chain };
-  SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i64, DstVT,
-                                       MVT::Other, Ops);
-  // Either way, we're replacing the node, so tell the caller that.
-  Done = true;
-  SDValue LoadedVal = SDValue(Res, 1);
-  if (InsertTo64) {
-    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-    LoadedVal =
-        SDValue(CurDAG->getMachineNode(ARM64::SUBREG_TO_REG, SDLoc(N), MVT::i64,
-                                       CurDAG->getTargetConstant(0, MVT::i64),
-                                       LoadedVal, SubReg),
-                0);
-  }
-
-  ReplaceUses(SDValue(N, 0), LoadedVal);
-  ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
-  ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
-
-  return nullptr;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
-                                      unsigned SubRegIdx) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  SDValue Chain = N->getOperand(0);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(2)); // Mem operand;
-  Ops.push_back(Chain);
-
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
-
-  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-  SDValue SuperReg = SDValue(Ld, 0);
-  for (unsigned i = 0; i < NumVecs; ++i)
-    ReplaceUses(SDValue(N, i),
-        CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
-
-  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-  return nullptr;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
-                                          unsigned Opc, unsigned SubRegIdx) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  SDValue Chain = N->getOperand(0);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(1)); // Mem operand
-  Ops.push_back(N->getOperand(2)); // Incremental
-  Ops.push_back(Chain);
-
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::i64); // Type of the write back register
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
-
-  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-
-  // Update uses of write back register
-  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
-
-  // Update uses of vector list
-  SDValue SuperReg = SDValue(Ld, 1);
-  if (NumVecs == 1)
-    ReplaceUses(SDValue(N, 0), SuperReg);
-  else
-    for (unsigned i = 0; i < NumVecs; ++i)
-      ReplaceUses(SDValue(N, i),
-          CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
-
-  // Update the chain
-  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
-  return nullptr;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
-                                       unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getOperand(2)->getValueType(0);
-
-  // Form a REG_SEQUENCE to force register allocation.
-  bool Is128Bit = VT.getSizeInBits() == 128;
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
-  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + 2));
-  Ops.push_back(N->getOperand(0));
-  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
-
-  return St;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
-                                               unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getOperand(2)->getValueType(0);
-  SmallVector<EVT, 2> ResTys;
-  ResTys.push_back(MVT::i64);   // Type of the write back register
-  ResTys.push_back(MVT::Other); // Type for the Chain
-
-  // Form a REG_SEQUENCE to force register allocation.
-  bool Is128Bit = VT.getSizeInBits() == 128;
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
-  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + 1)); // base register
-  Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental
-  Ops.push_back(N->getOperand(0)); // Chain
-  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-
-  return St;
-}
-
-/// WidenVector - Given a value in the V64 register class, produce the
-/// equivalent value in the V128 register class.
-class WidenVector {
-  SelectionDAG &DAG;
-
-public:
-  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
-
-  SDValue operator()(SDValue V64Reg) {
-    EVT VT = V64Reg.getValueType();
-    unsigned NarrowSize = VT.getVectorNumElements();
-    MVT EltTy = VT.getVectorElementType().getSimpleVT();
-    MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
-    SDLoc DL(V64Reg);
-
-    SDValue Undef =
-        SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
-    return DAG.getTargetInsertSubreg(ARM64::dsub, DL, WideTy, Undef, V64Reg);
-  }
-};
-
-/// NarrowVector - Given a value in the V128 register class, produce the
-/// equivalent value in the V64 register class.
-static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
-  EVT VT = V128Reg.getValueType();
-  unsigned WideSize = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType().getSimpleVT();
-  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
-
-  return DAG.getTargetExtractSubreg(ARM64::dsub, SDLoc(V128Reg), NarrowTy,
-                                    V128Reg);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
-                                          unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  bool Narrow = VT.getSizeInBits() == 64;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
-
-  if (Narrow)
-    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
-                   WidenVector(*CurDAG));
-
-  SDValue RegSeq = createQTuple(Regs);
-
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
-
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 3));
-  Ops.push_back(N->getOperand(0));
-  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-  SDValue SuperReg = SDValue(Ld, 0);
-
-  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
-  static unsigned QSubs[] = { ARM64::qsub0, ARM64::qsub1, ARM64::qsub2,
-                              ARM64::qsub3 };
-  for (unsigned i = 0; i < NumVecs; ++i) {
-    SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
-    if (Narrow)
-      NV = NarrowVector(NV, *CurDAG);
-    ReplaceUses(SDValue(N, i), NV);
-  }
-
-  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-
-  return Ld;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
-                                              unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  bool Narrow = VT.getSizeInBits() == 64;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
-
-  if (Narrow)
-    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
-                   WidenVector(*CurDAG));
-
-  SDValue RegSeq = createQTuple(Regs);
-
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::i64); // Type of the write back register
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
-
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number
-  Ops.push_back(N->getOperand(NumVecs + 2)); // Base register
-  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
-  Ops.push_back(N->getOperand(0));
-  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-
-  // Update uses of the write back register
-  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
-
-  // Update uses of the vector list
-  SDValue SuperReg = SDValue(Ld, 1);
-  if (NumVecs == 1) {
-    ReplaceUses(SDValue(N, 0),
-                Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
-  } else {
-    EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
-    static unsigned QSubs[] = { ARM64::qsub0, ARM64::qsub1, ARM64::qsub2,
-                                ARM64::qsub3 };
-    for (unsigned i = 0; i < NumVecs; ++i) {
-      SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
-                                                  SuperReg);
-      if (Narrow)
-        NV = NarrowVector(NV, *CurDAG);
-      ReplaceUses(SDValue(N, i), NV);
-    }
-  }
-
-  // Update the Chain
-  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
-
-  return Ld;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
-                                           unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getOperand(2)->getValueType(0);
-  bool Narrow = VT.getSizeInBits() == 64;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
-
-  if (Narrow)
-    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
-                   WidenVector(*CurDAG));
-
-  SDValue RegSeq = createQTuple(Regs);
-
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 3));
-  Ops.push_back(N->getOperand(0));
-  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
-
-  // Transfer memoperands.
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
-
-  return St;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
-                                               unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getOperand(2)->getValueType(0);
-  bool Narrow = VT.getSizeInBits() == 64;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
-
-  if (Narrow)
-    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
-                   WidenVector(*CurDAG));
-
-  SDValue RegSeq = createQTuple(Regs);
-
-  SmallVector<EVT, 2> ResTys;
-  ResTys.push_back(MVT::i64);   // Type of the write back register
-  ResTys.push_back(MVT::Other);
-
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register
-  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
-  Ops.push_back(N->getOperand(0));
-  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-
-  // Transfer memoperands.
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
-
-  return St;
-}
-
-static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
-                                       unsigned &Opc, SDValue &Opd0,
-                                       unsigned &LSB, unsigned &MSB,
-                                       unsigned NumberOfIgnoredLowBits,
-                                       bool BiggerPattern) {
-  assert(N->getOpcode() == ISD::AND &&
-         "N must be a AND operation to call this function");
-
-  EVT VT = N->getValueType(0);
-
-  // Here we can test the type of VT and return false when the type does not
-  // match, but since it is done prior to that call in the current context
-  // we turned that into an assert to avoid redundant code.
-  assert((VT == MVT::i32 || VT == MVT::i64) &&
-         "Type checking must have been done before calling this function");
-
-  // FIXME: simplify-demanded-bits in DAGCombine will probably have
-  // changed the AND node to a 32-bit mask operation. We'll have to
-  // undo that as part of the transform here if we want to catch all
-  // the opportunities.
-  // Currently the NumberOfIgnoredLowBits argument helps to recover
-  // form these situations when matching bigger pattern (bitfield insert).
-
-  // For unsigned extracts, check for a shift right and mask
-  uint64_t And_imm = 0;
-  if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
-    return false;
-
-  const SDNode *Op0 = N->getOperand(0).getNode();
-
-  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
-  // simplified. Try to undo that
-  And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
-
-  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
-  if (And_imm & (And_imm + 1))
-    return false;
-
-  bool ClampMSB = false;
-  uint64_t Srl_imm = 0;
-  // Handle the SRL + ANY_EXTEND case.
-  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
-      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
-    // Extend the incoming operand of the SRL to 64-bit.
-    Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
-    // Make sure to clamp the MSB so that we preserve the semantics of the
-    // original operations.
-    ClampMSB = true;
-  } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
-             isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
-                                   Srl_imm)) {
-    // If the shift result was truncated, we can still combine them.
-    Opd0 = Op0->getOperand(0).getOperand(0);
-
-    // Use the type of SRL node.
-    VT = Opd0->getValueType(0);
-  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
-    Opd0 = Op0->getOperand(0);
-  } else if (BiggerPattern) {
-    // Let's pretend a 0 shift right has been performed.
-    // The resulting code will be at least as good as the original one
-    // plus it may expose more opportunities for bitfield insert pattern.
-    // FIXME: Currently we limit this to the bigger pattern, because
-    // some optimizations expect AND and not UBFM
-    Opd0 = N->getOperand(0);
-  } else
-    return false;
-
-  assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
-         "bad amount in shift node!");
-
-  LSB = Srl_imm;
-  MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
-                                  : CountTrailingOnes_64(And_imm)) -
-        1;
-  if (ClampMSB)
-    // Since we're moving the extend before the right shift operation, we need
-    // to clamp the MSB to make sure we don't shift in undefined bits instead of
-    // the zeros which would get shifted in with the original right shift
-    // operation.
-    MSB = MSB > 31 ? 31 : MSB;
-
-  Opc = VT == MVT::i32 ? ARM64::UBFMWri : ARM64::UBFMXri;
-  return true;
-}
-
-static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                     unsigned &LSB, unsigned &MSB) {
-  // We are looking for the following pattern which basically extracts a single
-  // bit from the source value and places it in the LSB of the destination
-  // value, all other bits of the destination value or set to zero:
-  //
-  // Value2 = AND Value, MaskImm
-  // SRL Value2, ShiftImm
-  //
-  // with MaskImm >> ShiftImm == 1.
-  //
-  // This gets selected into a single UBFM:
-  //
-  // UBFM Value, ShiftImm, ShiftImm
-  //
-
-  if (N->getOpcode() != ISD::SRL)
-    return false;
-
-  uint64_t And_mask = 0;
-  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
-    return false;
-
-  Opd0 = N->getOperand(0).getOperand(0);
-
-  uint64_t Srl_imm = 0;
-  if (!isIntImmediate(N->getOperand(1), Srl_imm))
-    return false;
-
-  // Check whether we really have a one bit extract here.
-  if (And_mask >> Srl_imm == 0x1) {
-    if (N->getValueType(0) == MVT::i32)
-      Opc = ARM64::UBFMWri;
-    else
-      Opc = ARM64::UBFMXri;
-
-    LSB = MSB = Srl_imm;
-
-    return true;
-  }
-
-  return false;
-}
-
-static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                       unsigned &LSB, unsigned &MSB,
-                                       bool BiggerPattern) {
-  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
-         "N must be a SHR/SRA operation to call this function");
-
-  EVT VT = N->getValueType(0);
-
-  // Here we can test the type of VT and return false when the type does not
-  // match, but since it is done prior to that call in the current context
-  // we turned that into an assert to avoid redundant code.
-  assert((VT == MVT::i32 || VT == MVT::i64) &&
-         "Type checking must have been done before calling this function");
-
-  // Check for AND + SRL doing a one bit extract.
-  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
-    return true;
-
-  // we're looking for a shift of a shift
-  uint64_t Shl_imm = 0;
-  uint64_t Trunc_bits = 0;
-  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
-    Opd0 = N->getOperand(0).getOperand(0);
-  } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
-             N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
-    // We are looking for a shift of truncate. Truncate from i64 to i32 could
-    // be considered as setting high 32 bits as zero. Our strategy here is to
-    // always generate 64bit UBFM. This consistency will help the CSE pass
-    // later find more redundancy.
-    Opd0 = N->getOperand(0).getOperand(0);
-    Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
-    VT = Opd0->getValueType(0);
-    assert(VT == MVT::i64 && "the promoted type should be i64");
-  } else if (BiggerPattern) {
-    // Let's pretend a 0 shift left has been performed.
-    // FIXME: Currently we limit this to the bigger pattern case,
-    // because some optimizations expect AND and not UBFM
-    Opd0 = N->getOperand(0);
-  } else
-    return false;
-
-  assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
-  uint64_t Srl_imm = 0;
-  if (!isIntImmediate(N->getOperand(1), Srl_imm))
-    return false;
-
-  assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
-         "bad amount in shift node!");
-  // Note: The width operand is encoded as width-1.
-  unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
-  int sLSB = Srl_imm - Shl_imm;
-  if (sLSB < 0)
-    return false;
-  LSB = sLSB;
-  MSB = LSB + Width;
-  // SRA requires a signed extraction
-  if (VT == MVT::i32)
-    Opc = N->getOpcode() == ISD::SRA ? ARM64::SBFMWri : ARM64::UBFMWri;
-  else
-    Opc = N->getOpcode() == ISD::SRA ? ARM64::SBFMXri : ARM64::UBFMXri;
-  return true;
-}
-
-static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
-                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
-                                unsigned NumberOfIgnoredLowBits = 0,
-                                bool BiggerPattern = false) {
-  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
-    return false;
-
-  switch (N->getOpcode()) {
-  default:
-    if (!N->isMachineOpcode())
-      return false;
-    break;
-  case ISD::AND:
-    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
-                                      NumberOfIgnoredLowBits, BiggerPattern);
-  case ISD::SRL:
-  case ISD::SRA:
-    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
-  }
-
-  unsigned NOpc = N->getMachineOpcode();
-  switch (NOpc) {
-  default:
-    return false;
-  case ARM64::SBFMWri:
-  case ARM64::UBFMWri:
-  case ARM64::SBFMXri:
-  case ARM64::UBFMXri:
-    Opc = NOpc;
-    Opd0 = N->getOperand(0);
-    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
-    return true;
-  }
-  // Unreachable
-  return false;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
-  unsigned Opc, LSB, MSB;
-  SDValue Opd0;
-  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
-    return nullptr;
-
-  EVT VT = N->getValueType(0);
-
-  // If the bit extract operation is 64bit but the original type is 32bit, we
-  // need to add one EXTRACT_SUBREG.
-  if ((Opc == ARM64::SBFMXri || Opc == ARM64::UBFMXri) && VT == MVT::i32) {
-    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
-                       CurDAG->getTargetConstant(MSB, MVT::i64)};
-
-    SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
-    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-    MachineSDNode *Node =
-        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32,
-                               SDValue(BFM, 0), SubReg);
-    return Node;
-  }
-
-  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
-                   CurDAG->getTargetConstant(MSB, VT)};
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
-}
-
-/// Does DstMask form a complementary pair with the mask provided by
-/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
-/// this asks whether DstMask zeroes precisely those bits that will be set by
-/// the other half.
-static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
-                              unsigned NumberOfIgnoredHighBits, EVT VT) {
-  assert((VT == MVT::i32 || VT == MVT::i64) &&
-         "i32 or i64 mask type expected!");
-  unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
-
-  APInt SignificantDstMask = APInt(BitWidth, DstMask);
-  APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
-
-  return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
-         (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
-}
-
-// Look for bits that will be useful for later uses.
-// A bit is consider useless as soon as it is dropped and never used
-// before it as been dropped.
-// E.g., looking for useful bit of x
-// 1. y = x & 0x7
-// 2. z = y >> 2
-// After #1, x useful bits are 0x7, then the useful bits of x, live through
-// y.
-// After #2, the useful bits of x are 0x4.
-// However, if x is used on an unpredicatable instruction, then all its bits
-// are useful.
-// E.g.
-// 1. y = x & 0x7
-// 2. z = y >> 2
-// 3. str x, [@x]
-static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
-
-static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
-                                              unsigned Depth) {
-  uint64_t Imm =
-      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
-  Imm = ARM64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
-  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
-  getUsefulBits(Op, UsefulBits, Depth + 1);
-}
-
-static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
-                                             uint64_t Imm, uint64_t MSB,
-                                             unsigned Depth) {
-  // inherit the bitwidth value
-  APInt OpUsefulBits(UsefulBits);
-  OpUsefulBits = 1;
-
-  if (MSB >= Imm) {
-    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
-    --OpUsefulBits;
-    // The interesting part will be in the lower part of the result
-    getUsefulBits(Op, OpUsefulBits, Depth + 1);
-    // The interesting part was starting at Imm in the argument
-    OpUsefulBits = OpUsefulBits.shl(Imm);
-  } else {
-    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
-    --OpUsefulBits;
-    // The interesting part will be shifted in the result
-    OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
-    getUsefulBits(Op, OpUsefulBits, Depth + 1);
-    // The interesting part was at zero in the argument
-    OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
-  }
-
-  UsefulBits &= OpUsefulBits;
-}
-
-static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
-                                  unsigned Depth) {
-  uint64_t Imm =
-      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
-  uint64_t MSB =
-      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-
-  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
-}
-
-static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
-                                              unsigned Depth) {
-  uint64_t ShiftTypeAndValue =
-      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-  APInt Mask(UsefulBits);
-  Mask.clearAllBits();
-  Mask.flipAllBits();
-
-  if (ARM64_AM::getShiftType(ShiftTypeAndValue) == ARM64_AM::LSL) {
-    // Shift Left
-    uint64_t ShiftAmt = ARM64_AM::getShiftValue(ShiftTypeAndValue);
-    Mask = Mask.shl(ShiftAmt);
-    getUsefulBits(Op, Mask, Depth + 1);
-    Mask = Mask.lshr(ShiftAmt);
-  } else if (ARM64_AM::getShiftType(ShiftTypeAndValue) == ARM64_AM::LSR) {
-    // Shift Right
-    // We do not handle ARM64_AM::ASR, because the sign will change the
-    // number of useful bits
-    uint64_t ShiftAmt = ARM64_AM::getShiftValue(ShiftTypeAndValue);
-    Mask = Mask.lshr(ShiftAmt);
-    getUsefulBits(Op, Mask, Depth + 1);
-    Mask = Mask.shl(ShiftAmt);
-  } else
-    return;
-
-  UsefulBits &= Mask;
-}
-
-static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
-                                 unsigned Depth) {
-  uint64_t Imm =
-      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-  uint64_t MSB =
-      cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
-
-  if (Op.getOperand(1) == Orig)
-    return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
-
-  APInt OpUsefulBits(UsefulBits);
-  OpUsefulBits = 1;
-
-  if (MSB >= Imm) {
-    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
-    --OpUsefulBits;
-    UsefulBits &= ~OpUsefulBits;
-    getUsefulBits(Op, UsefulBits, Depth + 1);
-  } else {
-    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
-    --OpUsefulBits;
-    UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm));
-    getUsefulBits(Op, UsefulBits, Depth + 1);
-  }
-}
-
-static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
-                                SDValue Orig, unsigned Depth) {
-
-  // Users of this node should have already been instruction selected
-  // FIXME: Can we turn that into an assert?
-  if (!UserNode->isMachineOpcode())
-    return;
-
-  switch (UserNode->getMachineOpcode()) {
-  default:
-    return;
-  case ARM64::ANDSWri:
-  case ARM64::ANDSXri:
-  case ARM64::ANDWri:
-  case ARM64::ANDXri:
-    // We increment Depth only when we call the getUsefulBits
-    return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
-                                             Depth);
-  case ARM64::UBFMWri:
-  case ARM64::UBFMXri:
-    return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
-
-  case ARM64::ORRWrs:
-  case ARM64::ORRXrs:
-    if (UserNode->getOperand(1) != Orig)
-      return;
-    return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
-                                             Depth);
-  case ARM64::BFMWri:
-  case ARM64::BFMXri:
-    return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
-  }
-}
-
-static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
-  if (Depth >= 6)
-    return;
-  // Initialize UsefulBits
-  if (!Depth) {
-    unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits();
-    // At the beginning, assume every produced bits is useful
-    UsefulBits = APInt(Bitwidth, 0);
-    UsefulBits.flipAllBits();
-  }
-  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
-
-  for (SDNode *Node : Op.getNode()->uses()) {
-    // A use cannot produce useful bits
-    APInt UsefulBitsForUse = APInt(UsefulBits);
-    getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
-    UsersUsefulBits |= UsefulBitsForUse;
-  }
-  // UsefulBits contains the produced bits that are meaningful for the
-  // current definition, thus a user cannot make a bit meaningful at
-  // this point
-  UsefulBits &= UsersUsefulBits;
-}
-
-/// Create a machine node performing a notional SHL of Op by ShlAmount. If
-/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
-/// 0, return Op unchanged.
-static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
-  if (ShlAmount == 0)
-    return Op;
-
-  EVT VT = Op.getValueType();
-  unsigned BitWidth = VT.getSizeInBits();
-  unsigned UBFMOpc = BitWidth == 32 ? ARM64::UBFMWri : ARM64::UBFMXri;
-
-  SDNode *ShiftNode;
-  if (ShlAmount > 0) {
-    // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
-    ShiftNode = CurDAG->getMachineNode(
-        UBFMOpc, SDLoc(Op), VT, Op,
-        CurDAG->getTargetConstant(BitWidth - ShlAmount, VT),
-        CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT));
-  } else {
-    // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
-    assert(ShlAmount < 0 && "expected right shift");
-    int ShrAmount = -ShlAmount;
-    ShiftNode = CurDAG->getMachineNode(
-        UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT),
-        CurDAG->getTargetConstant(BitWidth - 1, VT));
-  }
-
-  return SDValue(ShiftNode, 0);
-}
-
-/// Does this tree qualify as an attempt to move a bitfield into position,
-/// essentially "(and (shl VAL, N), Mask)".
-static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
-                                    SDValue &Src, int &ShiftAmount,
-                                    int &MaskWidth) {
-  EVT VT = Op.getValueType();
-  unsigned BitWidth = VT.getSizeInBits();
-  (void)BitWidth;
-  assert(BitWidth == 32 || BitWidth == 64);
-
-  APInt KnownZero, KnownOne;
-  CurDAG->computeKnownBits(Op, KnownZero, KnownOne);
-
-  // Non-zero in the sense that they're not provably zero, which is the key
-  // point if we want to use this value
-  uint64_t NonZeroBits = (~KnownZero).getZExtValue();
-
-  // Discard a constant AND mask if present. It's safe because the node will
-  // already have been factored into the computeKnownBits calculation above.
-  uint64_t AndImm;
-  if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
-    assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
-    Op = Op.getOperand(0);
-  }
-
-  uint64_t ShlImm;
-  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
-    return false;
-  Op = Op.getOperand(0);
-
-  if (!isShiftedMask_64(NonZeroBits))
-    return false;
-
-  ShiftAmount = countTrailingZeros(NonZeroBits);
-  MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
-
-  // BFI encompasses sufficiently many nodes that it's worth inserting an extra
-  // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
-  // amount.
-  Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
-
-  return true;
-}
-
-// Given a OR operation, check if we have the following pattern
-// ubfm c, b, imm, imm2 (or something that does the same jobs, see
-//                       isBitfieldExtractOp)
-// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
-//                 countTrailingZeros(mask2) == imm2 - imm + 1
-// f = d | c
-// if yes, given reference arguments will be update so that one can replace
-// the OR instruction with:
-// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
-static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
-                                     SDValue &Src, unsigned &ImmR,
-                                     unsigned &ImmS, SelectionDAG *CurDAG) {
-  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
-
-  // Set Opc
-  EVT VT = N->getValueType(0);
-  if (VT == MVT::i32)
-    Opc = ARM64::BFMWri;
-  else if (VT == MVT::i64)
-    Opc = ARM64::BFMXri;
-  else
-    return false;
-
-  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
-  // have the expected shape. Try to undo that.
-  APInt UsefulBits;
-  getUsefulBits(SDValue(N, 0), UsefulBits);
-
-  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
-  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
-
-  // OR is commutative, check both possibilities (does llvm provide a
-  // way to do that directely, e.g., via code matcher?)
-  SDValue OrOpd1Val = N->getOperand(1);
-  SDNode *OrOpd0 = N->getOperand(0).getNode();
-  SDNode *OrOpd1 = N->getOperand(1).getNode();
-  for (int i = 0; i < 2;
-       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
-    unsigned BFXOpc;
-    int DstLSB, Width;
-    if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
-                            NumberOfIgnoredLowBits, true)) {
-      // Check that the returned opcode is compatible with the pattern,
-      // i.e., same type and zero extended (U and not S)
-      if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) ||
-          (BFXOpc != ARM64::UBFMWri && VT == MVT::i32))
-        continue;
-
-      // Compute the width of the bitfield insertion
-      DstLSB = 0;
-      Width = ImmS - ImmR + 1;
-      // FIXME: This constraint is to catch bitfield insertion we may
-      // want to widen the pattern if we want to grab general bitfied
-      // move case
-      if (Width <= 0)
-        continue;
-
-      // If the mask on the insertee is correct, we have a BFXIL operation. We
-      // can share the ImmR and ImmS values from the already-computed UBFM.
-    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
-                                       DstLSB, Width)) {
-      ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
-      ImmS = Width - 1;
-    } else
-      continue;
-
-    // Check the second part of the pattern
-    EVT VT = OrOpd1->getValueType(0);
-    assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
-
-    // Compute the Known Zero for the candidate of the first operand.
-    // This allows to catch more general case than just looking for
-    // AND with imm. Indeed, simplify-demanded-bits may have removed
-    // the AND instruction because it proves it was useless.
-    APInt KnownZero, KnownOne;
-    CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne);
-
-    // Check if there is enough room for the second operand to appear
-    // in the first one
-    APInt BitsToBeInserted =
-        APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
-
-    if ((BitsToBeInserted & ~KnownZero) != 0)
-      continue;
-
-    // Set the first operand
-    uint64_t Imm;
-    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
-        isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
-      // In that case, we can eliminate the AND
-      Dst = OrOpd1->getOperand(0);
-    else
-      // Maybe the AND has been removed by simplify-demanded-bits
-      // or is useful because it discards more bits
-      Dst = OrOpd1Val;
-
-    // both parts match
-    return true;
-  }
-
-  return false;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
-  if (N->getOpcode() != ISD::OR)
-    return nullptr;
-
-  unsigned Opc;
-  unsigned LSB, MSB;
-  SDValue Opd0, Opd1;
-
-  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
-    return nullptr;
-
-  EVT VT = N->getValueType(0);
-  SDValue Ops[] = { Opd0,
-                    Opd1,
-                    CurDAG->getTargetConstant(LSB, VT),
-                    CurDAG->getTargetConstant(MSB, VT) };
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectLIBM(SDNode *N) {
-  EVT VT = N->getValueType(0);
-  unsigned Variant;
-  unsigned Opc;
-  unsigned FRINTXOpcs[] = { ARM64::FRINTXSr, ARM64::FRINTXDr };
-
-  if (VT == MVT::f32) {
-    Variant = 0;
-  } else if (VT == MVT::f64) {
-    Variant = 1;
-  } else
-    return nullptr; // Unrecognized argument type. Fall back on default codegen.
-
-  // Pick the FRINTX variant needed to set the flags.
-  unsigned FRINTXOpc = FRINTXOpcs[Variant];
-
-  switch (N->getOpcode()) {
-  default:
-    return nullptr; // Unrecognized libm ISD node. Fall back on default codegen.
-  case ISD::FCEIL: {
-    unsigned FRINTPOpcs[] = { ARM64::FRINTPSr, ARM64::FRINTPDr };
-    Opc = FRINTPOpcs[Variant];
-    break;
-  }
-  case ISD::FFLOOR: {
-    unsigned FRINTMOpcs[] = { ARM64::FRINTMSr, ARM64::FRINTMDr };
-    Opc = FRINTMOpcs[Variant];
-    break;
-  }
-  case ISD::FTRUNC: {
-    unsigned FRINTZOpcs[] = { ARM64::FRINTZSr, ARM64::FRINTZDr };
-    Opc = FRINTZOpcs[Variant];
-    break;
-  }
-  case ISD::FROUND: {
-    unsigned FRINTAOpcs[] = { ARM64::FRINTASr, ARM64::FRINTADr };
-    Opc = FRINTAOpcs[Variant];
-    break;
-  }
-  }
-
-  SDLoc dl(N);
-  SDValue In = N->getOperand(0);
-  SmallVector<SDValue, 2> Ops;
-  Ops.push_back(In);
-
-  if (!TM.Options.UnsafeFPMath) {
-    SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
-    Ops.push_back(SDValue(FRINTX, 1));
-  }
-
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
-}
-
-bool
-ARM64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
-                                              unsigned RegWidth) {
-  APFloat FVal(0.0);
-  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
-    FVal = CN->getValueAPF();
-  else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
-    // Some otherwise illegal constants are allowed in this case.
-    if (LN->getOperand(1).getOpcode() != ARM64ISD::ADDlow ||
-        !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
-      return false;
-
-    ConstantPoolSDNode *CN =
-        dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
-    FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
-  } else
-    return false;
-
-  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
-  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
-  // x-register.
-  //
-  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
-  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
-  // integers.
-  bool IsExact;
-
-  // fbits is between 1 and 64 in the worst-case, which means the fmul
-  // could have 2^64 as an actual operand. Need 65 bits of precision.
-  APSInt IntVal(65, true);
-  FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
-
-  // N.b. isPowerOf2 also checks for > 0.
-  if (!IsExact || !IntVal.isPowerOf2()) return false;
-  unsigned FBits = IntVal.logBase2();
-
-  // Checks above should have guaranteed that we haven't lost information in
-  // finding FBits, but it must still be in range.
-  if (FBits == 0 || FBits > RegWidth) return false;
-
-  FixedPos = CurDAG->getTargetConstant(FBits, MVT::i32);
-  return true;
-}
-
-SDNode *ARM64DAGToDAGISel::Select(SDNode *Node) {
-  // Dump information about the Node being selected
-  DEBUG(errs() << "Selecting: ");
-  DEBUG(Node->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  // If we have a custom node, we already have selected!
-  if (Node->isMachineOpcode()) {
-    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
-    Node->setNodeId(-1);
-    return nullptr;
-  }
-
-  // Few custom selection stuff.
-  SDNode *ResNode = nullptr;
-  EVT VT = Node->getValueType(0);
-
-  switch (Node->getOpcode()) {
-  default:
-    break;
-
-  case ISD::ADD:
-    if (SDNode *I = SelectMLAV64LaneV128(Node))
-      return I;
-    break;
-
-  case ISD::LOAD: {
-    // Try to select as an indexed load. Fall through to normal processing
-    // if we can't.
-    bool Done = false;
-    SDNode *I = SelectIndexedLoad(Node, Done);
-    if (Done)
-      return I;
-    break;
-  }
-
-  case ISD::SRL:
-  case ISD::AND:
-  case ISD::SRA:
-    if (SDNode *I = SelectBitfieldExtractOp(Node))
-      return I;
-    break;
-
-  case ISD::OR:
-    if (SDNode *I = SelectBitfieldInsertOp(Node))
-      return I;
-    break;
-
-  case ISD::EXTRACT_VECTOR_ELT: {
-    // Extracting lane zero is a special case where we can just use a plain
-    // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
-    // the rest of the compiler, especially the register allocator and copyi
-    // propagation, to reason about, so is preferred when it's possible to
-    // use it.
-    ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
-    // Bail and use the default Select() for non-zero lanes.
-    if (LaneNode->getZExtValue() != 0)
-      break;
-    // If the element type is not the same as the result type, likewise
-    // bail and use the default Select(), as there's more to do than just
-    // a cross-class COPY. This catches extracts of i8 and i16 elements
-    // since they will need an explicit zext.
-    if (VT != Node->getOperand(0).getValueType().getVectorElementType())
-      break;
-    unsigned SubReg;
-    switch (Node->getOperand(0)
-                .getValueType()
-                .getVectorElementType()
-                .getSizeInBits()) {
-    default:
-      assert(0 && "Unexpected vector element type!");
-    case 64:
-      SubReg = ARM64::dsub;
-      break;
-    case 32:
-      SubReg = ARM64::ssub;
-      break;
-    case 16: // FALLTHROUGH
-    case 8:
-      llvm_unreachable("unexpected zext-requiring extract element!");
-    }
-    SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
-                                                     Node->getOperand(0));
-    DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
-    DEBUG(Extract->dumpr(CurDAG));
-    DEBUG(dbgs() << "\n");
-    return Extract.getNode();
-  }
-  case ISD::Constant: {
-    // Materialize zero constants as copies from WZR/XZR.  This allows
-    // the coalescer to propagate these into other instructions.
-    ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
-    if (ConstNode->isNullValue()) {
-      if (VT == MVT::i32)
-        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                      ARM64::WZR, MVT::i32).getNode();
-      else if (VT == MVT::i64)
-        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                      ARM64::XZR, MVT::i64).getNode();
-    }
-    break;
-  }
-
-  case ISD::FrameIndex: {
-    // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
-    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-    unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0);
-    const TargetLowering *TLI = getTargetLowering();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
-                      CurDAG->getTargetConstant(Shifter, MVT::i32) };
-    return CurDAG->SelectNodeTo(Node, ARM64::ADDXri, MVT::i64, Ops);
-  }
-  case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_ldaxp:
-    case Intrinsic::arm64_ldxp: {
-      unsigned Op =
-          IntNo == Intrinsic::arm64_ldaxp ? ARM64::LDAXPX : ARM64::LDXPX;
-      SDValue MemAddr = Node->getOperand(2);
-      SDLoc DL(Node);
-      SDValue Chain = Node->getOperand(0);
-
-      SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
-                                          MVT::Other, MemAddr, Chain);
-
-      // Transfer memoperands.
-      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
-      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
-      return Ld;
-    }
-    case Intrinsic::arm64_stlxp:
-    case Intrinsic::arm64_stxp: {
-      unsigned Op =
-          IntNo == Intrinsic::arm64_stlxp ? ARM64::STLXPX : ARM64::STXPX;
-      SDLoc DL(Node);
-      SDValue Chain = Node->getOperand(0);
-      SDValue ValLo = Node->getOperand(2);
-      SDValue ValHi = Node->getOperand(3);
-      SDValue MemAddr = Node->getOperand(4);
-
-      // Place arguments in the right order.
-      SmallVector<SDValue, 7> Ops;
-      Ops.push_back(ValLo);
-      Ops.push_back(ValHi);
-      Ops.push_back(MemAddr);
-      Ops.push_back(Chain);
-
-      SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
-      // Transfer memoperands.
-      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
-      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
-
-      return St;
-    }
-    case Intrinsic::arm64_neon_ld1x2:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, ARM64::LD1Twov8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, ARM64::LD1Twov16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 2, ARM64::LD1Twov4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 2, ARM64::LD1Twov8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, ARM64::LD1Twov2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, ARM64::LD1Twov4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, ARM64::LD1Twov1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, ARM64::LD1Twov2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld1x3:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, ARM64::LD1Threev8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, ARM64::LD1Threev16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 3, ARM64::LD1Threev4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 3, ARM64::LD1Threev8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, ARM64::LD1Threev2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, ARM64::LD1Threev4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, ARM64::LD1Threev1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, ARM64::LD1Threev2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld1x4:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld2:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, ARM64::LD2Twov8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, ARM64::LD2Twov16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 2, ARM64::LD2Twov4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 2, ARM64::LD2Twov8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, ARM64::LD2Twov2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, ARM64::LD2Twov4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, ARM64::LD1Twov1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, ARM64::LD2Twov2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld3:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, ARM64::LD3Threev8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, ARM64::LD3Threev16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 3, ARM64::LD3Threev4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 3, ARM64::LD3Threev8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, ARM64::LD3Threev2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, ARM64::LD3Threev4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, ARM64::LD1Threev1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, ARM64::LD3Threev2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld4:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld2r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, ARM64::LD2Rv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, ARM64::LD2Rv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 2, ARM64::LD2Rv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 2, ARM64::LD2Rv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, ARM64::LD2Rv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, ARM64::LD2Rv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, ARM64::LD2Rv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, ARM64::LD2Rv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld3r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, ARM64::LD3Rv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, ARM64::LD3Rv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 3, ARM64::LD3Rv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 3, ARM64::LD3Rv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, ARM64::LD3Rv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, ARM64::LD3Rv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, ARM64::LD3Rv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, ARM64::LD3Rv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld4r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, ARM64::LD4Rv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, ARM64::LD4Rv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 4, ARM64::LD4Rv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 4, ARM64::LD4Rv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, ARM64::LD4Rv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, ARM64::LD4Rv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, ARM64::LD4Rv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, ARM64::LD4Rv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld2lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 2, ARM64::LD2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectLoadLane(Node, 2, ARM64::LD2i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 2, ARM64::LD2i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 2, ARM64::LD2i64);
-      break;
-    case Intrinsic::arm64_neon_ld3lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 3, ARM64::LD3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectLoadLane(Node, 3, ARM64::LD3i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 3, ARM64::LD3i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 3, ARM64::LD3i64);
-      break;
-    case Intrinsic::arm64_neon_ld4lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 4, ARM64::LD4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectLoadLane(Node, 4, ARM64::LD4i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 4, ARM64::LD4i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 4, ARM64::LD4i64);
-      break;
-    }
-  } break;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_neon_tbl2:
-      return SelectTable(Node, 2, VT == MVT::v8i8 ? ARM64::TBLv8i8Two
-                                                  : ARM64::TBLv16i8Two,
-                         false);
-    case Intrinsic::arm64_neon_tbl3:
-      return SelectTable(Node, 3, VT == MVT::v8i8 ? ARM64::TBLv8i8Three
-                                                  : ARM64::TBLv16i8Three,
-                         false);
-    case Intrinsic::arm64_neon_tbl4:
-      return SelectTable(Node, 4, VT == MVT::v8i8 ? ARM64::TBLv8i8Four
-                                                  : ARM64::TBLv16i8Four,
-                         false);
-    case Intrinsic::arm64_neon_tbx2:
-      return SelectTable(Node, 2, VT == MVT::v8i8 ? ARM64::TBXv8i8Two
-                                                  : ARM64::TBXv16i8Two,
-                         true);
-    case Intrinsic::arm64_neon_tbx3:
-      return SelectTable(Node, 3, VT == MVT::v8i8 ? ARM64::TBXv8i8Three
-                                                  : ARM64::TBXv16i8Three,
-                         true);
-    case Intrinsic::arm64_neon_tbx4:
-      return SelectTable(Node, 4, VT == MVT::v8i8 ? ARM64::TBXv8i8Four
-                                                  : ARM64::TBXv16i8Four,
-                         true);
-    case Intrinsic::arm64_neon_smull:
-    case Intrinsic::arm64_neon_umull:
-      if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
-        return N;
-      break;
-    }
-    break;
-  }
-  case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    if (Node->getNumOperands() >= 3)
-      VT = Node->getOperand(2)->getValueType(0);
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_neon_st1x2: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 2, ARM64::ST1Twov8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 2, ARM64::ST1Twov16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 2, ARM64::ST1Twov4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 2, ARM64::ST1Twov8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 2, ARM64::ST1Twov2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 2, ARM64::ST1Twov4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 2, ARM64::ST1Twov2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 2, ARM64::ST1Twov1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st1x3: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 3, ARM64::ST1Threev8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 3, ARM64::ST1Threev16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 3, ARM64::ST1Threev4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 3, ARM64::ST1Threev8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 3, ARM64::ST1Threev2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 3, ARM64::ST1Threev4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 3, ARM64::ST1Threev2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 3, ARM64::ST1Threev1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st1x4: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 4, ARM64::ST1Fourv8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 4, ARM64::ST1Fourv16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 4, ARM64::ST1Fourv4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 4, ARM64::ST1Fourv8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 4, ARM64::ST1Fourv2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 4, ARM64::ST1Fourv4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 4, ARM64::ST1Fourv2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 4, ARM64::ST1Fourv1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st2: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 2, ARM64::ST2Twov8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 2, ARM64::ST2Twov16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 2, ARM64::ST2Twov4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 2, ARM64::ST2Twov8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 2, ARM64::ST2Twov2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 2, ARM64::ST2Twov4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 2, ARM64::ST2Twov2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 2, ARM64::ST1Twov1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st3: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 3, ARM64::ST3Threev8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 3, ARM64::ST3Threev16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 3, ARM64::ST3Threev4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 3, ARM64::ST3Threev8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 3, ARM64::ST3Threev2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 3, ARM64::ST3Threev4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 3, ARM64::ST3Threev2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 3, ARM64::ST1Threev1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st4: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 4, ARM64::ST4Fourv8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 4, ARM64::ST4Fourv16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 4, ARM64::ST4Fourv4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 4, ARM64::ST4Fourv8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 4, ARM64::ST4Fourv2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 4, ARM64::ST4Fourv4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 4, ARM64::ST4Fourv2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 4, ARM64::ST1Fourv1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st2lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 2, ARM64::ST2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectStoreLane(Node, 2, ARM64::ST2i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 2, ARM64::ST2i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 2, ARM64::ST2i64);
-      break;
-    }
-    case Intrinsic::arm64_neon_st3lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 3, ARM64::ST3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectStoreLane(Node, 3, ARM64::ST3i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 3, ARM64::ST3i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 3, ARM64::ST3i64);
-      break;
-    }
-    case Intrinsic::arm64_neon_st4lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 4, ARM64::ST4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectStoreLane(Node, 4, ARM64::ST4i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 4, ARM64::ST4i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 4, ARM64::ST4i64);
-      break;
-    }
-    }
-  }
-  case ARM64ISD::LD2post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD3post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD4post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD1x2post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD1x3post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD1x4post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD1DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD2DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD3DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD4DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD1LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 1, ARM64::LD1i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostLoadLane(Node, 1, ARM64::LD1i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 1, ARM64::LD1i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 1, ARM64::LD1i64_POST);
-    break;
-  }
-  case ARM64ISD::LD2LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 2, ARM64::LD2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostLoadLane(Node, 2, ARM64::LD2i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 2, ARM64::LD2i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 2, ARM64::LD2i64_POST);
-    break;
-  }
-  case ARM64ISD::LD3LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 3, ARM64::LD3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostLoadLane(Node, 3, ARM64::LD3i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 3, ARM64::LD3i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 3, ARM64::LD3i64_POST);
-    break;
-  }
-  case ARM64ISD::LD4LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 4, ARM64::LD4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostLoadLane(Node, 4, ARM64::LD4i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 4, ARM64::LD4i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 4, ARM64::LD4i64_POST);
-    break;
-  }
-  case ARM64ISD::ST2post: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov16b_POST);
-    else if (VT == MVT::v4i16)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov4h_POST);
-    else if (VT == MVT::v8i16)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov4s_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov2d_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov1d_POST);
-    break;
-  }
-  case ARM64ISD::ST3post: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev16b_POST);
-    else if (VT == MVT::v4i16)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev4h_POST);
-    else if (VT == MVT::v8i16)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev4s_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev2d_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev1d_POST);
-    break;
-  }
-  case ARM64ISD::ST4post: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv16b_POST);
-    else if (VT == MVT::v4i16)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv4h_POST);
-    else if (VT == MVT::v8i16)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv4s_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv2d_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv1d_POST);
-    break;
-  }
-  case ARM64ISD::ST1x2post: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov16b_POST);
-    else if (VT == MVT::v4i16)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov4h_POST);
-    else if (VT == MVT::v8i16)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov4s_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov1d_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov2d_POST);
-    break;
-  }
-  case ARM64ISD::ST1x3post: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev16b_POST);
-    else if (VT == MVT::v4i16)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev4h_POST);
-    else if (VT == MVT::v8i16)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev4s_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev1d_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev2d_POST);
-    break;
-  }
-  case ARM64ISD::ST1x4post: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv16b_POST);
-    else if (VT == MVT::v4i16)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv4h_POST);
-    else if (VT == MVT::v8i16)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv4s_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv1d_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv2d_POST);
-    break;
-  }
-  case ARM64ISD::ST2LANEpost: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostStoreLane(Node, 2, ARM64::ST2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostStoreLane(Node, 2, ARM64::ST2i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostStoreLane(Node, 2, ARM64::ST2i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostStoreLane(Node, 2, ARM64::ST2i64_POST);
-    break;
-  }
-  case ARM64ISD::ST3LANEpost: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostStoreLane(Node, 3, ARM64::ST3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostStoreLane(Node, 3, ARM64::ST3i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostStoreLane(Node, 3, ARM64::ST3i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostStoreLane(Node, 3, ARM64::ST3i64_POST);
-    break;
-  }
-  case ARM64ISD::ST4LANEpost: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostStoreLane(Node, 4, ARM64::ST4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostStoreLane(Node, 4, ARM64::ST4i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostStoreLane(Node, 4, ARM64::ST4i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostStoreLane(Node, 4, ARM64::ST4i64_POST);
-    break;
-  }
-
-  case ISD::FCEIL:
-  case ISD::FFLOOR:
-  case ISD::FTRUNC:
-  case ISD::FROUND:
-    if (SDNode *I = SelectLIBM(Node))
-      return I;
-    break;
-  }
-
-  // Select the default instruction
-  ResNode = SelectCode(Node);
-
-  DEBUG(errs() << "=> ");
-  if (ResNode == nullptr || ResNode == Node)
-    DEBUG(Node->dump(CurDAG));
-  else
-    DEBUG(ResNode->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  return ResNode;
-}
-
-/// createARM64ISelDag - This pass converts a legalized DAG into a
-/// ARM64-specific DAG, ready for instruction scheduling.
-FunctionPass *llvm::createARM64ISelDag(ARM64TargetMachine &TM,
-                                       CodeGenOpt::Level OptLevel) {
-  return new ARM64DAGToDAGISel(TM, OptLevel);
-}
diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp
deleted file mode 100644
index c24b7deea94..00000000000
--- a/lib/Target/ARM64/ARM64ISelLowering.cpp
+++ /dev/null
@@ -1,7895 +0,0 @@
-//===-- ARM64ISelLowering.cpp - ARM64 DAG Lowering Implementation  --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64TargetLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64ISelLowering.h"
-#include "ARM64PerfectShuffle.h"
-#include "ARM64Subtarget.h"
-#include "ARM64CallingConv.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64TargetMachine.h"
-#include "ARM64TargetObjectFile.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOptions.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-lower"
-
-STATISTIC(NumTailCalls, "Number of tail calls");
-STATISTIC(NumShiftInserts, "Number of vector shift inserts");
-
-enum AlignMode {
-  StrictAlign,
-  NoStrictAlign
-};
-
-static cl::opt<AlignMode>
-Align(cl::desc("Load/store alignment support"),
-      cl::Hidden, cl::init(NoStrictAlign),
-      cl::values(
-          clEnumValN(StrictAlign,   "arm64-strict-align",
-                     "Disallow all unaligned memory accesses"),
-          clEnumValN(NoStrictAlign, "arm64-no-strict-align",
-                     "Allow unaligned memory accesses"),
-          clEnumValEnd));
-
-// Place holder until extr generation is tested fully.
-static cl::opt<bool>
-EnableARM64ExtrGeneration("arm64-extr-generation", cl::Hidden,
-                          cl::desc("Allow ARM64 (or (shift)(shift))->extract"),
-                          cl::init(true));
-
-static cl::opt<bool>
-EnableARM64SlrGeneration("arm64-shift-insert-generation", cl::Hidden,
-                         cl::desc("Allow ARM64 SLI/SRI formation"),
-                         cl::init(false));
-
-//===----------------------------------------------------------------------===//
-// ARM64 Lowering public interface.
-//===----------------------------------------------------------------------===//
-static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
-  if (TM.getSubtarget<ARM64Subtarget>().isTargetDarwin())
-    return new ARM64_MachoTargetObjectFile();
-
-  return new ARM64_ELFTargetObjectFile();
-}
-
-ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(TM)) {
-  Subtarget = &TM.getSubtarget<ARM64Subtarget>();
-
-  // ARM64 doesn't have comparisons which set GPRs or setcc instructions, so
-  // we have to make something up. Arbitrarily, choose ZeroOrOne.
-  setBooleanContents(ZeroOrOneBooleanContent);
-  // When comparing vectors the result sets the different elements in the
-  // vector to all-one or all-zero.
-  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
-
-  // Set up the register classes.
-  addRegisterClass(MVT::i32, &ARM64::GPR32allRegClass);
-  addRegisterClass(MVT::i64, &ARM64::GPR64allRegClass);
-
-  if (Subtarget->hasFPARMv8()) {
-    addRegisterClass(MVT::f16, &ARM64::FPR16RegClass);
-    addRegisterClass(MVT::f32, &ARM64::FPR32RegClass);
-    addRegisterClass(MVT::f64, &ARM64::FPR64RegClass);
-    addRegisterClass(MVT::f128, &ARM64::FPR128RegClass);
-  }
-
-  if (Subtarget->hasNEON()) {
-    addRegisterClass(MVT::v16i8, &ARM64::FPR8RegClass);
-    addRegisterClass(MVT::v8i16, &ARM64::FPR16RegClass);
-    // Someone set us up the NEON.
-    addDRTypeForNEON(MVT::v2f32);
-    addDRTypeForNEON(MVT::v8i8);
-    addDRTypeForNEON(MVT::v4i16);
-    addDRTypeForNEON(MVT::v2i32);
-    addDRTypeForNEON(MVT::v1i64);
-    addDRTypeForNEON(MVT::v1f64);
-
-    addQRTypeForNEON(MVT::v4f32);
-    addQRTypeForNEON(MVT::v2f64);
-    addQRTypeForNEON(MVT::v16i8);
-    addQRTypeForNEON(MVT::v8i16);
-    addQRTypeForNEON(MVT::v4i32);
-    addQRTypeForNEON(MVT::v2i64);
-  }
-
-  // Compute derived properties from the register classes
-  computeRegisterProperties();
-
-  // Provide all sorts of operation actions
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
-  setOperationAction(ISD::SETCC, MVT::i32, Custom);
-  setOperationAction(ISD::SETCC, MVT::i64, Custom);
-  setOperationAction(ISD::SETCC, MVT::f32, Custom);
-  setOperationAction(ISD::SETCC, MVT::f64, Custom);
-  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
-  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
-  setOperationAction(ISD::SELECT, MVT::i32, Custom);
-  setOperationAction(ISD::SELECT, MVT::i64, Custom);
-  setOperationAction(ISD::SELECT, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT, MVT::f64, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
-
-  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
-
-  setOperationAction(ISD::FREM, MVT::f32, Expand);
-  setOperationAction(ISD::FREM, MVT::f64, Expand);
-  setOperationAction(ISD::FREM, MVT::f80, Expand);
-
-  // Custom lowering hooks are needed for XOR
-  // to fold it into CSINC/CSINV.
-  setOperationAction(ISD::XOR, MVT::i32, Custom);
-  setOperationAction(ISD::XOR, MVT::i64, Custom);
-
-  // Virtually no operation on f128 is legal, but LLVM can't expand them when
-  // there's a valid register class, so we need custom operations in most cases.
-  setOperationAction(ISD::FABS, MVT::f128, Expand);
-  setOperationAction(ISD::FADD, MVT::f128, Custom);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
-  setOperationAction(ISD::FCOS, MVT::f128, Expand);
-  setOperationAction(ISD::FDIV, MVT::f128, Custom);
-  setOperationAction(ISD::FMA, MVT::f128, Expand);
-  setOperationAction(ISD::FMUL, MVT::f128, Custom);
-  setOperationAction(ISD::FNEG, MVT::f128, Expand);
-  setOperationAction(ISD::FPOW, MVT::f128, Expand);
-  setOperationAction(ISD::FREM, MVT::f128, Expand);
-  setOperationAction(ISD::FRINT, MVT::f128, Expand);
-  setOperationAction(ISD::FSIN, MVT::f128, Expand);
-  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
-  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
-  setOperationAction(ISD::FSUB, MVT::f128, Custom);
-  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
-  setOperationAction(ISD::SETCC, MVT::f128, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
-  setOperationAction(ISD::SELECT, MVT::f128, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
-  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
-
-  // Lowering for many of the conversions is actually specified by the non-f128
-  // type. The LowerXXX function will be trivial when f128 isn't involved.
-  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
-  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
-
-  // Variable arguments.
-  setOperationAction(ISD::VASTART, MVT::Other, Custom);
-  setOperationAction(ISD::VAARG, MVT::Other, Custom);
-  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
-  setOperationAction(ISD::VAEND, MVT::Other, Expand);
-
-  // Variable-sized objects.
-  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
-  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
-
-  // Exception handling.
-  // FIXME: These are guesses. Has this been defined yet?
-  setExceptionPointerRegister(ARM64::X0);
-  setExceptionSelectorRegister(ARM64::X1);
-
-  // Constant pool entries
-  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
-
-  // BlockAddress
-  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
-
-  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
-  setOperationAction(ISD::ADDC, MVT::i32, Custom);
-  setOperationAction(ISD::ADDE, MVT::i32, Custom);
-  setOperationAction(ISD::SUBC, MVT::i32, Custom);
-  setOperationAction(ISD::SUBE, MVT::i32, Custom);
-  setOperationAction(ISD::ADDC, MVT::i64, Custom);
-  setOperationAction(ISD::ADDE, MVT::i64, Custom);
-  setOperationAction(ISD::SUBC, MVT::i64, Custom);
-  setOperationAction(ISD::SUBE, MVT::i64, Custom);
-
-  // ARM64 lacks both left-rotate and popcount instructions.
-  setOperationAction(ISD::ROTL, MVT::i32, Expand);
-  setOperationAction(ISD::ROTL, MVT::i64, Expand);
-
-  // ARM64 doesn't have {U|S}MUL_LOHI.
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-
-
-  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
-  // counterparts, which ARM64 supports directly.
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
-
-  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
-  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
-
-  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
-  setOperationAction(ISD::SREM, MVT::i32, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-
-  // Custom lower Add/Sub/Mul with overflow.
-  setOperationAction(ISD::SADDO, MVT::i32, Custom);
-  setOperationAction(ISD::SADDO, MVT::i64, Custom);
-  setOperationAction(ISD::UADDO, MVT::i32, Custom);
-  setOperationAction(ISD::UADDO, MVT::i64, Custom);
-  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
-  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
-  setOperationAction(ISD::USUBO, MVT::i32, Custom);
-  setOperationAction(ISD::USUBO, MVT::i64, Custom);
-  setOperationAction(ISD::SMULO, MVT::i32, Custom);
-  setOperationAction(ISD::SMULO, MVT::i64, Custom);
-  setOperationAction(ISD::UMULO, MVT::i32, Custom);
-  setOperationAction(ISD::UMULO, MVT::i64, Custom);
-
-  setOperationAction(ISD::FSIN, MVT::f32, Expand);
-  setOperationAction(ISD::FSIN, MVT::f64, Expand);
-  setOperationAction(ISD::FCOS, MVT::f32, Expand);
-  setOperationAction(ISD::FCOS, MVT::f64, Expand);
-  setOperationAction(ISD::FPOW, MVT::f32, Expand);
-  setOperationAction(ISD::FPOW, MVT::f64, Expand);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
-
-  // ARM64 has implementations of a lot of rounding-like FP operations.
-  static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
-  for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
-    MVT Ty = RoundingTypes[I];
-    setOperationAction(ISD::FFLOOR, Ty, Legal);
-    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-    setOperationAction(ISD::FCEIL, Ty, Legal);
-    setOperationAction(ISD::FRINT, Ty, Legal);
-    setOperationAction(ISD::FTRUNC, Ty, Legal);
-    setOperationAction(ISD::FROUND, Ty, Legal);
-  }
-
-  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
-
-  if (Subtarget->isTargetMachO()) {
-    // For iOS, we don't want to the normal expansion of a libcall to
-    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
-    // traffic.
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  } else {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-  }
-
-  // ARM64 does not have floating-point extending loads, i1 sign-extending load,
-  // floating-point truncating stores, or v2i32->v2i16 truncating store.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
-  // Indexed loads and stores are supported.
-  for (unsigned im = (unsigned)ISD::PRE_INC;
-       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
-    setIndexedLoadAction(im, MVT::i8, Legal);
-    setIndexedLoadAction(im, MVT::i16, Legal);
-    setIndexedLoadAction(im, MVT::i32, Legal);
-    setIndexedLoadAction(im, MVT::i64, Legal);
-    setIndexedLoadAction(im, MVT::f64, Legal);
-    setIndexedLoadAction(im, MVT::f32, Legal);
-    setIndexedStoreAction(im, MVT::i8, Legal);
-    setIndexedStoreAction(im, MVT::i16, Legal);
-    setIndexedStoreAction(im, MVT::i32, Legal);
-    setIndexedStoreAction(im, MVT::i64, Legal);
-    setIndexedStoreAction(im, MVT::f64, Legal);
-    setIndexedStoreAction(im, MVT::f32, Legal);
-  }
-
-  // Trap.
-  setOperationAction(ISD::TRAP, MVT::Other, Legal);
-
-  // We combine OR nodes for bitfield operations.
-  setTargetDAGCombine(ISD::OR);
-
-  // Vector add and sub nodes may conceal a high-half opportunity.
-  // Also, try to fold ADD into CSINC/CSINV..
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::SUB);
-
-  setTargetDAGCombine(ISD::XOR);
-  setTargetDAGCombine(ISD::SINT_TO_FP);
-  setTargetDAGCombine(ISD::UINT_TO_FP);
-
-  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
-
-  setTargetDAGCombine(ISD::ANY_EXTEND);
-  setTargetDAGCombine(ISD::ZERO_EXTEND);
-  setTargetDAGCombine(ISD::SIGN_EXTEND);
-  setTargetDAGCombine(ISD::BITCAST);
-  setTargetDAGCombine(ISD::CONCAT_VECTORS);
-  setTargetDAGCombine(ISD::STORE);
-
-  setTargetDAGCombine(ISD::MUL);
-
-  setTargetDAGCombine(ISD::SELECT);
-  setTargetDAGCombine(ISD::VSELECT);
-
-  setTargetDAGCombine(ISD::INTRINSIC_VOID);
-  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
-  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
-
-  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
-  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
-  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
-
-  setStackPointerRegisterToSaveRestore(ARM64::SP);
-
-  setSchedulingPreference(Sched::Hybrid);
-
-  // Enable TBZ/TBNZ
-  MaskAndBranchFoldingIsLegal = true;
-
-  setMinFunctionAlignment(2);
-
-  RequireStrictAlign = (Align == StrictAlign);
-
-  setHasExtractBitsInsn(true);
-
-  if (Subtarget->hasNEON()) {
-    // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
-    // silliness like this:
-    setOperationAction(ISD::FABS, MVT::v1f64, Expand);
-    setOperationAction(ISD::FADD, MVT::v1f64, Expand);
-    setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
-    setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
-    setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
-    setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
-    setOperationAction(ISD::FMA, MVT::v1f64, Expand);
-    setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
-    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
-    setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
-    setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
-    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
-    setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
-    setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
-    setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
-    setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
-    setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
-    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
-    setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
-
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
-    setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
-
-    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
-
-    // ARM64 doesn't have a direct vector ->f32 conversion instructions for
-    // elements smaller than i32, so promote the input to i32 first.
-    setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
-    // Similarly, there is no direct i32 -> f64 vector conversion instruction.
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
-
-    // ARM64 doesn't have MUL.2d:
-    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
-    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
-    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
-    // Likewise, narrowing and extending vector loads/stores aren't handled
-    // directly.
-    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-
-      setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
-                         Expand);
-
-      setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-
-      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
-
-      for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-           InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
-        setTruncStoreAction((MVT::SimpleValueType)VT,
-                            (MVT::SimpleValueType)InnerVT, Expand);
-      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
-    }
-
-    // ARM64 has implementations of a lot of rounding-like FP operations.
-    static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
-    for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
-      MVT Ty = RoundingVecTypes[I];
-      setOperationAction(ISD::FFLOOR, Ty, Legal);
-      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-      setOperationAction(ISD::FCEIL, Ty, Legal);
-      setOperationAction(ISD::FRINT, Ty, Legal);
-      setOperationAction(ISD::FTRUNC, Ty, Legal);
-      setOperationAction(ISD::FROUND, Ty, Legal);
-    }
-  }
-}
-
-void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
-  if (VT == MVT::v2f32) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
-
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
-  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
-
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
-  }
-
-  // Mark vector float intrinsics as expand.
-  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
-    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
-  }
-
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-
-  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
-  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
-
-  // CNT supports only B element sizes.
-  if (VT != MVT::v8i8 && VT != MVT::v16i8)
-    setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
-
-  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
-
-  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
-
-  if (Subtarget->isLittleEndian()) {
-    for (unsigned im = (unsigned)ISD::PRE_INC;
-         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
-      setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
-      setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
-    }
-  }
-}
-
-void ARM64TargetLowering::addDRTypeForNEON(MVT VT) {
-  addRegisterClass(VT, &ARM64::FPR64RegClass);
-  addTypeForNEON(VT, MVT::v2i32);
-}
-
-void ARM64TargetLowering::addQRTypeForNEON(MVT VT) {
-  addRegisterClass(VT, &ARM64::FPR128RegClass);
-  addTypeForNEON(VT, MVT::v4i32);
-}
-
-EVT ARM64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-  if (!VT.isVector())
-    return MVT::i32;
-  return VT.changeVectorElementTypeToInteger();
-}
-
-/// computeKnownBitsForTargetNode - Determine which of the bits specified in
-/// Mask are known to be either zero or one and return them in the
-/// KnownZero/KnownOne bitsets.
-void ARM64TargetLowering::computeKnownBitsForTargetNode(
-    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
-    const SelectionDAG &DAG, unsigned Depth) const {
-  switch (Op.getOpcode()) {
-  default:
-    break;
-  case ARM64ISD::CSEL: {
-    APInt KnownZero2, KnownOne2;
-    DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
-    DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
-    KnownZero &= KnownZero2;
-    KnownOne &= KnownOne2;
-    break;
-  }
-  case ISD::INTRINSIC_W_CHAIN: {
-   ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
-    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
-    switch (IntID) {
-    default: return;
-    case Intrinsic::arm64_ldaxr:
-    case Intrinsic::arm64_ldxr: {
-      unsigned BitWidth = KnownOne.getBitWidth();
-      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
-      unsigned MemBits = VT.getScalarType().getSizeInBits();
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
-      return;
-    }
-    }
-    break;
-  }
-  case ISD::INTRINSIC_WO_CHAIN:
-  case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_neon_umaxv:
-    case Intrinsic::arm64_neon_uminv: {
-      // Figure out the datatype of the vector operand. The UMINV instruction
-      // will zero extend the result, so we can mark as known zero all the
-      // bits larger than the element datatype. 32-bit or larget doesn't need
-      // this as those are legal types and will be handled by isel directly.
-      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
-      unsigned BitWidth = KnownZero.getBitWidth();
-      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
-        assert(BitWidth >= 8 && "Unexpected width!");
-        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
-        KnownZero |= Mask;
-      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
-        assert(BitWidth >= 16 && "Unexpected width!");
-        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
-        KnownZero |= Mask;
-      }
-      break;
-    } break;
-    }
-  }
-  }
-}
-
-MVT ARM64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
-  return MVT::i64;
-}
-
-unsigned ARM64TargetLowering::getMaximalGlobalOffset() const {
-  // FIXME: On ARM64, this depends on the type.
-  // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes().
-  // and the offset has to be a multiple of the related size in bytes.
-  return 4095;
-}
-
-FastISel *
-ARM64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
-                                    const TargetLibraryInfo *libInfo) const {
-  return ARM64::createFastISel(funcInfo, libInfo);
-}
-
-const char *ARM64TargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (Opcode) {
-  default:
-    return nullptr;
-  case ARM64ISD::CALL:              return "ARM64ISD::CALL";
-  case ARM64ISD::ADRP:              return "ARM64ISD::ADRP";
-  case ARM64ISD::ADDlow:            return "ARM64ISD::ADDlow";
-  case ARM64ISD::LOADgot:           return "ARM64ISD::LOADgot";
-  case ARM64ISD::RET_FLAG:          return "ARM64ISD::RET_FLAG";
-  case ARM64ISD::BRCOND:            return "ARM64ISD::BRCOND";
-  case ARM64ISD::CSEL:              return "ARM64ISD::CSEL";
-  case ARM64ISD::FCSEL:             return "ARM64ISD::FCSEL";
-  case ARM64ISD::CSINV:             return "ARM64ISD::CSINV";
-  case ARM64ISD::CSNEG:             return "ARM64ISD::CSNEG";
-  case ARM64ISD::CSINC:             return "ARM64ISD::CSINC";
-  case ARM64ISD::THREAD_POINTER:    return "ARM64ISD::THREAD_POINTER";
-  case ARM64ISD::TLSDESC_CALL:      return "ARM64ISD::TLSDESC_CALL";
-  case ARM64ISD::ADC:               return "ARM64ISD::ADC";
-  case ARM64ISD::SBC:               return "ARM64ISD::SBC";
-  case ARM64ISD::ADDS:              return "ARM64ISD::ADDS";
-  case ARM64ISD::SUBS:              return "ARM64ISD::SUBS";
-  case ARM64ISD::ADCS:              return "ARM64ISD::ADCS";
-  case ARM64ISD::SBCS:              return "ARM64ISD::SBCS";
-  case ARM64ISD::ANDS:              return "ARM64ISD::ANDS";
-  case ARM64ISD::FCMP:              return "ARM64ISD::FCMP";
-  case ARM64ISD::FMIN:              return "ARM64ISD::FMIN";
-  case ARM64ISD::FMAX:              return "ARM64ISD::FMAX";
-  case ARM64ISD::DUP:               return "ARM64ISD::DUP";
-  case ARM64ISD::DUPLANE8:          return "ARM64ISD::DUPLANE8";
-  case ARM64ISD::DUPLANE16:         return "ARM64ISD::DUPLANE16";
-  case ARM64ISD::DUPLANE32:         return "ARM64ISD::DUPLANE32";
-  case ARM64ISD::DUPLANE64:         return "ARM64ISD::DUPLANE64";
-  case ARM64ISD::MOVI:              return "ARM64ISD::MOVI";
-  case ARM64ISD::MOVIshift:         return "ARM64ISD::MOVIshift";
-  case ARM64ISD::MOVIedit:          return "ARM64ISD::MOVIedit";
-  case ARM64ISD::MOVImsl:           return "ARM64ISD::MOVImsl";
-  case ARM64ISD::FMOV:              return "ARM64ISD::FMOV";
-  case ARM64ISD::MVNIshift:         return "ARM64ISD::MVNIshift";
-  case ARM64ISD::MVNImsl:           return "ARM64ISD::MVNImsl";
-  case ARM64ISD::BICi:              return "ARM64ISD::BICi";
-  case ARM64ISD::ORRi:              return "ARM64ISD::ORRi";
-  case ARM64ISD::BSL:               return "ARM64ISD::BSL";
-  case ARM64ISD::NEG:               return "ARM64ISD::NEG";
-  case ARM64ISD::EXTR:              return "ARM64ISD::EXTR";
-  case ARM64ISD::ZIP1:              return "ARM64ISD::ZIP1";
-  case ARM64ISD::ZIP2:              return "ARM64ISD::ZIP2";
-  case ARM64ISD::UZP1:              return "ARM64ISD::UZP1";
-  case ARM64ISD::UZP2:              return "ARM64ISD::UZP2";
-  case ARM64ISD::TRN1:              return "ARM64ISD::TRN1";
-  case ARM64ISD::TRN2:              return "ARM64ISD::TRN2";
-  case ARM64ISD::REV16:             return "ARM64ISD::REV16";
-  case ARM64ISD::REV32:             return "ARM64ISD::REV32";
-  case ARM64ISD::REV64:             return "ARM64ISD::REV64";
-  case ARM64ISD::EXT:               return "ARM64ISD::EXT";
-  case ARM64ISD::VSHL:              return "ARM64ISD::VSHL";
-  case ARM64ISD::VLSHR:             return "ARM64ISD::VLSHR";
-  case ARM64ISD::VASHR:             return "ARM64ISD::VASHR";
-  case ARM64ISD::CMEQ:              return "ARM64ISD::CMEQ";
-  case ARM64ISD::CMGE:              return "ARM64ISD::CMGE";
-  case ARM64ISD::CMGT:              return "ARM64ISD::CMGT";
-  case ARM64ISD::CMHI:              return "ARM64ISD::CMHI";
-  case ARM64ISD::CMHS:              return "ARM64ISD::CMHS";
-  case ARM64ISD::FCMEQ:             return "ARM64ISD::FCMEQ";
-  case ARM64ISD::FCMGE:             return "ARM64ISD::FCMGE";
-  case ARM64ISD::FCMGT:             return "ARM64ISD::FCMGT";
-  case ARM64ISD::CMEQz:             return "ARM64ISD::CMEQz";
-  case ARM64ISD::CMGEz:             return "ARM64ISD::CMGEz";
-  case ARM64ISD::CMGTz:             return "ARM64ISD::CMGTz";
-  case ARM64ISD::CMLEz:             return "ARM64ISD::CMLEz";
-  case ARM64ISD::CMLTz:             return "ARM64ISD::CMLTz";
-  case ARM64ISD::FCMEQz:            return "ARM64ISD::FCMEQz";
-  case ARM64ISD::FCMGEz:            return "ARM64ISD::FCMGEz";
-  case ARM64ISD::FCMGTz:            return "ARM64ISD::FCMGTz";
-  case ARM64ISD::FCMLEz:            return "ARM64ISD::FCMLEz";
-  case ARM64ISD::FCMLTz:            return "ARM64ISD::FCMLTz";
-  case ARM64ISD::NOT:               return "ARM64ISD::NOT";
-  case ARM64ISD::BIT:               return "ARM64ISD::BIT";
-  case ARM64ISD::CBZ:               return "ARM64ISD::CBZ";
-  case ARM64ISD::CBNZ:              return "ARM64ISD::CBNZ";
-  case ARM64ISD::TBZ:               return "ARM64ISD::TBZ";
-  case ARM64ISD::TBNZ:              return "ARM64ISD::TBNZ";
-  case ARM64ISD::TC_RETURN:         return "ARM64ISD::TC_RETURN";
-  case ARM64ISD::SITOF:             return "ARM64ISD::SITOF";
-  case ARM64ISD::UITOF:             return "ARM64ISD::UITOF";
-  case ARM64ISD::SQSHL_I:           return "ARM64ISD::SQSHL_I";
-  case ARM64ISD::UQSHL_I:           return "ARM64ISD::UQSHL_I";
-  case ARM64ISD::SRSHR_I:           return "ARM64ISD::SRSHR_I";
-  case ARM64ISD::URSHR_I:           return "ARM64ISD::URSHR_I";
-  case ARM64ISD::SQSHLU_I:          return "ARM64ISD::SQSHLU_I";
-  case ARM64ISD::WrapperLarge:      return "ARM64ISD::WrapperLarge";
-  case ARM64ISD::LD2post:           return "ARM64ISD::LD2post";
-  case ARM64ISD::LD3post:           return "ARM64ISD::LD3post";
-  case ARM64ISD::LD4post:           return "ARM64ISD::LD4post";
-  case ARM64ISD::ST2post:           return "ARM64ISD::ST2post";
-  case ARM64ISD::ST3post:           return "ARM64ISD::ST3post";
-  case ARM64ISD::ST4post:           return "ARM64ISD::ST4post";
-  case ARM64ISD::LD1x2post:         return "ARM64ISD::LD1x2post";
-  case ARM64ISD::LD1x3post:         return "ARM64ISD::LD1x3post";
-  case ARM64ISD::LD1x4post:         return "ARM64ISD::LD1x4post";
-  case ARM64ISD::ST1x2post:         return "ARM64ISD::ST1x2post";
-  case ARM64ISD::ST1x3post:         return "ARM64ISD::ST1x3post";
-  case ARM64ISD::ST1x4post:         return "ARM64ISD::ST1x4post";
-  case ARM64ISD::LD1DUPpost:        return "ARM64ISD::LD1DUPpost";
-  case ARM64ISD::LD2DUPpost:        return "ARM64ISD::LD2DUPpost";
-  case ARM64ISD::LD3DUPpost:        return "ARM64ISD::LD3DUPpost";
-  case ARM64ISD::LD4DUPpost:        return "ARM64ISD::LD4DUPpost";
-  case ARM64ISD::LD1LANEpost:       return "ARM64ISD::LD1LANEpost";
-  case ARM64ISD::LD2LANEpost:       return "ARM64ISD::LD2LANEpost";
-  case ARM64ISD::LD3LANEpost:       return "ARM64ISD::LD3LANEpost";
-  case ARM64ISD::LD4LANEpost:       return "ARM64ISD::LD4LANEpost";
-  case ARM64ISD::ST2LANEpost:       return "ARM64ISD::ST2LANEpost";
-  case ARM64ISD::ST3LANEpost:       return "ARM64ISD::ST3LANEpost";
-  case ARM64ISD::ST4LANEpost:       return "ARM64ISD::ST4LANEpost";
-  }
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitF128CSEL(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const {
-  // We materialise the F128CSEL pseudo-instruction as some control flow and a
-  // phi node:
-
-  // OrigBB:
-  //     [... previous instrs leading to comparison ...]
-  //     b.ne TrueBB
-  //     b EndBB
-  // TrueBB:
-  //     ; Fallthrough
-  // EndBB:
-  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
-
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  MachineFunction *MF = MBB->getParent();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  DebugLoc DL = MI->getDebugLoc();
-  MachineFunction::iterator It = MBB;
-  ++It;
-
-  unsigned DestReg = MI->getOperand(0).getReg();
-  unsigned IfTrueReg = MI->getOperand(1).getReg();
-  unsigned IfFalseReg = MI->getOperand(2).getReg();
-  unsigned CondCode = MI->getOperand(3).getImm();
-  bool NZCVKilled = MI->getOperand(4).isKill();
-
-  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, TrueBB);
-  MF->insert(It, EndBB);
-
-  // Transfer rest of current basic-block to EndBB
-  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
-                MBB->end());
-  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  BuildMI(MBB, DL, TII->get(ARM64::Bcc)).addImm(CondCode).addMBB(TrueBB);
-  BuildMI(MBB, DL, TII->get(ARM64::B)).addMBB(EndBB);
-  MBB->addSuccessor(TrueBB);
-  MBB->addSuccessor(EndBB);
-
-  // TrueBB falls through to the end.
-  TrueBB->addSuccessor(EndBB);
-
-  if (!NZCVKilled) {
-    TrueBB->addLiveIn(ARM64::NZCV);
-    EndBB->addLiveIn(ARM64::NZCV);
-  }
-
-  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(ARM64::PHI), DestReg)
-      .addReg(IfTrueReg)
-      .addMBB(TrueBB)
-      .addReg(IfFalseReg)
-      .addMBB(MBB);
-
-  MI->eraseFromParent();
-  return EndBB;
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *BB) const {
-  switch (MI->getOpcode()) {
-  default:
-#ifndef NDEBUG
-    MI->dump();
-#endif
-    assert(0 && "Unexpected instruction for custom inserter!");
-    break;
-
-  case ARM64::F128CSEL:
-    return EmitF128CSEL(MI, BB);
-
-  case TargetOpcode::STACKMAP:
-  case TargetOpcode::PATCHPOINT:
-    return emitPatchPoint(MI, BB);
-  }
-  llvm_unreachable("Unexpected instruction for custom inserter!");
-}
-
-//===----------------------------------------------------------------------===//
-// ARM64 Lowering private implementation.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Lowering Code
-//===----------------------------------------------------------------------===//
-
-/// changeIntCCToARM64CC - Convert a DAG integer condition code to an ARM64 CC
-static ARM64CC::CondCode changeIntCCToARM64CC(ISD::CondCode CC) {
-  switch (CC) {
-  default:
-    llvm_unreachable("Unknown condition code!");
-  case ISD::SETNE:
-    return ARM64CC::NE;
-  case ISD::SETEQ:
-    return ARM64CC::EQ;
-  case ISD::SETGT:
-    return ARM64CC::GT;
-  case ISD::SETGE:
-    return ARM64CC::GE;
-  case ISD::SETLT:
-    return ARM64CC::LT;
-  case ISD::SETLE:
-    return ARM64CC::LE;
-  case ISD::SETUGT:
-    return ARM64CC::HI;
-  case ISD::SETUGE:
-    return ARM64CC::HS;
-  case ISD::SETULT:
-    return ARM64CC::LO;
-  case ISD::SETULE:
-    return ARM64CC::LS;
-  }
-}
-
-/// changeFPCCToARM64CC - Convert a DAG fp condition code to an ARM64 CC.
-static void changeFPCCToARM64CC(ISD::CondCode CC, ARM64CC::CondCode &CondCode,
-                                ARM64CC::CondCode &CondCode2) {
-  CondCode2 = ARM64CC::AL;
-  switch (CC) {
-  default:
-    llvm_unreachable("Unknown FP condition!");
-  case ISD::SETEQ:
-  case ISD::SETOEQ:
-    CondCode = ARM64CC::EQ;
-    break;
-  case ISD::SETGT:
-  case ISD::SETOGT:
-    CondCode = ARM64CC::GT;
-    break;
-  case ISD::SETGE:
-  case ISD::SETOGE:
-    CondCode = ARM64CC::GE;
-    break;
-  case ISD::SETOLT:
-    CondCode = ARM64CC::MI;
-    break;
-  case ISD::SETOLE:
-    CondCode = ARM64CC::LS;
-    break;
-  case ISD::SETONE:
-    CondCode = ARM64CC::MI;
-    CondCode2 = ARM64CC::GT;
-    break;
-  case ISD::SETO:
-    CondCode = ARM64CC::VC;
-    break;
-  case ISD::SETUO:
-    CondCode = ARM64CC::VS;
-    break;
-  case ISD::SETUEQ:
-    CondCode = ARM64CC::EQ;
-    CondCode2 = ARM64CC::VS;
-    break;
-  case ISD::SETUGT:
-    CondCode = ARM64CC::HI;
-    break;
-  case ISD::SETUGE:
-    CondCode = ARM64CC::PL;
-    break;
-  case ISD::SETLT:
-  case ISD::SETULT:
-    CondCode = ARM64CC::LT;
-    break;
-  case ISD::SETLE:
-  case ISD::SETULE:
-    CondCode = ARM64CC::LE;
-    break;
-  case ISD::SETNE:
-  case ISD::SETUNE:
-    CondCode = ARM64CC::NE;
-    break;
-  }
-}
-
-/// changeVectorFPCCToARM64CC - Convert a DAG fp condition code to an ARM64 CC
-/// usable with the vector instructions. Fewer operations are available without
-/// a real NZCV register, so we have to use less efficient combinations to get
-/// the same effect.
-static void changeVectorFPCCToARM64CC(ISD::CondCode CC,
-                                      ARM64CC::CondCode &CondCode,
-                                      ARM64CC::CondCode &CondCode2,
-                                      bool &Invert) {
-  Invert = false;
-  switch (CC) {
-  default:
-    // Mostly the scalar mappings work fine.
-    changeFPCCToARM64CC(CC, CondCode, CondCode2);
-    break;
-  case ISD::SETUO:
-    Invert = true; // Fallthrough
-  case ISD::SETO:
-    CondCode = ARM64CC::MI;
-    CondCode2 = ARM64CC::GE;
-    break;
-  case ISD::SETUEQ:
-  case ISD::SETULT:
-  case ISD::SETULE:
-  case ISD::SETUGT:
-  case ISD::SETUGE:
-    // All of the compare-mask comparisons are ordered, but we can switch
-    // between the two by a double inversion. E.g. ULE == !OGT.
-    Invert = true;
-    changeFPCCToARM64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
-    break;
-  }
-}
-
-static bool isLegalArithImmed(uint64_t C) {
-  // Matches ARM64DAGToDAGISel::SelectArithImmed().
-  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
-}
-
-static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                              SDLoc dl, SelectionDAG &DAG) {
-  EVT VT = LHS.getValueType();
-
-  if (VT.isFloatingPoint())
-    return DAG.getNode(ARM64ISD::FCMP, dl, VT, LHS, RHS);
-
-  // The CMP instruction is just an alias for SUBS, and representing it as
-  // SUBS means that it's possible to get CSE with subtract operations.
-  // A later phase can perform the optimization of setting the destination
-  // register to WZR/XZR if it ends up being unused.
-  unsigned Opcode = ARM64ISD::SUBS;
-
-  if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
-      cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
-      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
-    // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
-    // can be set differently by this operation. It comes down to whether
-    // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
-    // everything is fine. If not then the optimization is wrong. Thus general
-    // comparisons are only valid if op2 != 0.
-
-    // So, finally, the only LLVM-native comparisons that don't mention C and V
-    // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
-    // the absence of information about op2.
-    Opcode = ARM64ISD::ADDS;
-    RHS = RHS.getOperand(1);
-  } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
-             cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
-             !isUnsignedIntSetCC(CC)) {
-    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
-    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
-    // of the signed comparisons.
-    Opcode = ARM64ISD::ANDS;
-    RHS = LHS.getOperand(1);
-    LHS = LHS.getOperand(0);
-  }
-
-  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
-      .getValue(1);
-}
-
-static SDValue getARM64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                           SDValue &ARM64cc, SelectionDAG &DAG, SDLoc dl) {
-  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
-    EVT VT = RHS.getValueType();
-    uint64_t C = RHSC->getZExtValue();
-    if (!isLegalArithImmed(C)) {
-      // Constant does not fit, try adjusting it by one?
-      switch (CC) {
-      default:
-        break;
-      case ISD::SETLT:
-      case ISD::SETGE:
-        if ((VT == MVT::i32 && C != 0x80000000 &&
-             isLegalArithImmed((uint32_t)(C - 1))) ||
-            (VT == MVT::i64 && C != 0x80000000ULL &&
-             isLegalArithImmed(C - 1ULL))) {
-          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
-          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      case ISD::SETULT:
-      case ISD::SETUGE:
-        if ((VT == MVT::i32 && C != 0 &&
-             isLegalArithImmed((uint32_t)(C - 1))) ||
-            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
-          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
-          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      case ISD::SETLE:
-      case ISD::SETGT:
-        if ((VT == MVT::i32 && C != 0x7fffffff &&
-             isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
-             isLegalArithImmed(C + 1ULL))) {
-          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
-          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      case ISD::SETULE:
-      case ISD::SETUGT:
-        if ((VT == MVT::i32 && C != 0xffffffff &&
-             isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
-             isLegalArithImmed(C + 1ULL))) {
-          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
-          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      }
-    }
-  }
-
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-  ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC);
-  ARM64cc = DAG.getConstant(ARM64CC, MVT::i32);
-  return Cmp;
-}
-
-static std::pair<SDValue, SDValue>
-getARM64XALUOOp(ARM64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
-  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
-         "Unsupported value type");
-  SDValue Value, Overflow;
-  SDLoc DL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  unsigned Opc = 0;
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("Unknown overflow instruction!");
-  case ISD::SADDO:
-    Opc = ARM64ISD::ADDS;
-    CC = ARM64CC::VS;
-    break;
-  case ISD::UADDO:
-    Opc = ARM64ISD::ADDS;
-    CC = ARM64CC::HS;
-    break;
-  case ISD::SSUBO:
-    Opc = ARM64ISD::SUBS;
-    CC = ARM64CC::VS;
-    break;
-  case ISD::USUBO:
-    Opc = ARM64ISD::SUBS;
-    CC = ARM64CC::LO;
-    break;
-  // Multiply needs a little bit extra work.
-  case ISD::SMULO:
-  case ISD::UMULO: {
-    CC = ARM64CC::NE;
-    bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
-    if (Op.getValueType() == MVT::i32) {
-      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-      // For a 32 bit multiply with overflow check we want the instruction
-      // selector to generate a widening multiply (SMADDL/UMADDL). For that we
-      // need to generate the following pattern:
-      // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
-      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
-      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
-      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
-      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
-                                DAG.getConstant(0, MVT::i64));
-      // On ARM64 the upper 32 bits are always zero extended for a 32 bit
-      // operation. We need to clear out the upper 32 bits, because we used a
-      // widening multiply that wrote all 64 bits. In the end this should be a
-      // noop.
-      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
-      if (IsSigned) {
-        // The signed overflow check requires more than just a simple check for
-        // any bit set in the upper 32 bits of the result. These bits could be
-        // just the sign bits of a negative number. To perform the overflow
-        // check we have to arithmetic shift right the 32nd bit of the result by
-        // 31 bits. Then we compare the result to the upper 32 bits.
-        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
-                                        DAG.getConstant(32, MVT::i64));
-        UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
-        SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
-                                        DAG.getConstant(31, MVT::i64));
-        // It is important that LowerBits is last, otherwise the arithmetic
-        // shift will not be folded into the compare (SUBS).
-        SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
-        Overflow = DAG.getNode(ARM64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
-                       .getValue(1);
-      } else {
-        // The overflow check for unsigned multiply is easy. We only need to
-        // check if any of the upper 32 bits are set. This can be done with a
-        // CMP (shifted register). For that we need to generate the following
-        // pattern:
-        // (i64 ARM64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
-        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
-                                        DAG.getConstant(32, MVT::i64));
-        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-        Overflow =
-            DAG.getNode(ARM64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
-                        UpperBits).getValue(1);
-      }
-      break;
-    }
-    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
-    // For the 64 bit multiply
-    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
-    if (IsSigned) {
-      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
-      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
-                                      DAG.getConstant(63, MVT::i64));
-      // It is important that LowerBits is last, otherwise the arithmetic
-      // shift will not be folded into the compare (SUBS).
-      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-      Overflow = DAG.getNode(ARM64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
-                     .getValue(1);
-    } else {
-      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
-      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-      Overflow =
-          DAG.getNode(ARM64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
-                      UpperBits).getValue(1);
-    }
-    break;
-  }
-  } // switch (...)
-
-  if (Opc) {
-    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
-
-    // Emit the ARM64 operation with overflow check.
-    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
-    Overflow = Value.getValue(1);
-  }
-  return std::make_pair(Value, Overflow);
-}
-
-SDValue ARM64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                                           RTLIB::Libcall Call) const {
-  SmallVector<SDValue, 2> Ops;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
-    Ops.push_back(Op.getOperand(i));
-
-  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
-                     SDLoc(Op)).first;
-}
-
-static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
-  SDValue Sel = Op.getOperand(0);
-  SDValue Other = Op.getOperand(1);
-
-  // If neither operand is a SELECT_CC, give up.
-  if (Sel.getOpcode() != ISD::SELECT_CC)
-    std::swap(Sel, Other);
-  if (Sel.getOpcode() != ISD::SELECT_CC)
-    return Op;
-
-  // The folding we want to perform is:
-  // (xor x, (select_cc a, b, cc, 0, -1) )
-  //   -->
-  // (csel x, (xor x, -1), cc ...)
-  //
-  // The latter will get matched to a CSINV instruction.
-
-  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
-  SDValue LHS = Sel.getOperand(0);
-  SDValue RHS = Sel.getOperand(1);
-  SDValue TVal = Sel.getOperand(2);
-  SDValue FVal = Sel.getOperand(3);
-  SDLoc dl(Sel);
-
-  // FIXME: This could be generalized to non-integer comparisons.
-  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
-    return Op;
-
-  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
-  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
-
-  // The the values aren't constants, this isn't the pattern we're looking for.
-  if (!CFVal || !CTVal)
-    return Op;
-
-  // We can commute the SELECT_CC by inverting the condition.  This
-  // might be needed to make this fit into a CSINV pattern.
-  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
-    std::swap(TVal, FVal);
-    std::swap(CTVal, CFVal);
-    CC = ISD::getSetCCInverse(CC, true);
-  }
-
-  // If the constants line up, perform the transform!
-  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
-    SDValue CCVal;
-    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
-
-    FVal = Other;
-    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
-                       DAG.getConstant(-1ULL, Other.getValueType()));
-
-    return DAG.getNode(ARM64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
-                       CCVal, Cmp);
-  }
-
-  return Op;
-}
-
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
-
-  // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
-    return SDValue();
-
-  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-
-  unsigned Opc;
-  bool ExtraOp = false;
-  switch (Op.getOpcode()) {
-  default:
-    assert(0 && "Invalid code");
-  case ISD::ADDC:
-    Opc = ARM64ISD::ADDS;
-    break;
-  case ISD::SUBC:
-    Opc = ARM64ISD::SUBS;
-    break;
-  case ISD::ADDE:
-    Opc = ARM64ISD::ADCS;
-    ExtraOp = true;
-    break;
-  case ISD::SUBE:
-    Opc = ARM64ISD::SBCS;
-    ExtraOp = true;
-    break;
-  }
-
-  if (!ExtraOp)
-    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
-  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
-                     Op.getOperand(2));
-}
-
-static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
-  // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
-    return SDValue();
-
-  ARM64CC::CondCode CC;
-  // The actual operation that sets the overflow or carry flag.
-  SDValue Value, Overflow;
-  std::tie(Value, Overflow) = getARM64XALUOOp(CC, Op, DAG);
-
-  // We use 0 and 1 as false and true values.
-  SDValue TVal = DAG.getConstant(1, MVT::i32);
-  SDValue FVal = DAG.getConstant(0, MVT::i32);
-
-  // We use an inverted condition, because the conditional select is inverted
-  // too. This will allow it to be selected to a single instruction:
-  // CSINC Wd, WZR, WZR, invert(cond).
-  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
-  Overflow = DAG.getNode(ARM64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal, CCVal,
-                         Overflow);
-
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
-}
-
-// Prefetch operands are:
-// 1: Address to prefetch
-// 2: bool isWrite
-// 3: int locality (0 = no locality ... 3 = extreme locality)
-// 4: bool isDataCache
-static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-  // The data thing is not used.
-  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
-
-  bool IsStream = !Locality;
-  // When the locality number is set
-  if (Locality) {
-    // The front-end should have filtered out the out-of-range values
-    assert(Locality <= 3 && "Prefetch locality out-of-range");
-    // The locality degree is the opposite of the cache speed.
-    // Put the number the other way around.
-    // The encoding starts at 0 for level 1
-    Locality = 3 - Locality;
-  }
-
-  // built the mask value encoding the expected behavior.
-  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
-                   (Locality << 1) |    // Cache level bits
-                   (unsigned)IsStream;  // Stream bit
-  return DAG.getNode(ARM64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
-                     DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
-}
-
-SDValue ARM64TargetLowering::LowerFP_EXTEND(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
-
-  RTLIB::Libcall LC;
-  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
-}
-
-SDValue ARM64TargetLowering::LowerFP_ROUND(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
-  }
-
-  RTLIB::Libcall LC;
-  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  // FP_ROUND node has a second operand indicating whether it is known to be
-  // precise. That doesn't take part in the LibCall so we can't directly use
-  // LowerF128Call.
-  SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op)).first;
-}
-
-static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
-  // Warning: We maintain cost tables in ARM64TargetTransformInfo.cpp.
-  // Any additional optimization in this function should be recorded
-  // in the cost tables.
-  EVT InVT = Op.getOperand(0).getValueType();
-  EVT VT = Op.getValueType();
-
-  // FP_TO_XINT conversion from the same type are legal.
-  if (VT.getSizeInBits() == InVT.getSizeInBits())
-    return Op;
-
-  if (InVT == MVT::v2f64 || InVT == MVT::v4f32) {
-    SDLoc dl(Op);
-    SDValue Cv =
-        DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
-                    Op.getOperand(0));
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
-  } else if (InVT == MVT::v2f32) {
-    SDLoc dl(Op);
-    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
-    return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
-  }
-
-  // Type changing conversions are illegal.
-  return SDValue();
-}
-
-SDValue ARM64TargetLowering::LowerFP_TO_INT(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType().isVector())
-    return LowerVectorFP_TO_INT(Op, DAG);
-
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
-  }
-
-  RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::FP_TO_SINT)
-    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  SmallVector<SDValue, 2> Ops;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
-    Ops.push_back(Op.getOperand(i));
-
-  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
-                     SDLoc(Op)).first;
-}
-
-static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
-  // Warning: We maintain cost tables in ARM64TargetTransformInfo.cpp.
-  // Any additional optimization in this function should be recorded
-  // in the cost tables.
-  EVT VT = Op.getValueType();
-  SDLoc dl(Op);
-  SDValue In = Op.getOperand(0);
-  EVT InVT = In.getValueType();
-
-  // v2i32 to v2f32 is legal.
-  if (VT == MVT::v2f32 && InVT == MVT::v2i32)
-    return Op;
-
-  // This function only handles v2f64 outputs.
-  if (VT == MVT::v2f64) {
-    // Extend the input argument to a v2i64 that we can feed into the
-    // floating point conversion. Zero or sign extend based on whether
-    // we're doing a signed or unsigned float conversion.
-    unsigned Opc =
-        Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
-    assert(Op.getNumOperands() == 1 && "FP conversions take one argument");
-    SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0));
-    return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted);
-  }
-
-  // Scalarize v2i64 to v2f32 conversions.
-  std::vector<SDValue> BuildVectorOps;
-  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
-    SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In,
-                               DAG.getConstant(i, MVT::i64));
-    Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr);
-    BuildVectorOps.push_back(Sclr);
-  }
-
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BuildVectorOps);
-}
-
-SDValue ARM64TargetLowering::LowerINT_TO_FP(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  if (Op.getValueType().isVector())
-    return LowerVectorINT_TO_FP(Op, DAG);
-
-  // i128 conversions are libcalls.
-  if (Op.getOperand(0).getValueType() == MVT::i128)
-    return SDValue();
-
-  // Other conversions are legal, unless it's to the completely software-based
-  // fp128.
-  if (Op.getValueType() != MVT::f128)
-    return Op;
-
-  RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::SINT_TO_FP)
-    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
-}
-
-SDValue ARM64TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
-  // For iOS, we want to call an alternative entry point: __sincos_stret,
-  // which returns the values in two S / D registers.
-  SDLoc dl(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  ArgListTy Args;
-  ArgListEntry Entry;
-
-  Entry.Node = Arg;
-  Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
-  Args.push_back(Entry);
-
-  const char *LibcallName =
-      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
-
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0);
-
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-  return CallResult.first;
-}
-
-SDValue ARM64TargetLowering::LowerOperation(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("unimplemented operand");
-    return SDValue();
-  case ISD::GlobalAddress:
-    return LowerGlobalAddress(Op, DAG);
-  case ISD::GlobalTLSAddress:
-    return LowerGlobalTLSAddress(Op, DAG);
-  case ISD::SETCC:
-    return LowerSETCC(Op, DAG);
-  case ISD::BR_CC:
-    return LowerBR_CC(Op, DAG);
-  case ISD::SELECT:
-    return LowerSELECT(Op, DAG);
-  case ISD::SELECT_CC:
-    return LowerSELECT_CC(Op, DAG);
-  case ISD::JumpTable:
-    return LowerJumpTable(Op, DAG);
-  case ISD::ConstantPool:
-    return LowerConstantPool(Op, DAG);
-  case ISD::BlockAddress:
-    return LowerBlockAddress(Op, DAG);
-  case ISD::VASTART:
-    return LowerVASTART(Op, DAG);
-  case ISD::VACOPY:
-    return LowerVACOPY(Op, DAG);
-  case ISD::VAARG:
-    return LowerVAARG(Op, DAG);
-  case ISD::ADDC:
-  case ISD::ADDE:
-  case ISD::SUBC:
-  case ISD::SUBE:
-    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
-  case ISD::SADDO:
-  case ISD::UADDO:
-  case ISD::SSUBO:
-  case ISD::USUBO:
-  case ISD::SMULO:
-  case ISD::UMULO:
-    return LowerXALUO(Op, DAG);
-  case ISD::FADD:
-    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
-  case ISD::FSUB:
-    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
-  case ISD::FMUL:
-    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
-  case ISD::FDIV:
-    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
-  case ISD::FP_ROUND:
-    return LowerFP_ROUND(Op, DAG);
-  case ISD::FP_EXTEND:
-    return LowerFP_EXTEND(Op, DAG);
-  case ISD::FRAMEADDR:
-    return LowerFRAMEADDR(Op, DAG);
-  case ISD::RETURNADDR:
-    return LowerRETURNADDR(Op, DAG);
-  case ISD::INSERT_VECTOR_ELT:
-    return LowerINSERT_VECTOR_ELT(Op, DAG);
-  case ISD::EXTRACT_VECTOR_ELT:
-    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
-  case ISD::BUILD_VECTOR:
-    return LowerBUILD_VECTOR(Op, DAG);
-  case ISD::VECTOR_SHUFFLE:
-    return LowerVECTOR_SHUFFLE(Op, DAG);
-  case ISD::EXTRACT_SUBVECTOR:
-    return LowerEXTRACT_SUBVECTOR(Op, DAG);
-  case ISD::SRA:
-  case ISD::SRL:
-  case ISD::SHL:
-    return LowerVectorSRA_SRL_SHL(Op, DAG);
-  case ISD::SHL_PARTS:
-    return LowerShiftLeftParts(Op, DAG);
-  case ISD::SRL_PARTS:
-  case ISD::SRA_PARTS:
-    return LowerShiftRightParts(Op, DAG);
-  case ISD::CTPOP:
-    return LowerCTPOP(Op, DAG);
-  case ISD::FCOPYSIGN:
-    return LowerFCOPYSIGN(Op, DAG);
-  case ISD::AND:
-    return LowerVectorAND(Op, DAG);
-  case ISD::OR:
-    return LowerVectorOR(Op, DAG);
-  case ISD::XOR:
-    return LowerXOR(Op, DAG);
-  case ISD::PREFETCH:
-    return LowerPREFETCH(Op, DAG);
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
-    return LowerINT_TO_FP(Op, DAG);
-  case ISD::FP_TO_SINT:
-  case ISD::FP_TO_UINT:
-    return LowerFP_TO_INT(Op, DAG);
-  case ISD::FSINCOS:
-    return LowerFSINCOS(Op, DAG);
-  }
-}
-
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned ARM64TargetLowering::getFunctionAlignment(const Function *F) const {
-  return 2;
-}
-
-//===----------------------------------------------------------------------===//
-//                      Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-
-#include "ARM64GenCallingConv.inc"
-
-/// Selects the correct CCAssignFn for a the given CallingConvention
-/// value.
-CCAssignFn *ARM64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
-                                                   bool IsVarArg) const {
-  switch (CC) {
-  default:
-    llvm_unreachable("Unsupported calling convention.");
-  case CallingConv::WebKit_JS:
-    return CC_ARM64_WebKit_JS;
-  case CallingConv::C:
-  case CallingConv::Fast:
-    if (!Subtarget->isTargetDarwin())
-      return CC_ARM64_AAPCS;
-    return IsVarArg ? CC_ARM64_DarwinPCS_VarArg : CC_ARM64_DarwinPCS;
-  }
-}
-
-SDValue ARM64TargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Assign locations to all of the incoming arguments.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  // At this point, Ins[].VT may already be promoted to i32. To correctly
-  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
-  // i8 to CC_ARM64_AAPCS with i32 being ValVT and i8 being LocVT.
-  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
-  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
-  // LocVT.
-  unsigned NumArgs = Ins.size();
-  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
-  unsigned CurArgIdx = 0;
-  for (unsigned i = 0; i != NumArgs; ++i) {
-    MVT ValVT = Ins[i].VT;
-    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[i].OrigArgIndex;
-
-    // Get type of the original argument.
-    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
-    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
-    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
-    MVT LocVT = ValVT;
-    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
-      LocVT = MVT::i8;
-    else if (ActualMVT == MVT::i16)
-      LocVT = MVT::i16;
-
-    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
-    bool Res =
-        AssignFn(i, ValVT, LocVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
-    assert(!Res && "Call operand has unhandled type");
-    (void)Res;
-  }
-  assert(ArgLocs.size() == Ins.size());
-  SmallVector<SDValue, 16> ArgValues;
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-
-    if (Ins[i].Flags.isByVal()) {
-      // Byval is used for HFAs in the PCS, but the system should work in a
-      // non-compliant manner for larger structs.
-      EVT PtrTy = getPointerTy();
-      int Size = Ins[i].Flags.getByValSize();
-      unsigned NumRegs = (Size + 7) / 8;
-
-      // FIXME: This works on big-endian for composite byvals, which are the common
-      // case. It should also work for fundamental types too.
-      unsigned FrameIdx =
-        MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
-      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
-      InVals.push_back(FrameIdxN);
-
-      continue;
-    } if (VA.isRegLoc()) {
-      // Arguments stored in registers.
-      EVT RegVT = VA.getLocVT();
-
-      SDValue ArgValue;
-      const TargetRegisterClass *RC;
-
-      if (RegVT == MVT::i32)
-        RC = &ARM64::GPR32RegClass;
-      else if (RegVT == MVT::i64)
-        RC = &ARM64::GPR64RegClass;
-      else if (RegVT == MVT::f32)
-        RC = &ARM64::FPR32RegClass;
-      else if (RegVT == MVT::f64 || RegVT.is64BitVector())
-        RC = &ARM64::FPR64RegClass;
-      else if (RegVT == MVT::f128 || RegVT.is128BitVector())
-        RC = &ARM64::FPR128RegClass;
-      else
-        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
-
-      // Transform the arguments in physical registers into virtual ones.
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
-      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
-
-      // If this is an 8, 16 or 32-bit value, it is really passed promoted
-      // to 64 bits.  Insert an assert[sz]ext to capture this, then
-      // truncate to the right size.
-      switch (VA.getLocInfo()) {
-      default:
-        llvm_unreachable("Unknown loc info!");
-      case CCValAssign::Full:
-        break;
-      case CCValAssign::BCvt:
-        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
-        break;
-      case CCValAssign::SExt:
-        ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
-                               DAG.getValueType(VA.getValVT()));
-        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
-        break;
-      case CCValAssign::ZExt:
-        ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
-                               DAG.getValueType(VA.getValVT()));
-        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
-        break;
-      }
-
-      InVals.push_back(ArgValue);
-
-    } else { // VA.isRegLoc()
-      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
-      unsigned ArgOffset = VA.getLocMemOffset();
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
-
-      uint32_t BEAlign = 0;
-      if (ArgSize < 8 && !Subtarget->isLittleEndian())
-        BEAlign = 8 - ArgSize;
-
-      int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
-
-      // Create load nodes to retrieve arguments from the stack.
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      SDValue ArgValue;
-
-      // If the loc type and val type are not the same, create an anyext load.
-      if (VA.getLocVT().getSizeInBits() != VA.getValVT().getSizeInBits()) {
-        // We should only get here if this is a pure integer.
-        assert(!VA.getValVT().isVector() && VA.getValVT().isInteger() &&
-               "Only integer extension supported!");
-        ArgValue = DAG.getExtLoad(ISD::EXTLOAD, DL, VA.getValVT(), Chain, FIN,
-                                  MachinePointerInfo::getFixedStack(FI),
-                                  VA.getLocVT(),
-                                  false, false, false, 0);
-      } else {
-        ArgValue = DAG.getLoad(VA.getValVT(), DL, Chain, FIN,
-                               MachinePointerInfo::getFixedStack(FI), false,
-                               false, false, 0);
-      }
-
-      InVals.push_back(ArgValue);
-    }
-  }
-
-  // varargs
-  if (isVarArg) {
-    if (!Subtarget->isTargetDarwin()) {
-      // The AAPCS variadic function ABI is identical to the non-variadic
-      // one. As a result there may be more arguments in registers and we should
-      // save them for future reference.
-      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
-    }
-
-    ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-    // This will point to the next argument passed via stack.
-    unsigned StackOffset = CCInfo.getNextStackOffset();
-    // We currently pass all varargs at 8-byte alignment.
-    StackOffset = ((StackOffset + 7) & ~7);
-    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
-  }
-
-  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-  unsigned StackArgSize = CCInfo.getNextStackOffset();
-  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
-  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
-    // This is a non-standard ABI so by fiat I say we're allowed to make full
-    // use of the stack area to be popped, which must be aligned to 16 bytes in
-    // any case:
-    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
-
-    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
-    // a multiple of 16.
-    FuncInfo->setArgumentStackToRestore(StackArgSize);
-
-    // This realignment carries over to the available bytes below. Our own
-    // callers will guarantee the space is free by giving an aligned value to
-    // CALLSEQ_START.
-  }
-  // Even if we're not expected to free up the space, it's useful to know how
-  // much is there while considering tail calls (because we can reuse it).
-  FuncInfo->setBytesInStackArgArea(StackArgSize);
-
-  return Chain;
-}
-
-void ARM64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
-                                              SelectionDAG &DAG, SDLoc DL,
-                                              SDValue &Chain) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-
-  SmallVector<SDValue, 8> MemOps;
-
-  static const MCPhysReg GPRArgRegs[] = { ARM64::X0, ARM64::X1, ARM64::X2,
-                                          ARM64::X3, ARM64::X4, ARM64::X5,
-                                          ARM64::X6, ARM64::X7 };
-  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
-  unsigned FirstVariadicGPR =
-      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
-
-  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
-  int GPRIdx = 0;
-  if (GPRSaveSize != 0) {
-    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
-
-    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
-
-    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &ARM64::GPR64RegClass);
-      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
-      SDValue Store =
-          DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                       MachinePointerInfo::getStack(i * 8), false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(8, getPointerTy()));
-    }
-  }
-  FuncInfo->setVarArgsGPRIndex(GPRIdx);
-  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
-
-  if (Subtarget->hasFPARMv8()) {
-    static const MCPhysReg FPRArgRegs[] = { ARM64::Q0, ARM64::Q1, ARM64::Q2,
-                                            ARM64::Q3, ARM64::Q4, ARM64::Q5,
-                                            ARM64::Q6, ARM64::Q7 };
-    static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
-    unsigned FirstVariadicFPR =
-        CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
-
-    unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
-    int FPRIdx = 0;
-    if (FPRSaveSize != 0) {
-      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
-
-      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
-
-      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
-        unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &ARM64::FPR128RegClass);
-        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
-
-        SDValue Store =
-            DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                         MachinePointerInfo::getStack(i * 16), false, false, 0);
-        MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                          DAG.getConstant(16, getPointerTy()));
-      }
-    }
-    FuncInfo->setVarArgsFPRIndex(FPRIdx);
-    FuncInfo->setVarArgsFPRSize(FPRSaveSize);
-  }
-
-  if (!MemOps.empty()) {
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
-  }
-}
-
-/// LowerCallResult - Lower the result values of a call into the
-/// appropriate copies out of appropriate physical registers.
-SDValue ARM64TargetLowering::LowerCallResult(
-    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
-    SDValue ThisVal) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                         : RetCC_ARM64_AAPCS;
-  // Assign locations to each value returned by this call.
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
-  CCInfo.AnalyzeCallResult(Ins, RetCC);
-
-  // Copy all of the result registers out of their specified physreg.
-  for (unsigned i = 0; i != RVLocs.size(); ++i) {
-    CCValAssign VA = RVLocs[i];
-
-    // Pass 'this' value directly from the argument to return value, to avoid
-    // reg unit interference
-    if (i == 0 && isThisReturn) {
-      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
-             "unexpected return calling convention register assignment");
-      InVals.push_back(ThisVal);
-      continue;
-    }
-
-    SDValue Val =
-        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
-    Chain = Val.getValue(1);
-    InFlag = Val.getValue(2);
-
-    switch (VA.getLocInfo()) {
-    default:
-      llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::BCvt:
-      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
-      break;
-    }
-
-    InVals.push_back(Val);
-  }
-
-  return Chain;
-}
-
-bool ARM64TargetLowering::isEligibleForTailCallOptimization(
-    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-    bool isCalleeStructRet, bool isCallerStructRet,
-    const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
-  // For CallingConv::C this function knows whether the ABI needs
-  // changing. That's not true for other conventions so they will have to opt in
-  // manually.
-  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
-    return false;
-
-  const MachineFunction &MF = DAG.getMachineFunction();
-  const Function *CallerF = MF.getFunction();
-  CallingConv::ID CallerCC = CallerF->getCallingConv();
-  bool CCMatch = CallerCC == CalleeCC;
-
-  // Byval parameters hand the function a pointer directly into the stack area
-  // we want to reuse during a tail call. Working around this *is* possible (see
-  // X86) but less efficient and uglier in LowerCall.
-  for (Function::const_arg_iterator i = CallerF->arg_begin(),
-                                    e = CallerF->arg_end();
-       i != e; ++i)
-    if (i->hasByValAttr())
-      return false;
-
-  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
-    if (IsTailCallConvention(CalleeCC) && CCMatch)
-      return true;
-    return false;
-  }
-
-  // Now we search for cases where we can use a tail call without changing the
-  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
-  // concept.
-
-  // I want anyone implementing a new calling convention to think long and hard
-  // about this assert.
-  assert((!isVarArg || CalleeCC == CallingConv::C) &&
-         "Unexpected variadic calling convention");
-
-  if (isVarArg && !Outs.empty()) {
-    // At least two cases here: if caller is fastcc then we can't have any
-    // memory arguments (we'd be expected to clean up the stack afterwards). If
-    // caller is C then we could potentially use its argument area.
-
-    // FIXME: for now we take the most conservative of these in both cases:
-    // disallow all variadic memory operands.
-    SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), ArgLocs, *DAG.getContext());
-
-    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
-    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
-      if (!ArgLocs[i].isRegLoc())
-        return false;
-  }
-
-  // If the calling conventions do not match, then we'd better make sure the
-  // results are returned in the same way as what the caller expects.
-  if (!CCMatch) {
-    SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs1, *DAG.getContext());
-    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
-
-    SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs2, *DAG.getContext());
-    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
-
-    if (RVLocs1.size() != RVLocs2.size())
-      return false;
-    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
-      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
-        return false;
-      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
-        return false;
-      if (RVLocs1[i].isRegLoc()) {
-        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
-          return false;
-      } else {
-        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
-          return false;
-      }
-    }
-  }
-
-  // Nothing more to check if the callee is taking no arguments
-  if (Outs.empty())
-    return true;
-
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
-
-  const ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-
-  // If the stack arguments for this call would fit into our own save area then
-  // the call can be made tail.
-  return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
-}
-
-SDValue ARM64TargetLowering::addTokenForArgument(SDValue Chain,
-                                                 SelectionDAG &DAG,
-                                                 MachineFrameInfo *MFI,
-                                                 int ClobberedFI) const {
-  SmallVector<SDValue, 8> ArgChains;
-  int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
-  int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
-
-  // Include the original chain at the beginning of the list. When this is
-  // used by target LowerCall hooks, this helps legalize find the
-  // CALLSEQ_BEGIN node.
-  ArgChains.push_back(Chain);
-
-  // Add a chain value for each stack argument corresponding
-  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
-                            UE = DAG.getEntryNode().getNode()->use_end();
-       U != UE; ++U)
-    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
-      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
-        if (FI->getIndex() < 0) {
-          int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
-          int64_t InLastByte = InFirstByte;
-          InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
-
-          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
-              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
-            ArgChains.push_back(SDValue(L, 1));
-        }
-
-  // Build a tokenfactor for all the chains.
-  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
-}
-
-bool ARM64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
-                                                 bool TailCallOpt) const {
-  return CallCC == CallingConv::Fast && TailCallOpt;
-}
-
-bool ARM64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
-  return CallCC == CallingConv::Fast;
-}
-
-/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
-/// and add input and output parameter nodes.
-SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
-                                       SmallVectorImpl<SDValue> &InVals) const {
-  SelectionDAG &DAG = CLI.DAG;
-  SDLoc &DL = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
-  SDValue Chain = CLI.Chain;
-  SDValue Callee = CLI.Callee;
-  bool &IsTailCall = CLI.IsTailCall;
-  CallingConv::ID CallConv = CLI.CallConv;
-  bool IsVarArg = CLI.IsVarArg;
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
-  bool IsThisReturn = false;
-
-  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
-  bool IsSibCall = false;
-
-  if (IsTailCall) {
-    // Check if it's really possible to do a tail call.
-    IsTailCall = isEligibleForTailCallOptimization(
-        Callee, CallConv, IsVarArg, IsStructRet,
-        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
-    if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
-      report_fatal_error("failed to perform tail call elimination on a call "
-                         "site marked musttail");
-
-    // A sibling call is one where we're under the usual C ABI and not planning
-    // to change that but can still do a tail call:
-    if (!TailCallOpt && IsTailCall)
-      IsSibCall = true;
-
-    if (IsTailCall)
-      ++NumTailCalls;
-  }
-
-  // Analyze operands of the call, assigning locations to each operand.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  if (IsVarArg) {
-    // Handle fixed and variable vector arguments differently.
-    // Variable vector arguments always go into memory.
-    unsigned NumArgs = Outs.size();
-
-    for (unsigned i = 0; i != NumArgs; ++i) {
-      MVT ArgVT = Outs[i].VT;
-      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
-                                               /*IsVarArg=*/ !Outs[i].IsFixed);
-      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
-      assert(!Res && "Call operand has unhandled type");
-      (void)Res;
-    }
-  } else {
-    // At this point, Outs[].VT may already be promoted to i32. To correctly
-    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
-    // i8 to CC_ARM64_AAPCS with i32 being ValVT and i8 being LocVT.
-    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
-    // we use a special version of AnalyzeCallOperands to pass in ValVT and
-    // LocVT.
-    unsigned NumArgs = Outs.size();
-    for (unsigned i = 0; i != NumArgs; ++i) {
-      MVT ValVT = Outs[i].VT;
-      // Get type of the original argument.
-      EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
-                                  /*AllowUnknown*/ true);
-      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
-      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
-      MVT LocVT = ValVT;
-      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
-        LocVT = MVT::i8;
-      else if (ActualMVT == MVT::i16)
-        LocVT = MVT::i16;
-
-      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
-      bool Res = AssignFn(i, ValVT, LocVT, CCValAssign::Full, ArgFlags, CCInfo);
-      assert(!Res && "Call operand has unhandled type");
-      (void)Res;
-    }
-  }
-
-  // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
-
-  if (IsSibCall) {
-    // Since we're not changing the ABI to make this a tail call, the memory
-    // operands are already available in the caller's incoming argument space.
-    NumBytes = 0;
-  }
-
-  // FPDiff is the byte offset of the call's argument area from the callee's.
-  // Stores to callee stack arguments will be placed in FixedStackSlots offset
-  // by this amount for a tail call. In a sibling call it must be 0 because the
-  // caller will deallocate the entire stack and the callee still expects its
-  // arguments to begin at SP+0. Completely unused for non-tail calls.
-  int FPDiff = 0;
-
-  if (IsTailCall && !IsSibCall) {
-    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
-
-    // Since callee will pop argument stack as a tail call, we must keep the
-    // popped size 16-byte aligned.
-    NumBytes = RoundUpToAlignment(NumBytes, 16);
-
-    // FPDiff will be negative if this tail call requires more space than we
-    // would automatically have in our incoming argument space. Positive if we
-    // can actually shrink the stack.
-    FPDiff = NumReusableBytes - NumBytes;
-
-    // The stack pointer must be 16-byte aligned at all times it's used for a
-    // memory operation, which in practice means at *all* times and in
-    // particular across call boundaries. Therefore our own arguments started at
-    // a 16-byte aligned SP and the delta applied for the tail call should
-    // satisfy the same constraint.
-    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
-  }
-
-  // Adjust the stack pointer for the new arguments...
-  // These operations are automatically eliminated by the prolog/epilog pass
-  if (!IsSibCall)
-    Chain =
-        DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
-
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, ARM64::SP, getPointerTy());
-
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
-  SmallVector<SDValue, 8> MemOpChains;
-
-  // Walk the register/memloc assignments, inserting copies/loads.
-  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
-       ++i, ++realArgIdx) {
-    CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = OutVals[realArgIdx];
-    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
-
-    // Promote the value if needed.
-    switch (VA.getLocInfo()) {
-    default:
-      llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::SExt:
-      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::ZExt:
-      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::AExt:
-      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::FPExt:
-      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    }
-
-    if (VA.isRegLoc()) {
-      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
-        assert(VA.getLocVT() == MVT::i64 &&
-               "unexpected calling convention register assignment");
-        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
-               "unexpected use of 'returned'");
-        IsThisReturn = true;
-      }
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    } else {
-      assert(VA.isMemLoc());
-
-      SDValue DstAddr;
-      MachinePointerInfo DstInfo;
-
-      // FIXME: This works on big-endian for composite byvals, which are the
-      // common case. It should also work for fundamental types too.
-      uint32_t BEAlign = 0;
-      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
-                                        : VA.getLocVT().getSizeInBits();
-      OpSize = (OpSize + 7) / 8;
-      if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
-        if (OpSize < 8)
-          BEAlign = 8 - OpSize;
-      }
-      unsigned LocMemOffset = VA.getLocMemOffset();
-      int32_t Offset = LocMemOffset + BEAlign;
-      SDValue PtrOff = DAG.getIntPtrConstant(Offset);
-      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
-
-      if (IsTailCall) {
-        Offset = Offset + FPDiff;
-        int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
-
-        DstAddr = DAG.getFrameIndex(FI, getPointerTy());
-        DstInfo = MachinePointerInfo::getFixedStack(FI);
-
-        // Make sure any stack arguments overlapping with where we're storing
-        // are loaded before this eventual operation. Otherwise they'll be
-        // clobbered.
-        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
-      } else {
-        SDValue PtrOff = DAG.getIntPtrConstant(Offset);
-
-        DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
-        DstInfo = MachinePointerInfo::getStack(LocMemOffset);
-      }
-
-      if (Outs[i].Flags.isByVal()) {
-        SDValue SizeNode =
-            DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
-        SDValue Cpy = DAG.getMemcpy(
-            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
-            /*isVolatile = */ false,
-            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
-
-        MemOpChains.push_back(Cpy);
-      } else {
-        // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
-        // promoted to a legal register type i32, we should truncate Arg back to
-        // i1/i8/i16.
-        if (Arg.getValueType().isSimple() &&
-            Arg.getValueType().getSimpleVT() == MVT::i32 &&
-            (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
-             VA.getLocVT() == MVT::i16))
-          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
-
-        SDValue Store =
-            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
-        MemOpChains.push_back(Store);
-      }
-    }
-  }
-
-  if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
-
-  // Build a sequence of copy-to-reg nodes chained together with token chain
-  // and flag operands which copy the outgoing args into the appropriate regs.
-  SDValue InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
-  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
-  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
-  // node so that legalize doesn't hack it.
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      Subtarget->isTargetMachO()) {
-    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-      const GlobalValue *GV = G->getGlobal();
-      bool InternalLinkage = GV->hasInternalLinkage();
-      if (InternalLinkage)
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
-      else {
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
-                                            ARM64II::MO_GOT);
-        Callee = DAG.getNode(ARM64ISD::LOADgot, DL, getPointerTy(), Callee);
-      }
-    } else if (ExternalSymbolSDNode *S =
-                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
-      const char *Sym = S->getSymbol();
-      Callee =
-          DAG.getTargetExternalSymbol(Sym, getPointerTy(), ARM64II::MO_GOT);
-      Callee = DAG.getNode(ARM64ISD::LOADgot, DL, getPointerTy(), Callee);
-    }
-  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = G->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
-  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    const char *Sym = S->getSymbol();
-    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
-  }
-
-  // We don't usually want to end the call-sequence here because we would tidy
-  // the frame up *after* the call, however in the ABI-changing tail-call case
-  // we've carefully laid out the parameters so that when sp is reset they'll be
-  // in the correct location.
-  if (IsTailCall && !IsSibCall) {
-    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                               DAG.getIntPtrConstant(0, true), InFlag, DL);
-    InFlag = Chain.getValue(1);
-  }
-
-  std::vector<SDValue> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Callee);
-
-  if (IsTailCall) {
-    // Each tail call may have to adjust the stack by a different amount, so
-    // this information must travel along with the operation for eventual
-    // consumption by emitEpilogue.
-    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
-  }
-
-  // Add argument registers to the end of the list so that they are known live
-  // into the call.
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
-    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
-                                  RegsToPass[i].second.getValueType()));
-
-  // Add a register mask operand representing the call-preserved registers.
-  const uint32_t *Mask;
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
-  if (IsThisReturn) {
-    // For 'this' returns, use the X0-preserving mask if applicable
-    Mask = ARI->getThisReturnPreservedMask(CallConv);
-    if (!Mask) {
-      IsThisReturn = false;
-      Mask = ARI->getCallPreservedMask(CallConv);
-    }
-  } else
-    Mask = ARI->getCallPreservedMask(CallConv);
-
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
-
-  if (InFlag.getNode())
-    Ops.push_back(InFlag);
-
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-
-  // If we're doing a tall call, use a TC_RETURN here rather than an
-  // actual call instruction.
-  if (IsTailCall)
-    return DAG.getNode(ARM64ISD::TC_RETURN, DL, NodeTys, Ops);
-
-  // Returns a chain and a flag for retval copy to use.
-  Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, Ops);
-  InFlag = Chain.getValue(1);
-
-  uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
-                                ? RoundUpToAlignment(NumBytes, 16)
-                                : 0;
-
-  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(CalleePopBytes, true),
-                             InFlag, DL);
-  if (!Ins.empty())
-    InFlag = Chain.getValue(1);
-
-  // Handle result values, copying them out of physregs into vregs that we
-  // return.
-  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
-                         InVals, IsThisReturn,
-                         IsThisReturn ? OutVals[0] : SDValue());
-}
-
-bool ARM64TargetLowering::CanLowerReturn(
-    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                         : RetCC_ARM64_AAPCS;
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
-  return CCInfo.CheckReturn(Outs, RetCC);
-}
-
-SDValue
-ARM64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                                 bool isVarArg,
-                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                 const SmallVectorImpl<SDValue> &OutVals,
-                                 SDLoc DL, SelectionDAG &DAG) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                         : RetCC_ARM64_AAPCS;
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
-  CCInfo.AnalyzeReturn(Outs, RetCC);
-
-  // Copy the result values into the output registers.
-  SDValue Flag;
-  SmallVector<SDValue, 4> RetOps(1, Chain);
-  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
-       ++i, ++realRVLocIdx) {
-    CCValAssign &VA = RVLocs[i];
-    assert(VA.isRegLoc() && "Can only return in registers!");
-    SDValue Arg = OutVals[realRVLocIdx];
-
-    switch (VA.getLocInfo()) {
-    default:
-      llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
-      break;
-    }
-
-    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
-    Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
-  }
-
-  RetOps[0] = Chain; // Update chain.
-
-  // Add the flag if we have it.
-  if (Flag.getNode())
-    RetOps.push_back(Flag);
-
-  return DAG.getNode(ARM64ISD::RET_FLAG, DL, MVT::Other, RetOps);
-}
-
-//===----------------------------------------------------------------------===//
-//  Other Lowering Code
-//===----------------------------------------------------------------------===//
-
-SDValue ARM64TargetLowering::LowerGlobalAddress(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  unsigned char OpFlags =
-      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
-
-  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
-         "unexpected offset in global node");
-
-  // This also catched the large code model case for Darwin.
-  if ((OpFlags & ARM64II::MO_GOT) != 0) {
-    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
-    // FIXME: Once remat is capable of dealing with instructions with register
-    // operands, expand this into two nodes instead of using a wrapper node.
-    return DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, GotAddr);
-  }
-
-  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G3),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G0 | MO_NC));
-  } else {
-    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
-    // the only correct model on Darwin.
-    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                            OpFlags | ARM64II::MO_PAGE);
-    unsigned char LoFlags = OpFlags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC;
-    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
-
-    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-  }
-}
-
-/// \brief Convert a TLS address reference into the correct sequence of loads
-/// and calls to compute the variable's address (for Darwin, currently) and
-/// return an SDValue containing the final node.
-
-/// Darwin only has one TLS scheme which must be capable of dealing with the
-/// fully general situation, in the worst case. This means:
-///     + "extern __thread" declaration.
-///     + Defined in a possibly unknown dynamic library.
-///
-/// The general system is that each __thread variable has a [3 x i64] descriptor
-/// which contains information used by the runtime to calculate the address. The
-/// only part of this the compiler needs to know about is the first xword, which
-/// contains a function pointer that must be called with the address of the
-/// entire descriptor in "x0".
-///
-/// Since this descriptor may be in a different unit, in general even the
-/// descriptor must be accessed via an indirect load. The "ideal" code sequence
-/// is:
-///     adrp x0, _var@TLVPPAGE
-///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
-///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
-///                                      ; the function pointer
-///     blr x1                           ; Uses descriptor address in x0
-///     ; Address of _var is now in x0.
-///
-/// If the address of _var's descriptor *is* known to the linker, then it can
-/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
-/// a slight efficiency gain.
-SDValue
-ARM64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
-
-  SDLoc DL(Op);
-  MVT PtrVT = getPointerTy();
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-
-  SDValue TLVPAddr =
-      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
-  SDValue DescAddr = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, TLVPAddr);
-
-  // The first entry in the descriptor is a function pointer that we must call
-  // to obtain the address of the variable.
-  SDValue Chain = DAG.getEntryNode();
-  SDValue FuncTLVGet =
-      DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
-                  false, true, true, 8);
-  Chain = FuncTLVGet.getValue(1);
-
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  MFI->setAdjustsStack(true);
-
-  // TLS calls preserve all registers except those that absolutely must be
-  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
-  // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
-
-  // Finally, we can make the call. This is just a degenerate version of a
-  // normal ARM64 call node: x0 takes the address of the descriptor, and returns
-  // the address of the variable in this thread.
-  Chain = DAG.getCopyToReg(Chain, DL, ARM64::X0, DescAddr, SDValue());
-  Chain = DAG.getNode(ARM64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
-                      Chain, FuncTLVGet, DAG.getRegister(ARM64::X0, MVT::i64),
-                      DAG.getRegisterMask(Mask), Chain.getValue(1));
-  return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Chain.getValue(1));
-}
-
-/// When accessing thread-local variables under either the general-dynamic or
-/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
-/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
-/// is a function pointer to carry out the resolution. This function takes the
-/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
-/// other registers (except LR, NZCV) are preserved.
-///
-/// Thus, the ideal call sequence on AArch64 is:
-///
-///     adrp x0, :tlsdesc:thread_var
-///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
-///     add x0, x0, :tlsdesc_lo12:thread_var
-///     .tlsdesccall thread_var
-///     blr x8
-///     (TPIDR_EL0 offset now in x0).
-///
-/// The ".tlsdesccall" directive instructs the assembler to insert a particular
-/// relocation to help the linker relax this sequence if it turns out to be too
-/// conservative.
-///
-/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
-/// is harmless.
-SDValue ARM64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
-                                                 SDValue DescAddr, SDLoc DL,
-                                                 SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
-
-  // The function we need to call is simply the first entry in the GOT for this
-  // descriptor, load it in preparation.
-  SDValue Func = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, SymAddr);
-
-  // TLS calls preserve all registers except those that absolutely must be
-  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
-  // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
-
-  // The function takes only one argument: the address of the descriptor itself
-  // in X0.
-  SDValue Glue, Chain;
-  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM64::X0, DescAddr, Glue);
-  Glue = Chain.getValue(1);
-
-  // We're now ready to populate the argument list, as with a normal call:
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Func);
-  Ops.push_back(SymAddr);
-  Ops.push_back(DAG.getRegister(ARM64::X0, PtrVT));
-  Ops.push_back(DAG.getRegisterMask(Mask));
-  Ops.push_back(Glue);
-
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(ARM64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
-  Glue = Chain.getValue(1);
-
-  return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Glue);
-}
-
-SDValue ARM64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
-                                                      SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
-         "ELF TLS only supported in small memory model");
-  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-
-  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
-
-  SDValue TPOff;
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-  const GlobalValue *GV = GA->getGlobal();
-
-  SDValue ThreadBase = DAG.getNode(ARM64ISD::THREAD_POINTER, DL, PtrVT);
-
-  if (Model == TLSModel::LocalExec) {
-    SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_G0 | ARM64II::MO_NC);
-
-    TPOff = SDValue(DAG.getMachineNode(ARM64::MOVZXi, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(16, MVT::i32)),
-                    0);
-    TPOff = SDValue(DAG.getMachineNode(ARM64::MOVKXi, DL, PtrVT, TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)),
-                    0);
-  } else if (Model == TLSModel::InitialExec) {
-    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
-    TPOff = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, TPOff);
-  } else if (Model == TLSModel::LocalDynamic) {
-    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
-    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
-    // the beginning of the module's TLS region, followed by a DTPREL offset
-    // calculation.
-
-    // These accesses will need deduplicating if there's more than one.
-    ARM64FunctionInfo *MFI =
-        DAG.getMachineFunction().getInfo<ARM64FunctionInfo>();
-    MFI->incNumLocalDynamicTLSAccesses();
-
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT, ARM64II::MO_TLS | ARM64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT,
-        ARM64II::MO_TLS | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
-    // The call needs a relocation too for linker relaxation. It doesn't make
-    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
-    // the address.
-    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
-                                                  ARM64II::MO_TLS);
-
-    // Now we can calculate the offset from TPIDR_EL0 to this module's
-    // thread-local area.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
-
-    // Now use :dtprel_whatever: operations to calculate this variable's offset
-    // in its thread-storage area.
-    SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, MVT::i64, 0, ARM64II::MO_TLS | ARM64II::MO_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(
-        GV, DL, MVT::i64, 0, ARM64II::MO_TLS | ARM64II::MO_G0 | ARM64II::MO_NC);
-
-    SDValue DTPOff =
-        SDValue(DAG.getMachineNode(ARM64::MOVZXi, DL, PtrVT, HiVar,
-                                   DAG.getTargetConstant(16, MVT::i32)),
-                0);
-    DTPOff = SDValue(DAG.getMachineNode(ARM64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
-                                        DAG.getTargetConstant(0, MVT::i32)),
-                     0);
-
-    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
-  } else if (Model == TLSModel::GeneralDynamic) {
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0,
-        ARM64II::MO_TLS | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
-    // The call needs a relocation too for linker relaxation. It doesn't make
-    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
-    // the address.
-    SDValue SymAddr =
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
-
-    // Finally we can make a call to calculate the offset from tpidr_el0.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
-  } else
-    llvm_unreachable("Unsupported ELF TLS access model");
-
-  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
-}
-
-SDValue ARM64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
-                                                   SelectionDAG &DAG) const {
-  if (Subtarget->isTargetDarwin())
-    return LowerDarwinGlobalTLSAddress(Op, DAG);
-  else if (Subtarget->isTargetELF())
-    return LowerELFGlobalTLSAddress(Op, DAG);
-
-  llvm_unreachable("Unexpected platform trying to use TLS");
-}
-SDValue ARM64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain = Op.getOperand(0);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
-  SDValue LHS = Op.getOperand(2);
-  SDValue RHS = Op.getOperand(3);
-  SDValue Dest = Op.getOperand(4);
-  SDLoc dl(Op);
-
-  // Handle f128 first, since lowering it will result in comparing the return
-  // value of a libcall against zero, which is just what the rest of LowerBR_CC
-  // is expecting to deal with.
-  if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (!RHS.getNode()) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
-    }
-  }
-
-  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
-  // instruction.
-  unsigned Opc = LHS.getOpcode();
-  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
-      cast<ConstantSDNode>(RHS)->isOne() &&
-      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
-       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
-    assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
-           "Unexpected condition code.");
-    // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
-      return SDValue();
-
-    // The actual operation with overflow check.
-    ARM64CC::CondCode OFCC;
-    SDValue Value, Overflow;
-    std::tie(Value, Overflow) = getARM64XALUOOp(OFCC, LHS.getValue(0), DAG);
-
-    if (CC == ISD::SETNE)
-      OFCC = getInvertedCondCode(OFCC);
-    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
-
-    return DAG.getNode(ARM64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
-                       CCVal, Overflow);
-  }
-
-  if (LHS.getValueType().isInteger()) {
-    assert((LHS.getValueType() == RHS.getValueType()) &&
-           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
-
-    // If the RHS of the comparison is zero, we can potentially fold this
-    // to a specialized branch.
-    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
-    if (RHSC && RHSC->getZExtValue() == 0) {
-      if (CC == ISD::SETEQ) {
-        // See if we can use a TBZ to fold in an AND as well.
-        // TBZ has a smaller branch displacement than CBZ.  If the offset is
-        // out of bounds, a late MI-layer pass rewrites branches.
-        // 403.gcc is an example that hits this case.
-        if (LHS.getOpcode() == ISD::AND &&
-            isa<ConstantSDNode>(LHS.getOperand(1)) &&
-            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
-          SDValue Test = LHS.getOperand(0);
-          uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
-          return DAG.getNode(ARM64ISD::TBZ, dl, MVT::Other, Chain, Test,
-                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
-        }
-
-        return DAG.getNode(ARM64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
-      } else if (CC == ISD::SETNE) {
-        // See if we can use a TBZ to fold in an AND as well.
-        // TBZ has a smaller branch displacement than CBZ.  If the offset is
-        // out of bounds, a late MI-layer pass rewrites branches.
-        // 403.gcc is an example that hits this case.
-        if (LHS.getOpcode() == ISD::AND &&
-            isa<ConstantSDNode>(LHS.getOperand(1)) &&
-            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
-          SDValue Test = LHS.getOperand(0);
-          uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBNZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
-          return DAG.getNode(ARM64ISD::TBNZ, dl, MVT::Other, Chain, Test,
-                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
-        }
-
-        return DAG.getNode(ARM64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
-      }
-    }
-
-    SDValue CCVal;
-    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
-    return DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
-                       Cmp);
-  }
-
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-
-  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-  // clean.  Some of them require two branches to implement.
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-  SDValue BR1 =
-      DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
-  if (CC2 != ARM64CC::AL) {
-    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
-    return DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
-                       Cmp);
-  }
-
-  return BR1;
-}
-
-SDValue ARM64TargetLowering::LowerFCOPYSIGN(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-
-  SDValue In1 = Op.getOperand(0);
-  SDValue In2 = Op.getOperand(1);
-  EVT SrcVT = In2.getValueType();
-  if (SrcVT != VT) {
-    if (SrcVT == MVT::f32 && VT == MVT::f64)
-      In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
-    else if (SrcVT == MVT::f64 && VT == MVT::f32)
-      In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
-    else
-      // FIXME: Src type is different, bail out for now. Can VT really be a
-      // vector type?
-      return SDValue();
-  }
-
-  EVT VecVT;
-  EVT EltVT;
-  SDValue EltMask, VecVal1, VecVal2;
-  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
-    EltVT = MVT::i32;
-    VecVT = MVT::v4i32;
-    EltMask = DAG.getConstant(0x80000000ULL, EltVT);
-
-    if (!VT.isVector()) {
-      VecVal1 = DAG.getTargetInsertSubreg(ARM64::ssub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In1);
-      VecVal2 = DAG.getTargetInsertSubreg(ARM64::ssub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In2);
-    } else {
-      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
-      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
-    }
-  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
-    EltVT = MVT::i64;
-    VecVT = MVT::v2i64;
-
-    // We want to materialize a mask with the the high bit set, but the AdvSIMD
-    // immediate moves cannot materialize that in a single instruction for
-    // 64-bit elements. Instead, materialize zero and then negate it.
-    EltMask = DAG.getConstant(0, EltVT);
-
-    if (!VT.isVector()) {
-      VecVal1 = DAG.getTargetInsertSubreg(ARM64::dsub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In1);
-      VecVal2 = DAG.getTargetInsertSubreg(ARM64::dsub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In2);
-    } else {
-      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
-      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
-    }
-  } else {
-    llvm_unreachable("Invalid type for copysign!");
-  }
-
-  std::vector<SDValue> BuildVectorOps;
-  for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
-    BuildVectorOps.push_back(EltMask);
-
-  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
-
-  // If we couldn't materialize the mask above, then the mask vector will be
-  // the zero vector, and we need to negate it here.
-  if (VT == MVT::f64 || VT == MVT::v2f64) {
-    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
-    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
-    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
-  }
-
-  SDValue Sel =
-      DAG.getNode(ARM64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
-
-  if (VT == MVT::f32)
-    return DAG.getTargetExtractSubreg(ARM64::ssub, DL, VT, Sel);
-  else if (VT == MVT::f64)
-    return DAG.getTargetExtractSubreg(ARM64::dsub, DL, VT, Sel);
-  else
-    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
-}
-
-SDValue ARM64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
-  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
-    return SDValue();
-
-  // While there is no integer popcount instruction, it can
-  // be more efficiently lowered to the following sequence that uses
-  // AdvSIMD registers/instructions as long as the copies to/from
-  // the AdvSIMD registers are cheap.
-  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
-  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
-  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
-  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
-  SDValue Val = Op.getOperand(0);
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
-
-  SDValue VecVal;
-  if (VT == MVT::i32) {
-    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
-    VecVal =
-        DAG.getTargetInsertSubreg(ARM64::ssub, DL, MVT::v8i8, ZeroVec, VecVal);
-  } else {
-    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
-  }
-
-  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
-  SDValue UaddLV = DAG.getNode(
-      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
-      DAG.getConstant(Intrinsic::arm64_neon_uaddlv, MVT::i32), CtPop);
-
-  if (VT == MVT::i64)
-    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
-  return UaddLV;
-}
-
-SDValue ARM64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-
-  if (Op.getValueType().isVector())
-    return LowerVSETCC(Op, DAG);
-
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  SDLoc dl(Op);
-
-  // We chose ZeroOrOneBooleanContents, so use zero and one.
-  EVT VT = Op.getValueType();
-  SDValue TVal = DAG.getConstant(1, VT);
-  SDValue FVal = DAG.getConstant(0, VT);
-
-  // Handle f128 first, since one possible outcome is a normal integer
-  // comparison which gets picked up by the next if statement.
-  if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, use it.
-    if (!RHS.getNode()) {
-      assert(LHS.getValueType() == Op.getValueType() &&
-             "Unexpected setcc expansion!");
-      return LHS;
-    }
-  }
-
-  if (LHS.getValueType().isInteger()) {
-    SDValue CCVal;
-    SDValue Cmp =
-        getARM64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
-
-    // Note that we inverted the condition above, so we reverse the order of
-    // the true and false operands here.  This will allow the setcc to be
-    // matched to a single CSINC instruction.
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
-  }
-
-  // Now we know we're dealing with FP values.
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-
-  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
-  // and do the comparison.
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-  if (CC2 == ARM64CC::AL) {
-    changeFPCCToARM64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
-    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-
-    // Note that we inverted the condition above, so we reverse the order of
-    // the true and false operands here.  This will allow the setcc to be
-    // matched to a single CSINC instruction.
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
-  } else {
-    // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-    // clean.  Some of them require two CSELs to implement.  As is in this case,
-    // we emit the first CSEL and then emit a second using the output of the
-    // first as the RHS.  We're effectively OR'ing the two CC's together.
-
-    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
-    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-    SDValue CS1 = DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
-
-    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
-  }
-}
-
-/// A SELECT_CC operation is really some kind of max or min if both values being
-/// compared are, in some sense, equal to the results in either case. However,
-/// it is permissible to compare f32 values and produce directly extended f64
-/// values.
-///
-/// Extending the comparison operands would also be allowed, but is less likely
-/// to happen in practice since their use is right here. Note that truncate
-/// operations would *not* be semantically equivalent.
-static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
-  if (Cmp == Result)
-    return true;
-
-  ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
-  ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
-  if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
-      Result.getValueType() == MVT::f64) {
-    bool Lossy;
-    APFloat CmpVal = CCmp->getValueAPF();
-    CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
-    return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
-  }
-
-  return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
-}
-
-SDValue ARM64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  SDValue CC = Op->getOperand(0);
-  SDValue TVal = Op->getOperand(1);
-  SDValue FVal = Op->getOperand(2);
-  SDLoc DL(Op);
-
-  unsigned Opc = CC.getOpcode();
-  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
-  // instruction.
-  if (CC.getResNo() == 1 &&
-      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
-       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
-    // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
-      return SDValue();
-
-    ARM64CC::CondCode OFCC;
-    SDValue Value, Overflow;
-    std::tie(Value, Overflow) = getARM64XALUOOp(OFCC, CC.getValue(0), DAG);
-    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
-
-    return DAG.getNode(ARM64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal,
-                       Overflow);
-  }
-
-  if (CC.getOpcode() == ISD::SETCC)
-    return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
-                           cast<CondCodeSDNode>(CC.getOperand(2))->get());
-  else
-    return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
-                           FVal, ISD::SETNE);
-}
-
-SDValue ARM64TargetLowering::LowerSELECT_CC(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue TVal = Op.getOperand(2);
-  SDValue FVal = Op.getOperand(3);
-  SDLoc dl(Op);
-
-  // Handle f128 first, because it will result in a comparison of some RTLIB
-  // call result against zero.
-  if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (!RHS.getNode()) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
-    }
-  }
-
-  // Handle integers first.
-  if (LHS.getValueType().isInteger()) {
-    assert((LHS.getValueType() == RHS.getValueType()) &&
-           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
-
-    unsigned Opcode = ARM64ISD::CSEL;
-
-    // If both the TVal and the FVal are constants, see if we can swap them in
-    // order to for a CSINV or CSINC out of them.
-    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
-    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
-
-    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
-      std::swap(TVal, FVal);
-      std::swap(CTVal, CFVal);
-      CC = ISD::getSetCCInverse(CC, true);
-    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
-      std::swap(TVal, FVal);
-      std::swap(CTVal, CFVal);
-      CC = ISD::getSetCCInverse(CC, true);
-    } else if (TVal.getOpcode() == ISD::XOR) {
-      // If TVal is a NOT we want to swap TVal and FVal so that we can match
-      // with a CSINV rather than a CSEL.
-      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
-
-      if (CVal && CVal->isAllOnesValue()) {
-        std::swap(TVal, FVal);
-        std::swap(CTVal, CFVal);
-        CC = ISD::getSetCCInverse(CC, true);
-      }
-    } else if (TVal.getOpcode() == ISD::SUB) {
-      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
-      // that we can match with a CSNEG rather than a CSEL.
-      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
-
-      if (CVal && CVal->isNullValue()) {
-        std::swap(TVal, FVal);
-        std::swap(CTVal, CFVal);
-        CC = ISD::getSetCCInverse(CC, true);
-      }
-    } else if (CTVal && CFVal) {
-      const int64_t TrueVal = CTVal->getSExtValue();
-      const int64_t FalseVal = CFVal->getSExtValue();
-      bool Swap = false;
-
-      // If both TVal and FVal are constants, see if FVal is the
-      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
-      // instead of a CSEL in that case.
-      if (TrueVal == ~FalseVal) {
-        Opcode = ARM64ISD::CSINV;
-      } else if (TrueVal == -FalseVal) {
-        Opcode = ARM64ISD::CSNEG;
-      } else if (TVal.getValueType() == MVT::i32) {
-        // If our operands are only 32-bit wide, make sure we use 32-bit
-        // arithmetic for the check whether we can use CSINC. This ensures that
-        // the addition in the check will wrap around properly in case there is
-        // an overflow (which would not be the case if we do the check with
-        // 64-bit arithmetic).
-        const uint32_t TrueVal32 = CTVal->getZExtValue();
-        const uint32_t FalseVal32 = CFVal->getZExtValue();
-
-        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
-          Opcode = ARM64ISD::CSINC;
-
-          if (TrueVal32 > FalseVal32) {
-            Swap = true;
-          }
-        }
-        // 64-bit check whether we can use CSINC.
-      } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
-        Opcode = ARM64ISD::CSINC;
-
-        if (TrueVal > FalseVal) {
-          Swap = true;
-        }
-      }
-
-      // Swap TVal and FVal if necessary.
-      if (Swap) {
-        std::swap(TVal, FVal);
-        std::swap(CTVal, CFVal);
-        CC = ISD::getSetCCInverse(CC, true);
-      }
-
-      if (Opcode != ARM64ISD::CSEL) {
-        // Drop FVal since we can get its value by simply inverting/negating
-        // TVal.
-        FVal = TVal;
-      }
-    }
-
-    SDValue CCVal;
-    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
-
-    EVT VT = Op.getValueType();
-    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
-  }
-
-  // Now we know we're dealing with FP values.
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-  assert(LHS.getValueType() == RHS.getValueType());
-  EVT VT = Op.getValueType();
-
-  // Try to match this select into a max/min operation, which have dedicated
-  // opcode in the instruction set.
-  // FIXME: This is not correct in the presence of NaNs, so we only enable this
-  // in no-NaNs mode.
-  if (getTargetMachine().Options.NoNaNsFPMath) {
-    SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
-    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
-        selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
-      CC = ISD::getSetCCSwappedOperands(CC);
-      std::swap(MinMaxLHS, MinMaxRHS);
-    }
-
-    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
-        selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
-      switch (CC) {
-      default:
-        break;
-      case ISD::SETGT:
-      case ISD::SETGE:
-      case ISD::SETUGT:
-      case ISD::SETUGE:
-      case ISD::SETOGT:
-      case ISD::SETOGE:
-        return DAG.getNode(ARM64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
-        break;
-      case ISD::SETLT:
-      case ISD::SETLE:
-      case ISD::SETULT:
-      case ISD::SETULE:
-      case ISD::SETOLT:
-      case ISD::SETOLE:
-        return DAG.getNode(ARM64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
-        break;
-      }
-    }
-  }
-
-  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
-  // and do the comparison.
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-
-  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-  // clean.  Some of them require two CSELs to implement.
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-  SDValue CS1 = DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
-
-  // If we need a second CSEL, emit it, using the output of the first as the
-  // RHS.  We're effectively OR'ing the two CC's together.
-  if (CC2 != ARM64CC::AL) {
-    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
-  }
-
-  // Otherwise, return the output of the first CSEL.
-  return CS1;
-}
-
-SDValue ARM64TargetLowering::LowerJumpTable(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  // Jump table entries as PC relative offsets. No additional tweaking
-  // is necessary here. Just get the address of the jump table.
-  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      !Subtarget->isTargetMachO()) {
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G3),
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G0 | MO_NC));
-  }
-
-  SDValue Hi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_PAGE);
-  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
-                                      ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-  SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-  return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-}
-
-SDValue ARM64TargetLowering::LowerConstantPool(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-
-  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
-    // Use the GOT for the large code model on iOS.
-    if (Subtarget->isTargetMachO()) {
-      SDValue GotAddr = DAG.getTargetConstantPool(
-          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
-          ARM64II::MO_GOT);
-      return DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, GotAddr);
-    }
-
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G3),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G0 | MO_NC));
-  } else {
-    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
-    // ELF, the only valid one on Darwin.
-    SDValue Hi =
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_PAGE);
-    SDValue Lo = DAG.getTargetConstantPool(
-        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
-        ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-
-    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-  }
-}
-
-SDValue ARM64TargetLowering::LowerBlockAddress(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      !Subtarget->isTargetMachO()) {
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G3),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G0 | MO_NC));
-  } else {
-    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_PAGE);
-    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_PAGEOFF |
-                                                             ARM64II::MO_NC);
-    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-  }
-}
-
-SDValue ARM64TargetLowering::LowerDarwin_VASTART(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  ARM64FunctionInfo *FuncInfo =
-      DAG.getMachineFunction().getInfo<ARM64FunctionInfo>();
-
-  SDLoc DL(Op);
-  SDValue FR =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
-  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
-                      MachinePointerInfo(SV), false, false, 0);
-}
-
-SDValue ARM64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  // The layout of the va_list struct is specified in the AArch64 Procedure Call
-  // Standard, section B.3.
-  MachineFunction &MF = DAG.getMachineFunction();
-  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-  SDLoc DL(Op);
-
-  SDValue Chain = Op.getOperand(0);
-  SDValue VAList = Op.getOperand(1);
-  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  SmallVector<SDValue, 4> MemOps;
-
-  // void *__stack at offset 0
-  SDValue Stack =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
-  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
-                                MachinePointerInfo(SV), false, false, 8));
-
-  // void *__gr_top at offset 8
-  int GPRSize = FuncInfo->getVarArgsGPRSize();
-  if (GPRSize > 0) {
-    SDValue GRTop, GRTopAddr;
-
-    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(8, getPointerTy()));
-
-    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
-    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
-                        DAG.getConstant(GPRSize, getPointerTy()));
-
-    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
-                                  MachinePointerInfo(SV, 8), false, false, 8));
-  }
-
-  // void *__vr_top at offset 16
-  int FPRSize = FuncInfo->getVarArgsFPRSize();
-  if (FPRSize > 0) {
-    SDValue VRTop, VRTopAddr;
-    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(16, getPointerTy()));
-
-    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
-    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
-                        DAG.getConstant(FPRSize, getPointerTy()));
-
-    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
-                                  MachinePointerInfo(SV, 16), false, false, 8));
-  }
-
-  // int __gr_offs at offset 24
-  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(24, getPointerTy()));
-  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
-                                GROffsAddr, MachinePointerInfo(SV, 24), false,
-                                false, 4));
-
-  // int __vr_offs at offset 28
-  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(28, getPointerTy()));
-  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
-                                VROffsAddr, MachinePointerInfo(SV, 28), false,
-                                false, 4));
-
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
-}
-
-SDValue ARM64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
-  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
-                                     : LowerAAPCS_VASTART(Op, DAG);
-}
-
-SDValue ARM64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
-  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
-  // pointer.
-  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
-  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
-  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-
-  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
-                       Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
-                       8, false, false, MachinePointerInfo(DestSV),
-                       MachinePointerInfo(SrcSV));
-}
-
-SDValue ARM64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetDarwin() &&
-         "automatic va_arg instruction only works on Darwin");
-
-  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Addr = Op.getOperand(1);
-  unsigned Align = Op.getConstantOperandVal(3);
-
-  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
-                               MachinePointerInfo(V), false, false, false, 0);
-  Chain = VAList.getValue(1);
-
-  if (Align > 8) {
-    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
-    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                         DAG.getConstant(Align - 1, getPointerTy()));
-    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
-                         DAG.getConstant(-(int64_t)Align, getPointerTy()));
-  }
-
-  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
-  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
-
-  // Scalar integer and FP values smaller than 64 bits are implicitly extended
-  // up to 64 bits.  At the very least, we have to increase the striding of the
-  // vaargs list to match this, and for FP values we need to introduce
-  // FP_ROUND nodes as well.
-  if (VT.isInteger() && !VT.isVector())
-    ArgSize = 8;
-  bool NeedFPTrunc = false;
-  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
-    ArgSize = 8;
-    NeedFPTrunc = true;
-  }
-
-  // Increment the pointer, VAList, to the next vaarg
-  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                               DAG.getConstant(ArgSize, getPointerTy()));
-  // Store the incremented VAList to the legalized pointer
-  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
-                                 false, false, 0);
-
-  // Load the actual argument out of the pointer VAList
-  if (NeedFPTrunc) {
-    // Load the value as an f64.
-    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
-                                 MachinePointerInfo(), false, false, false, 0);
-    // Round the value down to an f32.
-    SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
-                                   DAG.getIntPtrConstant(1));
-    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
-    // Merge the rounded value with the chain output of the load.
-    return DAG.getMergeValues(Ops, DL);
-  }
-
-  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
-                     false, false, 0);
-}
-
-SDValue ARM64TargetLowering::LowerFRAMEADDR(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  MFI->setFrameAddressIsTaken(true);
-
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, ARM64::FP, VT);
-  while (Depth--)
-    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(), false, false, false, 0);
-  return FrameAddr;
-}
-
-// FIXME? Maybe this could be a TableGen attribute on some registers and
-// this table could be generated automatically from RegInfo.
-unsigned ARM64TargetLowering::getRegisterByName(const char* RegName,
-                                                EVT VT) const {
-  unsigned Reg = StringSwitch<unsigned>(RegName)
-                       .Case("sp", ARM64::SP)
-                       .Default(0);
-  if (Reg)
-    return Reg;
-  report_fatal_error("Invalid register name global variable");
-}
-
-SDValue ARM64TargetLowering::LowerRETURNADDR(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MFI->setReturnAddressIsTaken(true);
-
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  if (Depth) {
-    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset = DAG.getConstant(8, getPointerTy());
-    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
-  }
-
-  // Return LR, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(ARM64::LR, &ARM64::GPR64RegClass);
-  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
-}
-
-/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue ARM64TargetLowering::LowerShiftRightParts(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
-  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
-
-  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
-
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
-
-  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
-                               ISD::SETGE, dl, DAG);
-  SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32);
-
-  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-  SDValue Lo =
-      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
-
-  // ARM64 shifts larger than the register width are wrapped rather than
-  // clamped, so we can't just emit "hi >> x".
-  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-  SDValue TrueValHi = Opc == ISD::SRA
-                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
-                                        DAG.getConstant(VTBits - 1, MVT::i64))
-                          : DAG.getConstant(0, VT);
-  SDValue Hi =
-      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
-
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, dl);
-}
-
-/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue ARM64TargetLowering::LowerShiftLeftParts(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
-
-  assert(Op.getOpcode() == ISD::SHL_PARTS);
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
-  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
-
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-
-  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
-                               ISD::SETGE, dl, DAG);
-  SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32);
-  SDValue Hi = DAG.getNode(ARM64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
-
-  // ARM64 shifts of larger than register sizes are wrapped rather than clamped,
-  // so we can't just emit "lo << a" if a is too big.
-  SDValue TrueValLo = DAG.getConstant(0, VT);
-  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-  SDValue Lo =
-      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
-
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, dl);
-}
-
-bool
-ARM64TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
-  // The ARM64 target doesn't support folding offsets into global addresses.
-  return false;
-}
-
-bool ARM64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
-  // FIXME: We should be able to handle f128 as well with a clever lowering.
-  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
-    return true;
-
-  if (VT == MVT::f64)
-    return ARM64_AM::getFP64Imm(Imm) != -1;
-  else if (VT == MVT::f32)
-    return ARM64_AM::getFP32Imm(Imm) != -1;
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-//                          ARM64 Optimization Hooks
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//                          ARM64 Inline Assembly Support
-//===----------------------------------------------------------------------===//
-
-// Table of Constraints
-// TODO: This is the current set of constraints supported by ARM for the
-// compiler, not all of them may make sense, e.g. S may be difficult to support.
-//
-// r - A general register
-// w - An FP/SIMD register of some size in the range v0-v31
-// x - An FP/SIMD register of some size in the range v0-v15
-// I - Constant that can be used with an ADD instruction
-// J - Constant that can be used with a SUB instruction
-// K - Constant that can be used with a 32-bit logical instruction
-// L - Constant that can be used with a 64-bit logical instruction
-// M - Constant that can be used as a 32-bit MOV immediate
-// N - Constant that can be used as a 64-bit MOV immediate
-// Q - A memory reference with base register and no offset
-// S - A symbolic address
-// Y - Floating point constant zero
-// Z - Integer constant zero
-//
-//   Note that general register operands will be output using their 64-bit x
-// register name, whatever the size of the variable, unless the asm operand
-// is prefixed by the %w modifier. Floating-point and SIMD register operands
-// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
-// %q modifier.
-
-/// getConstraintType - Given a constraint letter, return the type of
-/// constraint it is for this target.
-ARM64TargetLowering::ConstraintType
-ARM64TargetLowering::getConstraintType(const std::string &Constraint) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    default:
-      break;
-    case 'z':
-      return C_Other;
-    case 'x':
-    case 'w':
-      return C_RegisterClass;
-    // An address with a single base register. Due to the way we
-    // currently handle addresses it is the same as 'r'.
-    case 'Q':
-      return C_Memory;
-    }
-  }
-  return TargetLowering::getConstraintType(Constraint);
-}
-
-/// Examine constraint type and operand type and determine a weight value.
-/// This object must already have been set up with the operand type
-/// and the current alternative constraint selected.
-TargetLowering::ConstraintWeight
-ARM64TargetLowering::getSingleConstraintMatchWeight(
-    AsmOperandInfo &info, const char *constraint) const {
-  ConstraintWeight weight = CW_Invalid;
-  Value *CallOperandVal = info.CallOperandVal;
-  // If we don't have a value, we can't do a match,
-  // but allow it at the lowest weight.
-  if (!CallOperandVal)
-    return CW_Default;
-  Type *type = CallOperandVal->getType();
-  // Look at the constraint type.
-  switch (*constraint) {
-  default:
-    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
-    break;
-  case 'x':
-  case 'w':
-    if (type->isFloatingPointTy() || type->isVectorTy())
-      weight = CW_Register;
-    break;
-  case 'z':
-    weight = CW_Constant;
-    break;
-  }
-  return weight;
-}
-
-std::pair<unsigned, const TargetRegisterClass *>
-ARM64TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                  MVT VT) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    case 'r':
-      if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &ARM64::GPR64commonRegClass);
-      return std::make_pair(0U, &ARM64::GPR32commonRegClass);
-    case 'w':
-      if (VT == MVT::f32)
-        return std::make_pair(0U, &ARM64::FPR32RegClass);
-      if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &ARM64::FPR64RegClass);
-      if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &ARM64::FPR128RegClass);
-      break;
-    // The instructions that this constraint is designed for can
-    // only take 128-bit registers so just use that regclass.
-    case 'x':
-      if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &ARM64::FPR128_loRegClass);
-      break;
-    }
-  }
-  if (StringRef("{cc}").equals_lower(Constraint))
-    return std::make_pair(unsigned(ARM64::NZCV), &ARM64::CCRRegClass);
-
-  // Use the default implementation in TargetLowering to convert the register
-  // constraint into a member of a register class.
-  std::pair<unsigned, const TargetRegisterClass *> Res;
-  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
-
-  // Not found as a standard register?
-  if (!Res.second) {
-    unsigned Size = Constraint.size();
-    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
-        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
-      const std::string Reg =
-          std::string(&Constraint[2], &Constraint[Size - 1]);
-      int RegNo = atoi(Reg.c_str());
-      if (RegNo >= 0 && RegNo <= 31) {
-        // v0 - v31 are aliases of q0 - q31.
-        // By default we'll emit v0-v31 for this unless there's a modifier where
-        // we'll emit the correct register as well.
-        Res.first = ARM64::FPR128RegClass.getRegister(RegNo);
-        Res.second = &ARM64::FPR128RegClass;
-      }
-    }
-  }
-
-  return Res;
-}
-
-/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
-/// vector.  If it is invalid, don't add anything to Ops.
-void ARM64TargetLowering::LowerAsmOperandForConstraint(
-    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
-    SelectionDAG &DAG) const {
-  SDValue Result;
-
-  // Currently only support length 1 constraints.
-  if (Constraint.length() != 1)
-    return;
-
-  char ConstraintLetter = Constraint[0];
-  switch (ConstraintLetter) {
-  default:
-    break;
-
-  // This set of constraints deal with valid constants for various instructions.
-  // Validate and return a target constant for them if we can.
-  case 'z': {
-    // 'z' maps to xzr or wzr so it needs an input of 0.
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C || C->getZExtValue() != 0)
-      return;
-
-    if (Op.getValueType() == MVT::i64)
-      Result = DAG.getRegister(ARM64::XZR, MVT::i64);
-    else
-      Result = DAG.getRegister(ARM64::WZR, MVT::i32);
-    break;
-  }
-
-  case 'I':
-  case 'J':
-  case 'K':
-  case 'L':
-  case 'M':
-  case 'N':
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C)
-      return;
-
-    // Grab the value and do some validation.
-    uint64_t CVal = C->getZExtValue();
-    switch (ConstraintLetter) {
-    // The I constraint applies only to simple ADD or SUB immediate operands:
-    // i.e. 0 to 4095 with optional shift by 12
-    // The J constraint applies only to ADD or SUB immediates that would be
-    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
-    // instruction [or vice versa], in other words -1 to -4095 with optional
-    // left shift by 12.
-    case 'I':
-      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
-        break;
-      return;
-    case 'J': {
-      uint64_t NVal = -C->getSExtValue();
-      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
-        break;
-      return;
-    }
-    // The K and L constraints apply *only* to logical immediates, including
-    // what used to be the MOVI alias for ORR (though the MOVI alias has now
-    // been removed and MOV should be used). So these constraints have to
-    // distinguish between bit patterns that are valid 32-bit or 64-bit
-    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
-    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
-    // versa.
-    case 'K':
-      if (ARM64_AM::isLogicalImmediate(CVal, 32))
-        break;
-      return;
-    case 'L':
-      if (ARM64_AM::isLogicalImmediate(CVal, 64))
-        break;
-      return;
-    // The M and N constraints are a superset of K and L respectively, for use
-    // with the MOV (immediate) alias. As well as the logical immediates they
-    // also match 32 or 64-bit immediates that can be loaded either using a
-    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
-    // (M) or 64-bit 0x1234000000000000 (N) etc.
-    // As a note some of this code is liberally stolen from the asm parser.
-    case 'M': {
-      if (!isUInt<32>(CVal))
-        return;
-      if (ARM64_AM::isLogicalImmediate(CVal, 32))
-        break;
-      if ((CVal & 0xFFFF) == CVal)
-        break;
-      if ((CVal & 0xFFFF0000ULL) == CVal)
-        break;
-      uint64_t NCVal = ~(uint32_t)CVal;
-      if ((NCVal & 0xFFFFULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF0000ULL) == NCVal)
-        break;
-      return;
-    }
-    case 'N': {
-      if (ARM64_AM::isLogicalImmediate(CVal, 64))
-        break;
-      if ((CVal & 0xFFFFULL) == CVal)
-        break;
-      if ((CVal & 0xFFFF0000ULL) == CVal)
-        break;
-      if ((CVal & 0xFFFF00000000ULL) == CVal)
-        break;
-      if ((CVal & 0xFFFF000000000000ULL) == CVal)
-        break;
-      uint64_t NCVal = ~CVal;
-      if ((NCVal & 0xFFFFULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF0000ULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
-        break;
-      return;
-    }
-    default:
-      return;
-    }
-
-    // All assembler immediates are 64-bit integers.
-    Result = DAG.getTargetConstant(CVal, MVT::i64);
-    break;
-  }
-
-  if (Result.getNode()) {
-    Ops.push_back(Result);
-    return;
-  }
-
-  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
-}
-
-//===----------------------------------------------------------------------===//
-//                     ARM64 Advanced SIMD Support
-//===----------------------------------------------------------------------===//
-
-/// WidenVector - Given a value in the V64 register class, produce the
-/// equivalent value in the V128 register class.
-static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
-  EVT VT = V64Reg.getValueType();
-  unsigned NarrowSize = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType().getSimpleVT();
-  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
-  SDLoc DL(V64Reg);
-
-  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
-                     V64Reg, DAG.getConstant(0, MVT::i32));
-}
-
-/// getExtFactor - Determine the adjustment factor for the position when
-/// generating an "extract from vector registers" instruction.
-static unsigned getExtFactor(SDValue &V) {
-  EVT EltType = V.getValueType().getVectorElementType();
-  return EltType.getSizeInBits() / 8;
-}
-
-/// NarrowVector - Given a value in the V128 register class, produce the
-/// equivalent value in the V64 register class.
-static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
-  EVT VT = V128Reg.getValueType();
-  unsigned WideSize = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType().getSimpleVT();
-  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
-  SDLoc DL(V128Reg);
-
-  return DAG.getTargetExtractSubreg(ARM64::dsub, DL, NarrowTy, V128Reg);
-}
-
-// Gather data to see if the operation can be modelled as a
-// shuffle in combination with VEXTs.
-SDValue ARM64TargetLowering::ReconstructShuffle(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
-
-  SmallVector<SDValue, 2> SourceVecs;
-  SmallVector<unsigned, 2> MinElts;
-  SmallVector<unsigned, 2> MaxElts;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
-      continue;
-    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
-      // A shuffle can only come from building a vector from various
-      // elements of other vectors.
-      return SDValue();
-    }
-
-    // Record this extraction against the appropriate vector if possible...
-    SDValue SourceVec = V.getOperand(0);
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
-    bool FoundSource = false;
-    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
-      if (SourceVecs[j] == SourceVec) {
-        if (MinElts[j] > EltNo)
-          MinElts[j] = EltNo;
-        if (MaxElts[j] < EltNo)
-          MaxElts[j] = EltNo;
-        FoundSource = true;
-        break;
-      }
-    }
-
-    // Or record a new source if not...
-    if (!FoundSource) {
-      SourceVecs.push_back(SourceVec);
-      MinElts.push_back(EltNo);
-      MaxElts.push_back(EltNo);
-    }
-  }
-
-  // Currently only do something sane when at most two source vectors
-  // involved.
-  if (SourceVecs.size() > 2)
-    return SDValue();
-
-  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
-  int VEXTOffsets[2] = { 0, 0 };
-
-  // This loop extracts the usage patterns of the source vectors
-  // and prepares appropriate SDValues for a shuffle if possible.
-  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
-    if (SourceVecs[i].getValueType() == VT) {
-      // No VEXT necessary
-      ShuffleSrcs[i] = SourceVecs[i];
-      VEXTOffsets[i] = 0;
-      continue;
-    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
-      // We can pad out the smaller vector for free, so if it's part of a
-      // shuffle...
-      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i],
-                                   DAG.getUNDEF(SourceVecs[i].getValueType()));
-      continue;
-    }
-
-    // Don't attempt to extract subvectors from BUILD_VECTOR sources
-    // that expand or trunc the original value.
-    // TODO: We can try to bitcast and ANY_EXTEND the result but
-    // we need to consider the cost of vector ANY_EXTEND, and the
-    // legality of all the types.
-    if (SourceVecs[i].getValueType().getVectorElementType() !=
-        VT.getVectorElementType())
-      return SDValue();
-
-    // Since only 64-bit and 128-bit vectors are legal on ARM and
-    // we've eliminated the other cases...
-    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts &&
-           "unexpected vector sizes in ReconstructShuffle");
-
-    if (MaxElts[i] - MinElts[i] >= NumElts) {
-      // Span too large for a VEXT to cope
-      return SDValue();
-    }
-
-    if (MinElts[i] >= NumElts) {
-      // The extraction can just take the second half
-      VEXTOffsets[i] = NumElts;
-      ShuffleSrcs[i] =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
-                      DAG.getIntPtrConstant(NumElts));
-    } else if (MaxElts[i] < NumElts) {
-      // The extraction can just take the first half
-      VEXTOffsets[i] = 0;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                   SourceVecs[i], DAG.getIntPtrConstant(0));
-    } else {
-      // An actual VEXT is needed
-      VEXTOffsets[i] = MinElts[i];
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                     SourceVecs[i], DAG.getIntPtrConstant(0));
-      SDValue VEXTSrc2 =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
-                      DAG.getIntPtrConstant(NumElts));
-      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
-      ShuffleSrcs[i] = DAG.getNode(ARM64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
-                                   DAG.getConstant(Imm, MVT::i32));
-    }
-  }
-
-  SmallVector<int, 8> Mask;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF) {
-      Mask.push_back(-1);
-      continue;
-    }
-
-    SDValue ExtractVec = Entry.getOperand(0);
-    int ExtractElt =
-        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
-    if (ExtractVec == SourceVecs[0]) {
-      Mask.push_back(ExtractElt - VEXTOffsets[0]);
-    } else {
-      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
-    }
-  }
-
-  // Final check before we try to produce nonsense...
-  if (isShuffleMaskLegal(Mask, VT))
-    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
-                                &Mask[0]);
-
-  return SDValue();
-}
-
-// check if an EXT instruction can handle the shuffle mask when the
-// vector sources of the shuffle are the same.
-static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
-  if (M[0] < 0)
-    return false;
-
-  Imm = M[0];
-
-  // If this is a VEXT shuffle, the immediate value is the index of the first
-  // element.  The other shuffle indices must be the successive elements after
-  // the first one.
-  unsigned ExpectedElt = Imm;
-  for (unsigned i = 1; i < NumElts; ++i) {
-    // Increment the expected index.  If it wraps around, just follow it
-    // back to index zero and keep going.
-    ++ExpectedElt;
-    if (ExpectedElt == NumElts)
-      ExpectedElt = 0;
-
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if (ExpectedElt != static_cast<unsigned>(M[i]))
-      return false;
-  }
-
-  return true;
-}
-
-// check if an EXT instruction can handle the shuffle mask when the
-// vector sources of the shuffle are different.
-static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
-                      unsigned &Imm) {
-  // Look for the first non-undef element.
-  const int *FirstRealElt = std::find_if(M.begin(), M.end(),
-      [](int Elt) {return Elt >= 0;});
-
-  // Benefit form APInt to handle overflow when calculating expected element.
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
-  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
-  // The following shuffle indices must be the successive elements after the
-  // first real element.
-  const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
-      [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
-  if (FirstWrongElt != M.end())
-    return false;
-
-  // The index of an EXT is the first element if it is not UNDEF.
-  // Watch out for the beginning UNDEFs. The EXT index should be the expected
-  // value of the first element.  E.g. 
-  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
-  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
-  // ExpectedElt is the last mask index plus 1.
-  Imm = ExpectedElt.getZExtValue();
-
-  // There are two difference cases requiring to reverse input vectors.
-  // For example, for vector <4 x i32> we have the following cases,
-  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
-  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
-  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
-  // to reverse two input vectors.
-  if (Imm < NumElts)
-    ReverseEXT = true;
-  else
-    Imm -= NumElts;
-
-  return true;
-}
-
-/// isREVMask - Check if a vector shuffle corresponds to a REV
-/// instruction with the specified blocksize.  (The order of the elements
-/// within each block of the vector is reversed.)
-static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
-  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
-         "Only possible block sizes for REV are: 16, 32, 64");
-
-  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
-  if (EltSz == 64)
-    return false;
-
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned BlockElts = M[0] + 1;
-  // If the first shuffle index is UNDEF, be optimistic.
-  if (M[0] < 0)
-    BlockElts = BlockSize / EltSz;
-
-  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
-    return false;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
-      return false;
-  }
-
-  return true;
-}
-
-static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  unsigned Idx = WhichResult * NumElts / 2;
-  for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
-      return false;
-    Idx += 1;
-  }
-
-  return true;
-}
-
-static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i != NumElts; ++i) {
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if ((unsigned)M[i] != 2 * i + WhichResult)
-      return false;
-  }
-
-  return true;
-}
-
-static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
-      return false;
-  }
-  return true;
-}
-
-/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
-/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
-/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
-static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  unsigned Idx = WhichResult * NumElts / 2;
-  for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
-      return false;
-    Idx += 1;
-  }
-
-  return true;
-}
-
-/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
-/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
-/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
-static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned Half = VT.getVectorNumElements() / 2;
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned j = 0; j != 2; ++j) {
-    unsigned Idx = WhichResult;
-    for (unsigned i = 0; i != Half; ++i) {
-      int MIdx = M[i + j * Half];
-      if (MIdx >= 0 && (unsigned)MIdx != Idx)
-        return false;
-      Idx += 2;
-    }
-  }
-
-  return true;
-}
-
-/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
-/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
-/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
-static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
-      return false;
-  }
-  return true;
-}
-
-static bool isINSMask(ArrayRef<int> M, int NumInputElements,
-                      bool &DstIsLeft, int &Anomaly) {
-  if (M.size() != static_cast<size_t>(NumInputElements))
-    return false;
-
-  int NumLHSMatch = 0, NumRHSMatch = 0;
-  int LastLHSMismatch = -1, LastRHSMismatch = -1;
-
-  for (int i = 0; i < NumInputElements; ++i) {
-    if (M[i] == -1) {
-      ++NumLHSMatch;
-      ++NumRHSMatch;
-      continue;
-    }
-
-    if (M[i] == i)
-      ++NumLHSMatch;
-    else
-      LastLHSMismatch = i;
-
-    if (M[i] == i + NumInputElements)
-      ++NumRHSMatch;
-    else
-      LastRHSMismatch = i;
-  }
-
-  if (NumLHSMatch == NumInputElements - 1) {
-    DstIsLeft = true;
-    Anomaly = LastLHSMismatch;
-    return true;
-  } else if (NumRHSMatch == NumInputElements - 1) {
-    DstIsLeft = false;
-    Anomaly = LastRHSMismatch;
-    return true;
-  }
-
-  return false;
-}
-
-static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
-  if (VT.getSizeInBits() != 128)
-    return false;
-
-  unsigned NumElts = VT.getVectorNumElements();
-
-  for (int I = 0, E = NumElts / 2; I != E; I++) {
-    if (Mask[I] != I)
-      return false;
-  }
-
-  int Offset = NumElts / 2;
-  for (int I = NumElts / 2, E = NumElts; I != E; I++) {
-    if (Mask[I] != I + SplitLHS * Offset)
-      return false;
-  }
-
-  return true;
-}
-
-static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue V0 = Op.getOperand(0);
-  SDValue V1 = Op.getOperand(1);
-  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
-
-  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
-      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
-    return SDValue();
-
-  bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
-
-  if (!isConcatMask(Mask, VT, SplitV0))
-    return SDValue();
-
-  EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                                VT.getVectorNumElements() / 2);
-  if (SplitV0) {
-    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
-                     DAG.getConstant(0, MVT::i64));
-  }
-  if (V1.getValueType().getSizeInBits() == 128) {
-    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
-                     DAG.getConstant(0, MVT::i64));
-  }
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
-}
-
-/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
-/// the specified operations to build the shuffle.
-static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
-                                      SDValue RHS, SelectionDAG &DAG,
-                                      SDLoc dl) {
-  unsigned OpNum = (PFEntry >> 26) & 0x0F;
-  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
-  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
-
-  enum {
-    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
-    OP_VREV,
-    OP_VDUP0,
-    OP_VDUP1,
-    OP_VDUP2,
-    OP_VDUP3,
-    OP_VEXT1,
-    OP_VEXT2,
-    OP_VEXT3,
-    OP_VUZPL, // VUZP, left result
-    OP_VUZPR, // VUZP, right result
-    OP_VZIPL, // VZIP, left result
-    OP_VZIPR, // VZIP, right result
-    OP_VTRNL, // VTRN, left result
-    OP_VTRNR  // VTRN, right result
-  };
-
-  if (OpNum == OP_COPY) {
-    if (LHSID == (1 * 9 + 2) * 9 + 3)
-      return LHS;
-    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
-    return RHS;
-  }
-
-  SDValue OpLHS, OpRHS;
-  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
-  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
-  EVT VT = OpLHS.getValueType();
-
-  switch (OpNum) {
-  default:
-    llvm_unreachable("Unknown shuffle opcode!");
-  case OP_VREV:
-    // VREV divides the vector in half and swaps within the half.
-    if (VT.getVectorElementType() == MVT::i32 ||
-        VT.getVectorElementType() == MVT::f32)
-      return DAG.getNode(ARM64ISD::REV64, dl, VT, OpLHS);
-    // vrev <4 x i16> -> REV32
-    if (VT.getVectorElementType() == MVT::i16)
-      return DAG.getNode(ARM64ISD::REV32, dl, VT, OpLHS);
-    // vrev <4 x i8> -> REV16
-    assert(VT.getVectorElementType() == MVT::i8);
-    return DAG.getNode(ARM64ISD::REV16, dl, VT, OpLHS);
-  case OP_VDUP0:
-  case OP_VDUP1:
-  case OP_VDUP2:
-  case OP_VDUP3: {
-    EVT EltTy = VT.getVectorElementType();
-    unsigned Opcode;
-    if (EltTy == MVT::i8)
-      Opcode = ARM64ISD::DUPLANE8;
-    else if (EltTy == MVT::i16)
-      Opcode = ARM64ISD::DUPLANE16;
-    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
-      Opcode = ARM64ISD::DUPLANE32;
-    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
-      Opcode = ARM64ISD::DUPLANE64;
-    else
-      llvm_unreachable("Invalid vector element type?");
-
-    if (VT.getSizeInBits() == 64)
-      OpLHS = WidenVector(OpLHS, DAG);
-    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
-    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
-  }
-  case OP_VEXT1:
-  case OP_VEXT2:
-  case OP_VEXT3: {
-    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
-    return DAG.getNode(ARM64ISD::EXT, dl, VT, OpLHS, OpRHS,
-                       DAG.getConstant(Imm, MVT::i32));
-  }
-  case OP_VUZPL:
-    return DAG.getNode(ARM64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VUZPR:
-    return DAG.getNode(ARM64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VZIPL:
-    return DAG.getNode(ARM64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VZIPR:
-    return DAG.getNode(ARM64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VTRNL:
-    return DAG.getNode(ARM64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VTRNR:
-    return DAG.getNode(ARM64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  }
-}
-
-static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
-                           SelectionDAG &DAG) {
-  // Check to see if we can use the TBL instruction.
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  SDLoc DL(Op);
-
-  EVT EltVT = Op.getValueType().getVectorElementType();
-  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
-
-  SmallVector<SDValue, 8> TBLMask;
-  for (int Val : ShuffleMask) {
-    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
-      unsigned Offset = Byte + Val * BytesPerElt;
-      TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
-    }
-  }
-
-  MVT IndexVT = MVT::v8i8;
-  unsigned IndexLen = 8;
-  if (Op.getValueType().getSizeInBits() == 128) {
-    IndexVT = MVT::v16i8;
-    IndexLen = 16;
-  }
-
-  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
-  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
-
-  SDValue Shuffle;
-  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
-    if (IndexLen == 8)
-      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
-    Shuffle = DAG.getNode(
-        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-        DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst,
-        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-                    makeArrayRef(TBLMask.data(), IndexLen)));
-  } else {
-    if (IndexLen == 8) {
-      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
-      Shuffle = DAG.getNode(
-          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-          DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-                      makeArrayRef(TBLMask.data(), IndexLen)));
-    } else {
-      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
-      // cannot currently represent the register constraints on the input
-      // table registers.
-      //  Shuffle = DAG.getNode(ARM64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
-      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-      //                               &TBLMask[0], IndexLen));
-      Shuffle = DAG.getNode(
-          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-          DAG.getConstant(Intrinsic::arm64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-                      makeArrayRef(TBLMask.data(), IndexLen)));
-    }
-  }
-  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
-}
-
-static unsigned getDUPLANEOp(EVT EltType) {
-  if (EltType == MVT::i8)
-    return ARM64ISD::DUPLANE8;
-  if (EltType == MVT::i16)
-    return ARM64ISD::DUPLANE16;
-  if (EltType == MVT::i32 || EltType == MVT::f32)
-    return ARM64ISD::DUPLANE32;
-  if (EltType == MVT::i64 || EltType == MVT::f64)
-    return ARM64ISD::DUPLANE64;
-
-  llvm_unreachable("Invalid vector element type?");
-}
-
-SDValue ARM64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
-
-  // Convert shuffles that are directly supported on NEON to target-specific
-  // DAG nodes, instead of keeping them as shuffles and matching them again
-  // during code selection.  This is more efficient and avoids the possibility
-  // of inconsistencies between legalization and selection.
-  ArrayRef<int> ShuffleMask = SVN->getMask();
-
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-
-  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
-                                       V1.getValueType().getSimpleVT())) {
-    int Lane = SVN->getSplatIndex();
-    // If this is undef splat, generate it via "just" vdup, if possible.
-    if (Lane == -1)
-      Lane = 0;
-
-    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return DAG.getNode(ARM64ISD::DUP, dl, V1.getValueType(),
-                         V1.getOperand(0));
-    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
-    // constant. If so, we can just reference the lane's definition directly.
-    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
-        !isa<ConstantSDNode>(V1.getOperand(Lane)))
-      return DAG.getNode(ARM64ISD::DUP, dl, VT, V1.getOperand(Lane));
-
-    // Otherwise, duplicate from the lane of the input vector.
-    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
-
-    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
-    // to make a vector of the same size as this SHUFFLE. We can ignore the
-    // extract entirely, and canonicalise the concat using WidenVector.
-    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
-      V1 = V1.getOperand(0);
-    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
-      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
-      Lane -= Idx * VT.getVectorNumElements() / 2;
-      V1 = WidenVector(V1.getOperand(Idx), DAG);
-    } else if (VT.getSizeInBits() == 64)
-      V1 = WidenVector(V1, DAG);
-
-    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
-  }
-
-  if (isREVMask(ShuffleMask, VT, 64))
-    return DAG.getNode(ARM64ISD::REV64, dl, V1.getValueType(), V1, V2);
-  if (isREVMask(ShuffleMask, VT, 32))
-    return DAG.getNode(ARM64ISD::REV32, dl, V1.getValueType(), V1, V2);
-  if (isREVMask(ShuffleMask, VT, 16))
-    return DAG.getNode(ARM64ISD::REV16, dl, V1.getValueType(), V1, V2);
-
-  bool ReverseEXT = false;
-  unsigned Imm;
-  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
-    if (ReverseEXT)
-      std::swap(V1, V2);
-    Imm *= getExtFactor(V1);
-    return DAG.getNode(ARM64ISD::EXT, dl, V1.getValueType(), V1, V2,
-                       DAG.getConstant(Imm, MVT::i32));
-  } else if (V2->getOpcode() == ISD::UNDEF &&
-             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
-    Imm *= getExtFactor(V1);
-    return DAG.getNode(ARM64ISD::EXT, dl, V1.getValueType(), V1, V1,
-                       DAG.getConstant(Imm, MVT::i32));
-  }
-
-  unsigned WhichResult;
-  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::ZIP1 : ARM64ISD::ZIP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
-  }
-  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::UZP1 : ARM64ISD::UZP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
-  }
-  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::TRN1 : ARM64ISD::TRN2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
-  }
-
-  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::ZIP1 : ARM64ISD::ZIP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
-  }
-  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::UZP1 : ARM64ISD::UZP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
-  }
-  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::TRN1 : ARM64ISD::TRN2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
-  }
-
-  SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
-  if (Concat.getNode())
-    return Concat;
-
-  bool DstIsLeft;
-  int Anomaly;
-  int NumInputElements = V1.getValueType().getVectorNumElements();
-  if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
-    SDValue DstVec = DstIsLeft ? V1 : V2;
-    SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
-
-    SDValue SrcVec = V1;
-    int SrcLane = ShuffleMask[Anomaly];
-    if (SrcLane >= NumInputElements) {
-      SrcVec = V2;
-      SrcLane -= VT.getVectorNumElements();
-    }
-    SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
-
-    EVT ScalarVT = VT.getVectorElementType();
-    if (ScalarVT.getSizeInBits() < 32)
-      ScalarVT = MVT::i32;
-
-    return DAG.getNode(
-        ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
-        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
-        DstLaneV);
-  }
-
-  // If the shuffle is not directly supported and it has 4 elements, use
-  // the PerfectShuffle-generated table to synthesize it from other shuffles.
-  unsigned NumElts = VT.getVectorNumElements();
-  if (NumElts == 4) {
-    unsigned PFIndexes[4];
-    for (unsigned i = 0; i != 4; ++i) {
-      if (ShuffleMask[i] < 0)
-        PFIndexes[i] = 8;
-      else
-        PFIndexes[i] = ShuffleMask[i];
-    }
-
-    // Compute the index in the perfect shuffle table.
-    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
-                            PFIndexes[2] * 9 + PFIndexes[3];
-    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost = (PFEntry >> 30);
-
-    if (Cost <= 4)
-      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
-  }
-
-  return GenerateTBL(Op, ShuffleMask, DAG);
-}
-
-static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
-                               APInt &UndefBits) {
-  EVT VT = BVN->getValueType(0);
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
-
-    for (unsigned i = 0; i < NumSplats; ++i) {
-      CnstBits <<= SplatBitSize;
-      UndefBits <<= SplatBitSize;
-      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
-      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
-    }
-
-    return true;
-  }
-
-  return false;
-}
-
-SDValue ARM64TargetLowering::LowerVectorAND(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN =
-      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
-  SDValue LHS = Op.getOperand(0);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  if (!BVN)
-    return Op;
-
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We only have BIC vector immediate instruction, which is and-not.
-    CnstBits = ~CnstBits;
-
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = ~UndefBits;
-    goto AttemptModImm;
-  }
-
-// We can always fall back to a non-immediate AND.
-FailedModImm:
-  return Op;
-}
-
-// Specialized code to quickly find if PotentialBVec is a BuildVector that
-// consists of only the same constant int value, returned in reference arg
-// ConstVal
-static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
-                                     uint64_t &ConstVal) {
-  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
-  if (!Bvec)
-    return false;
-  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
-  if (!FirstElt)
-    return false;
-  EVT VT = Bvec->getValueType(0);
-  unsigned NumElts = VT.getVectorNumElements();
-  for (unsigned i = 1; i < NumElts; ++i)
-    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
-      return false;
-  ConstVal = FirstElt->getZExtValue();
-  return true;
-}
-
-static unsigned getIntrinsicID(const SDNode *N) {
-  unsigned Opcode = N->getOpcode();
-  switch (Opcode) {
-  default:
-    return Intrinsic::not_intrinsic;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-    if (IID < Intrinsic::num_intrinsics)
-      return IID;
-    return Intrinsic::not_intrinsic;
-  }
-  }
-}
-
-// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
-// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
-// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
-// Also, logical shift right -> sri, with the same structure.
-static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-
-  if (!VT.isVector())
-    return SDValue();
-
-  SDLoc DL(N);
-
-  // Is the first op an AND?
-  const SDValue And = N->getOperand(0);
-  if (And.getOpcode() != ISD::AND)
-    return SDValue();
-
-  // Is the second op an shl or lshr?
-  SDValue Shift = N->getOperand(1);
-  // This will have been turned into: ARM64ISD::VSHL vector, #shift
-  // or ARM64ISD::VLSHR vector, #shift
-  unsigned ShiftOpc = Shift.getOpcode();
-  if ((ShiftOpc != ARM64ISD::VSHL && ShiftOpc != ARM64ISD::VLSHR))
-    return SDValue();
-  bool IsShiftRight = ShiftOpc == ARM64ISD::VLSHR;
-
-  // Is the shift amount constant?
-  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
-  if (!C2node)
-    return SDValue();
-
-  // Is the and mask vector all constant?
-  uint64_t C1;
-  if (!isAllConstantBuildVector(And.getOperand(1), C1))
-    return SDValue();
-
-  // Is C1 == ~C2, taking into account how much one can shift elements of a
-  // particular size?
-  uint64_t C2 = C2node->getZExtValue();
-  unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
-  if (C2 > ElemSizeInBits)
-    return SDValue();
-  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
-  if ((C1 & ElemMask) != (~C2 & ElemMask))
-    return SDValue();
-
-  SDValue X = And.getOperand(0);
-  SDValue Y = Shift.getOperand(0);
-
-  unsigned Intrin =
-      IsShiftRight ? Intrinsic::arm64_neon_vsri : Intrinsic::arm64_neon_vsli;
-  SDValue ResultSLI =
-      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                  DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
-
-  DEBUG(dbgs() << "arm64-lower: transformed: \n");
-  DEBUG(N->dump(&DAG));
-  DEBUG(dbgs() << "into: \n");
-  DEBUG(ResultSLI->dump(&DAG));
-
-  ++NumShiftInserts;
-  return ResultSLI;
-}
-
-SDValue ARM64TargetLowering::LowerVectorOR(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
-  if (EnableARM64SlrGeneration) {
-    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
-    if (Res.getNode())
-      return Res;
-  }
-
-  BuildVectorSDNode *BVN =
-      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
-  SDValue LHS = Op.getOperand(1);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  // OR commutes, so try swapping the operands.
-  if (!BVN) {
-    LHS = Op.getOperand(0);
-    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
-  }
-  if (!BVN)
-    return Op;
-
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = UndefBits;
-    goto AttemptModImm;
-  }
-
-// We can always fall back to a non-immediate OR.
-FailedModImm:
-  return Op;
-}
-
-SDValue ARM64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      // Certain magic vector constants (used to express things like NOT
-      // and NEG) are passed through unmodified.  This allows codegen patterns
-      // for these operations to match.  Special-purpose patterns will lower
-      // these immediates to MOVIs if it proves necessary.
-      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
-        return Op;
-
-      // The many faces of MOVI...
-      if (ARM64_AM::isAdvSIMDModImmType10(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType10(CnstVal);
-        if (VT.getSizeInBits() == 128) {
-          SDValue Mov = DAG.getNode(ARM64ISD::MOVIedit, dl, MVT::v2i64,
-                                    DAG.getConstant(CnstVal, MVT::i32));
-          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-        }
-
-        // Support the V64 version via subregister insertion.
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIedit, dl, MVT::f64,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType7(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType7(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType8(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType8(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType9(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType9(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVI, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      // The few faces of FMOV...
-      if (ARM64_AM::isAdvSIMDModImmType11(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType11(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
-        SDValue Mov = DAG.getNode(ARM64ISD::FMOV, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType12(CnstVal) &&
-          VT.getSizeInBits() == 128) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType12(CnstVal);
-        SDValue Mov = DAG.getNode(ARM64ISD::FMOV, dl, MVT::v2f64,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      // The many faces of MVNI...
-      CnstVal = ~CnstVal;
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType7(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType7(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType8(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType8(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = UndefBits;
-    goto AttemptModImm;
-  }
-FailedModImm:
-
-  // Scan through the operands to find some interesting properties we can
-  // exploit:
-  //   1) If only one value is used, we can use a DUP, or
-  //   2) if only the low element is not undef, we can just insert that, or
-  //   3) if only one constant value is used (w/ some non-constant lanes),
-  //      we can splat the constant value into the whole vector then fill
-  //      in the non-constant lanes.
-  //   4) FIXME: If different constant values are used, but we can intelligently
-  //             select the values we'll be overwriting for the non-constant
-  //             lanes such that we can directly materialize the vector
-  //             some other way (MOVI, e.g.), we can be sneaky.
-  unsigned NumElts = VT.getVectorNumElements();
-  bool isOnlyLowElement = true;
-  bool usesOnlyOneValue = true;
-  bool usesOnlyOneConstantValue = true;
-  bool isConstant = true;
-  unsigned NumConstantLanes = 0;
-  SDValue Value;
-  SDValue ConstantValue;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
-      continue;
-    if (i > 0)
-      isOnlyLowElement = false;
-    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
-      isConstant = false;
-
-    if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
-      ++NumConstantLanes;
-      if (!ConstantValue.getNode())
-        ConstantValue = V;
-      else if (ConstantValue != V)
-        usesOnlyOneConstantValue = false;
-    }
-
-    if (!Value.getNode())
-      Value = V;
-    else if (V != Value)
-      usesOnlyOneValue = false;
-  }
-
-  if (!Value.getNode())
-    return DAG.getUNDEF(VT);
-
-  if (isOnlyLowElement)
-    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
-
-  // Use DUP for non-constant splats.  For f32 constant splats, reduce to
-  // i32 and try again.
-  if (usesOnlyOneValue) {
-    if (!isConstant) {
-      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-          Value.getValueType() != VT)
-        return DAG.getNode(ARM64ISD::DUP, dl, VT, Value);
-
-      // This is actually a DUPLANExx operation, which keeps everything vectory.
-
-      // DUPLANE works on 128-bit vectors, widen it if necessary.
-      SDValue Lane = Value.getOperand(1);
-      Value = Value.getOperand(0);
-      if (Value.getValueType().getSizeInBits() == 64)
-        Value = WidenVector(Value, DAG);
-
-      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
-      return DAG.getNode(Opcode, dl, VT, Value, Lane);
-    }
-
-    if (VT.getVectorElementType().isFloatingPoint()) {
-      SmallVector<SDValue, 8> Ops;
-      MVT NewType =
-          (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
-      for (unsigned i = 0; i < NumElts; ++i)
-        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
-      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
-      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
-      Val = LowerBUILD_VECTOR(Val, DAG);
-      if (Val.getNode())
-        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
-    }
-  }
-
-  // If there was only one constant value used and for more than one lane,
-  // start by splatting that value, then replace the non-constant lanes. This
-  // is better than the default, which will perform a separate initialization
-  // for each lane.
-  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
-    SDValue Val = DAG.getNode(ARM64ISD::DUP, dl, VT, ConstantValue);
-    // Now insert the non-constant lanes.
-    for (unsigned i = 0; i < NumElts; ++i) {
-      SDValue V = Op.getOperand(i);
-      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
-      if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
-        // Note that type legalization likely mucked about with the VT of the
-        // source operand, so we may have to convert it here before inserting.
-        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
-      }
-    }
-    return Val;
-  }
-
-  // If all elements are constants and the case above didn't get hit, fall back
-  // to the default expansion, which will generate a load from the constant
-  // pool.
-  if (isConstant)
-    return SDValue();
-
-  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
-  if (NumElts >= 4) {
-    SDValue shuffle = ReconstructShuffle(Op, DAG);
-    if (shuffle != SDValue())
-      return shuffle;
-  }
-
-  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
-  // know the default expansion would otherwise fall back on something even
-  // worse. For a vector with one or two non-undef values, that's
-  // scalar_to_vector for the elements followed by a shuffle (provided the
-  // shuffle is valid for the target) and materialization element by element
-  // on the stack followed by a load for everything else.
-  if (!isConstant && !usesOnlyOneValue) {
-    SDValue Vec = DAG.getUNDEF(VT);
-    SDValue Op0 = Op.getOperand(0);
-    unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
-    unsigned i = 0;
-    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
-    // a) Avoid a RMW dependency on the full vector register, and
-    // b) Allow the register coalescer to fold away the copy if the
-    //    value is already in an S or D register.
-    if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
-      unsigned SubIdx = ElemSize == 32 ? ARM64::ssub : ARM64::dsub;
-      MachineSDNode *N =
-          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
-                             DAG.getTargetConstant(SubIdx, MVT::i32));
-      Vec = SDValue(N, 0);
-      ++i;
-    }
-    for (; i < NumElts; ++i) {
-      SDValue V = Op.getOperand(i);
-      if (V.getOpcode() == ISD::UNDEF)
-        continue;
-      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
-      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
-    }
-    return Vec;
-  }
-
-  // Just use the default expansion. We failed to find a better alternative.
-  return SDValue();
-}
-
-SDValue ARM64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
-
-  // Check for non-constant lane.
-  if (!isa<ConstantSDNode>(Op.getOperand(2)))
-    return SDValue();
-
-  EVT VT = Op.getOperand(0).getValueType();
-
-  // Insertion/extraction are legal for V128 types.
-  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
-    return Op;
-
-  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
-    return SDValue();
-
-  // For V64 types, we perform insertion by expanding the value
-  // to a V128 type and perform the insertion on that.
-  SDLoc DL(Op);
-  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
-  EVT WideTy = WideVec.getValueType();
-
-  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
-                             Op.getOperand(1), Op.getOperand(2));
-  // Re-narrow the resultant vector.
-  return NarrowVector(Node, DAG);
-}
-
-SDValue ARM64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
-                                                     SelectionDAG &DAG) const {
-  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
-
-  // Check for non-constant lane.
-  if (!isa<ConstantSDNode>(Op.getOperand(1)))
-    return SDValue();
-
-  EVT VT = Op.getOperand(0).getValueType();
-
-  // Insertion/extraction are legal for V128 types.
-  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
-    return Op;
-
-  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
-    return SDValue();
-
-  // For V64 types, we perform extraction by expanding the value
-  // to a V128 type and perform the extraction on that.
-  SDLoc DL(Op);
-  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
-  EVT WideTy = WideVec.getValueType();
-
-  EVT ExtrTy = WideTy.getVectorElementType();
-  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
-    ExtrTy = MVT::i32;
-
-  // For extractions, we just return the result directly.
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
-                     Op.getOperand(1));
-}
-
-SDValue ARM64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  EVT VT = Op.getOperand(0).getValueType();
-  SDLoc dl(Op);
-  // Just in case...
-  if (!VT.isVector())
-    return SDValue();
-
-  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-  if (!Cst)
-    return SDValue();
-  unsigned Val = Cst->getZExtValue();
-
-  unsigned Size = Op.getValueType().getSizeInBits();
-  if (Val == 0) {
-    switch (Size) {
-    case 8:
-      return DAG.getTargetExtractSubreg(ARM64::bsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 16:
-      return DAG.getTargetExtractSubreg(ARM64::hsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 32:
-      return DAG.getTargetExtractSubreg(ARM64::ssub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 64:
-      return DAG.getTargetExtractSubreg(ARM64::dsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    default:
-      llvm_unreachable("Unexpected vector type in extract_subvector!");
-    }
-  }
-  // If this is extracting the upper 64-bits of a 128-bit vector, we match
-  // that directly.
-  if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
-    return Op;
-
-  return SDValue();
-}
-
-bool ARM64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
-                                             EVT VT) const {
-  if (VT.getVectorNumElements() == 4 &&
-      (VT.is128BitVector() || VT.is64BitVector())) {
-    unsigned PFIndexes[4];
-    for (unsigned i = 0; i != 4; ++i) {
-      if (M[i] < 0)
-        PFIndexes[i] = 8;
-      else
-        PFIndexes[i] = M[i];
-    }
-
-    // Compute the index in the perfect shuffle table.
-    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
-                            PFIndexes[2] * 9 + PFIndexes[3];
-    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost = (PFEntry >> 30);
-
-    if (Cost <= 4)
-      return true;
-  }
-
-  bool DummyBool;
-  int DummyInt;
-  unsigned DummyUnsigned;
-
-  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
-          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
-          isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
-          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
-          isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
-          isZIPMask(M, VT, DummyUnsigned) ||
-          isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
-          isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
-          isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
-          isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
-          isConcatMask(M, VT, VT.getSizeInBits() == 128));
-}
-
-/// getVShiftImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift operation, where all the elements of the
-/// build_vector must have the same constant integer value.
-static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
-  // Ignore bit_converts.
-  while (Op.getOpcode() == ISD::BITCAST)
-    Op = Op.getOperand(0);
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
-                                    HasAnyUndefs, ElementBits) ||
-      SplatBitSize > ElementBits)
-    return false;
-  Cnt = SplatBits.getSExtValue();
-  return true;
-}
-
-/// isVShiftLImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift left operation.  That value must be in the range:
-///   0 <= Value < ElementBits for a left shift; or
-///   0 <= Value <= ElementBits for a long left shift.
-static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
-    return false;
-  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
-}
-
-/// isVShiftRImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift right operation.  For a shift opcode, the value
-/// is positive, but for an intrinsic the value count must be negative. The
-/// absolute value must be in the range:
-///   1 <= |Value| <= ElementBits for a right shift; or
-///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
-static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
-                         int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
-    return false;
-  if (isIntrinsic)
-    Cnt = -Cnt;
-  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
-}
-
-SDValue ARM64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  int64_t Cnt;
-
-  if (!Op.getOperand(1).getValueType().isVector())
-    return Op;
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("unexpected shift opcode");
-
-  case ISD::SHL:
-    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
-      return DAG.getNode(ARM64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
-                         DAG.getConstant(Cnt, MVT::i32));
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                       DAG.getConstant(Intrinsic::arm64_neon_ushl, MVT::i32),
-                       Op.getOperand(0), Op.getOperand(1));
-  case ISD::SRA:
-  case ISD::SRL:
-    // Right shift immediate
-    if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
-        Cnt < EltSize) {
-      unsigned Opc =
-          (Op.getOpcode() == ISD::SRA) ? ARM64ISD::VASHR : ARM64ISD::VLSHR;
-      return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
-                         DAG.getConstant(Cnt, MVT::i32));
-    }
-
-    // Right shift register.  Note, there is not a shift right register
-    // instruction, but the shift left register instruction takes a signed
-    // value, where negative numbers specify a right shift.
-    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::arm64_neon_sshl
-                                                : Intrinsic::arm64_neon_ushl;
-    // negate the shift amount
-    SDValue NegShift = DAG.getNode(ARM64ISD::NEG, DL, VT, Op.getOperand(1));
-    SDValue NegShiftLeft =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                    DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
-    return NegShiftLeft;
-  }
-
-  return SDValue();
-}
-
-static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
-                                    ARM64CC::CondCode CC, bool NoNans, EVT VT,
-                                    SDLoc dl, SelectionDAG &DAG) {
-  EVT SrcVT = LHS.getValueType();
-
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
-  bool IsZero = IsCnst && (CnstBits == 0);
-
-  if (SrcVT.getVectorElementType().isFloatingPoint()) {
-    switch (CC) {
-    default:
-      return SDValue();
-    case ARM64CC::NE: {
-      SDValue Fcmeq;
-      if (IsZero)
-        Fcmeq = DAG.getNode(ARM64ISD::FCMEQz, dl, VT, LHS);
-      else
-        Fcmeq = DAG.getNode(ARM64ISD::FCMEQ, dl, VT, LHS, RHS);
-      return DAG.getNode(ARM64ISD::NOT, dl, VT, Fcmeq);
-    }
-    case ARM64CC::EQ:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMEQz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMEQ, dl, VT, LHS, RHS);
-    case ARM64CC::GE:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMGEz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGE, dl, VT, LHS, RHS);
-    case ARM64CC::GT:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMGTz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGT, dl, VT, LHS, RHS);
-    case ARM64CC::LS:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMLEz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGE, dl, VT, RHS, LHS);
-    case ARM64CC::LT:
-      if (!NoNans)
-        return SDValue();
-    // If we ignore NaNs then we can use to the MI implementation.
-    // Fallthrough.
-    case ARM64CC::MI:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMLTz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGT, dl, VT, RHS, LHS);
-    }
-  }
-
-  switch (CC) {
-  default:
-    return SDValue();
-  case ARM64CC::NE: {
-    SDValue Cmeq;
-    if (IsZero)
-      Cmeq = DAG.getNode(ARM64ISD::CMEQz, dl, VT, LHS);
-    else
-      Cmeq = DAG.getNode(ARM64ISD::CMEQ, dl, VT, LHS, RHS);
-    return DAG.getNode(ARM64ISD::NOT, dl, VT, Cmeq);
-  }
-  case ARM64CC::EQ:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMEQz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMEQ, dl, VT, LHS, RHS);
-  case ARM64CC::GE:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMGEz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGE, dl, VT, LHS, RHS);
-  case ARM64CC::GT:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMGTz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGT, dl, VT, LHS, RHS);
-  case ARM64CC::LE:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMLEz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGE, dl, VT, RHS, LHS);
-  case ARM64CC::LS:
-    return DAG.getNode(ARM64ISD::CMHS, dl, VT, RHS, LHS);
-  case ARM64CC::LO:
-    return DAG.getNode(ARM64ISD::CMHI, dl, VT, RHS, LHS);
-  case ARM64CC::LT:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMLTz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGT, dl, VT, RHS, LHS);
-  case ARM64CC::HI:
-    return DAG.getNode(ARM64ISD::CMHI, dl, VT, LHS, RHS);
-  case ARM64CC::HS:
-    return DAG.getNode(ARM64ISD::CMHS, dl, VT, LHS, RHS);
-  }
-}
-
-SDValue ARM64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDLoc dl(Op);
-
-  if (LHS.getValueType().getVectorElementType().isInteger()) {
-    assert(LHS.getValueType() == RHS.getValueType());
-    ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC);
-    return EmitVectorComparison(LHS, RHS, ARM64CC, false, Op.getValueType(), dl,
-                                DAG);
-  }
-
-  assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
-         LHS.getValueType().getVectorElementType() == MVT::f64);
-
-  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-  // clean.  Some of them require two branches to implement.
-  ARM64CC::CondCode CC1, CC2;
-  bool ShouldInvert;
-  changeVectorFPCCToARM64CC(CC, CC1, CC2, ShouldInvert);
-
-  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
-  SDValue Cmp =
-      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
-  if (!Cmp.getNode())
-    return SDValue();
-
-  if (CC2 != ARM64CC::AL) {
-    SDValue Cmp2 =
-        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
-    if (!Cmp2.getNode())
-      return SDValue();
-
-    Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
-  }
-
-  if (ShouldInvert)
-    return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
-
-  return Cmp;
-}
-
-/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
-/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
-/// specified in the intrinsic calls.
-bool ARM64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                             const CallInst &I,
-                                             unsigned Intrinsic) const {
-  switch (Intrinsic) {
-  case Intrinsic::arm64_neon_ld2:
-  case Intrinsic::arm64_neon_ld3:
-  case Intrinsic::arm64_neon_ld4:
-  case Intrinsic::arm64_neon_ld1x2:
-  case Intrinsic::arm64_neon_ld1x3:
-  case Intrinsic::arm64_neon_ld1x4:
-  case Intrinsic::arm64_neon_ld2lane:
-  case Intrinsic::arm64_neon_ld3lane:
-  case Intrinsic::arm64_neon_ld4lane:
-  case Intrinsic::arm64_neon_ld2r:
-  case Intrinsic::arm64_neon_ld3r:
-  case Intrinsic::arm64_neon_ld4r: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.offset = 0;
-    Info.align = 0;
-    Info.vol = false; // volatile loads with NEON intrinsics not supported
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm64_neon_st2:
-  case Intrinsic::arm64_neon_st3:
-  case Intrinsic::arm64_neon_st4:
-  case Intrinsic::arm64_neon_st1x2:
-  case Intrinsic::arm64_neon_st1x3:
-  case Intrinsic::arm64_neon_st1x4:
-  case Intrinsic::arm64_neon_st2lane:
-  case Intrinsic::arm64_neon_st3lane:
-  case Intrinsic::arm64_neon_st4lane: {
-    Info.opc = ISD::INTRINSIC_VOID;
-    // Conservatively set memVT to the entire set of vectors stored.
-    unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
-      Type *ArgTy = I.getArgOperand(ArgI)->getType();
-      if (!ArgTy->isVectorTy())
-        break;
-      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
-    }
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.offset = 0;
-    Info.align = 0;
-    Info.vol = false; // volatile stores with NEON intrinsics not supported
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  case Intrinsic::arm64_ldaxr:
-  case Intrinsic::arm64_ldxr: {
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
-    Info.vol = true;
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm64_stlxr:
-  case Intrinsic::arm64_stxr: {
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
-    Info.ptrVal = I.getArgOperand(1);
-    Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
-    Info.vol = true;
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  case Intrinsic::arm64_ldaxp:
-  case Intrinsic::arm64_ldxp: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::i128;
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.align = 16;
-    Info.vol = true;
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm64_stlxp:
-  case Intrinsic::arm64_stxp: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::i128;
-    Info.ptrVal = I.getArgOperand(2);
-    Info.offset = 0;
-    Info.align = 16;
-    Info.vol = true;
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  default:
-    break;
-  }
-
-  return false;
-}
-
-// Truncations from 64-bit GPR to 32-bit GPR is free.
-bool ARM64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
-  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
-    return false;
-  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
-  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
-  if (NumBits1 <= NumBits2)
-    return false;
-  return true;
-}
-bool ARM64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
-    return false;
-  unsigned NumBits1 = VT1.getSizeInBits();
-  unsigned NumBits2 = VT2.getSizeInBits();
-  if (NumBits1 <= NumBits2)
-    return false;
-  return true;
-}
-
-// All 32-bit GPR operations implicitly zero the high-half of the corresponding
-// 64-bit GPR.
-bool ARM64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
-  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
-    return false;
-  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
-  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
-  if (NumBits1 == 32 && NumBits2 == 64)
-    return true;
-  return false;
-}
-bool ARM64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
-    return false;
-  unsigned NumBits1 = VT1.getSizeInBits();
-  unsigned NumBits2 = VT2.getSizeInBits();
-  if (NumBits1 == 32 && NumBits2 == 64)
-    return true;
-  return false;
-}
-
-bool ARM64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
-  EVT VT1 = Val.getValueType();
-  if (isZExtFree(VT1, VT2)) {
-    return true;
-  }
-
-  if (Val.getOpcode() != ISD::LOAD)
-    return false;
-
-  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
-  return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() &&
-          VT2.isInteger() && VT1.getSizeInBits() <= 32);
-}
-
-bool ARM64TargetLowering::hasPairedLoad(Type *LoadedType,
-                                        unsigned &RequiredAligment) const {
-  if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
-    return false;
-  // Cyclone supports unaligned accesses.
-  RequiredAligment = 0;
-  unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
-  return NumBits == 32 || NumBits == 64;
-}
-
-bool ARM64TargetLowering::hasPairedLoad(EVT LoadedType,
-                                        unsigned &RequiredAligment) const {
-  if (!LoadedType.isSimple() ||
-      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
-    return false;
-  // Cyclone supports unaligned accesses.
-  RequiredAligment = 0;
-  unsigned NumBits = LoadedType.getSizeInBits();
-  return NumBits == 32 || NumBits == 64;
-}
-
-static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
-                       unsigned AlignCheck) {
-  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
-          (DstAlign == 0 || DstAlign % AlignCheck == 0));
-}
-
-EVT ARM64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-                                             unsigned SrcAlign, bool IsMemset,
-                                             bool ZeroMemset, bool MemcpyStrSrc,
-                                             MachineFunction &MF) const {
-  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
-  // instruction to materialize the v2i64 zero and one store (with restrictive
-  // addressing mode). Just do two i64 store of zero-registers.
-  bool Fast;
-  const Function *F = MF.getFunction();
-  if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
-      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat) &&
-      (memOpAlign(SrcAlign, DstAlign, 16) ||
-       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
-    return MVT::f128;
-
-  return Size >= 8 ? MVT::i64 : MVT::i32;
-}
-
-// 12-bit optionally shifted immediates are legal for adds.
-bool ARM64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
-  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
-    return true;
-  return false;
-}
-
-// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
-// immediates is the same as for an add or a sub.
-bool ARM64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
-  if (Immed < 0)
-    Immed *= -1;
-  return isLegalAddImmediate(Immed);
-}
-
-/// isLegalAddressingMode - Return true if the addressing mode represented
-/// by AM is legal for this target, for a load/store of the specified type.
-bool ARM64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                Type *Ty) const {
-  // ARM64 has five basic addressing modes:
-  //  reg
-  //  reg + 9-bit signed offset
-  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
-  //  reg1 + reg2
-  //  reg + SIZE_IN_BYTES * reg
-
-  // No global is ever allowed as a base.
-  if (AM.BaseGV)
-    return false;
-
-  // No reg+reg+imm addressing.
-  if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
-    return false;
-
-  // check reg + imm case:
-  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
-  uint64_t NumBytes = 0;
-  if (Ty->isSized()) {
-    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
-    NumBytes = NumBits / 8;
-    if (!isPowerOf2_64(NumBits))
-      NumBytes = 0;
-  }
-
-  if (!AM.Scale) {
-    int64_t Offset = AM.BaseOffs;
-
-    // 9-bit signed offset
-    if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
-      return true;
-
-    // 12-bit unsigned offset
-    unsigned shift = Log2_64(NumBytes);
-    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
-        // Must be a multiple of NumBytes (NumBytes is a power of 2)
-        (Offset >> shift) << shift == Offset)
-      return true;
-    return false;
-  }
-
-  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
-
-  if (!AM.Scale || AM.Scale == 1 ||
-      (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
-    return true;
-  return false;
-}
-
-int ARM64TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                              Type *Ty) const {
-  // Scaling factors are not free at all.
-  // Operands                     | Rt Latency
-  // -------------------------------------------
-  // Rt, [Xn, Xm]                 | 4
-  // -------------------------------------------
-  // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
-  // Rt, [Xn, Wm, <extend> #imm]  |
-  if (isLegalAddressingMode(AM, Ty))
-    // Scale represents reg2 * scale, thus account for 1 if
-    // it is not equal to 0 or 1.
-    return AM.Scale != 0 && AM.Scale != 1;
-  return -1;
-}
-
-bool ARM64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  VT = VT.getScalarType();
-
-  if (!VT.isSimple())
-    return false;
-
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::f32:
-  case MVT::f64:
-    return true;
-  default:
-    break;
-  }
-
-  return false;
-}
-
-const MCPhysReg *
-ARM64TargetLowering::getScratchRegisters(CallingConv::ID) const {
-  // LR is a callee-save register, but we must treat it as clobbered by any call
-  // site. Hence we include LR in the scratch registers, which are in turn added
-  // as implicit-defs for stackmaps and patchpoints.
-  static const MCPhysReg ScratchRegs[] = {
-    ARM64::X16, ARM64::X17, ARM64::LR, 0
-  };
-  return ScratchRegs;
-}
-
-bool ARM64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
-  EVT VT = N->getValueType(0);
-    // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
-    // it with shift to let it be lowered to UBFX.
-  if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
-      isa<ConstantSDNode>(N->getOperand(1))) {
-    uint64_t TruncMask = N->getConstantOperandVal(1);
-    if (isMask_64(TruncMask) &&
-      N->getOperand(0).getOpcode() == ISD::SRL &&
-      isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
-      return false;
-  }
-  return true;
-}
-
-bool ARM64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                                            Type *Ty) const {
-  assert(Ty->isIntegerTy());
-
-  unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  if (BitSize == 0)
-    return false;
-
-  int64_t Val = Imm.getSExtValue();
-  if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, BitSize))
-    return true;
-
-  if ((int64_t)Val < 0)
-    Val = ~Val;
-  if (BitSize == 32)
-    Val &= (1LL << 32) - 1;
-
-  unsigned LZ = countLeadingZeros((uint64_t)Val);
-  unsigned Shift = (63 - LZ) / 16;
-  // MOVZ is free so return true for one or fewer MOVK.
-  return (Shift < 3) ? true : false;
-}
-
-// Generate SUBS and CSEL for integer abs.
-static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc DL(N);
-
-  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
-  // and change it to SUB and CSEL.
-  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
-      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
-      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
-    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
-      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
-        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
-                                  N0.getOperand(0));
-        // Generate SUBS & CSEL.
-        SDValue Cmp =
-            DAG.getNode(ARM64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
-                        N0.getOperand(0), DAG.getConstant(0, VT));
-        return DAG.getNode(ARM64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
-                           DAG.getConstant(ARM64CC::PL, MVT::i32),
-                           SDValue(Cmp.getNode(), 1));
-      }
-  return SDValue();
-}
-
-// performXorCombine - Attempts to handle integer ABS.
-static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARM64Subtarget *Subtarget) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  return performIntegerAbsCombine(N, DAG);
-}
-
-static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARM64Subtarget *Subtarget) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // Multiplication of a power of two plus/minus one can be done more
-  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
-  // future CPUs have a cheaper MADD instruction, this may need to be
-  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
-  // 64-bit is 5 cycles, so this is always a win.
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
-    APInt Value = C->getAPIntValue();
-    EVT VT = N->getValueType(0);
-    APInt VP1 = Value + 1;
-    if (VP1.isPowerOf2()) {
-      // Multiplying by one less than a power of two, replace with a shift
-      // and a subtract.
-      SDValue ShiftedVal =
-          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                      DAG.getConstant(VP1.logBase2(), MVT::i64));
-      return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
-    }
-    APInt VM1 = Value - 1;
-    if (VM1.isPowerOf2()) {
-      // Multiplying by one more than a power of two, replace with a shift
-      // and an add.
-      SDValue ShiftedVal =
-          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                      DAG.getConstant(VM1.logBase2(), MVT::i64));
-      return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
-    }
-  }
-  return SDValue();
-}
-
-static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  if (VT != MVT::f32 && VT != MVT::f64)
-    return SDValue();
-  // Only optimize when the source and destination types have the same width.
-  if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
-    return SDValue();
-
-  // If the result of an integer load is only used by an integer-to-float
-  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
-  // This eliminates an "integer-to-vector-move UOP and improve throughput.
-  SDValue N0 = N->getOperand(0);
-  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
-      // Do not change the width of a volatile load.
-      !cast<LoadSDNode>(N0)->isVolatile()) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
-                               LN0->getPointerInfo(), LN0->isVolatile(),
-                               LN0->isNonTemporal(), LN0->isInvariant(),
-                               LN0->getAlignment());
-
-    // Make sure successors of the original load stay after it by updating them
-    // to use the new Chain.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
-
-    unsigned Opcode =
-        (N->getOpcode() == ISD::SINT_TO_FP) ? ARM64ISD::SITOF : ARM64ISD::UITOF;
-    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
-  }
-
-  return SDValue();
-}
-
-/// An EXTR instruction is made up of two shifts, ORed together. This helper
-/// searches for and classifies those shifts.
-static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
-                         bool &FromHi) {
-  if (N.getOpcode() == ISD::SHL)
-    FromHi = false;
-  else if (N.getOpcode() == ISD::SRL)
-    FromHi = true;
-  else
-    return false;
-
-  if (!isa<ConstantSDNode>(N.getOperand(1)))
-    return false;
-
-  ShiftAmount = N->getConstantOperandVal(1);
-  Src = N->getOperand(0);
-  return true;
-}
-
-/// EXTR instruction extracts a contiguous chunk of bits from two existing
-/// registers viewed as a high/low pair. This function looks for the pattern:
-/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
-/// EXTR. Can't quite be done in TableGen because the two immediates aren't
-/// independent.
-static SDValue tryCombineToEXTR(SDNode *N,
-                                TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  assert(N->getOpcode() == ISD::OR && "Unexpected root");
-
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  SDValue LHS;
-  uint32_t ShiftLHS = 0;
-  bool LHSFromHi = 0;
-  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
-    return SDValue();
-
-  SDValue RHS;
-  uint32_t ShiftRHS = 0;
-  bool RHSFromHi = 0;
-  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
-    return SDValue();
-
-  // If they're both trying to come from the high part of the register, they're
-  // not really an EXTR.
-  if (LHSFromHi == RHSFromHi)
-    return SDValue();
-
-  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
-    return SDValue();
-
-  if (LHSFromHi) {
-    std::swap(LHS, RHS);
-    std::swap(ShiftLHS, ShiftRHS);
-  }
-
-  return DAG.getNode(ARM64ISD::EXTR, DL, VT, LHS, RHS,
-                     DAG.getConstant(ShiftRHS, MVT::i64));
-}
-
-static SDValue tryCombineToBSL(SDNode *N,
-                                TargetLowering::DAGCombinerInfo &DCI) {
-  EVT VT = N->getValueType(0);
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-
-  if (!VT.isVector())
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  if (N0.getOpcode() != ISD::AND)
-    return SDValue();
-
-  SDValue N1 = N->getOperand(1);
-  if (N1.getOpcode() != ISD::AND)
-    return SDValue();
-
-  // We only have to look for constant vectors here since the general, variable
-  // case can be handled in TableGen.
-  unsigned Bits = VT.getVectorElementType().getSizeInBits();
-  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
-  for (int i = 1; i >= 0; --i)
-    for (int j = 1; j >= 0; --j) {
-      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
-      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
-      if (!BVN0 || !BVN1)
-        continue;
-
-      bool FoundMatch = true;
-      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
-        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
-        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
-        if (!CN0 || !CN1 ||
-            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
-          FoundMatch = false;
-          break;
-        }
-      }
-
-      if (FoundMatch)
-        return DAG.getNode(ARM64ISD::BSL, DL, VT, SDValue(BVN0, 0),
-                           N0->getOperand(1 - i), N1->getOperand(1 - j));
-    }
-
-  return SDValue();
-}
-
-static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
-                                const ARM64Subtarget *Subtarget) {
-  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
-  if (!EnableARM64ExtrGeneration)
-    return SDValue();
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
-    return SDValue();
-
-  SDValue Res = tryCombineToEXTR(N, DCI);
-  if (Res.getNode())
-    return Res;
-
-  Res = tryCombineToBSL(N, DCI);
-  if (Res.getNode())
-    return Res;
-
-  return SDValue();
-}
-
-static SDValue performBitcastCombine(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // Remove extraneous bitcasts around an extract_subvector.
-  // For example,
-  //    (v4i16 (bitconvert
-  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
-  //  becomes
-  //    (extract_subvector ((v8i16 ...), (i64 4)))
-
-  // Only interested in 64-bit vectors as the ultimate result.
-  EVT VT = N->getValueType(0);
-  if (!VT.isVector())
-    return SDValue();
-  if (VT.getSimpleVT().getSizeInBits() != 64)
-    return SDValue();
-  // Is the operand an extract_subvector starting at the beginning or halfway
-  // point of the vector? A low half may also come through as an
-  // EXTRACT_SUBREG, so look for that, too.
-  SDValue Op0 = N->getOperand(0);
-  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
-      !(Op0->isMachineOpcode() &&
-        Op0->getMachineOpcode() == ARM64::EXTRACT_SUBREG))
-    return SDValue();
-  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
-  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
-      return SDValue();
-  } else if (Op0->getMachineOpcode() == ARM64::EXTRACT_SUBREG) {
-    if (idx != ARM64::dsub)
-      return SDValue();
-    // The dsub reference is equivalent to a lane zero subvector reference.
-    idx = 0;
-  }
-  // Look through the bitcast of the input to the extract.
-  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
-    return SDValue();
-  SDValue Source = Op0->getOperand(0)->getOperand(0);
-  // If the source type has twice the number of elements as our destination
-  // type, we know this is an extract of the high or low half of the vector.
-  EVT SVT = Source->getValueType(0);
-  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
-    return SDValue();
-
-  DEBUG(dbgs() << "arm64-lower: bitcast extract_subvector simplification\n");
-
-  // Create the simplified form to just extract the low or high half of the
-  // vector directly rather than bothering with the bitcasts.
-  SDLoc dl(N);
-  unsigned NumElements = VT.getVectorNumElements();
-  if (idx) {
-    SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
-  } else {
-    SDValue SubReg = DAG.getTargetConstant(ARM64::dsub, MVT::i32);
-    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
-                                      Source, SubReg),
-                   0);
-  }
-}
-
-static SDValue performConcatVectorsCombine(SDNode *N,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-
-  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
-  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
-  // canonicalise to that.
-  if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
-    assert(VT.getVectorElementType().getSizeInBits() == 64);
-    return DAG.getNode(ARM64ISD::DUPLANE64, dl, VT,
-                       WidenVector(N->getOperand(0), DAG),
-                       DAG.getConstant(0, MVT::i64));
-  }
-
-  // Canonicalise concat_vectors so that the right-hand vector has as few
-  // bit-casts as possible before its real operation. The primary matching
-  // destination for these operations will be the narrowing "2" instructions,
-  // which depend on the operation being performed on this right-hand vector.
-  // For example,
-  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
-  // becomes
-  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
-
-  SDValue Op1 = N->getOperand(1);
-  if (Op1->getOpcode() != ISD::BITCAST)
-    return SDValue();
-  SDValue RHS = Op1->getOperand(0);
-  MVT RHSTy = RHS.getValueType().getSimpleVT();
-  // If the RHS is not a vector, this is not the pattern we're looking for.
-  if (!RHSTy.isVector())
-    return SDValue();
-
-  DEBUG(dbgs() << "arm64-lower: concat_vectors bitcast simplification\n");
-
-  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
-                                  RHSTy.getVectorNumElements() * 2);
-  return DAG.getNode(
-      ISD::BITCAST, dl, VT,
-      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
-                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
-}
-
-static SDValue tryCombineFixedPointConvert(SDNode *N,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-  // Transform a scalar conversion of a value from a lane extract into a
-  // lane extract of a vector conversion. E.g., from foo1 to foo2:
-  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
-  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
-  //
-  // The second form interacts better with instruction selection and the
-  // register allocator to avoid cross-class register copies that aren't
-  // coalescable due to a lane reference.
-
-  // Check the operand and see if it originates from a lane extract.
-  SDValue Op1 = N->getOperand(1);
-  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
-    // Yep, no additional predication needed. Perform the transform.
-    SDValue IID = N->getOperand(0);
-    SDValue Shift = N->getOperand(2);
-    SDValue Vec = Op1.getOperand(0);
-    SDValue Lane = Op1.getOperand(1);
-    EVT ResTy = N->getValueType(0);
-    EVT VecResTy;
-    SDLoc DL(N);
-
-    // The vector width should be 128 bits by the time we get here, even
-    // if it started as 64 bits (the extract_vector handling will have
-    // done so).
-    assert(Vec.getValueType().getSizeInBits() == 128 &&
-           "unexpected vector size on extract_vector_elt!");
-    if (Vec.getValueType() == MVT::v4i32)
-      VecResTy = MVT::v4f32;
-    else if (Vec.getValueType() == MVT::v2i64)
-      VecResTy = MVT::v2f64;
-    else
-      assert(0 && "unexpected vector type!");
-
-    SDValue Convert =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
-  }
-  return SDValue();
-}
-
-// AArch64 high-vector "long" operations are formed by performing the non-high
-// version on an extract_subvector of each operand which gets the high half:
-//
-//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
-//
-// However, there are cases which don't have an extract_high explicitly, but
-// have another operation that can be made compatible with one for free. For
-// example:
-//
-//  (dupv64 scalar) --> (extract_high (dup128 scalar))
-//
-// This routine does the actual conversion of such DUPs, once outer routines
-// have determined that everything else is in order.
-static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
-  // We can handle most types of duplicate, but the lane ones have an extra
-  // operand saying *which* lane, so we need to know.
-  bool IsDUPLANE;
-  switch (N.getOpcode()) {
-  case ARM64ISD::DUP:
-    IsDUPLANE = false;
-    break;
-  case ARM64ISD::DUPLANE8:
-  case ARM64ISD::DUPLANE16:
-  case ARM64ISD::DUPLANE32:
-  case ARM64ISD::DUPLANE64:
-    IsDUPLANE = true;
-    break;
-  default:
-    return SDValue();
-  }
-
-  MVT NarrowTy = N.getSimpleValueType();
-  if (!NarrowTy.is64BitVector())
-    return SDValue();
-
-  MVT ElementTy = NarrowTy.getVectorElementType();
-  unsigned NumElems = NarrowTy.getVectorNumElements();
-  MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
-
-  SDValue NewDUP;
-  if (IsDUPLANE)
-    NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
-                         N.getOperand(1));
-  else
-    NewDUP = DAG.getNode(ARM64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
-
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
-                     NewDUP, DAG.getConstant(NumElems, MVT::i64));
-}
-
-static bool isEssentiallyExtractSubvector(SDValue N) {
-  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
-    return true;
-
-  return N.getOpcode() == ISD::BITCAST &&
-         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
-}
-
-/// \brief Helper structure to keep track of ISD::SET_CC operands.
-struct GenericSetCCInfo {
-  const SDValue *Opnd0;
-  const SDValue *Opnd1;
-  ISD::CondCode CC;
-};
-
-/// \brief Helper structure to keep track of a SET_CC lowered into ARM64 code.
-struct ARM64SetCCInfo {
-  const SDValue *Cmp;
-  ARM64CC::CondCode CC;
-};
-
-/// \brief Helper structure to keep track of SetCC information.
-union SetCCInfo {
-  GenericSetCCInfo Generic;
-  ARM64SetCCInfo ARM64;
-};
-
-/// \brief Helper structure to be able to read SetCC information.
-/// If set to true, IsARM64 field, Info is a ARM64SetCCInfo, otherwise Info is
-/// a GenericSetCCInfo.
-struct SetCCInfoAndKind {
-  SetCCInfo Info;
-  bool IsARM64;
-};
-
-/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
-/// an
-/// ARM64 lowered one.
-/// \p SetCCInfo is filled accordingly.
-/// \post SetCCInfo is meanginfull only when this function returns true.
-/// \return True when Op is a kind of SET_CC operation.
-static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
-  // If this is a setcc, this is straight forward.
-  if (Op.getOpcode() == ISD::SETCC) {
-    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
-    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
-    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-    SetCCInfo.IsARM64 = false;
-    return true;
-  }
-  // Otherwise, check if this is a matching csel instruction.
-  // In other words:
-  // - csel 1, 0, cc
-  // - csel 0, 1, !cc
-  if (Op.getOpcode() != ARM64ISD::CSEL)
-    return false;
-  // Set the information about the operands.
-  // TODO: we want the operands of the Cmp not the csel
-  SetCCInfo.Info.ARM64.Cmp = &Op.getOperand(3);
-  SetCCInfo.IsARM64 = true;
-  SetCCInfo.Info.ARM64.CC = static_cast<ARM64CC::CondCode>(
-      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
-
-  // Check that the operands matches the constraints:
-  // (1) Both operands must be constants.
-  // (2) One must be 1 and the other must be 0.
-  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
-  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-
-  // Check (1).
-  if (!TValue || !FValue)
-    return false;
-
-  // Check (2).
-  if (!TValue->isOne()) {
-    // Update the comparison when we are interested in !cc.
-    std::swap(TValue, FValue);
-    SetCCInfo.Info.ARM64.CC =
-        ARM64CC::getInvertedCondCode(SetCCInfo.Info.ARM64.CC);
-  }
-  return TValue->isOne() && FValue->isNullValue();
-}
-
-// Returns true if Op is setcc or zext of setcc.
-static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
-  if (isSetCC(Op, Info))
-    return true;
-  return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
-    isSetCC(Op->getOperand(0), Info));
-}
-
-// The folding we want to perform is:
-// (add x, [zext] (setcc cc ...) )
-//   -->
-// (csel x, (add x, 1), !cc ...)
-//
-// The latter will get matched to a CSINC instruction.
-static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
-  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
-  SDValue LHS = Op->getOperand(0);
-  SDValue RHS = Op->getOperand(1);
-  SetCCInfoAndKind InfoAndKind;
-
-  // If neither operand is a SET_CC, give up.
-  if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
-    std::swap(LHS, RHS);
-    if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
-      return SDValue();
-  }
-
-  // FIXME: This could be generatized to work for FP comparisons.
-  EVT CmpVT = InfoAndKind.IsARM64
-                  ? InfoAndKind.Info.ARM64.Cmp->getOperand(0).getValueType()
-                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
-  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
-    return SDValue();
-
-  SDValue CCVal;
-  SDValue Cmp;
-  SDLoc dl(Op);
-  if (InfoAndKind.IsARM64) {
-    CCVal = DAG.getConstant(
-        ARM64CC::getInvertedCondCode(InfoAndKind.Info.ARM64.CC), MVT::i32);
-    Cmp = *InfoAndKind.Info.ARM64.Cmp;
-  } else
-    Cmp = getARM64Cmp(*InfoAndKind.Info.Generic.Opnd0,
-                      *InfoAndKind.Info.Generic.Opnd1,
-                      ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
-                      CCVal, DAG, dl);
-
-  EVT VT = Op->getValueType(0);
-  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
-  return DAG.getNode(ARM64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
-}
-
-// The basic add/sub long vector instructions have variants with "2" on the end
-// which act on the high-half of their inputs. They are normally matched by
-// patterns like:
-//
-// (add (zeroext (extract_high LHS)),
-//      (zeroext (extract_high RHS)))
-// -> uaddl2 vD, vN, vM
-//
-// However, if one of the extracts is something like a duplicate, this
-// instruction can still be used profitably. This function puts the DAG into a
-// more appropriate form for those patterns to trigger.
-static SDValue performAddSubLongCombine(SDNode *N,
-                                        TargetLowering::DAGCombinerInfo &DCI,
-                                        SelectionDAG &DAG) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  MVT VT = N->getSimpleValueType(0);
-  if (!VT.is128BitVector()) {
-    if (N->getOpcode() == ISD::ADD)
-      return performSetccAddFolding(N, DAG);
-    return SDValue();
-  }
-
-  // Make sure both branches are extended in the same way.
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
-       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
-      LHS.getOpcode() != RHS.getOpcode())
-    return SDValue();
-
-  unsigned ExtType = LHS.getOpcode();
-
-  // It's not worth doing if at least one of the inputs isn't already an
-  // extract, but we don't know which it'll be so we have to try both.
-  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
-    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
-    if (!RHS.getNode())
-      return SDValue();
-
-    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
-  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
-    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
-    if (!LHS.getNode())
-      return SDValue();
-
-    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
-  }
-
-  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
-}
-
-// Massage DAGs which we can use the high-half "long" operations on into
-// something isel will recognize better. E.g.
-//
-// (arm64_neon_umull (extract_high vec) (dupv64 scalar)) -->
-//   (arm64_neon_umull (extract_high (v2i64 vec)))
-//                     (extract_high (v2i64 (dup128 scalar)))))
-//
-static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
-                                       TargetLowering::DAGCombinerInfo &DCI,
-                                       SelectionDAG &DAG) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SDValue LHS = N->getOperand(1);
-  SDValue RHS = N->getOperand(2);
-  assert(LHS.getValueType().is64BitVector() &&
-         RHS.getValueType().is64BitVector() &&
-         "unexpected shape for long operation");
-
-  // Either node could be a DUP, but it's not worth doing both of them (you'd
-  // just as well use the non-high version) so look for a corresponding extract
-  // operation on the other "wing".
-  if (isEssentiallyExtractSubvector(LHS)) {
-    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
-    if (!RHS.getNode())
-      return SDValue();
-  } else if (isEssentiallyExtractSubvector(RHS)) {
-    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
-    if (!LHS.getNode())
-      return SDValue();
-  }
-
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
-                     N->getOperand(0), LHS, RHS);
-}
-
-static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
-  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
-  unsigned ElemBits = ElemTy.getSizeInBits();
-
-  int64_t ShiftAmount;
-  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
-    APInt SplatValue, SplatUndef;
-    unsigned SplatBitSize;
-    bool HasAnyUndefs;
-    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
-                              HasAnyUndefs, ElemBits) ||
-        SplatBitSize != ElemBits)
-      return SDValue();
-
-    ShiftAmount = SplatValue.getSExtValue();
-  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
-    ShiftAmount = CVN->getSExtValue();
-  } else
-    return SDValue();
-
-  unsigned Opcode;
-  bool IsRightShift;
-  switch (IID) {
-  default:
-    llvm_unreachable("Unknown shift intrinsic");
-  case Intrinsic::arm64_neon_sqshl:
-    Opcode = ARM64ISD::SQSHL_I;
-    IsRightShift = false;
-    break;
-  case Intrinsic::arm64_neon_uqshl:
-    Opcode = ARM64ISD::UQSHL_I;
-    IsRightShift = false;
-    break;
-  case Intrinsic::arm64_neon_srshl:
-    Opcode = ARM64ISD::SRSHR_I;
-    IsRightShift = true;
-    break;
-  case Intrinsic::arm64_neon_urshl:
-    Opcode = ARM64ISD::URSHR_I;
-    IsRightShift = true;
-    break;
-  case Intrinsic::arm64_neon_sqshlu:
-    Opcode = ARM64ISD::SQSHLU_I;
-    IsRightShift = false;
-    break;
-  }
-
-  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
-    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
-                       DAG.getConstant(-ShiftAmount, MVT::i32));
-  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits)
-    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
-                       DAG.getConstant(ShiftAmount, MVT::i32));
-
-  return SDValue();
-}
-
-// The CRC32[BH] instructions ignore the high bits of their data operand. Since
-// the intrinsics must be legal and take an i32, this means there's almost
-// certainly going to be a zext in the DAG which we can eliminate.
-static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
-  SDValue AndN = N->getOperand(2);
-  if (AndN.getOpcode() != ISD::AND)
-    return SDValue();
-
-  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
-  if (!CMask || CMask->getZExtValue() != Mask)
-    return SDValue();
-
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
-                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
-}
-
-static SDValue performIntrinsicCombine(SDNode *N,
-                                       TargetLowering::DAGCombinerInfo &DCI,
-                                       const ARM64Subtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-  unsigned IID = getIntrinsicID(N);
-  switch (IID) {
-  default:
-    break;
-  case Intrinsic::arm64_neon_vcvtfxs2fp:
-  case Intrinsic::arm64_neon_vcvtfxu2fp:
-    return tryCombineFixedPointConvert(N, DCI, DAG);
-    break;
-  case Intrinsic::arm64_neon_fmax:
-    return DAG.getNode(ARM64ISD::FMAX, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2));
-  case Intrinsic::arm64_neon_fmin:
-    return DAG.getNode(ARM64ISD::FMIN, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2));
-  case Intrinsic::arm64_neon_smull:
-  case Intrinsic::arm64_neon_umull:
-  case Intrinsic::arm64_neon_pmull:
-  case Intrinsic::arm64_neon_sqdmull:
-    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
-  case Intrinsic::arm64_neon_sqshl:
-  case Intrinsic::arm64_neon_uqshl:
-  case Intrinsic::arm64_neon_sqshlu:
-  case Intrinsic::arm64_neon_srshl:
-  case Intrinsic::arm64_neon_urshl:
-    return tryCombineShiftImm(IID, N, DAG);
-  case Intrinsic::arm64_crc32b:
-  case Intrinsic::arm64_crc32cb:
-    return tryCombineCRC32(0xff, N, DAG);
-  case Intrinsic::arm64_crc32h:
-  case Intrinsic::arm64_crc32ch:
-    return tryCombineCRC32(0xffff, N, DAG);
-  }
-  return SDValue();
-}
-
-static SDValue performExtendCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    SelectionDAG &DAG) {
-  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
-  // we can convert that DUP into another extract_high (of a bigger DUP), which
-  // helps the backend to decide that an sabdl2 would be useful, saving a real
-  // extract_high operation.
-  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
-      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
-    SDNode *ABDNode = N->getOperand(0).getNode();
-    unsigned IID = getIntrinsicID(ABDNode);
-    if (IID == Intrinsic::arm64_neon_sabd ||
-        IID == Intrinsic::arm64_neon_uabd) {
-      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
-      if (!NewABD.getNode())
-        return SDValue();
-
-      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
-                         NewABD);
-    }
-  }
-
-  // This is effectively a custom type legalization for ARM64.
-  //
-  // Type legalization will split an extend of a small, legal, type to a larger
-  // illegal type by first splitting the destination type, often creating
-  // illegal source types, which then get legalized in isel-confusing ways,
-  // leading to really terrible codegen. E.g.,
-  //   %result = v8i32 sext v8i8 %value
-  // becomes
-  //   %losrc = extract_subreg %value, ...
-  //   %hisrc = extract_subreg %value, ...
-  //   %lo = v4i32 sext v4i8 %losrc
-  //   %hi = v4i32 sext v4i8 %hisrc
-  // Things go rapidly downhill from there.
-  //
-  // For ARM64, the [sz]ext vector instructions can only go up one element
-  // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
-  // take two instructions.
-  //
-  // This implies that the most efficient way to do the extend from v8i8
-  // to two v4i32 values is to first extend the v8i8 to v8i16, then do
-  // the normal splitting to happen for the v8i16->v8i32.
-
-  // This is pre-legalization to catch some cases where the default
-  // type legalization will create ill-tempered code.
-  if (!DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // We're only interested in cleaning things up for non-legal vector types
-  // here. If both the source and destination are legal, things will just
-  // work naturally without any fiddling.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT ResVT = N->getValueType(0);
-  if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
-    return SDValue();
-  // If the vector type isn't a simple VT, it's beyond the scope of what
-  // we're  worried about here. Let legalization do its thing and hope for
-  // the best.
-  if (!ResVT.isSimple())
-    return SDValue();
-
-  SDValue Src = N->getOperand(0);
-  MVT SrcVT = Src->getValueType(0).getSimpleVT();
-  // If the source VT is a 64-bit vector, we can play games and get the
-  // better results we want.
-  if (SrcVT.getSizeInBits() != 64)
-    return SDValue();
-
-  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
-  unsigned ElementCount = SrcVT.getVectorNumElements();
-  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
-  SDLoc DL(N);
-  Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
-
-  // Now split the rest of the operation into two halves, each with a 64
-  // bit source.
-  EVT LoVT, HiVT;
-  SDValue Lo, Hi;
-  unsigned NumElements = ResVT.getVectorNumElements();
-  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
-  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
-                                 ResVT.getVectorElementType(), NumElements / 2);
-
-  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
-                               LoVT.getVectorNumElements());
-  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(0));
-  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
-  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
-  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
-
-  // Now combine the parts back together so we still have a single result
-  // like the combiner expects.
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
-}
-
-/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
-/// value. The load store optimizer pass will merge them to store pair stores.
-/// This has better performance than a splat of the scalar followed by a split
-/// vector store. Even if the stores are not merged it is four stores vs a dup,
-/// followed by an ext.b and two stores.
-static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
-  SDValue StVal = St->getValue();
-  EVT VT = StVal.getValueType();
-
-  // Don't replace floating point stores, they possibly won't be transformed to
-  // stp because of the store pair suppress pass.
-  if (VT.isFloatingPoint())
-    return SDValue();
-
-  // Check for insert vector elements.
-  if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
-    return SDValue();
-
-  // We can express a splat as store pair(s) for 2 or 4 elements.
-  unsigned NumVecElts = VT.getVectorNumElements();
-  if (NumVecElts != 4 && NumVecElts != 2)
-    return SDValue();
-  SDValue SplatVal = StVal.getOperand(1);
-  unsigned RemainInsertElts = NumVecElts - 1;
-
-  // Check that this is a splat.
-  while (--RemainInsertElts) {
-    SDValue NextInsertElt = StVal.getOperand(0);
-    if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
-      return SDValue();
-    if (NextInsertElt.getOperand(1) != SplatVal)
-      return SDValue();
-    StVal = NextInsertElt;
-  }
-  unsigned OrigAlignment = St->getAlignment();
-  unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
-  unsigned Alignment = std::min(OrigAlignment, EltOffset);
-
-  // Create scalar stores. This is at least as good as the code sequence for a
-  // split unaligned store wich is a dup.s, ext.b, and two stores.
-  // Most of the time the three stores should be replaced by store pair
-  // instructions (stp).
-  SDLoc DL(St);
-  SDValue BasePtr = St->getBasePtr();
-  SDValue NewST1 =
-      DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
-                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
-
-  unsigned Offset = EltOffset;
-  while (--NumVecElts) {
-    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
-                                    DAG.getConstant(Offset, MVT::i64));
-    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
-                          St->getPointerInfo(), St->isVolatile(),
-                          St->isNonTemporal(), Alignment);
-    Offset += EltOffset;
-  }
-  return NewST1;
-}
-
-static SDValue performSTORECombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   SelectionDAG &DAG,
-                                   const ARM64Subtarget *Subtarget) {
-  if (!DCI.isBeforeLegalize())
-    return SDValue();
-
-  StoreSDNode *S = cast<StoreSDNode>(N);
-  if (S->isVolatile())
-    return SDValue();
-
-  // Cyclone has bad performance on unaligned 16B stores when crossing line and
-  // page boundries. We want to split such stores.
-  if (!Subtarget->isCyclone())
-    return SDValue();
-
-  // Don't split at Oz.
-  MachineFunction &MF = DAG.getMachineFunction();
-  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::MinSize);
-  if (IsMinSize)
-    return SDValue();
-
-  SDValue StVal = S->getValue();
-  EVT VT = StVal.getValueType();
-
-  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
-  // those up regresses performance on micro-benchmarks and olden/bh.
-  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
-    return SDValue();
-
-  // Split unaligned 16B stores. They are terrible for performance.
-  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
-  // extensions can use this to mark that it does not want splitting to happen
-  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
-  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
-  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
-      S->getAlignment() <= 2)
-    return SDValue();
-
-  // If we get a splat of a scalar convert this vector store to a store of
-  // scalars. They will be merged into store pairs thereby removing two
-  // instructions.
-  SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
-  if (ReplacedSplat != SDValue())
-    return ReplacedSplat;
-
-  SDLoc DL(S);
-  unsigned NumElts = VT.getVectorNumElements() / 2;
-  // Split VT into two.
-  EVT HalfVT =
-      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
-  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(0));
-  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(NumElts));
-  SDValue BasePtr = S->getBasePtr();
-  SDValue NewST1 =
-      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
-                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
-  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
-                                  DAG.getConstant(8, MVT::i64));
-  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
-                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
-                      S->getAlignment());
-}
-
-/// Target-specific DAG combine function for post-increment LD1 (lane) and
-/// post-increment LD1R.
-static SDValue performPostLD1Combine(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     bool IsLaneOp) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-
-  unsigned LoadIdx = IsLaneOp ? 1 : 0;
-  SDNode *LD = N->getOperand(LoadIdx).getNode();
-  // If it is not LOAD, can not do such combine.
-  if (LD->getOpcode() != ISD::LOAD)
-    return SDValue();
-
-  LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
-  EVT MemVT = LoadSDN->getMemoryVT();
-  // Check if memory operand is the same type as the vector element.
-  if (MemVT != VT.getVectorElementType())
-    return SDValue();
-
-  // Check if there are other uses. If so, do not combine as it will introduce
-  // an extra load.
-  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
-       ++UI) {
-    if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
-      continue;
-    if (*UI != N)
-      return SDValue();
-  }
-
-  SDValue Addr = LD->getOperand(1);
-  SDValue Vector = N->getOperand(0);
-  // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
-       Addr.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (User->getOpcode() != ISD::ADD
-        || UI.getUse().getResNo() != Addr.getResNo())
-      continue;
-
-    // Check that the add is independent of the load.  Otherwise, folding it
-    // would create a cycle.
-    if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
-      continue;
-    // Also check that add is not used in the vector operand.  This would also
-    // create a cycle.
-    if (User->isPredecessorOf(Vector.getNode()))
-      continue;
-
-    // If the increment is a constant, it must match the memory ref size.
-    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
-    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
-      uint32_t IncVal = CInc->getZExtValue();
-      unsigned NumBytes = VT.getScalarSizeInBits() / 8;
-      if (IncVal != NumBytes)
-        continue;
-      Inc = DAG.getRegister(ARM64::XZR, MVT::i64);
-    }
-
-    SmallVector<SDValue, 8> Ops;
-    Ops.push_back(LD->getOperand(0));  // Chain
-    if (IsLaneOp) {
-      Ops.push_back(Vector);           // The vector to be inserted
-      Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
-    }
-    Ops.push_back(Addr);
-    Ops.push_back(Inc);
-
-    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
-    unsigned NewOp = IsLaneOp ? ARM64ISD::LD1LANEpost : ARM64ISD::LD1DUPpost;
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
-                                           MemVT,
-                                           LoadSDN->getMemOperand());
-
-    // Update the uses.
-    std::vector<SDValue> NewResults;
-    NewResults.push_back(SDValue(LD, 0));             // The result of load
-    NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
-    DCI.CombineTo(LD, NewResults);
-    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
-    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
-
-    break;
-  }
-  return SDValue();
-}
-
-/// Target-specific DAG combine function for NEON load/store intrinsics
-/// to merge base address updates.
-static SDValue performNEONPostLDSTCombine(SDNode *N,
-                                          TargetLowering::DAGCombinerInfo &DCI,
-                                          SelectionDAG &DAG) {
-  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
-    return SDValue();
-
-  unsigned AddrOpIdx = N->getNumOperands() - 1;
-  SDValue Addr = N->getOperand(AddrOpIdx);
-
-  // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
-       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (User->getOpcode() != ISD::ADD ||
-        UI.getUse().getResNo() != Addr.getResNo())
-      continue;
-
-    // Check that the add is independent of the load/store.  Otherwise, folding
-    // it would create a cycle.
-    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
-      continue;
-
-    // Find the new opcode for the updating load/store.
-    bool IsStore = false;
-    bool IsLaneOp = false;
-    bool IsDupOp = false;
-    unsigned NewOpc = 0;
-    unsigned NumVecs = 0;
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-    switch (IntNo) {
-    default: llvm_unreachable("unexpected intrinsic for Neon base update");
-    case Intrinsic::arm64_neon_ld2:       NewOpc = ARM64ISD::LD2post;
-      NumVecs = 2; break;
-    case Intrinsic::arm64_neon_ld3:       NewOpc = ARM64ISD::LD3post;
-      NumVecs = 3; break;
-    case Intrinsic::arm64_neon_ld4:       NewOpc = ARM64ISD::LD4post;
-      NumVecs = 4; break;
-    case Intrinsic::arm64_neon_st2:       NewOpc = ARM64ISD::ST2post;
-      NumVecs = 2; IsStore = true; break;
-    case Intrinsic::arm64_neon_st3:       NewOpc = ARM64ISD::ST3post;
-      NumVecs = 3; IsStore = true; break;
-    case Intrinsic::arm64_neon_st4:       NewOpc = ARM64ISD::ST4post;
-      NumVecs = 4; IsStore = true; break;
-    case Intrinsic::arm64_neon_ld1x2:     NewOpc = ARM64ISD::LD1x2post;
-      NumVecs = 2; break;
-    case Intrinsic::arm64_neon_ld1x3:     NewOpc = ARM64ISD::LD1x3post;
-      NumVecs = 3; break;
-    case Intrinsic::arm64_neon_ld1x4:     NewOpc = ARM64ISD::LD1x4post;
-      NumVecs = 4; break;
-    case Intrinsic::arm64_neon_st1x2:     NewOpc = ARM64ISD::ST1x2post;
-      NumVecs = 2; IsStore = true; break;
-    case Intrinsic::arm64_neon_st1x3:     NewOpc = ARM64ISD::ST1x3post;
-      NumVecs = 3; IsStore = true; break;
-    case Intrinsic::arm64_neon_st1x4:     NewOpc = ARM64ISD::ST1x4post;
-      NumVecs = 4; IsStore = true; break;
-    case Intrinsic::arm64_neon_ld2r:      NewOpc = ARM64ISD::LD2DUPpost;
-      NumVecs = 2; IsDupOp = true; break;
-    case Intrinsic::arm64_neon_ld3r:      NewOpc = ARM64ISD::LD3DUPpost;
-      NumVecs = 3; IsDupOp = true; break;
-    case Intrinsic::arm64_neon_ld4r:      NewOpc = ARM64ISD::LD4DUPpost;
-      NumVecs = 4; IsDupOp = true; break;
-    case Intrinsic::arm64_neon_ld2lane:   NewOpc = ARM64ISD::LD2LANEpost;
-      NumVecs = 2; IsLaneOp = true; break;
-    case Intrinsic::arm64_neon_ld3lane:   NewOpc = ARM64ISD::LD3LANEpost;
-      NumVecs = 3; IsLaneOp = true; break;
-    case Intrinsic::arm64_neon_ld4lane:   NewOpc = ARM64ISD::LD4LANEpost;
-      NumVecs = 4; IsLaneOp = true; break;
-    case Intrinsic::arm64_neon_st2lane:   NewOpc = ARM64ISD::ST2LANEpost;
-      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
-    case Intrinsic::arm64_neon_st3lane:   NewOpc = ARM64ISD::ST3LANEpost;
-      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
-    case Intrinsic::arm64_neon_st4lane:   NewOpc = ARM64ISD::ST4LANEpost;
-      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
-    }
-
-    EVT VecTy;
-    if (IsStore)
-      VecTy = N->getOperand(2).getValueType();
-    else
-      VecTy = N->getValueType(0);
-
-    // If the increment is a constant, it must match the memory ref size.
-    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
-    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
-      uint32_t IncVal = CInc->getZExtValue();
-      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
-      if (IsLaneOp || IsDupOp)
-        NumBytes /= VecTy.getVectorNumElements();
-      if (IncVal != NumBytes)
-        continue;
-      Inc = DAG.getRegister(ARM64::XZR, MVT::i64);
-    }
-    SmallVector<SDValue, 8> Ops;
-    Ops.push_back(N->getOperand(0)); // Incoming chain
-    // Load lane and store have vector list as input.
-    if (IsLaneOp || IsStore)
-      for (unsigned i = 2; i < AddrOpIdx; ++i)
-        Ops.push_back(N->getOperand(i));
-    Ops.push_back(Addr); // Base register
-    Ops.push_back(Inc);
-
-    // Return Types.
-    EVT Tys[6];
-    unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
-    unsigned n;
-    for (n = 0; n < NumResultVecs; ++n)
-      Tys[n] = VecTy;
-    Tys[n++] = MVT::i64;  // Type of write back register
-    Tys[n] = MVT::Other;  // Type of the chain
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
-
-    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
-                                           MemInt->getMemoryVT(),
-                                           MemInt->getMemOperand());
-
-    // Update the uses.
-    std::vector<SDValue> NewResults;
-    for (unsigned i = 0; i < NumResultVecs; ++i) {
-      NewResults.push_back(SDValue(UpdN.getNode(), i));
-    }
-    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
-    DCI.CombineTo(N, NewResults);
-    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
-
-    break;
-  }
-  return SDValue();
-}
-
-// Optimize compare with zero and branch.
-static SDValue performBRCONDCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    SelectionDAG &DAG) {
-  SDValue Chain = N->getOperand(0);
-  SDValue Dest = N->getOperand(1);
-  SDValue CCVal = N->getOperand(2);
-  SDValue Cmp = N->getOperand(3);
-
-  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
-  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
-  if (CC != ARM64CC::EQ && CC != ARM64CC::NE)
-    return SDValue();
-
-  unsigned CmpOpc = Cmp.getOpcode();
-  if (CmpOpc != ARM64ISD::ADDS && CmpOpc != ARM64ISD::SUBS)
-    return SDValue();
-
-  // Only attempt folding if there is only one use of the flag and no use of the
-  // value.
-  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
-    return SDValue();
-
-  SDValue LHS = Cmp.getOperand(0);
-  SDValue RHS = Cmp.getOperand(1);
-
-  assert(LHS.getValueType() == RHS.getValueType() &&
-         "Expected the value type to be the same for both operands!");
-  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
-    return SDValue();
-
-  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
-    std::swap(LHS, RHS);
-
-  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
-    return SDValue();
-
-  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
-      LHS.getOpcode() == ISD::SRL)
-    return SDValue();
-
-  // Fold the compare into the branch instruction.
-  SDValue BR;
-  if (CC == ARM64CC::EQ)
-    BR = DAG.getNode(ARM64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
-  else
-    BR = DAG.getNode(ARM64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
-
-  // Do not add new nodes to DAG combiner worklist.
-  DCI.CombineTo(N, BR, false);
-
-  return SDValue();
-}
-
-// vselect (v1i1 setcc) ->
-//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
-// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
-// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
-// such VSELECT.
-static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  EVT CCVT = N0.getValueType();
-
-  if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
-      CCVT.getVectorElementType() != MVT::i1)
-    return SDValue();
-
-  EVT ResVT = N->getValueType(0);
-  EVT CmpVT = N0.getOperand(0).getValueType();
-  // Only combine when the result type is of the same size as the compared
-  // operands.
-  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
-    return SDValue();
-
-  SDValue IfTrue = N->getOperand(1);
-  SDValue IfFalse = N->getOperand(2);
-  SDValue SetCC =
-      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
-                   N0.getOperand(0), N0.getOperand(1),
-                   cast<CondCodeSDNode>(N0.getOperand(2))->get());
-  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
-                     IfTrue, IfFalse);
-}
-
-/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
-/// the compare-mask instructions rather than going via NZCV, even if LHS and
-/// RHS are really scalar. This replaces any scalar setcc in the above pattern
-/// with a vector one followed by a DUP shuffle on the result.
-static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  EVT ResVT = N->getValueType(0);
-
-  if (!N->getOperand(1).getValueType().isVector())
-    return SDValue();
-
-  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
-    return SDValue();
-
-  SDLoc DL(N0);
-
-  EVT SrcVT = N0.getOperand(0).getValueType();
-  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
-                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
-  EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
-
-  // First perform a vector comparison, where lane 0 is the one we're interested
-  // in.
-  SDValue LHS =
-      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
-  SDValue RHS =
-      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
-  SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
-
-  // Now duplicate the comparison mask we want across all other lanes.
-  SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
-  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
-  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
-                     Mask);
-
-  return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
-}
-
-SDValue ARM64TargetLowering::PerformDAGCombine(SDNode *N,
-                                               DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-  switch (N->getOpcode()) {
-  default:
-    break;
-  case ISD::ADD:
-  case ISD::SUB:
-    return performAddSubLongCombine(N, DCI, DAG);
-  case ISD::XOR:
-    return performXorCombine(N, DAG, DCI, Subtarget);
-  case ISD::MUL:
-    return performMulCombine(N, DAG, DCI, Subtarget);
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
-    return performIntToFpCombine(N, DAG);
-  case ISD::OR:
-    return performORCombine(N, DCI, Subtarget);
-  case ISD::INTRINSIC_WO_CHAIN:
-    return performIntrinsicCombine(N, DCI, Subtarget);
-  case ISD::ANY_EXTEND:
-  case ISD::ZERO_EXTEND:
-  case ISD::SIGN_EXTEND:
-    return performExtendCombine(N, DCI, DAG);
-  case ISD::BITCAST:
-    return performBitcastCombine(N, DCI, DAG);
-  case ISD::CONCAT_VECTORS:
-    return performConcatVectorsCombine(N, DCI, DAG);
-  case ISD::SELECT:
-    return performSelectCombine(N, DAG);
-  case ISD::VSELECT:
-    return performVSelectCombine(N, DCI.DAG);
-  case ISD::STORE:
-    return performSTORECombine(N, DCI, DAG, Subtarget);
-  case ARM64ISD::BRCOND:
-    return performBRCONDCombine(N, DCI, DAG);
-  case ARM64ISD::DUP:
-    return performPostLD1Combine(N, DCI, false);
-  case ISD::INSERT_VECTOR_ELT:
-    return performPostLD1Combine(N, DCI, true);
-  case ISD::INTRINSIC_VOID:
-  case ISD::INTRINSIC_W_CHAIN:
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
-    case Intrinsic::arm64_neon_ld2:
-    case Intrinsic::arm64_neon_ld3:
-    case Intrinsic::arm64_neon_ld4:
-    case Intrinsic::arm64_neon_ld1x2:
-    case Intrinsic::arm64_neon_ld1x3:
-    case Intrinsic::arm64_neon_ld1x4:
-    case Intrinsic::arm64_neon_ld2lane:
-    case Intrinsic::arm64_neon_ld3lane:
-    case Intrinsic::arm64_neon_ld4lane:
-    case Intrinsic::arm64_neon_ld2r:
-    case Intrinsic::arm64_neon_ld3r:
-    case Intrinsic::arm64_neon_ld4r:
-    case Intrinsic::arm64_neon_st2:
-    case Intrinsic::arm64_neon_st3:
-    case Intrinsic::arm64_neon_st4:
-    case Intrinsic::arm64_neon_st1x2:
-    case Intrinsic::arm64_neon_st1x3:
-    case Intrinsic::arm64_neon_st1x4:
-    case Intrinsic::arm64_neon_st2lane:
-    case Intrinsic::arm64_neon_st3lane:
-    case Intrinsic::arm64_neon_st4lane:
-      return performNEONPostLDSTCombine(N, DCI, DAG);
-    default:
-      break;
-    }
-  }
-  return SDValue();
-}
-
-// Check if the return value is used as only a return value, as otherwise
-// we can't perform a tail-call. In particular, we need to check for
-// target ISD nodes that are returns and any other "odd" constructs
-// that the generic analysis code won't necessarily catch.
-bool ARM64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
-  if (N->getNumValues() != 1)
-    return false;
-  if (!N->hasNUsesOfValue(1, 0))
-    return false;
-
-  SDValue TCChain = Chain;
-  SDNode *Copy = *N->use_begin();
-  if (Copy->getOpcode() == ISD::CopyToReg) {
-    // If the copy has a glue operand, we conservatively assume it isn't safe to
-    // perform a tail call.
-    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
-        MVT::Glue)
-      return false;
-    TCChain = Copy->getOperand(0);
-  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
-    return false;
-
-  bool HasRet = false;
-  for (SDNode *Node : Copy->uses()) {
-    if (Node->getOpcode() != ARM64ISD::RET_FLAG)
-      return false;
-    HasRet = true;
-  }
-
-  if (!HasRet)
-    return false;
-
-  Chain = TCChain;
-  return true;
-}
-
-// Return whether the an instruction can potentially be optimized to a tail
-// call. This will cause the optimizers to attempt to move, or duplicate,
-// return instructions to help enable tail call optimizations for this
-// instruction.
-bool ARM64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!CI->isTailCall())
-    return false;
-
-  return true;
-}
-
-bool ARM64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
-                                                 SDValue &Offset,
-                                                 ISD::MemIndexedMode &AM,
-                                                 bool &IsInc,
-                                                 SelectionDAG &DAG) const {
-  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
-    return false;
-
-  Base = Op->getOperand(0);
-  // All of the indexed addressing mode instructions take a signed
-  // 9 bit immediate offset.
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
-    int64_t RHSC = (int64_t)RHS->getZExtValue();
-    if (RHSC >= 256 || RHSC <= -256)
-      return false;
-    IsInc = (Op->getOpcode() == ISD::ADD);
-    Offset = Op->getOperand(1);
-    return true;
-  }
-  return false;
-}
-
-bool ARM64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
-                                                    SDValue &Offset,
-                                                    ISD::MemIndexedMode &AM,
-                                                    SelectionDAG &DAG) const {
-  EVT VT;
-  SDValue Ptr;
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
-    VT = LD->getMemoryVT();
-    Ptr = LD->getBasePtr();
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
-    VT = ST->getMemoryVT();
-    Ptr = ST->getBasePtr();
-  } else
-    return false;
-
-  bool IsInc;
-  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
-    return false;
-  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
-  return true;
-}
-
-bool ARM64TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
-                                                     SDValue &Base,
-                                                     SDValue &Offset,
-                                                     ISD::MemIndexedMode &AM,
-                                                     SelectionDAG &DAG) const {
-  EVT VT;
-  SDValue Ptr;
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
-    VT = LD->getMemoryVT();
-    Ptr = LD->getBasePtr();
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
-    VT = ST->getMemoryVT();
-    Ptr = ST->getBasePtr();
-  } else
-    return false;
-
-  bool IsInc;
-  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
-    return false;
-  // Post-indexing updates the base, so it's not a valid transform
-  // if that's not the same as the load's pointer.
-  if (Ptr != Base)
-    return false;
-  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
-  return true;
-}
-
-void ARM64TargetLowering::ReplaceNodeResults(SDNode *N,
-                                             SmallVectorImpl<SDValue> &Results,
-                                             SelectionDAG &DAG) const {
-  switch (N->getOpcode()) {
-  default:
-    llvm_unreachable("Don't know how to custom expand this");
-  case ISD::FP_TO_UINT:
-  case ISD::FP_TO_SINT:
-    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
-    // Let normal code take care of it by not adding anything to Results.
-    return;
-  }
-}
-
-bool ARM64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
-  // Loads and stores less than 128-bits are already atomic; ones above that
-  // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong:
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
-  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-    return LI->getType()->getPrimitiveSizeInBits() == 128;
-
-  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
-  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
-}
-
-Value *ARM64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
-                                           AtomicOrdering Ord) const {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire =
-      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
-
-  // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
-  // intrinsic must return {i64, i64} and we have to recombine them into a
-  // single i128 here.
-  if (ValTy->getPrimitiveSizeInBits() == 128) {
-    Intrinsic::ID Int =
-        IsAcquire ? Intrinsic::arm64_ldaxp : Intrinsic::arm64_ldxp;
-    Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
-
-    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
-    Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
-
-    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
-    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
-    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
-    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
-    return Builder.CreateOr(
-        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
-  }
-
-  Type *Tys[] = { Addr->getType() };
-  Intrinsic::ID Int =
-      IsAcquire ? Intrinsic::arm64_ldaxr : Intrinsic::arm64_ldxr;
-  Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
-
-  return Builder.CreateTruncOrBitCast(
-      Builder.CreateCall(Ldxr, Addr),
-      cast<PointerType>(Addr->getType())->getElementType());
-}
-
-Value *ARM64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
-                                                 Value *Val, Value *Addr,
-                                                 AtomicOrdering Ord) const {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease =
-      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
-
-  // Since the intrinsics must have legal type, the i128 intrinsics take two
-  // parameters: "i64, i64". We must marshal Val into the appropriate form
-  // before the call.
-  if (Val->getType()->getPrimitiveSizeInBits() == 128) {
-    Intrinsic::ID Int =
-        IsRelease ? Intrinsic::arm64_stlxp : Intrinsic::arm64_stxp;
-    Function *Stxr = Intrinsic::getDeclaration(M, Int);
-    Type *Int64Ty = Type::getInt64Ty(M->getContext());
-
-    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
-    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
-    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
-    return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
-  }
-
-  Intrinsic::ID Int =
-      IsRelease ? Intrinsic::arm64_stlxr : Intrinsic::arm64_stxr;
-  Type *Tys[] = { Addr->getType() };
-  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
-
-  return Builder.CreateCall2(
-      Stxr, Builder.CreateZExtOrBitCast(
-                Val, Stxr->getFunctionType()->getParamType(0)),
-      Addr);
-}
diff --git a/lib/Target/ARM64/ARM64ISelLowering.h b/lib/Target/ARM64/ARM64ISelLowering.h
deleted file mode 100644
index b2402c9791c..00000000000
--- a/lib/Target/ARM64/ARM64ISelLowering.h
+++ /dev/null
@@ -1,464 +0,0 @@
-//==-- ARM64ISelLowering.h - ARM64 DAG Lowering Interface --------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interfaces that ARM64 uses to lower LLVM code into a
-// selection DAG.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64_ISELLOWERING_H
-#define LLVM_TARGET_ARM64_ISELLOWERING_H
-
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/Target/TargetLowering.h"
-
-namespace llvm {
-
-namespace ARM64ISD {
-
-enum {
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
-  CALL,         // Function call.
-
-  // Almost the same as a normal call node, except that a TLSDesc relocation is
-  // needed so the linker can relax it correctly if possible.
-  TLSDESC_CALL,
-  ADRP,     // Page address of a TargetGlobalAddress operand.
-  ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
-  LOADgot,  // Load from automatically generated descriptor (e.g. Global
-            // Offset Table, TLS record).
-  RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
-  BRCOND,   // Conditional branch instruction; "b.cond".
-  CSEL,
-  FCSEL, // Conditional move instruction.
-  CSINV, // Conditional select invert.
-  CSNEG, // Conditional select negate.
-  CSINC, // Conditional select increment.
-
-  // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
-  // ELF.
-  THREAD_POINTER,
-  ADC,
-  SBC, // adc, sbc instructions
-
-  // Arithmetic instructions which write flags.
-  ADDS,
-  SUBS,
-  ADCS,
-  SBCS,
-  ANDS,
-
-  // Floating point comparison
-  FCMP,
-
-  // Floating point max and min instructions.
-  FMAX,
-  FMIN,
-
-  // Scalar extract
-  EXTR,
-
-  // Scalar-to-vector duplication
-  DUP,
-  DUPLANE8,
-  DUPLANE16,
-  DUPLANE32,
-  DUPLANE64,
-
-  // Vector immedate moves
-  MOVI,
-  MOVIshift,
-  MOVIedit,
-  MOVImsl,
-  FMOV,
-  MVNIshift,
-  MVNImsl,
-
-  // Vector immediate ops
-  BICi,
-  ORRi,
-
-  // Vector bit select: similar to ISD::VSELECT but not all bits within an
-  // element must be identical.
-  BSL,
-
-  // Vector arithmetic negation
-  NEG,
-
-  // Vector shuffles
-  ZIP1,
-  ZIP2,
-  UZP1,
-  UZP2,
-  TRN1,
-  TRN2,
-  REV16,
-  REV32,
-  REV64,
-  EXT,
-
-  // Vector shift by scalar
-  VSHL,
-  VLSHR,
-  VASHR,
-
-  // Vector shift by scalar (again)
-  SQSHL_I,
-  UQSHL_I,
-  SQSHLU_I,
-  SRSHR_I,
-  URSHR_I,
-
-  // Vector comparisons
-  CMEQ,
-  CMGE,
-  CMGT,
-  CMHI,
-  CMHS,
-  FCMEQ,
-  FCMGE,
-  FCMGT,
-
-  // Vector zero comparisons
-  CMEQz,
-  CMGEz,
-  CMGTz,
-  CMLEz,
-  CMLTz,
-  FCMEQz,
-  FCMGEz,
-  FCMGTz,
-  FCMLEz,
-  FCMLTz,
-
-  // Vector bitwise negation
-  NOT,
-
-  // Vector bitwise selection
-  BIT,
-
-  // Compare-and-branch
-  CBZ,
-  CBNZ,
-  TBZ,
-  TBNZ,
-
-  // Tail calls
-  TC_RETURN,
-
-  // Custom prefetch handling
-  PREFETCH,
-
-  // {s|u}int to FP within a FP register.
-  SITOF,
-  UITOF,
-
-  // NEON Load/Store with post-increment base updates
-  LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
-  LD3post,
-  LD4post,
-  ST2post,
-  ST3post,
-  ST4post,
-  LD1x2post,
-  LD1x3post,
-  LD1x4post,
-  ST1x2post,
-  ST1x3post,
-  ST1x4post,
-  LD1DUPpost,
-  LD2DUPpost,
-  LD3DUPpost,
-  LD4DUPpost,
-  LD1LANEpost,
-  LD2LANEpost,
-  LD3LANEpost,
-  LD4LANEpost,
-  ST2LANEpost,
-  ST3LANEpost,
-  ST4LANEpost
-};
-
-} // end namespace ARM64ISD
-
-class ARM64Subtarget;
-class ARM64TargetMachine;
-
-class ARM64TargetLowering : public TargetLowering {
-  bool RequireStrictAlign;
-
-public:
-  explicit ARM64TargetLowering(ARM64TargetMachine &TM);
-
-  /// Selects the correct CCAssignFn for a the given CallingConvention
-  /// value.
-  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
-
-  /// computeKnownBitsForTargetNode - Determine which of the bits specified in
-  /// Mask are known to be either zero or one and return them in the
-  /// KnownZero/KnownOne bitsets.
-  void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
-                                     APInt &KnownOne, const SelectionDAG &DAG,
-                                     unsigned Depth = 0) const override;
-
-  MVT getScalarShiftAmountTy(EVT LHSTy) const override;
-
-  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
-  /// unaligned memory accesses. of the specified type.
-  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
-                                     bool *Fast = nullptr) const override {
-    if (RequireStrictAlign)
-      return false;
-    // FIXME: True for Cyclone, but not necessary others.
-    if (Fast)
-      *Fast = true;
-    return true;
-  }
-
-  /// LowerOperation - Provide custom lowering hooks for some operations.
-  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-
-  const char *getTargetNodeName(unsigned Opcode) const override;
-
-  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-
-  /// getFunctionAlignment - Return the Log2 alignment of this function.
-  unsigned getFunctionAlignment(const Function *F) const;
-
-  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
-  /// be used for loads / stores from the global.
-  unsigned getMaximalGlobalOffset() const override;
-
-  /// Returns true if a cast between SrcAS and DestAS is a noop.
-  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
-    // Addrspacecasts are always noops.
-    return true;
-  }
-
-  /// createFastISel - This method returns a target specific FastISel object,
-  /// or null if the target does not support "fast" ISel.
-  FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
-                           const TargetLibraryInfo *libInfo) const override;
-
-  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
-
-  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
-
-  /// isShuffleMaskLegal - Return true if the given shuffle mask can be
-  /// codegen'd directly, or if it should be stack expanded.
-  bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
-
-  /// getSetCCResultType - Return the ISD::SETCC ValueType
-  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
-
-  SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
-
-  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
-                                  MachineBasicBlock *BB) const;
-
-  MachineBasicBlock *
-  EmitInstrWithCustomInserter(MachineInstr *MI,
-                              MachineBasicBlock *MBB) const override;
-
-  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
-                          unsigned Intrinsic) const override;
-
-  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
-  bool isTruncateFree(EVT VT1, EVT VT2) const override;
-
-  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
-  bool isZExtFree(EVT VT1, EVT VT2) const override;
-  bool isZExtFree(SDValue Val, EVT VT2) const override;
-
-  bool hasPairedLoad(Type *LoadedType,
-                     unsigned &RequiredAligment) const override;
-  bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
-
-  bool isLegalAddImmediate(int64_t) const override;
-  bool isLegalICmpImmediate(int64_t) const override;
-
-  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
-                          MachineFunction &MF) const override;
-
-  /// isLegalAddressingMode - Return true if the addressing mode represented
-  /// by AM is legal for this target, for a load/store of the specified type.
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
-
-  /// \brief Return the cost of the scaling factor used in the addressing
-  /// mode represented by AM for this target, for a load/store
-  /// of the specified type.
-  /// If the AM is supported, the return value must be >= 0.
-  /// If the AM is not supported, it returns a negative value.
-  int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
-
-  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
-  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
-  /// expanded to FMAs when this method returns true, otherwise fmuladd is
-  /// expanded to fmul + fadd.
-  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
-
-  const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
-
-  /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
-  bool isDesirableToCommuteWithShift(const SDNode *N) const override;
-
-  /// \brief Returns true if it is beneficial to convert a load of a constant
-  /// to just the constant itself.
-  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                         Type *Ty) const override;
-
-  Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
-                        AtomicOrdering Ord) const override;
-  Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
-                              Value *Addr, AtomicOrdering Ord) const override;
-
-  bool shouldExpandAtomicInIR(Instruction *Inst) const override;
-
-private:
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
-  void addDRTypeForNEON(MVT VT);
-  void addQRTypeForNEON(MVT VT);
-
-  SDValue
-  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                       SelectionDAG &DAG,
-                       SmallVectorImpl<SDValue> &InVals) const override;
-
-  SDValue LowerCall(CallLoweringInfo & /*CLI*/,
-                    SmallVectorImpl<SDValue> &InVals) const override;
-
-  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
-                          CallingConv::ID CallConv, bool isVarArg,
-                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-                          bool isThisReturn, SDValue ThisVal) const;
-
-  bool isEligibleForTailCallOptimization(
-      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-      bool isCalleeStructRet, bool isCallerStructRet,
-      const SmallVectorImpl<ISD::OutputArg> &Outs,
-      const SmallVectorImpl<SDValue> &OutVals,
-      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
-
-  /// Finds the incoming stack arguments which overlap the given fixed stack
-  /// object and incorporates their load into the current chain. This prevents
-  /// an upcoming store from clobbering the stack argument before it's used.
-  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
-                              MachineFrameInfo *MFI, int ClobberedFI) const;
-
-  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
-
-  bool IsTailCallConvention(CallingConv::ID CallCC) const;
-
-  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
-                           SDValue &Chain) const;
-
-  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-                      bool isVarArg,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
-
-  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
-                      SelectionDAG &DAG) const override;
-
-  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
-                              SelectionDAG &DAG) const;
-  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                        RTLIB::Libcall Call) const;
-  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
-
-  ConstraintType
-  getConstraintType(const std::string &Constraint) const override;
-  unsigned getRegisterByName(const char* RegName, EVT VT) const override;
-
-  /// Examine constraint string and operand type and determine a weight value.
-  /// The operand object must already have been set up with the operand type.
-  ConstraintWeight
-  getSingleConstraintMatchWeight(AsmOperandInfo &info,
-                                 const char *constraint) const override;
-
-  std::pair<unsigned, const TargetRegisterClass *>
-  getRegForInlineAsmConstraint(const std::string &Constraint,
-                               MVT VT) const override;
-  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
-                                    std::vector<SDValue> &Ops,
-                                    SelectionDAG &DAG) const override;
-
-  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
-  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
-  bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
-                              ISD::MemIndexedMode &AM, bool &IsInc,
-                              SelectionDAG &DAG) const;
-  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
-                                 ISD::MemIndexedMode &AM,
-                                 SelectionDAG &DAG) const override;
-  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
-                                  SDValue &Offset, ISD::MemIndexedMode &AM,
-                                  SelectionDAG &DAG) const override;
-
-  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                          SelectionDAG &DAG) const override;
-};
-
-namespace ARM64 {
-FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
-                         const TargetLibraryInfo *libInfo);
-} // end namespace ARM64
-
-} // end namespace llvm
-
-#endif // LLVM_TARGET_ARM64_ISELLOWERING_H
diff --git a/lib/Target/ARM64/ARM64InstrAtomics.td b/lib/Target/ARM64/ARM64InstrAtomics.td
deleted file mode 100644
index 1d1483ac126..00000000000
--- a/lib/Target/ARM64/ARM64InstrAtomics.td
+++ /dev/null
@@ -1,364 +0,0 @@
-//===- ARM64InstrAtomics.td - ARM64 Atomic codegen support -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// ARM64 Atomic operand code-gen constructs.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------
-// Atomic fences
-//===----------------------------------
-def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
-def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
-
-//===----------------------------------
-// Atomic loads
-//===----------------------------------
-
-// When they're actually atomic, only one addressing mode (GPR64sp) is
-// supported, but when they're relaxed and anything can be used, all the
-// standard modes would be valid and may give efficiency gains.
-
-// A atomic load operation that actually needs acquire semantics.
-class acquiring_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(Ordering != AcquireRelease && "unexpected load ordering");
-  return Ordering == Acquire || Ordering == SequentiallyConsistent;
-}]>;
-
-// An atomic load operation that does not need either acquire or release
-// semantics.
-class relaxed_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Monotonic || Ordering == Unordered;
-}]>;
-
-// 8-bit loads
-def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
-                                                     ro_Wextend8:$offset)),
-          (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>;
-def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
-                                                     ro_Xextend8:$offset)),
-          (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>;
-def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn,
-                                                    uimm12s1:$offset)),
-          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
-def : Pat<(relaxed_load<atomic_load_8>
-               (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
-          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
-
-// 16-bit loads
-def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
-                                                       ro_Wextend16:$extend)),
-          (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
-def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
-                                                       ro_Xextend16:$extend)),
-          (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
-def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn,
-                                                      uimm12s2:$offset)),
-          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
-def : Pat<(relaxed_load<atomic_load_16>
-               (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
-          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
-
-// 32-bit loads
-def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
-                                                       ro_Wextend32:$extend)),
-          (LDRWroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
-def : Pat<(relaxed_load<atomic_load_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
-                                                       ro_Xextend32:$extend)),
-          (LDRWroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
-def : Pat<(relaxed_load<atomic_load_32> (am_indexed32 GPR64sp:$Rn,
-                                                      uimm12s4:$offset)),
-          (LDRWui GPR64sp:$Rn, uimm12s4:$offset)>;
-def : Pat<(relaxed_load<atomic_load_32>
-               (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
-          (LDURWi GPR64sp:$Rn, simm9:$offset)>;
-
-// 64-bit loads
-def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
-                                                       ro_Wextend64:$extend)),
-          (LDRXroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
-def : Pat<(relaxed_load<atomic_load_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
-                                                       ro_Xextend64:$extend)),
-          (LDRXroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
-def : Pat<(relaxed_load<atomic_load_64> (am_indexed64 GPR64sp:$Rn,
-                                                      uimm12s8:$offset)),
-          (LDRXui GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat<(relaxed_load<atomic_load_64>
-               (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
-          (LDURXi GPR64sp:$Rn, simm9:$offset)>;
-
-//===----------------------------------
-// Atomic stores
-//===----------------------------------
-
-// When they're actually atomic, only one addressing mode (GPR64sp) is
-// supported, but when they're relaxed and anything can be used, all the
-// standard modes would be valid and may give efficiency gains.
-
-// A store operation that actually needs release semantics.
-class releasing_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(Ordering != AcquireRelease && "unexpected store ordering");
-  return Ordering == Release || Ordering == SequentiallyConsistent;
-}]>;
-
-// An atomic store operation that doesn't actually need to be atomic on ARM64.
-class relaxed_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Monotonic || Ordering == Unordered;
-}]>;
-
-// 8-bit stores
-def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
-          (STLRB GPR32:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_8>
-               (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
-               GPR32:$val),
-          (STRBBroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend)>;
-def : Pat<(relaxed_store<atomic_store_8>
-               (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
-               GPR32:$val),
-          (STRBBroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend)>;
-def : Pat<(relaxed_store<atomic_store_8>
-               (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), GPR32:$val),
-          (STRBBui GPR32:$val, GPR64sp:$Rn, uimm12s1:$offset)>;
-def : Pat<(relaxed_store<atomic_store_8>
-               (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
-          (STURBBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
-
-// 16-bit stores
-def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
-          (STLRH GPR32:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
-                                                         ro_Wextend16:$extend),
-                                          GPR32:$val),
-          (STRHHroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
-def : Pat<(relaxed_store<atomic_store_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
-                                                         ro_Xextend16:$extend),
-                                          GPR32:$val),
-          (STRHHroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
-def : Pat<(relaxed_store<atomic_store_16>
-              (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), GPR32:$val),
-          (STRHHui GPR32:$val, GPR64sp:$Rn, uimm12s2:$offset)>;
-def : Pat<(relaxed_store<atomic_store_16>
-               (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
-          (STURHHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
-
-// 32-bit stores
-def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
-          (STLRW GPR32:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
-                                                         ro_Wextend32:$extend),
-                                          GPR32:$val),
-          (STRWroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
-def : Pat<(relaxed_store<atomic_store_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
-                                                         ro_Xextend32:$extend),
-                                          GPR32:$val),
-          (STRWroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
-def : Pat<(relaxed_store<atomic_store_32>
-              (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), GPR32:$val),
-          (STRWui GPR32:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
-def : Pat<(relaxed_store<atomic_store_32>
-               (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
-          (STURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
-
-// 64-bit stores
-def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
-          (STLRX GPR64:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
-                                                         ro_Wextend16:$extend),
-                                          GPR64:$val),
-          (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
-def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
-                                                         ro_Xextend16:$extend),
-                                          GPR64:$val),
-          (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
-def : Pat<(relaxed_store<atomic_store_64>
-              (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), GPR64:$val),
-          (STRXui GPR64:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat<(relaxed_store<atomic_store_64>
-               (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val),
-          (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>;
-
-//===----------------------------------
-// Low-level exclusive operations
-//===----------------------------------
-
-// Load-exclusives.
-
-def ldxr_1 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def ldxr_2 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def ldxr_4 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def ldxr_8 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
-
-def : Pat<(ldxr_1 GPR64sp:$addr),
-          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
-def : Pat<(ldxr_2 GPR64sp:$addr),
-          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
-def : Pat<(ldxr_4 GPR64sp:$addr),
-          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
-def : Pat<(ldxr_8 GPR64sp:$addr), (LDXRX GPR64sp:$addr)>;
-
-def : Pat<(and (ldxr_1 GPR64sp:$addr), 0xff),
-          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
-def : Pat<(and (ldxr_2 GPR64sp:$addr), 0xffff),
-          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
-def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff),
-          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
-
-// Load-exclusives.
-
-def ldaxr_1 : PatFrag<(ops node:$ptr), (int_arm64_ldaxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def ldaxr_2 : PatFrag<(ops node:$ptr), (int_arm64_ldaxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def ldaxr_4 : PatFrag<(ops node:$ptr), (int_arm64_ldaxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def ldaxr_8 : PatFrag<(ops node:$ptr), (int_arm64_ldaxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
-
-def : Pat<(ldaxr_1 GPR64sp:$addr),
-          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
-def : Pat<(ldaxr_2 GPR64sp:$addr),
-          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
-def : Pat<(ldaxr_4 GPR64sp:$addr),
-          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
-def : Pat<(ldaxr_8 GPR64sp:$addr), (LDAXRX GPR64sp:$addr)>;
-
-def : Pat<(and (ldaxr_1 GPR64sp:$addr), 0xff),
-          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
-def : Pat<(and (ldaxr_2 GPR64sp:$addr), 0xffff),
-          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
-def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff),
-          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
-
-// Store-exclusives.
-
-def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
-
-
-def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr),
-          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
-def : Pat<(stxr_2 GPR64:$val, GPR64sp:$addr),
-          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
-def : Pat<(stxr_4 GPR64:$val, GPR64sp:$addr),
-          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
-def : Pat<(stxr_8 GPR64:$val, GPR64sp:$addr),
-          (STXRX GPR64:$val, GPR64sp:$addr)>;
-
-def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
-          (STXRB GPR32:$val, GPR64sp:$addr)>;
-def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
-          (STXRH GPR32:$val, GPR64sp:$addr)>;
-def : Pat<(stxr_4 (zext GPR32:$val), GPR64sp:$addr),
-          (STXRW GPR32:$val, GPR64sp:$addr)>;
-
-def : Pat<(stxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
-          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
-def : Pat<(stxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
-          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
-def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
-          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
-
-// Store-release-exclusives.
-
-def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stlxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stlxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stlxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stlxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
-
-
-def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr),
-          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
-def : Pat<(stlxr_2 GPR64:$val, GPR64sp:$addr),
-          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
-def : Pat<(stlxr_4 GPR64:$val, GPR64sp:$addr),
-          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
-def : Pat<(stlxr_8 GPR64:$val, GPR64sp:$addr),
-          (STLXRX GPR64:$val, GPR64sp:$addr)>;
-
-def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
-          (STLXRB GPR32:$val, GPR64sp:$addr)>;
-def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
-          (STLXRH GPR32:$val, GPR64sp:$addr)>;
-def : Pat<(stlxr_4 (zext GPR32:$val), GPR64sp:$addr),
-          (STLXRW GPR32:$val, GPR64sp:$addr)>;
-
-def : Pat<(stlxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
-          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
-def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
-          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
-def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
-          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
-
-
-// And clear exclusive.
-
-def : Pat<(int_arm64_clrex), (CLREX 0xf)>;
diff --git a/lib/Target/ARM64/ARM64InstrFormats.td b/lib/Target/ARM64/ARM64InstrFormats.td
deleted file mode 100644
index ea45b3d4fb2..00000000000
--- a/lib/Target/ARM64/ARM64InstrFormats.td
+++ /dev/null
@@ -1,8574 +0,0 @@
-//===- ARM64InstrFormats.td - ARM64 Instruction Formats ------*- tblgen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//  Describe ARM64 instructions format here
-//
-
-// Format specifies the encoding used by the instruction.  This is part of the
-// ad-hoc solution used to emit machine instruction encodings by our machine
-// code emitter.
-class Format<bits<2> val> {
-  bits<2> Value = val;
-}
-
-def PseudoFrm   : Format<0>;
-def NormalFrm   : Format<1>; // Do we need any others?
-
-// ARM64 Instruction Format
-class ARM64Inst<Format f, string cstr> : Instruction {
-  field bits<32> Inst; // Instruction encoding.
-  // Mask of bits that cause an encoding to be UNPREDICTABLE.
-  // If a bit is set, then if the corresponding bit in the
-  // target encoding differs from its value in the "Inst" field,
-  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
-  field bits<32> Unpredictable = 0;
-  // SoftFail is the generic name for this field, but we alias it so
-  // as to make it more obvious what it means in ARM-land.
-  field bits<32> SoftFail = Unpredictable;
-  let Namespace   = "ARM64";
-  Format F        = f;
-  bits<2> Form    = F.Value;
-  let Pattern     = [];
-  let Constraints = cstr;
-}
-
-// Pseudo instructions (don't have encoding information)
-class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
-    : ARM64Inst<PseudoFrm, cstr> {
-  dag OutOperandList = oops;
-  dag InOperandList  = iops;
-  let Pattern        = pattern;
-  let isCodeGenOnly  = 1;
-}
-
-// Real instructions (have encoding information)
-class EncodedI<string cstr, list<dag> pattern> : ARM64Inst<NormalFrm, cstr> {
-  let Pattern = pattern;
-  let Size = 4;
-}
-
-// Normal instructions
-class I<dag oops, dag iops, string asm, string operands, string cstr,
-        list<dag> pattern>
-    : EncodedI<cstr, pattern> {
-  dag OutOperandList = oops;
-  dag InOperandList  = iops;
-  let AsmString      = !strconcat(asm, operands);
-}
-
-class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
-class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
-class UnOpFrag<dag res>  : PatFrag<(ops node:$LHS), res>;
-
-// Helper fragment for an extract of the high portion of a 128-bit vector.
-def extract_high_v16i8 :
-   UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
-def extract_high_v8i16 :
-   UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
-def extract_high_v4i32 :
-   UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
-def extract_high_v2i64 :
-   UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
-
-//===----------------------------------------------------------------------===//
-// Asm Operand Classes.
-//
-
-// Shifter operand for arithmetic shifted encodings.
-def ShifterOperand : AsmOperandClass {
-  let Name = "Shifter";
-}
-
-// Shifter operand for mov immediate encodings.
-def MovImm32ShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "MovImm32Shifter";
-  let RenderMethod = "addShifterOperands";
-  let DiagnosticType = "InvalidMovImm32Shift";
-}
-def MovImm64ShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "MovImm64Shifter";
-  let RenderMethod = "addShifterOperands";
-  let DiagnosticType = "InvalidMovImm64Shift";
-}
-
-// Shifter operand for arithmetic register shifted encodings.
-class ArithmeticShifterOperand<int width> : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "ArithmeticShifter" # width;
-  let PredicateMethod = "isArithmeticShifter<" # width # ">";
-  let RenderMethod = "addShifterOperands";
-  let DiagnosticType = "AddSubRegShift" # width;
-}
-
-def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>;
-def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>;
-
-// Shifter operand for logical register shifted encodings.
-class LogicalShifterOperand<int width> : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "LogicalShifter" # width;
-  let PredicateMethod = "isLogicalShifter<" # width # ">";
-  let RenderMethod = "addShifterOperands";
-  let DiagnosticType = "AddSubRegShift" # width;
-}
-
-def LogicalShifterOperand32 : LogicalShifterOperand<32>;
-def LogicalShifterOperand64 : LogicalShifterOperand<64>;
-
-// Shifter operand for logical vector 128/64-bit shifted encodings.
-def LogicalVecShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "LogicalVecShifter";
-  let RenderMethod = "addShifterOperands";
-}
-def LogicalVecHalfWordShifterOperand : AsmOperandClass {
-  let SuperClasses = [LogicalVecShifterOperand];
-  let Name = "LogicalVecHalfWordShifter";
-  let RenderMethod = "addShifterOperands";
-}
-
-// The "MSL" shifter on the vector MOVI instruction.
-def MoveVecShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "MoveVecShifter";
-  let RenderMethod = "addShifterOperands";
-}
-
-// Extend operand for arithmetic encodings.
-def ExtendOperand : AsmOperandClass {
-  let Name = "Extend";
-  let DiagnosticType = "AddSubRegExtendLarge";
-}
-def ExtendOperand64 : AsmOperandClass {
-  let SuperClasses = [ExtendOperand];
-  let Name = "Extend64";
-  let DiagnosticType = "AddSubRegExtendSmall";
-}
-// 'extend' that's a lsl of a 64-bit register.
-def ExtendOperandLSL64 : AsmOperandClass {
-  let SuperClasses = [ExtendOperand];
-  let Name = "ExtendLSL64";
-  let RenderMethod = "addExtend64Operands";
-  let DiagnosticType = "AddSubRegExtendLarge";
-}
-
-// 8-bit floating-point immediate encodings.
-def FPImmOperand : AsmOperandClass {
-  let Name = "FPImm";
-  let ParserMethod = "tryParseFPImm";
-  let DiagnosticType = "InvalidFPImm";
-}
-
-def CondCode : AsmOperandClass {
-  let Name = "CondCode";
-  let DiagnosticType = "InvalidCondCode";
-}
-
-// A 32-bit register pasrsed as 64-bit
-def GPR32as64Operand : AsmOperandClass {
-  let Name = "GPR32as64";
-}
-def GPR32as64 : RegisterOperand<GPR32> {
-  let ParserMatchClass = GPR32as64Operand;
-}
-
-// 8-bit immediate for AdvSIMD where 64-bit values of the form:
-// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
-// are encoded as the eight bit value 'abcdefgh'.
-def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
-
-
-//===----------------------------------------------------------------------===//
-// Operand Definitions.
-//
-
-// ADR[P] instruction labels.
-def AdrpOperand : AsmOperandClass {
-  let Name = "AdrpLabel";
-  let ParserMethod = "tryParseAdrpLabel";
-  let DiagnosticType = "InvalidLabel";
-}
-def adrplabel : Operand<i64> {
-  let EncoderMethod = "getAdrLabelOpValue";
-  let PrintMethod = "printAdrpLabel";
-  let ParserMatchClass = AdrpOperand;
-}
-
-def AdrOperand : AsmOperandClass {
-  let Name = "AdrLabel";
-  let ParserMethod = "tryParseAdrLabel";
-  let DiagnosticType = "InvalidLabel";
-}
-def adrlabel : Operand<i64> {
-  let EncoderMethod = "getAdrLabelOpValue";
-  let ParserMatchClass = AdrOperand;
-}
-
-// simm9 predicate - True if the immediate is in the range [-256, 255].
-def SImm9Operand : AsmOperandClass {
-  let Name = "SImm9";
-  let DiagnosticType = "InvalidMemoryIndexedSImm9";
-}
-def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
-  let ParserMatchClass = SImm9Operand;
-}
-
-// simm7sN predicate - True if the immediate is a multiple of N in the range
-// [-64 * N, 63 * N].
-class SImm7Scaled<int Scale> : AsmOperandClass {
-  let Name = "SImm7s" # Scale;
-  let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7";
-}
-
-def SImm7s4Operand : SImm7Scaled<4>;
-def SImm7s8Operand : SImm7Scaled<8>;
-def SImm7s16Operand : SImm7Scaled<16>;
-
-def simm7s4 : Operand<i32> {
-  let ParserMatchClass = SImm7s4Operand;
-  let PrintMethod = "printImmScale<4>";
-}
-
-def simm7s8 : Operand<i32> {
-  let ParserMatchClass = SImm7s8Operand;
-  let PrintMethod = "printImmScale<8>";
-}
-
-def simm7s16 : Operand<i32> {
-  let ParserMatchClass = SImm7s16Operand;
-  let PrintMethod = "printImmScale<16>";
-}
-
-class AsmImmRange<int Low, int High> : AsmOperandClass {
-  let Name = "Imm" # Low # "_" # High;
-  let DiagnosticType = "InvalidImm" # Low # "_" # High;
-}
-
-def Imm1_8Operand : AsmImmRange<1, 8>;
-def Imm1_16Operand : AsmImmRange<1, 16>;
-def Imm1_32Operand : AsmImmRange<1, 32>;
-def Imm1_64Operand : AsmImmRange<1, 64>;
-
-def MovZSymbolG3AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG3";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g3 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG3AsmOperand;
-}
-
-def MovZSymbolG2AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG2";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g2 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG2AsmOperand;
-}
-
-def MovZSymbolG1AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG1";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g1 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG1AsmOperand;
-}
-
-def MovZSymbolG0AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG0";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g0 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG0AsmOperand;
-}
-
-def MovKSymbolG3AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG3";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g3 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG3AsmOperand;
-}
-
-def MovKSymbolG2AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG2";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g2 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG2AsmOperand;
-}
-
-def MovKSymbolG1AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG1";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g1 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG1AsmOperand;
-}
-
-def MovKSymbolG0AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG0";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g0 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG0AsmOperand;
-}
-
-class fixedpoint_i32<ValueType FloatVT>
-  : Operand<FloatVT>,
-    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> {
-  let EncoderMethod = "getFixedPointScaleOpValue";
-  let DecoderMethod = "DecodeFixedPointScaleImm32";
-  let ParserMatchClass = Imm1_32Operand;
-}
-
-class fixedpoint_i64<ValueType FloatVT>
-  : Operand<FloatVT>,
-    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> {
-  let EncoderMethod = "getFixedPointScaleOpValue";
-  let DecoderMethod = "DecodeFixedPointScaleImm64";
-  let ParserMatchClass = Imm1_64Operand;
-}
-
-def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
-def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
-
-def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
-def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
-
-def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
-}]> {
-  let EncoderMethod = "getVecShiftR8OpValue";
-  let DecoderMethod = "DecodeVecShiftR8Imm";
-  let ParserMatchClass = Imm1_8Operand;
-}
-def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
-}]> {
-  let EncoderMethod = "getVecShiftR16OpValue";
-  let DecoderMethod = "DecodeVecShiftR16Imm";
-  let ParserMatchClass = Imm1_16Operand;
-}
-def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
-}]> {
-  let EncoderMethod = "getVecShiftR16OpValue";
-  let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
-  let ParserMatchClass = Imm1_8Operand;
-}
-def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
-}]> {
-  let EncoderMethod = "getVecShiftR32OpValue";
-  let DecoderMethod = "DecodeVecShiftR32Imm";
-  let ParserMatchClass = Imm1_32Operand;
-}
-def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
-}]> {
-  let EncoderMethod = "getVecShiftR32OpValue";
-  let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
-  let ParserMatchClass = Imm1_16Operand;
-}
-def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
-}]> {
-  let EncoderMethod = "getVecShiftR64OpValue";
-  let DecoderMethod = "DecodeVecShiftR64Imm";
-  let ParserMatchClass = Imm1_64Operand;
-}
-def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
-}]> {
-  let EncoderMethod = "getVecShiftR64OpValue";
-  let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
-  let ParserMatchClass = Imm1_32Operand;
-}
-
-def Imm0_7Operand : AsmImmRange<0, 7>;
-def Imm0_15Operand : AsmImmRange<0, 15>;
-def Imm0_31Operand : AsmImmRange<0, 31>;
-def Imm0_63Operand : AsmImmRange<0, 63>;
-
-def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 8);
-}]> {
-  let EncoderMethod = "getVecShiftL8OpValue";
-  let DecoderMethod = "DecodeVecShiftL8Imm";
-  let ParserMatchClass = Imm0_7Operand;
-}
-def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 16);
-}]> {
-  let EncoderMethod = "getVecShiftL16OpValue";
-  let DecoderMethod = "DecodeVecShiftL16Imm";
-  let ParserMatchClass = Imm0_15Operand;
-}
-def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 32);
-}]> {
-  let EncoderMethod = "getVecShiftL32OpValue";
-  let DecoderMethod = "DecodeVecShiftL32Imm";
-  let ParserMatchClass = Imm0_31Operand;
-}
-def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 64);
-}]> {
-  let EncoderMethod = "getVecShiftL64OpValue";
-  let DecoderMethod = "DecodeVecShiftL64Imm";
-  let ParserMatchClass = Imm0_63Operand;
-}
-
-
-// Crazy immediate formats used by 32-bit and 64-bit logical immediate
-// instructions for splatting repeating bit patterns across the immediate.
-def logical_imm32_XFORM : SDNodeXForm<imm, [{
-  uint64_t enc = ARM64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
-  return CurDAG->getTargetConstant(enc, MVT::i32);
-}]>;
-def logical_imm64_XFORM : SDNodeXForm<imm, [{
-  uint64_t enc = ARM64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
-  return CurDAG->getTargetConstant(enc, MVT::i32);
-}]>;
-
-def LogicalImm32Operand : AsmOperandClass {
-  let Name = "LogicalImm32";
-  let DiagnosticType = "LogicalSecondSource";
-}
-def LogicalImm64Operand : AsmOperandClass {
-  let Name = "LogicalImm64";
-  let DiagnosticType = "LogicalSecondSource";
-}
-def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
-  return ARM64_AM::isLogicalImmediate(N->getZExtValue(), 32);
-}], logical_imm32_XFORM> {
-  let PrintMethod = "printLogicalImm32";
-  let ParserMatchClass = LogicalImm32Operand;
-}
-def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
-  return ARM64_AM::isLogicalImmediate(N->getZExtValue(), 64);
-}], logical_imm64_XFORM> {
-  let PrintMethod = "printLogicalImm64";
-  let ParserMatchClass = LogicalImm64Operand;
-}
-
-// imm0_65535 predicate - True if the immediate is in the range [0,65535].
-def Imm0_65535Operand : AsmImmRange<0, 65535>;
-def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 65536;
-}]> {
-  let ParserMatchClass = Imm0_65535Operand;
-  let PrintMethod = "printHexImm";
-}
-
-// imm0_255 predicate - True if the immediate is in the range [0,255].
-def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
-def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 256;
-}]> {
-  let ParserMatchClass = Imm0_255Operand;
-  let PrintMethod = "printHexImm";
-}
-
-// imm0_127 predicate - True if the immediate is in the range [0,127]
-def Imm0_127Operand : AsmImmRange<0, 127>;
-def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 128;
-}]> {
-  let ParserMatchClass = Imm0_127Operand;
-  let PrintMethod = "printHexImm";
-}
-
-// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
-// for all shift-amounts.
-
-// imm0_63 predicate - True if the immediate is in the range [0,63]
-def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 64;
-}]> {
-  let ParserMatchClass = Imm0_63Operand;
-}
-
-// imm0_31 predicate - True if the immediate is in the range [0,31]
-def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 32;
-}]> {
-  let ParserMatchClass = Imm0_31Operand;
-}
-
-// imm0_15 predicate - True if the immediate is in the range [0,15]
-def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 16;
-}]> {
-  let ParserMatchClass = Imm0_15Operand;
-}
-
-// imm0_7 predicate - True if the immediate is in the range [0,7]
-def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 8;
-}]> {
-  let ParserMatchClass = Imm0_7Operand;
-}
-
-// An arithmetic shifter operand:
-//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
-//  {5-0} - imm6
-class arith_shift<ValueType Ty, int width> : Operand<Ty> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = !cast<AsmOperandClass>(
-                         "ArithmeticShifterOperand" # width);
-}
-
-def arith_shift32 : arith_shift<i32, 32>;
-def arith_shift64 : arith_shift<i64, 64>;
-
-class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
-    : Operand<Ty>,
-      ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
-  let PrintMethod = "printShiftedRegister";
-  let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width));
-}
-
-def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
-def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
-
-// An arithmetic shifter operand:
-//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
-//  {5-0} - imm6
-class logical_shift<int width> : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = !cast<AsmOperandClass>(
-                         "LogicalShifterOperand" # width);
-}
-
-def logical_shift32 : logical_shift<32>;
-def logical_shift64 : logical_shift<64>;
-
-class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
-    : Operand<Ty>,
-      ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
-  let PrintMethod = "printShiftedRegister";
-  let MIOperandInfo = (ops regclass, shiftop);
-}
-
-def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
-def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
-
-// A logical vector shifter operand:
-//  {7-6} - shift type: 00 = lsl
-//  {5-0} - imm6: #0, #8, #16, or #24
-def logical_vec_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let EncoderMethod = "getVecShifterOpValue";
-  let ParserMatchClass = LogicalVecShifterOperand;
-}
-
-// A logical vector half-word shifter operand:
-//  {7-6} - shift type: 00 = lsl
-//  {5-0} - imm6: #0 or #8
-def logical_vec_hw_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let EncoderMethod = "getVecShifterOpValue";
-  let ParserMatchClass = LogicalVecHalfWordShifterOperand;
-}
-
-// A vector move shifter operand:
-//  {0} - imm1: #8 or #16
-def move_vec_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let EncoderMethod = "getMoveVecShifterOpValue";
-  let ParserMatchClass = MoveVecShifterOperand;
-}
-
-def AddSubImmOperand : AsmOperandClass {
-  let Name = "AddSubImm";
-  let ParserMethod = "tryParseAddSubImm";
-  let DiagnosticType = "AddSubSecondSource";
-}
-// An ADD/SUB immediate shifter operand:
-//  second operand:
-//  {7-6} - shift type: 00 = lsl
-//  {5-0} - imm6: #0 or #12
-class addsub_shifted_imm<ValueType Ty>
-    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
-  let PrintMethod = "printAddSubImm";
-  let EncoderMethod = "getAddSubImmOpValue";
-  let ParserMatchClass = AddSubImmOperand;
-  let MIOperandInfo = (ops i32imm, i32imm);
-}
-
-def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
-def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
-
-class neg_addsub_shifted_imm<ValueType Ty>
-    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
-  let PrintMethod = "printAddSubImm";
-  let EncoderMethod = "getAddSubImmOpValue";
-  let ParserMatchClass = AddSubImmOperand;
-  let MIOperandInfo = (ops i32imm, i32imm);
-}
-
-def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
-def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
-
-// An extend operand:
-//  {5-3} - extend type
-//  {2-0} - imm3
-def arith_extend : Operand<i32> {
-  let PrintMethod = "printArithExtend";
-  let ParserMatchClass = ExtendOperand;
-}
-def arith_extend64 : Operand<i32> {
-  let PrintMethod = "printArithExtend";
-  let ParserMatchClass = ExtendOperand64;
-}
-
-// 'extend' that's a lsl of a 64-bit register.
-def arith_extendlsl64 : Operand<i32> {
-  let PrintMethod = "printArithExtend";
-  let ParserMatchClass = ExtendOperandLSL64;
-}
-
-class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
-                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
-  let PrintMethod = "printExtendedRegister";
-  let MIOperandInfo = (ops GPR32, arith_extend);
-}
-
-class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
-                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
-  let PrintMethod = "printExtendedRegister";
-  let MIOperandInfo = (ops GPR32, arith_extend64);
-}
-
-// Floating-point immediate.
-def fpimm32 : Operand<f32>,
-              PatLeaf<(f32 fpimm), [{
-      return ARM64_AM::getFP32Imm(N->getValueAPF()) != -1;
-    }], SDNodeXForm<fpimm, [{
-      APFloat InVal = N->getValueAPF();
-      uint32_t enc = ARM64_AM::getFP32Imm(InVal);
-      return CurDAG->getTargetConstant(enc, MVT::i32);
-    }]>> {
-  let ParserMatchClass = FPImmOperand;
-  let PrintMethod = "printFPImmOperand";
-}
-def fpimm64 : Operand<f64>,
-              PatLeaf<(f64 fpimm), [{
-      return ARM64_AM::getFP64Imm(N->getValueAPF()) != -1;
-    }], SDNodeXForm<fpimm, [{
-      APFloat InVal = N->getValueAPF();
-      uint32_t enc = ARM64_AM::getFP64Imm(InVal);
-      return CurDAG->getTargetConstant(enc, MVT::i32);
-    }]>> {
-  let ParserMatchClass = FPImmOperand;
-  let PrintMethod = "printFPImmOperand";
-}
-
-def fpimm8 : Operand<i32> {
-  let ParserMatchClass = FPImmOperand;
-  let PrintMethod = "printFPImmOperand";
-}
-
-def fpimm0 : PatLeaf<(fpimm), [{
-  return N->isExactlyValue(+0.0);
-}]>;
-
-// Vector lane operands
-class AsmVectorIndex<string Suffix> : AsmOperandClass {
-  let Name = "VectorIndex" # Suffix;
-  let DiagnosticType = "InvalidIndex" # Suffix;
-}
-def VectorIndex1Operand : AsmVectorIndex<"1">;
-def VectorIndexBOperand : AsmVectorIndex<"B">;
-def VectorIndexHOperand : AsmVectorIndex<"H">;
-def VectorIndexSOperand : AsmVectorIndex<"S">;
-def VectorIndexDOperand : AsmVectorIndex<"D">;
-
-def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) == 1;
-}]> {
-  let ParserMatchClass = VectorIndex1Operand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 16;
-}]> {
-  let ParserMatchClass = VectorIndexBOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 8;
-}]> {
-  let ParserMatchClass = VectorIndexHOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 4;
-}]> {
-  let ParserMatchClass = VectorIndexSOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 2;
-}]> {
-  let ParserMatchClass = VectorIndexDOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-
-// 8-bit immediate for AdvSIMD where 64-bit values of the form:
-// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
-// are encoded as the eight bit value 'abcdefgh'.
-def simdimmtype10 : Operand<i32>,
-                    PatLeaf<(f64 fpimm), [{
-      return ARM64_AM::isAdvSIMDModImmType10(N->getValueAPF()
-                                               .bitcastToAPInt()
-                                               .getZExtValue());
-    }], SDNodeXForm<fpimm, [{
-      APFloat InVal = N->getValueAPF();
-      uint32_t enc = ARM64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
-                                                           .bitcastToAPInt()
-                                                           .getZExtValue());
-      return CurDAG->getTargetConstant(enc, MVT::i32);
-    }]>> {
-  let ParserMatchClass = SIMDImmType10Operand;
-  let PrintMethod = "printSIMDType10Operand";
-}
-
-
-//---
-// System management
-//---
-
-// Base encoding for system instruction operands.
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands>
-    : I<oops, iops, asm, operands, "", []> {
-  let Inst{31-22} = 0b1101010100;
-  let Inst{21}    = L;
-}
-
-// System instructions which do not have an Rt register.
-class SimpleSystemI<bit L, dag iops, string asm, string operands>
-    : BaseSystemI<L, (outs), iops, asm, operands> {
-  let Inst{4-0} = 0b11111;
-}
-
-// System instructions which have an Rt register.
-class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
-    : BaseSystemI<L, oops, iops, asm, operands>,
-      Sched<[WriteSys]> {
-  bits<5> Rt;
-  let Inst{4-0} = Rt;
-}
-
-// Hint instructions that take both a CRm and a 3-bit immediate.
-class HintI<string mnemonic>
-    : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">,
-      Sched<[WriteHint]> {
-  bits <7> imm;
-  let Inst{20-12} = 0b000110010;
-  let Inst{11-5} = imm;
-}
-
-// System instructions taking a single literal operand which encodes into
-// CRm. op2 differentiates the opcodes.
-def BarrierAsmOperand : AsmOperandClass {
-  let Name = "Barrier";
-  let ParserMethod = "tryParseBarrierOperand";
-}
-def barrier_op : Operand<i32> {
-  let PrintMethod = "printBarrierOption";
-  let ParserMatchClass = BarrierAsmOperand;
-}
-class CRmSystemI<Operand crmtype, bits<3> opc, string asm>
-    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">,
-      Sched<[WriteBarrier]> {
-  bits<4> CRm;
-  let Inst{20-12} = 0b000110011;
-  let Inst{11-8} = CRm;
-  let Inst{7-5} = opc;
-}
-
-// MRS/MSR system instructions. These have different operand classes because
-// a different subset of registers can be accessed through each instruction.
-def MRSSystemRegisterOperand : AsmOperandClass {
-  let Name = "MRSSystemRegister";
-  let ParserMethod = "tryParseSysReg";
-  let DiagnosticType = "MRS";
-}
-// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
-def mrs_sysreg_op : Operand<i32> {
-  let ParserMatchClass = MRSSystemRegisterOperand;
-  let DecoderMethod = "DecodeMRSSystemRegister";
-  let PrintMethod = "printMRSSystemRegister";
-}
-
-def MSRSystemRegisterOperand : AsmOperandClass {
-  let Name = "MSRSystemRegister";
-  let ParserMethod = "tryParseSysReg";
-  let DiagnosticType = "MSR";
-}
-def msr_sysreg_op : Operand<i32> {
-  let ParserMatchClass = MSRSystemRegisterOperand;
-  let DecoderMethod = "DecodeMSRSystemRegister";
-  let PrintMethod = "printMSRSystemRegister";
-}
-
-class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
-                       "mrs", "\t$Rt, $systemreg"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
-}
-
-// FIXME: Some of these def NZCV, others don't. Best way to model that?
-// Explicitly modeling each of the system register as a register class
-// would do it, but feels like overkill at this point.
-class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
-                       "msr", "\t$systemreg, $Rt"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
-}
-
-def SystemPStateFieldOperand : AsmOperandClass {
-  let Name = "SystemPStateField";
-  let ParserMethod = "tryParseSysReg";
-}
-def pstatefield_op : Operand<i32> {
-  let ParserMatchClass = SystemPStateFieldOperand;
-  let PrintMethod = "printSystemPStateField";
-}
-
-let Defs = [NZCV] in
-class MSRpstateI
-  : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm),
-                  "msr", "\t$pstate_field, $imm">,
-    Sched<[WriteSys]> {
-  bits<6> pstatefield;
-  bits<4> imm;
-  let Inst{20-19} = 0b00;
-  let Inst{18-16} = pstatefield{5-3};
-  let Inst{15-12} = 0b0100;
-  let Inst{11-8} = imm;
-  let Inst{7-5} = pstatefield{2-0};
-
-  let DecoderMethod = "DecodeSystemPStateInstruction";
-}
-
-// SYS and SYSL generic system instructions.
-def SysCRAsmOperand : AsmOperandClass {
-  let Name = "SysCR";
-  let ParserMethod = "tryParseSysCROperand";
-}
-
-def sys_cr_op : Operand<i32> {
-  let PrintMethod = "printSysCROperand";
-  let ParserMatchClass = SysCRAsmOperand;
-}
-
-class SystemXtI<bit L, string asm>
-  : RtSystemI<L, (outs),
-       (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
-       asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
-  bits<3> op1;
-  bits<4> Cn;
-  bits<4> Cm;
-  bits<3> op2;
-  let Inst{20-19} = 0b01;
-  let Inst{18-16} = op1;
-  let Inst{15-12} = Cn;
-  let Inst{11-8}  = Cm;
-  let Inst{7-5}   = op2;
-}
-
-class SystemLXtI<bit L, string asm>
-  : RtSystemI<L, (outs),
-       (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
-       asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
-  bits<3> op1;
-  bits<4> Cn;
-  bits<4> Cm;
-  bits<3> op2;
-  let Inst{20-19} = 0b01;
-  let Inst{18-16} = op1;
-  let Inst{15-12} = Cn;
-  let Inst{11-8}  = Cm;
-  let Inst{7-5}   = op2;
-}
-
-
-// Branch (register) instructions:
-//
-//  case opc of
-//    0001 blr
-//    0000 br
-//    0101 dret
-//    0100 eret
-//    0010 ret
-//    otherwise UNDEFINED
-class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
-                    string operands, list<dag> pattern>
-    : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
-  let Inst{31-25} = 0b1101011;
-  let Inst{24-21} = opc;
-  let Inst{20-16} = 0b11111;
-  let Inst{15-10} = 0b000000;
-  let Inst{4-0}   = 0b00000;
-}
-
-class BranchReg<bits<4> opc, string asm, list<dag> pattern>
-    : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
-  bits<5> Rn;
-  let Inst{9-5} = Rn;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
-class SpecialReturn<bits<4> opc, string asm>
-    : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
-  let Inst{9-5} = 0b11111;
-}
-
-//---
-// Conditional branch instruction.
-//---
-
-// Condition code.
-// 4-bit immediate. Pretty-printed as <cc>
-def ccode : Operand<i32> {
-  let PrintMethod = "printCondCode";
-  let ParserMatchClass = CondCode;
-}
-def inv_ccode : Operand<i32> {
-  let PrintMethod = "printInverseCondCode";
-  let ParserMatchClass = CondCode;
-}
-
-// Conditional branch target. 19-bit immediate. The low two bits of the target
-// offset are implied zero and so are not part of the immediate.
-def PCRelLabel19Operand : AsmOperandClass {
-  let Name = "PCRelLabel19";
-  let DiagnosticType = "InvalidLabel";
-}
-def am_brcond : Operand<OtherVT> {
-  let EncoderMethod = "getCondBranchTargetOpValue";
-  let DecoderMethod = "DecodePCRelLabel19";
-  let PrintMethod = "printAlignedLabel";
-  let ParserMatchClass = PCRelLabel19Operand;
-}
-
-class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
-                     "b", ".$cond\t$target", "",
-                     [(ARM64brcond bb:$target, imm:$cond, NZCV)]>,
-                   Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-  let Uses = [NZCV];
-
-  bits<4> cond;
-  bits<19> target;
-  let Inst{31-24} = 0b01010100;
-  let Inst{23-5} = target;
-  let Inst{4} = 0;
-  let Inst{3-0} = cond;
-}
-
-//---
-// Compare-and-branch instructions.
-//---
-class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
-    : I<(outs), (ins regtype:$Rt, am_brcond:$target),
-         asm, "\t$Rt, $target", "",
-         [(node regtype:$Rt, bb:$target)]>,
-      Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-
-  bits<5> Rt;
-  bits<19> target;
-  let Inst{30-25} = 0b011010;
-  let Inst{24}    = op;
-  let Inst{23-5}  = target;
-  let Inst{4-0}   = Rt;
-}
-
-multiclass CmpBranch<bit op, string asm, SDNode node> {
-  def W : BaseCmpBranch<GPR32, op, asm, node> {
-    let Inst{31} = 0;
-  }
-  def X : BaseCmpBranch<GPR64, op, asm, node> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Test-bit-and-branch instructions.
-//---
-// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
-// the target offset are implied zero and so are not part of the immediate.
-def BranchTarget14Operand : AsmOperandClass {
-  let Name = "BranchTarget14";
-}
-def am_tbrcond : Operand<OtherVT> {
-  let EncoderMethod = "getTestBranchTargetOpValue";
-  let PrintMethod = "printAlignedLabel";
-  let ParserMatchClass = BranchTarget14Operand;
-}
-
-// AsmOperand classes to emit (or not) special diagnostics
-def TBZImm0_31Operand : AsmOperandClass {
-  let Name = "TBZImm0_31";
-  let PredicateMethod = "isImm0_31";
-  let RenderMethod = "addImm0_31Operands";
-}
-def TBZImm32_63Operand : AsmOperandClass {
-  let Name = "Imm32_63";
-  let DiagnosticType = "InvalidImm0_63";
-}
-
-class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
-  return (((uint32_t)Imm) < 32);
-}]> {
-  let ParserMatchClass = matcher;
-}
-
-def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>;
-def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>;
-
-def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{
-  return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64);
-}]> {
-  let ParserMatchClass = TBZImm32_63Operand;
-}
-
-class BaseTestBranch<RegisterClass regtype, Operand immtype,
-                     bit op, string asm, SDNode node>
-    : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target),
-       asm, "\t$Rt, $bit_off, $target", "",
-       [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>,
-      Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-
-  bits<5> Rt;
-  bits<6> bit_off;
-  bits<14> target;
-
-  let Inst{30-25} = 0b011011;
-  let Inst{24}    = op;
-  let Inst{23-19} = bit_off{4-0};
-  let Inst{18-5}  = target;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodeTestAndBranch";
-}
-
-multiclass TestBranch<bit op, string asm, SDNode node> {
-  def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> {
-    let Inst{31} = 0;
-  }
-
-  def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> {
-    let Inst{31} = 1;
-  }
-
-  // Alias X-reg with 0-31 imm to W-Reg.
-  def : InstAlias<asm # "\t$Rd, $imm, $target",
-                  (!cast<Instruction>(NAME#"W") GPR32as64:$Rd,
-                  tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>;
-  def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target),
-            (!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32),
-            tbz_imm0_31_diag:$imm, bb:$target)>;
-}
-
-//---
-// Unconditional branch (immediate) instructions.
-//---
-def BranchTarget26Operand : AsmOperandClass {
-  let Name = "BranchTarget26";
-  let DiagnosticType = "InvalidLabel";
-}
-def am_b_target : Operand<OtherVT> {
-  let EncoderMethod = "getBranchTargetOpValue";
-  let PrintMethod = "printAlignedLabel";
-  let ParserMatchClass = BranchTarget26Operand;
-}
-def am_bl_target : Operand<i64> {
-  let EncoderMethod = "getBranchTargetOpValue";
-  let PrintMethod = "printAlignedLabel";
-  let ParserMatchClass = BranchTarget26Operand;
-}
-
-class BImm<bit op, dag iops, string asm, list<dag> pattern>
-    : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
-  bits<26> addr;
-  let Inst{31}    = op;
-  let Inst{30-26} = 0b00101;
-  let Inst{25-0}  = addr;
-
-  let DecoderMethod = "DecodeUnconditionalBranch";
-}
-
-class BranchImm<bit op, string asm, list<dag> pattern>
-    : BImm<op, (ins am_b_target:$addr), asm, pattern>;
-class CallImm<bit op, string asm, list<dag> pattern>
-    : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
-
-//---
-// Basic one-operand data processing instructions.
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
-                         SDPatternOperator node>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
-      [(set regtype:$Rd, (node regtype:$Rn))]>,
-    Sched<[WriteI, ReadI]> {
-  bits<5> Rd;
-  bits<5> Rn;
-
-  let Inst{30-13} = 0b101101011000000000;
-  let Inst{12-10} = opc;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass OneOperandData<bits<3> opc, string asm,
-                          SDPatternOperator node = null_frag> {
-  def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
-    let Inst{31} = 0;
-  }
-
-  def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
-    let Inst{31} = 1;
-  }
-}
-
-class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
-    : BaseOneOperandData<opc, GPR32, asm, node> {
-  let Inst{31} = 0;
-}
-
-class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
-    : BaseOneOperandData<opc, GPR64, asm, node> {
-  let Inst{31} = 1;
-}
-
-//---
-// Basic two-operand data processing instructions.
-//---
-class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
-                          list<dag> pattern>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
-      Sched<[WriteI, ReadI, ReadI]> {
-  let Uses = [NZCV];
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{30}    = isSub;
-  let Inst{28-21} = 0b11010000;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
-                      SDNode OpNode>
-    : BaseBaseAddSubCarry<isSub, regtype, asm,
-        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>;
-
-class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
-                              SDNode OpNode>
-    : BaseBaseAddSubCarry<isSub, regtype, asm,
-        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)),
-         (implicit NZCV)]> {
-  let Defs = [NZCV];
-}
-
-multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
-                       SDNode OpNode, SDNode OpNode_setflags> {
-  def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
-    let Inst{31} = 0;
-    let Inst{29} = 0;
-  }
-  def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
-    let Inst{31} = 1;
-    let Inst{29} = 0;
-  }
-
-  // Sets flags.
-  def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
-                                    OpNode_setflags> {
-    let Inst{31} = 0;
-    let Inst{29} = 1;
-  }
-  def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
-                                    OpNode_setflags> {
-    let Inst{31} = 1;
-    let Inst{29} = 1;
-  }
-}
-
-class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
-                     SDPatternOperator OpNode>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-      asm, "\t$Rd, $Rn, $Rm", "",
-      [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{30-21} = 0b0011010110;
-  let Inst{20-16} = Rm;
-  let Inst{15-14} = 0b00;
-  let Inst{13-10} = opc;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
-              SDPatternOperator OpNode>
-    : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
-  let Inst{10}    = isSigned;
-}
-
-multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
-  def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
-           Sched<[WriteID32, ReadID, ReadID]> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
-           Sched<[WriteID64, ReadID, ReadID]> {
-    let Inst{31} = 1;
-  }
-}
-
-class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
-                SDPatternOperator OpNode = null_frag>
-  : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
-    Sched<[WriteIS, ReadI]> {
-  let Inst{11-10} = shift_type;
-}
-
-multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
-  def Wr : BaseShift<shift_type, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-
-  def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
-    let Inst{31} = 1;
-  }
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
-                                             (EXTRACT_SUBREG i64:$Rm, sub_32))>;
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
-}
-
-class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
-
-class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
-                       RegisterClass addtype, string asm,
-                       list<dag> pattern>
-  : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
-      asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<5> Ra;
-  let Inst{30-24} = 0b0011011;
-  let Inst{23-21} = opc;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = isSub;
-  let Inst{14-10} = Ra;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
-  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
-      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
-      Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
-    let Inst{31} = 0;
-  }
-
-  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
-      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
-      Sched<[WriteIM64, ReadIMA, ReadIM, ReadIM]> {
-    let Inst{31} = 1;
-  }
-}
-
-class WideMulAccum<bit isSub, bits<3> opc, string asm,
-                   SDNode AccNode, SDNode ExtNode>
-  : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
-    [(set GPR64:$Rd, (AccNode GPR64:$Ra,
-                            (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
-    Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
-  let Inst{31} = 1;
-}
-
-class MulHi<bits<3> opc, string asm, SDNode OpNode>
-  : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
-      asm, "\t$Rd, $Rn, $Rm", "",
-      [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
-    Sched<[WriteIM64, ReadIM, ReadIM]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-24} = 0b10011011;
-  let Inst{23-21} = opc;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
-  // (i.e. all bits 1) but is ignored by the processor.
-  let PostEncoderMethod = "fixMulHigh";
-}
-
-class MulAccumWAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
-class MulAccumXAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
-class WideMulAccumAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
-
-class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
-              SDPatternOperator OpNode, string asm>
-  : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
-      asm, "\t$Rd, $Rn, $Rm", "",
-      [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
-    Sched<[WriteISReg, ReadI, ReadISReg]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-
-  let Inst{31} = sf;
-  let Inst{30-21} = 0b0011010110;
-  let Inst{20-16} = Rm;
-  let Inst{15-13} = 0b010;
-  let Inst{12} = C;
-  let Inst{11-10} = sz;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-  let Predicates = [HasCRC];
-}
-
-//---
-// Address generation.
-//---
-
-class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
-    : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
-        pattern>,
-      Sched<[WriteI]> {
-  bits<5>  Xd;
-  bits<21> label;
-  let Inst{31}    = page;
-  let Inst{30-29} = label{1-0};
-  let Inst{28-24} = 0b10000;
-  let Inst{23-5}  = label{20-2};
-  let Inst{4-0}   = Xd;
-
-  let DecoderMethod = "DecodeAdrInstruction";
-}
-
-//---
-// Move immediate.
-//---
-
-def movimm32_imm : Operand<i32> {
-  let ParserMatchClass = Imm0_65535Operand;
-  let EncoderMethod = "getMoveWideImmOpValue";
-  let PrintMethod = "printHexImm";
-}
-def movimm32_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = MovImm32ShifterOperand;
-}
-def movimm64_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = MovImm64ShifterOperand;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
-                        string asm>
-  : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
-       asm, "\t$Rd, $imm$shift", "", []>,
-    Sched<[WriteImm]> {
-  bits<5> Rd;
-  bits<16> imm;
-  bits<6> shift;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100101;
-  let Inst{22-21} = shift{5-4};
-  let Inst{20-5}  = imm;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeMoveImmInstruction";
-}
-
-multiclass MoveImmediate<bits<2> opc, string asm> {
-  def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
-    let Inst{31} = 0;
-  }
-
-  def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
-                          string asm>
-  : I<(outs regtype:$Rd),
-      (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
-       asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
-    Sched<[WriteI, ReadI]> {
-  bits<5> Rd;
-  bits<16> imm;
-  bits<6> shift;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100101;
-  let Inst{22-21} = shift{5-4};
-  let Inst{20-5}  = imm;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeMoveImmInstruction";
-}
-
-multiclass InsertImmediate<bits<2> opc, string asm> {
-  def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
-    let Inst{31} = 0;
-  }
-
-  def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Add/Subtract
-//---
-
-class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
-                    RegisterClass srcRegtype, addsub_shifted_imm immtype,
-                    string asm, SDPatternOperator OpNode>
-    : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
-        asm, "\t$Rd, $Rn, $imm", "",
-        [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
-      Sched<[WriteI, ReadI]>  {
-  bits<5>  Rd;
-  bits<5>  Rn;
-  bits<14> imm;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b10001;
-  let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
-  let Inst{21-10} = imm{11-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-  let DecoderMethod = "DecodeBaseAddSubImm";
-}
-
-class BaseAddSubRegPseudo<RegisterClass regtype,
-                          SDPatternOperator OpNode>
-    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
-      Sched<[WriteI, ReadI, ReadI]>;
-
-class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
-                     arith_shifted_reg shifted_regtype, string asm,
-                     SDPatternOperator OpNode>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
-        asm, "\t$Rd, $Rn, $Rm", "",
-        [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
-      Sched<[WriteISReg, ReadI, ReadISReg]> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<8> shift;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b01011;
-  let Inst{23-22} = shift{7-6};
-  let Inst{21}    = 0;
-  let Inst{20-16} = src2;
-  let Inst{15-10} = shift{5-0};
-  let Inst{9-5}   = src1;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
-}
-
-class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
-                     RegisterClass src1Regtype, Operand src2Regtype,
-                     string asm, SDPatternOperator OpNode>
-    : I<(outs dstRegtype:$R1),
-        (ins src1Regtype:$R2, src2Regtype:$R3),
-        asm, "\t$R1, $R2, $R3", "",
-        [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
-      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<6> ext;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b01011;
-  let Inst{23-21} = 0b001;
-  let Inst{20-16} = Rm;
-  let Inst{15-13} = ext{5-3};
-  let Inst{12-10} = ext{2-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeAddSubERegInstruction";
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
-                       RegisterClass src1Regtype, RegisterClass src2Regtype,
-                       Operand ext_op, string asm>
-    : I<(outs dstRegtype:$Rd),
-        (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
-        asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
-      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<6> ext;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b01011;
-  let Inst{23-21} = 0b001;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = ext{5};
-  let Inst{12-10} = ext{2-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeAddSubERegInstruction";
-}
-
-// Aliases for register+register add/subtract.
-class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
-                     RegisterClass src1Regtype, RegisterClass src2Regtype,
-                     int shiftExt>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
-                      shiftExt)>;
-
-multiclass AddSub<bit isSub, string mnemonic,
-                  SDPatternOperator OpNode = null_frag> {
-  let hasSideEffects = 0 in {
-  // Add/Subtract immediate
-  def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
-                           mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xri  : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
-                           mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-
-  // Add/Subtract register - Only used for CodeGen
-  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
-
-  // Add/Subtract shifted register
-  def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
-                           OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
-                           OpNode> {
-    let Inst{31} = 1;
-  }
-  }
-
-  // Add/Subtract extended register
-  let AddedComplexity = 1, hasSideEffects = 0 in {
-  def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
-                           arith_extended_reg32<i32>, mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
-                           arith_extended_reg32to64<i64>, mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-  }
-
-  def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
-                               arith_extendlsl64, mnemonic> {
-    // UXTX and SXTX only.
-    let Inst{14-13} = 0b11;
-    let Inst{31} = 1;
-  }
-
-  // Register/register aliases with no shift when SP is not used.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
-                       GPR32, GPR32, GPR32, 0>;
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
-                       GPR64, GPR64, GPR64, 0>;
-
-  // Register/register aliases with no shift when either the destination or
-  // first source register is SP.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
-}
-
-multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp> {
-  let isCompare = 1, Defs = [NZCV] in {
-  // Add/Subtract immediate
-  def Wri  : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
-                           mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xri  : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
-                           mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-
-  // Add/Subtract register
-  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
-
-  // Add/Subtract shifted register
-  def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
-                           OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
-                           OpNode> {
-    let Inst{31} = 1;
-  }
-
-  // Add/Subtract extended register
-  let AddedComplexity = 1 in {
-  def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
-                           arith_extended_reg32<i32>, mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
-                           arith_extended_reg32<i64>, mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-  }
-
-  def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
-                               arith_extendlsl64, mnemonic> {
-    // UXTX and SXTX only.
-    let Inst{14-13} = 0b11;
-    let Inst{31} = 1;
-  }
-  } // Defs = [NZCV]
-
-  // Compare aliases
-  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
-                  WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
-  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
-                  XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
-                  WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
-                  XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
-                  XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
-                  WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
-                  XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
-
-  // Compare shorthands
-  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs")
-                  WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
-  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs")
-                  XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
-
-  // Register/register aliases with no shift when SP is not used.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
-                       GPR32, GPR32, GPR32, 0>;
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
-                       GPR64, GPR64, GPR64, 0>;
-
-  // Register/register aliases with no shift when the first source register
-  // is SP.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
-}
-
-//---
-// Extract
-//---
-def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
-                                      SDTCisPtrTy<3>]>;
-def ARM64Extr : SDNode<"ARM64ISD::EXTR", SDTA64EXTR>;
-
-class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
-                     list<dag> patterns>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
-         asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
-      Sched<[WriteExtr, ReadExtrHi]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<6> imm;
-
-  let Inst{30-23} = 0b00100111;
-  let Inst{21}    = 0;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = imm;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass ExtractImm<string asm> {
-  def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
-                      [(set GPR32:$Rd,
-                        (ARM64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
-    let Inst{31} = 0;
-    let Inst{22} = 0;
-    // imm<5> must be zero.
-    let imm{5}   = 0;
-  }
-  def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
-                      [(set GPR64:$Rd,
-                        (ARM64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
-
-    let Inst{31} = 1;
-    let Inst{22} = 1;
-  }
-}
-
-//---
-// Bitfield
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseBitfieldImm<bits<2> opc,
-                      RegisterClass regtype, Operand imm_type, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
-         asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
-      Sched<[WriteIS, ReadI]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> immr;
-  bits<6> imms;
-
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100110;
-  let Inst{21-16} = immr;
-  let Inst{15-10} = imms;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass BitfieldImm<bits<2> opc, string asm> {
-  def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
-    let Inst{31} = 0;
-    let Inst{22} = 0;
-    // imms<5> and immr<5> must be zero, else ReservedValue().
-    let Inst{21} = 0;
-    let Inst{15} = 0;
-  }
-  def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
-    let Inst{31} = 1;
-    let Inst{22} = 1;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseBitfieldImmWith2RegArgs<bits<2> opc,
-                      RegisterClass regtype, Operand imm_type, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
-                             imm_type:$imms),
-         asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
-      Sched<[WriteIS, ReadI]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> immr;
-  bits<6> imms;
-
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100110;
-  let Inst{21-16} = immr;
-  let Inst{15-10} = imms;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
-  def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
-    let Inst{31} = 0;
-    let Inst{22} = 0;
-    // imms<5> and immr<5> must be zero, else ReservedValue().
-    let Inst{21} = 0;
-    let Inst{15} = 0;
-  }
-  def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
-    let Inst{31} = 1;
-    let Inst{22} = 1;
-  }
-}
-
-//---
-// Logical
-//---
-
-// Logical (immediate)
-class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
-                     RegisterClass sregtype, Operand imm_type, string asm,
-                     list<dag> pattern>
-    : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
-         asm, "\t$Rd, $Rn, $imm", "", pattern>,
-      Sched<[WriteI, ReadI]> {
-  bits<5>  Rd;
-  bits<5>  Rn;
-  bits<13> imm;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100100;
-  let Inst{22}    = imm{12};
-  let Inst{21-16} = imm{11-6};
-  let Inst{15-10} = imm{5-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeLogicalImmInstruction";
-}
-
-// Logical (shifted register)
-class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
-                      logical_shifted_reg shifted_regtype, string asm,
-                      list<dag> pattern>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
-        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
-      Sched<[WriteISReg, ReadI, ReadISReg]> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<8> shift;
-  let Inst{30-29} = opc;
-  let Inst{28-24} = 0b01010;
-  let Inst{23-22} = shift{7-6};
-  let Inst{21}    = N;
-  let Inst{20-16} = src2;
-  let Inst{15-10} = shift{5-0};
-  let Inst{9-5}   = src1;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
-}
-
-// Aliases for register+register logical instructions.
-class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
-
-let AddedComplexity = 6 in
-multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> {
-  def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
-                           [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
-                                               logical_imm32:$imm))]> {
-    let Inst{31} = 0;
-    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
-  }
-  def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
-                           [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
-                                               logical_imm64:$imm))]> {
-    let Inst{31} = 1;
-  }
-}
-
-multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
-  let isCompare = 1, Defs = [NZCV] in {
-  def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
-      [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
-    let Inst{31} = 0;
-    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
-  }
-  def Xri  : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
-      [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
-    let Inst{31} = 1;
-  }
-  } // end Defs = [NZCV]
-}
-
-class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
-    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
-      Sched<[WriteI, ReadI, ReadI]>;
-
-// Split from LogicalImm as not all instructions have both.
-multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
-                      SDPatternOperator OpNode> {
-  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
-
-  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
-                            [(set GPR32:$Rd, (OpNode GPR32:$Rn,
-                                                 logical_shifted_reg32:$Rm))]> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
-                            [(set GPR64:$Rd, (OpNode GPR64:$Rn,
-                                                 logical_shifted_reg64:$Rm))]> {
-    let Inst{31} = 1;
-  }
-
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
-}
-
-// Split from LogicalReg to allow setting NZCV Defs
-multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
-                       SDPatternOperator OpNode = null_frag> {
-  let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
-
-  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
-            [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
-            [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> {
-    let Inst{31} = 1;
-  }
-  } // Defs = [NZCV]
-
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
-}
-
-//---
-// Conditionally set flags
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
-      Sched<[WriteI, ReadI]> {
-  let Uses = [NZCV];
-  let Defs = [NZCV];
-
-  bits<5> Rn;
-  bits<5> imm;
-  bits<4> nzcv;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b111010010;
-  let Inst{20-16} = imm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = 0b0;
-  let Inst{3-0}   = nzcv;
-}
-
-multiclass CondSetFlagsImm<bit op, string asm> {
-  def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
-      Sched<[WriteI, ReadI, ReadI]> {
-  let Uses = [NZCV];
-  let Defs = [NZCV];
-
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> nzcv;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b111010010;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = 0b0;
-  let Inst{3-0}   = nzcv;
-}
-
-multiclass CondSetFlagsReg<bit op, string asm> {
-  def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Conditional select
-//---
-
-class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
-         asm, "\t$Rd, $Rn, $Rm, $cond", "",
-         [(set regtype:$Rd,
-               (ARM64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
-      Sched<[WriteI, ReadI, ReadI]> {
-  let Uses = [NZCV];
-
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b011010100;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = op2;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass CondSelect<bit op, bits<2> op2, string asm> {
-  def Wr : BaseCondSelect<op, op2, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseCondSelect<op, op2, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
-                       PatFrag frag>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
-         asm, "\t$Rd, $Rn, $Rm, $cond", "",
-         [(set regtype:$Rd,
-               (ARM64csel regtype:$Rn, (frag regtype:$Rm),
-               (i32 imm:$cond), NZCV))]>,
-      Sched<[WriteI, ReadI, ReadI]> {
-  let Uses = [NZCV];
-
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b011010100;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = op2;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-def inv_cond_XFORM : SDNodeXForm<imm, [{
-  ARM64CC::CondCode CC = static_cast<ARM64CC::CondCode>(N->getZExtValue());
-  return CurDAG->getTargetConstant(ARM64CC::getInvertedCondCode(CC), MVT::i32);
-}]>;
-
-multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
-  def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
-    let Inst{31} = 1;
-  }
-
-  def : Pat<(ARM64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV),
-            (!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm,
-                                           (inv_cond_XFORM imm:$cond))>;
-
-  def : Pat<(ARM64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV),
-            (!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm,
-                                           (inv_cond_XFORM imm:$cond))>;
-}
-
-//---
-// Special Mask Value
-//---
-def maski8_or_more : Operand<i32>,
-  ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
-}
-def maski16_or_more : Operand<i32>,
-  ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
-}
-
-
-//---
-// Load/store
-//---
-
-// (unsigned immediate)
-// Indexed for 8-bit registers. offset is in range [0,4095].
-def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>;
-def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>;
-def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
-def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
-def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;
-
-class UImm12OffsetOperand<int Scale> : AsmOperandClass {
-  let Name = "UImm12Offset" # Scale;
-  let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">";
-  let PredicateMethod = "isUImm12Offset<" # Scale # ">";
-  let DiagnosticType = "InvalidMemoryIndexed" # Scale;
-}
-
-def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>;
-def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>;
-def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>;
-def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>;
-def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>;
-
-class uimm12_scaled<int Scale> : Operand<i64> {
-  let ParserMatchClass
-   = !cast<AsmOperandClass>("UImm12OffsetScale" # Scale # "Operand");
-  let EncoderMethod
-   = "getLdStUImm12OpValue<ARM64::fixup_arm64_ldst_imm12_scale" # Scale # ">";
-  let PrintMethod = "printUImm12Offset<" # Scale # ">";
-}
-
-def uimm12s1 : uimm12_scaled<1>;
-def uimm12s2 : uimm12_scaled<2>;
-def uimm12s4 : uimm12_scaled<4>;
-def uimm12s8 : uimm12_scaled<8>;
-def uimm12s16 : uimm12_scaled<16>;
-
-class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                      string asm, list<dag> pattern>
-    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
-  bits<5> Rt;
-
-  bits<5> Rn;
-  bits<12> offset;
-
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b01;
-  let Inst{23-22} = opc;
-  let Inst{21-10} = offset;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodeUnsignedLdStInstruction";
-}
-
-multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                  Operand indextype, string asm, list<dag> pattern> {
-  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-  def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
-                           (ins GPR64sp:$Rn, indextype:$offset),
-                           asm, pattern>,
-           Sched<[WriteLD]>;
-
-  def : InstAlias<asm # " $Rt, [$Rn]",
-                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
-}
-
-multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             Operand indextype, string asm, list<dag> pattern> {
-  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-  def ui : BaseLoadStoreUI<sz, V, opc, (outs),
-                           (ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
-                           asm, pattern>,
-           Sched<[WriteST]>;
-
-  def : InstAlias<asm # " $Rt, [$Rn]",
-                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
-}
-
-def PrefetchOperand : AsmOperandClass {
-  let Name = "Prefetch";
-  let ParserMethod = "tryParsePrefetch";
-}
-def prfop : Operand<i32> {
-  let PrintMethod = "printPrefetchOp";
-  let ParserMatchClass = PrefetchOperand;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
-    : BaseLoadStoreUI<sz, V, opc,
-                      (outs), (ins prfop:$Rt, GPR64sp:$Rn, uimm12s8:$offset),
-                      asm, pat>,
-      Sched<[WriteLD]>;
-
-//---
-// Load literal
-//---
-
-// Load literal address: 19-bit immediate. The low two bits of the target
-// offset are implied zero and so are not part of the immediate.
-def am_ldrlit : Operand<OtherVT> {
-  let EncoderMethod = "getLoadLiteralOpValue";
-  let DecoderMethod = "DecodePCRelLabel19";
-  let PrintMethod = "printAlignedLabel";
-  let ParserMatchClass = PCRelLabel19Operand;
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
-    : I<(outs regtype:$Rt), (ins am_ldrlit:$label),
-        asm, "\t$Rt, $label", "", []>,
-      Sched<[WriteLD]> {
-  bits<5> Rt;
-  bits<19> label;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b011;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-5}  = label;
-  let Inst{4-0}   = Rt;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
-    : I<(outs), (ins prfop:$Rt, am_ldrlit:$label),
-        asm, "\t$Rt, $label", "", pat>,
-      Sched<[WriteLD]> {
-  bits<5> Rt;
-  bits<19> label;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b011;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-5}  = label;
-  let Inst{4-0}   = Rt;
-}
-
-//---
-// Load/store register offset
-//---
-
-def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>;
-def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>;
-def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
-def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
-def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
-
-def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
-def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
-def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
-def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
-def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;
-
-class MemExtendOperand<string Reg, int Width> : AsmOperandClass {
-  let Name = "Mem" # Reg # "Extend" # Width;
-  let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">";
-  let RenderMethod = "addMemExtendOperands";
-  let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width;
-}
-
-def MemWExtend8Operand : MemExtendOperand<"W", 8> {
-  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
-  // the trivial shift.
-  let RenderMethod = "addMemExtend8Operands";
-}
-def MemWExtend16Operand : MemExtendOperand<"W", 16>;
-def MemWExtend32Operand : MemExtendOperand<"W", 32>;
-def MemWExtend64Operand : MemExtendOperand<"W", 64>;
-def MemWExtend128Operand : MemExtendOperand<"W", 128>;
-
-def MemXExtend8Operand : MemExtendOperand<"X", 8> {
-  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
-  // the trivial shift.
-  let RenderMethod = "addMemExtend8Operands";
-}
-def MemXExtend16Operand : MemExtendOperand<"X", 16>;
-def MemXExtend32Operand : MemExtendOperand<"X", 32>;
-def MemXExtend64Operand : MemExtendOperand<"X", 64>;
-def MemXExtend128Operand : MemExtendOperand<"X", 128>;
-
-class ro_extend<AsmOperandClass ParserClass, string Reg, int Width>
-        : Operand<i32> {
-  let ParserMatchClass = ParserClass;
-  let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">";
-  let DecoderMethod = "DecodeMemExtend";
-  let EncoderMethod = "getMemExtendOpValue";
-  let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift);
-}
-
-def ro_Wextend8   : ro_extend<MemWExtend8Operand,   "w", 8>;
-def ro_Wextend16  : ro_extend<MemWExtend16Operand,  "w", 16>;
-def ro_Wextend32  : ro_extend<MemWExtend32Operand,  "w", 32>;
-def ro_Wextend64  : ro_extend<MemWExtend64Operand,  "w", 64>;
-def ro_Wextend128 : ro_extend<MemWExtend128Operand, "w", 128>;
-
-def ro_Xextend8   : ro_extend<MemXExtend8Operand,   "x", 8>;
-def ro_Xextend16  : ro_extend<MemXExtend16Operand,  "x", 16>;
-def ro_Xextend32  : ro_extend<MemXExtend32Operand,  "x", 32>;
-def ro_Xextend64  : ro_extend<MemXExtend64Operand,  "x", 64>;
-def ro_Xextend128 : ro_extend<MemXExtend128Operand, "x", 128>;
-
-class ROAddrMode<ComplexPattern windex, ComplexPattern xindex,
-                  Operand wextend, Operand xextend>  {
-  // CodeGen-level pattern covering the entire addressing mode.
-  ComplexPattern Wpat = windex;
-  ComplexPattern Xpat = xindex;
-
-  // Asm-level Operand covering the valid "uxtw #3" style syntax.
-  Operand Wext = wextend;
-  Operand Xext = xextend;
-}
-
-def ro8 : ROAddrMode<ro_Windexed8, ro_Xindexed8, ro_Wextend8, ro_Xextend8>;
-def ro16 : ROAddrMode<ro_Windexed16, ro_Xindexed16, ro_Wextend16, ro_Xextend16>;
-def ro32 : ROAddrMode<ro_Windexed32, ro_Xindexed32, ro_Wextend32, ro_Xextend32>;
-def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
-def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
-                       ro_Xextend128>;
-
-class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
-  bits<5> Rt;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<2> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = extend{1}; // sign extend Rm?
-  let Inst{14}    = 1;
-  let Inst{12}    = extend{0}; // do shift?
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-}
-
-class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
-  : InstAlias<asm # " $Rt, [$Rn, $Rm]",
-              (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
-
-multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                   string asm, ValueType Ty, SDPatternOperator loadop> {
-  let AddedComplexity = 10 in
-  def roW : LoadStore8RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt),
-                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
-                 [(set (Ty regtype:$Rt),
-                       (loadop (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
-                                             ro_Wextend8:$extend)))]>,
-           Sched<[WriteLDIdx, ReadAdrBase]> {
-    let Inst{13} = 0b0;
-  }
-
-  let AddedComplexity = 10 in
-  def roX : LoadStore8RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt),
-                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
-                 [(set (Ty regtype:$Rt),
-                       (loadop (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
-                                             ro_Xextend8:$extend)))]>,
-           Sched<[WriteLDIdx, ReadAdrBase]> {
-    let Inst{13} = 0b1;
-  }
-
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
-}
-
-multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                    string asm, ValueType Ty, SDPatternOperator storeop> {
-  let AddedComplexity = 10 in
-  def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
-                 (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
-                 [(storeop (Ty regtype:$Rt),
-                           (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
-                                         ro_Wextend8:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
-    let Inst{13} = 0b0;
-  }
-
-  let AddedComplexity = 10 in
-  def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
-                 (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
-                 [(storeop (Ty regtype:$Rt),
-                           (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
-                                         ro_Xextend8:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
-    let Inst{13} = 0b1;
-  }
-
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
-}
-
-class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
-  bits<5> Rt;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<2> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = extend{1}; // sign extend Rm?
-  let Inst{14}    = 1;
-  let Inst{12}    = extend{0}; // do shift?
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-}
-
-multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                    string asm, ValueType Ty, SDPatternOperator loadop> {
-  let AddedComplexity = 10 in
-  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
-                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
-                 [(set (Ty regtype:$Rt),
-                       (loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
-                                              ro_Wextend16:$extend)))]>,
-            Sched<[WriteLDIdx, ReadAdrBase]> {
-    let Inst{13} = 0b0;
-  }
-
-  let AddedComplexity = 10 in
-  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
-                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
-                 [(set (Ty regtype:$Rt),
-                       (loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
-                                             ro_Xextend16:$extend)))]>,
-            Sched<[WriteLDIdx, ReadAdrBase]> {
-    let Inst{13} = 0b1;
-  }
-
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
-}
-
-multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                     string asm, ValueType Ty, SDPatternOperator storeop> {
-  let AddedComplexity = 10 in
-  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
-                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
-                [(storeop (Ty regtype:$Rt),
-                          (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
-                                         ro_Wextend16:$extend))]>,
-           Sched<[WriteSTIdx, ReadAdrBase]> {
-    let Inst{13} = 0b0;
-  }
-
-  let AddedComplexity = 10 in
-  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
-                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
-                [(storeop (Ty regtype:$Rt),
-                          (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
-                                         ro_Xextend16:$extend))]>,
-           Sched<[WriteSTIdx, ReadAdrBase]> {
-    let Inst{13} = 0b1;
-  }
-
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
-}
-
-class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
-  bits<5> Rt;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<2> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = extend{1}; // sign extend Rm?
-  let Inst{14}    = 1;
-  let Inst{12}    = extend{0}; // do shift?
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-}
-
-multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                    string asm, ValueType Ty, SDPatternOperator loadop> {
-  let AddedComplexity = 10 in
-  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
-                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
-                 [(set (Ty regtype:$Rt),
-                       (loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
-                                              ro_Wextend32:$extend)))]>,
-           Sched<[WriteLDIdx, ReadAdrBase]> {
-    let Inst{13} = 0b0;
-  }
-
-  let AddedComplexity = 10 in
-  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
-                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
-                 [(set (Ty regtype:$Rt),
-                       (loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
-                                              ro_Xextend32:$extend)))]>,
-           Sched<[WriteLDIdx, ReadAdrBase]> {
-    let Inst{13} = 0b1;
-  }
-
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
-}
-
-multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                     string asm, ValueType Ty, SDPatternOperator storeop> {
-  let AddedComplexity = 10 in
-  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
-                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
-                [(storeop (Ty regtype:$Rt),
-                          (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
-                                         ro_Wextend32:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
-    let Inst{13} = 0b0;
-  }
-
-  let AddedComplexity = 10 in
-  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
-                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
-                [(storeop (Ty regtype:$Rt),
-                          (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
-                                        ro_Xextend32:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
-    let Inst{13} = 0b1;
-  }
-
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
-}
-
-class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
-  bits<5> Rt;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<2> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = extend{1}; // sign extend Rm?
-  let Inst{14}    = 1;
-  let Inst{12}    = extend{0}; // do shift?
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-}
-
-multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                    string asm, ValueType Ty, SDPatternOperator loadop> {
-  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
-                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
-                [(set (Ty regtype:$Rt),
-                      (loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
-                                             ro_Wextend64:$extend)))]>,
-           Sched<[WriteLDIdx, ReadAdrBase]> {
-    let Inst{13} = 0b0;
-  }
-
-  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
-                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
-                 [(set (Ty regtype:$Rt),
-                       (loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
-                                              ro_Xextend64:$extend)))]>,
-           Sched<[WriteLDIdx, ReadAdrBase]> {
-    let Inst{13} = 0b1;
-  }
-
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
-}
-
-multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                     string asm, ValueType Ty, SDPatternOperator storeop> {
-  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
-                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
-                [(storeop (Ty regtype:$Rt),
-                          (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
-                                         ro_Wextend64:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
-    let Inst{13} = 0b0;
-  }
-
-  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
-                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
-                [(storeop (Ty regtype:$Rt),
-                          (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
-                                         ro_Xextend64:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
-    let Inst{13} = 0b1;
-  }
-
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
-}
-
-class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
-  bits<5> Rt;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<2> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = extend{1}; // sign extend Rm?
-  let Inst{14}    = 1;
-  let Inst{12}    = extend{0}; // do shift?
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-}
-
-multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                     string asm, ValueType Ty, SDPatternOperator loadop> {
-  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
-                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
-                 [(set (Ty regtype:$Rt),
-                       (loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
-                                               ro_Wextend128:$extend)))]>,
-            Sched<[WriteLDIdx, ReadAdrBase]> {
-    let Inst{13} = 0b0;
-  }
-
-  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
-                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
-                 [(set (Ty regtype:$Rt),
-                       (loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
-                                               ro_Xextend128:$extend)))]>,
-            Sched<[WriteLDIdx, ReadAdrBase]> {
-    let Inst{13} = 0b1;
-  }
-
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
-}
-
-multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, ValueType Ty, SDPatternOperator storeop> {
-  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
-               (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
-                [(storeop (Ty regtype:$Rt),
-                          (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
-                                          ro_Wextend128:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
-    let Inst{13} = 0b0;
-  }
-
-  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
-               (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
-                [(storeop (Ty regtype:$Rt),
-                          (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
-                                          ro_Xextend128:$extend))]>,
-            Sched<[WriteSTIdx, ReadAdrBase]> {
-    let Inst{13} = 0b1;
-  }
-
-  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class BasePrefetchRO<bits<2> sz, bit V, bits<2> opc, dag outs, dag ins,
-                     string asm, list<dag> pat>
-    : I<outs, ins, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat>,
-      Sched<[WriteLD]> {
-  bits<5> Rt;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<2> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = extend{1}; // sign extend Rm?
-  let Inst{14}    = 1;
-  let Inst{12}    = extend{0}; // do shift?
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-}
-
-multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
-  def roW : BasePrefetchRO<sz, V, opc, (outs),
-                (ins prfop:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
-                asm, [(ARM64Prefetch imm:$Rt,
-                                     (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
-                                                    ro_Wextend64:$extend))]> {
-    let Inst{13} = 0b0;
-  }
-
-  def roX : BasePrefetchRO<sz, V, opc, (outs),
-                (ins prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
-                asm,  [(ARM64Prefetch imm:$Rt,
-                                      (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
-                                                     ro_Xextend64:$extend))]> {
-    let Inst{13} = 0b1;
-  }
-
-  def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
-               (!cast<Instruction>(NAME # "roX") prfop:$Rt,
-                                                 GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
-}
-
-//---
-// Load/store unscaled immediate
-//---
-
-def am_unscaled8 :  ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
-def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
-def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
-def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
-def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
-
-class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                           string asm, list<dag> pattern>
-    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
-  bits<5> Rt;
-  bits<5> Rn;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                   string asm, list<dag> pattern> {
-  let AddedComplexity = 1 in // try this before LoadUI
-  def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
-                               (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
-          Sched<[WriteLD]>;
-
-  def : InstAlias<asm # " $Rt, [$Rn]",
-                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
-}
-
-multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                         string asm, list<dag> pattern> {
-  let AddedComplexity = 1 in // try this before StoreUI
-  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
-                               (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
-                               asm, pattern>,
-          Sched<[WriteST]>;
-
-  def : InstAlias<asm # " $Rt, [$Rn]",
-                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
-}
-
-multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
-                            list<dag> pat> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
-                               (ins prfop:$Rt, GPR64sp:$Rn, simm9:$offset),
-                               asm, pat>,
-          Sched<[WriteLD]>;
-
-  def : InstAlias<asm # " $Rt, [$Rn]",
-                  (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
-}
-
-//---
-// Load/store unscaled immediate, unprivileged
-//---
-
-class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
-                                dag oops, dag iops, string asm>
-    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", []> {
-  bits<5> Rt;
-  bits<5> Rn;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
-                            RegisterClass regtype, string asm> {
-  let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in
-  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs regtype:$Rt),
-                                    (ins GPR64sp:$Rn, simm9:$offset), asm>,
-          Sched<[WriteLD]>;
-
-  def : InstAlias<asm # " $Rt, [$Rn]",
-                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
-}
-
-multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
-                             RegisterClass regtype, string asm> {
-  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
-  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs),
-                                 (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
-                                 asm>,
-          Sched<[WriteST]>;
-
-  def : InstAlias<asm # " $Rt, [$Rn]",
-                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
-}
-
-//---
-// Load/store pre-indexed
-//---
-
-class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                          string asm, string cstr, list<dag> pat>
-    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]!", cstr, pat> {
-  bits<5> Rt;
-  bits<5> Rn;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b11;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePreIdx<sz, V, opc,
-                     (outs GPR64sp:$wback, regtype:$Rt),
-                     (ins GPR64sp:$Rn, simm9:$offset), asm,
-                     "$Rn = $wback", []>,
-      Sched<[WriteLD, WriteAdr]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                  string asm, SDPatternOperator storeop, ValueType Ty>
-    : BaseLoadStorePreIdx<sz, V, opc,
-                      (outs GPR64sp:$wback),
-                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
-                      asm, "$Rn = $wback",
-      [(set GPR64sp:$wback,
-            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
-      Sched<[WriteAdr, WriteST]>;
-} // hasSideEffects = 0
-
-//---
-// Load/store post-indexed
-//---
-
-// (pre-index) load/stores.
-class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                          string asm, string cstr, list<dag> pat>
-    : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
-  bits<5> Rt;
-  bits<5> Rn;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0b0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b01;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePostIdx<sz, V, opc,
-                      (outs GPR64sp:$wback, regtype:$Rt),
-                      (ins GPR64sp:$Rn, simm9:$offset),
-                      asm, "$Rn = $wback", []>,
-      Sched<[WriteLD, WriteI]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                   string asm, SDPatternOperator storeop, ValueType Ty>
-    : BaseLoadStorePostIdx<sz, V, opc,
-                      (outs GPR64sp:$wback),
-                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
-                       asm, "$Rn = $wback",
-      [(set GPR64sp:$wback,
-            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
-    Sched<[WriteAdr, WriteST, ReadAdrBase]>;
-} // hasSideEffects = 0
-
-
-//---
-// Load/store pair
-//---
-
-// (indexed, offset)
-
-class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
-  bits<5> Rt;
-  bits<5> Rt2;
-  bits<5> Rn;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b010;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = Rt2;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
-                          Operand indextype, string asm> {
-  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
-  def i : BaseLoadStorePairOffset<opc, V, 1,
-                                  (outs regtype:$Rt, regtype:$Rt2),
-                                  (ins GPR64sp:$Rn, indextype:$offset), asm>,
-          Sched<[WriteLD, WriteLDHi]>;
-
-  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
-                                                  GPR64sp:$Rn, 0)>;
-}
-
-
-multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
-                           Operand indextype, string asm> {
-  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
-  def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
-                                  (ins regtype:$Rt, regtype:$Rt2,
-                                       GPR64sp:$Rn, indextype:$offset),
-                                  asm>,
-          Sched<[WriteSTP]>;
-
-  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
-                                                  GPR64sp:$Rn, 0)>;
-}
-
-// (pre-indexed)
-class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback", []> {
-  bits<5> Rt;
-  bits<5> Rt2;
-  bits<5> Rn;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b011;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = Rt2;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
-                     Operand indextype, string asm>
-    : BaseLoadStorePairPreIdx<opc, V, 1,
-                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
-                              (ins GPR64sp:$Rn, indextype:$offset), asm>,
-      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand indextype, string asm>
-    : BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
-                             (ins regtype:$Rt, regtype:$Rt2,
-                                  GPR64sp:$Rn, indextype:$offset),
-                             asm>,
-      Sched<[WriteAdr, WriteSTP]>;
-} // hasSideEffects = 0
-
-// (post-indexed)
-
-class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback", []> {
-  bits<5> Rt;
-  bits<5> Rt2;
-  bits<5> Rn;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b001;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = Rt2;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand idxtype, string asm>
-    : BaseLoadStorePairPostIdx<opc, V, 1,
-                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
-                              (ins GPR64sp:$Rn, idxtype:$offset), asm>,
-      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
-                       Operand idxtype, string asm>
-    : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
-                             (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2,
-                                  GPR64sp:$Rn, idxtype:$offset),
-                             asm>,
-      Sched<[WriteAdr, WriteSTP]>;
-} // hasSideEffects = 0
-
-//  (no-allocate)
-
-class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
-  bits<5> Rt;
-  bits<5> Rt2;
-  bits<5> Rn;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b000;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = Rt2;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
-                           Operand indextype, string asm> {
-  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
-  def i : BaseLoadStorePairNoAlloc<opc, V, 1,
-                                   (outs regtype:$Rt, regtype:$Rt2),
-                                   (ins GPR64sp:$Rn, indextype:$offset), asm>,
-          Sched<[WriteLD, WriteLDHi]>;
-
-
-  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
-                                                  GPR64sp:$Rn, 0)>;
-}
-
-multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand indextype, string asm> {
-  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in
-  def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
-                                   (ins regtype:$Rt, regtype:$Rt2,
-                                        GPR64sp:$Rn, indextype:$offset),
-                                   asm>,
-          Sched<[WriteSTP]>;
-
-  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
-                                                  GPR64sp:$Rn, 0)>;
-}
-
-//---
-// Load/store exclusive
-//---
-
-// True exclusive operations write to and/or read from the system's exclusive
-// monitors, which as far as a compiler is concerned can be modelled as a
-// random shared memory address. Hence LoadExclusive mayStore.
-//
-// Since these instructions have the undefined register bits set to 1 in
-// their canonical form, we need a post encoder method to set those bits
-// to 1 when encoding these instructions. We do this using the
-// fixLoadStoreExclusive function. This function has template parameters:
-//
-// fixLoadStoreExclusive<int hasRs, int hasRt2>
-//
-// hasRs indicates that the instruction uses the Rs field, so we won't set
-// it to 1 (and the same for Rt2). We don't need template parameters for
-// the other register fields since Rt and Rn are always used.
-//
-let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
-class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                             dag oops, dag iops, string asm, string operands>
-    : I<oops, iops, asm, operands, "", []> {
-  let Inst{31-30} = sz;
-  let Inst{29-24} = 0b001000;
-  let Inst{23}    = o2;
-  let Inst{22}    = L;
-  let Inst{21}    = o1;
-  let Inst{15}    = o0;
-
-  let DecoderMethod = "DecodeExclusiveLdStInstruction";
-}
-
-// Neither Rs nor Rt2 operands.
-class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                               dag oops, dag iops, string asm, string operands>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
-  bits<5> Rt;
-  bits<5> Rn;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rt;
-
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
-}
-
-// Simple load acquires don't set the exclusive monitor
-let mayLoad = 1, mayStore = 0 in
-class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                  RegisterClass regtype, string asm>
-    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
-                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
-      Sched<[WriteLD]>;
-
-class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                    RegisterClass regtype, string asm>
-    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
-                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
-      Sched<[WriteLD]>;
-
-class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                       RegisterClass regtype, string asm>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
-                             (outs regtype:$Rt, regtype:$Rt2),
-                             (ins GPR64sp0:$Rn), asm,
-                             "\t$Rt, $Rt2, [$Rn]">,
-      Sched<[WriteLD, WriteLDHi]> {
-  bits<5> Rt;
-  bits<5> Rt2;
-  bits<5> Rn;
-  let Inst{14-10} = Rt2;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rt;
-
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
-}
-
-// Simple store release operations do not check the exclusive monitor.
-let mayLoad = 0, mayStore = 1 in
-class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                   RegisterClass regtype, string asm>
-    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
-                               (ins regtype:$Rt, GPR64sp0:$Rn),
-                               asm, "\t$Rt, [$Rn]">,
-      Sched<[WriteST]>;
-
-let mayLoad = 1, mayStore = 1 in
-class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                     RegisterClass regtype, string asm>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
-                             (ins regtype:$Rt, GPR64sp0:$Rn),
-                             asm, "\t$Ws, $Rt, [$Rn]">,
-      Sched<[WriteSTX]> {
-  bits<5> Ws;
-  bits<5> Rt;
-  bits<5> Rn;
-  let Inst{20-16} = Ws;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rt;
-
-  let Constraints = "@earlyclobber $Ws";
-  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
-}
-
-class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                         RegisterClass regtype, string asm>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
-                             (outs GPR32:$Ws),
-                             (ins regtype:$Rt, regtype:$Rt2, GPR64sp0:$Rn),
-                              asm, "\t$Ws, $Rt, $Rt2, [$Rn]">,
-      Sched<[WriteSTX]> {
-  bits<5> Ws;
-  bits<5> Rt;
-  bits<5> Rt2;
-  bits<5> Rn;
-  let Inst{20-16} = Ws;
-  let Inst{14-10} = Rt2;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rt;
-
-  let Constraints = "@earlyclobber $Ws";
-}
-
-//---
-// Exception generation
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
-    : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
-      Sched<[WriteSys]> {
-  bits<16> imm;
-  let Inst{31-24} = 0b11010100;
-  let Inst{23-21} = op1;
-  let Inst{20-5}  = imm;
-  let Inst{4-2}   = 0b000;
-  let Inst{1-0}   = ll;
-}
-
-let Predicates = [HasFPARMv8] in {
-
-//---
-// Floating point to integer conversion
-//---
-
-class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
-                      RegisterClass srcType, RegisterClass dstType,
-                      string asm, list<dag> pattern>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn),
-         asm, "\t$Rd, $Rn", "", pattern>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-29} = 0b00;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
-                      RegisterClass srcType, RegisterClass dstType,
-                      Operand immType, string asm, list<dag> pattern>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
-         asm, "\t$Rd, $Rn, $scale", "", pattern>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> scale;
-  let Inst{30-29} = 0b00;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21}    = 0;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = scale;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
-           SDPatternOperator OpN> {
-  // Unscaled single-precision to 32-bit
-  def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
-                                     [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-  }
-
-  // Unscaled single-precision to 64-bit
-  def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
-                                     [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-
-  // Unscaled double-precision to 32-bit
-  def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
-                                     [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-  }
-
-  // Unscaled double-precision to 64-bit
-  def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
-                                     [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-}
-
-multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
-                             SDPatternOperator OpN> {
-  // Scaled single-precision to 32-bit
-  def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
-                              fixedpoint_f32_i32, asm,
-              [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn,
-                                          fixedpoint_f32_i32:$scale)))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let scale{5} = 1;
-  }
-
-  // Scaled single-precision to 64-bit
-  def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
-                              fixedpoint_f32_i64, asm,
-              [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn,
-                                          fixedpoint_f32_i64:$scale)))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-
-  // Scaled double-precision to 32-bit
-  def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
-                              fixedpoint_f64_i32, asm,
-              [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn,
-                                          fixedpoint_f64_i32:$scale)))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let scale{5} = 1;
-  }
-
-  // Scaled double-precision to 64-bit
-  def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
-                              fixedpoint_f64_i64, asm,
-              [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn,
-                                          fixedpoint_f64_i64:$scale)))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-}
-
-//---
-// Integer to floating point conversion
-//---
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseIntegerToFP<bit isUnsigned,
-                      RegisterClass srcType, RegisterClass dstType,
-                      Operand immType, string asm, list<dag> pattern>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
-         asm, "\t$Rd, $Rn, $scale", "", pattern>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> scale;
-  let Inst{30-23} = 0b00111100;
-  let Inst{21-17} = 0b00001;
-  let Inst{16}    = isUnsigned;
-  let Inst{15-10} = scale;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseIntegerToFPUnscaled<bit isUnsigned,
-                      RegisterClass srcType, RegisterClass dstType,
-                      ValueType dvt, string asm, SDNode node>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn),
-         asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> scale;
-  let Inst{30-23} = 0b00111100;
-  let Inst{21-17} = 0b10001;
-  let Inst{16}    = isUnsigned;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
-  // Unscaled
-  def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  // Scaled
-  def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
-                             [(set FPR32:$Rd,
-                                   (fdiv (node GPR32:$Rn),
-                                         fixedpoint_f32_i32:$scale))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-    let scale{5} = 1;
-  }
-
-  def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm,
-                             [(set FPR64:$Rd,
-                                   (fdiv (node GPR32:$Rn),
-                                         fixedpoint_f64_i32:$scale))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-    let scale{5} = 1;
-  }
-
-  def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
-                             [(set FPR32:$Rd,
-                                   (fdiv (node GPR64:$Rn),
-                                         fixedpoint_f32_i64:$scale))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
-                             [(set FPR64:$Rd,
-                                   (fdiv (node GPR64:$Rn),
-                                         fixedpoint_f64_i64:$scale))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-}
-
-//---
-// Unscaled integer <-> floating point conversion (i.e. FMOV)
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
-                      RegisterClass srcType, RegisterClass dstType,
-                      string asm>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
-        // We use COPY_TO_REGCLASS for these bitconvert operations.
-        // copyPhysReg() expands the resultant COPY instructions after
-        // regalloc is done. This gives greater freedom for the allocator
-        // and related passes (coalescing, copy propagation, et. al.) to
-        // be more effective.
-        [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
-      Sched<[WriteFCopy]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-23} = 0b00111100;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
-                     RegisterClass srcType, RegisterOperand dstType, string asm,
-                     string kind>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
-        "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>,
-      Sched<[WriteFCopy]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-23} = 0b00111101;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod =  "DecodeFMOVLaneInstruction";
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
-                     RegisterOperand srcType, RegisterClass dstType, string asm,
-                     string kind>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
-        "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>,
-      Sched<[WriteFCopy]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-23} = 0b00111101;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod =  "DecodeFMOVLaneInstruction";
-}
-
-
-
-multiclass UnscaledConversion<string asm> {
-  def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
-                                             asm, ".d"> {
-    let Inst{31} = 1;
-    let Inst{22} = 0;
-  }
-
-  def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
-                                               asm, ".d"> {
-    let Inst{31} = 1;
-    let Inst{22} = 0;
-  }
-}
-
-//---
-// Floating point conversion
-//---
-
-class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
-                       RegisterClass srcType, string asm, list<dag> pattern>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-24} = 0b00011110;
-  let Inst{23-22} = type;
-  let Inst{21-17} = 0b10001;
-  let Inst{16-15} = opcode;
-  let Inst{14-10} = 0b10000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPConversion<string asm> {
-  // Double-precision to Half-precision
-  def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
-                             [(set FPR16:$Rd, (fround FPR64:$Rn))]>;
-
-  // Double-precision to Single-precision
-  def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
-                             [(set FPR32:$Rd, (fround FPR64:$Rn))]>;
-
-  // Half-precision to Double-precision
-  def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
-                             [(set FPR64:$Rd, (fextend FPR16:$Rn))]>;
-
-  // Half-precision to Single-precision
-  def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
-                             [(set FPR32:$Rd, (fextend FPR16:$Rn))]>;
-
-  // Single-precision to Double-precision
-  def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
-                             [(set FPR64:$Rd, (fextend FPR32:$Rn))]>;
-
-  // Single-precision to Half-precision
-  def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
-                             [(set FPR16:$Rd, (fround FPR32:$Rn))]>;
-}
-
-//---
-// Single operand floating point data processing
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
-                              ValueType vt, string asm, SDPatternOperator node>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
-         [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
-      Sched<[WriteF]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21-19} = 0b100;
-  let Inst{18-15} = opcode;
-  let Inst{14-10} = 0b10000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SingleOperandFPData<bits<4> opcode, string asm,
-                               SDPatternOperator node = null_frag> {
-  def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-//---
-// Two operand floating point data processing
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
-                           string asm, list<dag> pat>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-         asm, "\t$Rd, $Rn, $Rm", "", pat>,
-      Sched<[WriteF]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass TwoOperandFPData<bits<4> opcode, string asm,
-                            SDPatternOperator node = null_frag> {
-  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
-                         [(set (f32 FPR32:$Rd),
-                               (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
-                         [(set (f64 FPR64:$Rd),
-                               (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
-  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
-                  [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
-                  [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-
-//---
-// Three operand floating point data processing
-//---
-
-class BaseThreeOperandFPData<bit isNegated, bit isSub,
-                             RegisterClass regtype, string asm, list<dag> pat>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
-         asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
-      Sched<[WriteFMul]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<5> Ra;
-  let Inst{31-23} = 0b000111110;
-  let Inst{21}    = isNegated;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = isSub;
-  let Inst{14-10} = Ra;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
-                              SDPatternOperator node> {
-  def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
-            [(set FPR32:$Rd,
-                  (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
-            [(set FPR64:$Rd,
-                  (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-//---
-// Floating point data comparisons
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseOneOperandFPComparison<bit signalAllNans,
-                                 RegisterClass regtype, string asm,
-                                 list<dag> pat>
-    : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
-      Sched<[WriteFCmp]> {
-  bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-
-  let Inst{15-10} = 0b001000;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = signalAllNans;
-  let Inst{3-0}   = 0b1000;
-
-  // Rm should be 0b00000 canonically, but we need to accept any value.
-  let PostEncoderMethod = "fixOneOperandFPComparison";
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
-                                string asm, list<dag> pat>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
-      Sched<[WriteFCmp]> {
-  bits<5> Rm;
-  bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = 0b001000;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = signalAllNans;
-  let Inst{3-0}   = 0b0000;
-}
-
-multiclass FPComparison<bit signalAllNans, string asm,
-                        SDPatternOperator OpNode = null_frag> {
-  let Defs = [NZCV] in {
-  def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
-      [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
-    let Inst{22} = 0;
-  }
-
-  def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
-      [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
-    let Inst{22} = 0;
-  }
-
-  def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
-      [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
-    let Inst{22} = 1;
-  }
-
-  def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
-      [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
-    let Inst{22} = 1;
-  }
-  } // Defs = [NZCV]
-}
-
-//---
-// Floating point conditional comparisons
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseFPCondComparison<bit signalAllNans,
-                              RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
-      Sched<[WriteFCmp]> {
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> nzcv;
-  bits<4> cond;
-
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b01;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = signalAllNans;
-  let Inst{3-0}   = nzcv;
-}
-
-multiclass FPCondComparison<bit signalAllNans, string asm> {
-  let Defs = [NZCV], Uses = [NZCV] in {
-  def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
-    let Inst{22} = 0;
-  }
-
-  def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
-    let Inst{22} = 1;
-  }
-  } // Defs = [NZCV], Uses = [NZCV]
-}
-
-//---
-// Floating point conditional select
-//---
-
-class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
-         asm, "\t$Rd, $Rn, $Rm, $cond", "",
-         [(set regtype:$Rd,
-               (ARM64csel (vt regtype:$Rn), regtype:$Rm,
-                          (i32 imm:$cond), NZCV))]>,
-      Sched<[WriteF]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> cond;
-
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b11;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPCondSelect<string asm> {
-  let Uses = [NZCV] in {
-  def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
-    let Inst{22} = 0;
-  }
-
-  def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
-    let Inst{22} = 1;
-  }
-  } // Uses = [NZCV]
-}
-
-//---
-// Floating move immediate
-//---
-
-class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
-  : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
-      [(set regtype:$Rd, fpimmtype:$imm)]>,
-    Sched<[WriteFImm]> {
-  bits<5> Rd;
-  bits<8> imm;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-13} = imm;
-  let Inst{12-5}  = 0b10000000;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPMoveImmediate<string asm> {
-  def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
-    let Inst{22} = 0;
-  }
-
-  def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
-    let Inst{22} = 1;
-  }
-}
-} // end of 'let Predicates = [HasFPARMv8]'
-
-//----------------------------------------------------------------------------
-// AdvSIMD
-//----------------------------------------------------------------------------
-
-let Predicates = [HasNEON] in {
-
-//----------------------------------------------------------------------------
-// AdvSIMD three register vector instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string kind,
-                        list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
-      "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string kind,
-                        list<dag> pattern>
-  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
-      "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// All operand sizes distinguished in the encoding.
-multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
-                               SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-         [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-         [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
-                                      asm, ".2d",
-         [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
-}
-
-// As above, but D sized elements unsupported.
-multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-        [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-        [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-        [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-        [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-        [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-        [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
-}
-
-multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-      [(set (v8i8 V64:$dst),
-            (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-      [(set (v16i8 V128:$dst),
-            (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-      [(set (v4i16 V64:$dst),
-            (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-      [(set (v8i16 V128:$dst),
-            (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-      [(set (v2i32 V64:$dst),
-            (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-      [(set (v4i32 V128:$dst),
-            (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-}
-
-// As above, but only B sized elements supported.
-multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-    [(set (v16i8 V128:$Rd),
-          (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-}
-
-// As above, but only S and D sized floating point elements supported.
-multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
-                                 string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
-                                      asm, ".2s",
-        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
-                                      asm, ".4s",
-        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
-                                      asm, ".2d",
-        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
-}
-
-multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
-                                    string asm,
-                                    SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
-                                      asm, ".2s",
-        [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
-                                      asm, ".4s",
-        [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
-                                      asm, ".2d",
-        [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
-}
-
-multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
-                                 string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
-                                      asm, ".2s",
-     [(set (v2f32 V64:$dst),
-           (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
-                                      asm, ".4s",
-     [(set (v4f32 V128:$dst),
-           (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
-                                      asm, ".2d",
-     [(set (v2f64 V128:$dst),
-           (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
-}
-
-// As above, but D and B sized elements unsupported.
-multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-        [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-        [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-        [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-        [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-}
-
-// Logical three vector ops share opcode bits, and only use B sized elements.
-multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
-                                     asm, ".8b",
-                         [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
-  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
-                                     asm, ".16b",
-                         [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
-
-  def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
-          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
-  def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
-          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
-  def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
-          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
-
-  def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
-      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
-  def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
-      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
-  def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
-      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
-}
-
-multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
-                                  string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
-                                     asm, ".8b",
-             [(set (v8i8 V64:$dst),
-                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
-                                     asm, ".16b",
-             [(set (v16i8 V128:$dst),
-                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
-                           (v16i8 V128:$Rm)))]>;
-
-  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
-                           (v4i16 V64:$RHS))),
-          (!cast<Instruction>(NAME#"v8i8")
-            V64:$LHS, V64:$MHS, V64:$RHS)>;
-  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
-                           (v2i32 V64:$RHS))),
-          (!cast<Instruction>(NAME#"v8i8")
-            V64:$LHS, V64:$MHS, V64:$RHS)>;
-  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
-                           (v1i64 V64:$RHS))),
-          (!cast<Instruction>(NAME#"v8i8")
-            V64:$LHS, V64:$MHS, V64:$RHS)>;
-
-  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
-                           (v8i16 V128:$RHS))),
-      (!cast<Instruction>(NAME#"v16i8")
-        V128:$LHS, V128:$MHS, V128:$RHS)>;
-  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
-                           (v4i32 V128:$RHS))),
-      (!cast<Instruction>(NAME#"v16i8")
-        V128:$LHS, V128:$MHS, V128:$RHS)>;
-  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
-                           (v2i64 V128:$RHS))),
-      (!cast<Instruction>(NAME#"v16i8")
-        V128:$LHS, V128:$MHS, V128:$RHS)>;
-}
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD two register vector instructions.
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string dstkind,
-                        string srckind, list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
-      "{\t$Rd" # dstkind # ", $Rn" # srckind #
-      "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                            RegisterOperand regtype, string asm, string dstkind,
-                            string srckind, list<dag> pattern>
-  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
-      "{\t$Rd" # dstkind # ", $Rn" # srckind #
-      "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// Supports B, H, and S element sizes.
-multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
-                            SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b", ".8b",
-                          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b", ".16b",
-                          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h", ".4h",
-                          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h", ".8h",
-                          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s", ".2s",
-                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s", ".4s",
-                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-}
-
-class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
-                            RegisterOperand regtype, string asm, string dstkind,
-                            string srckind, string amount>
-  : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
-      "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
-      "|" # dstkind # "\t$Rd, $Rn, #" #  amount # "}", "", []>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29-24} = 0b101110;
-  let Inst{23-22} = size;
-  let Inst{21-10} = 0b100001001110;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDVectorLShiftLongBySizeBHS {
-  let neverHasSideEffects = 1 in {
-  def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
-                                             "shll", ".8h",  ".8b", "8">;
-  def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
-                                             "shll2", ".8h", ".16b", "8">;
-  def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
-                                             "shll", ".4s",  ".4h", "16">;
-  def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
-                                             "shll2", ".4s", ".8h", "16">;
-  def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
-                                             "shll", ".2d",  ".2s", "32">;
-  def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
-                                             "shll2", ".2d", ".4s", "32">;
-  }
-}
-
-// Supports all element sizes.
-multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".4h", ".8b",
-               [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".8h", ".16b",
-               [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".2s", ".4h",
-               [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".4s", ".8h",
-               [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".1d", ".2s",
-               [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".2d", ".4s",
-               [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-}
-
-multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
-                                          asm, ".4h", ".8b",
-      [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
-                                      (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
-                                          asm, ".8h", ".16b",
-      [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
-                                      (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
-                                          asm, ".2s", ".4h",
-      [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
-                                      (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
-                                          asm, ".4s", ".8h",
-      [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
-                                      (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
-                                          asm, ".1d", ".2s",
-      [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
-                                      (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
-                                          asm, ".2d", ".4s",
-      [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
-                                      (v4i32 V128:$Rn)))]>;
-}
-
-// Supports all element sizes, except 1xD.
-multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
-                                    asm, ".8b", ".8b",
-    [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
-                                    asm, ".16b", ".16b",
-    [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
-                                    asm, ".4h", ".4h",
-    [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
-                                    asm, ".8h", ".8h",
-    [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
-                                    asm, ".2s", ".2s",
-    [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
-                                    asm, ".4s", ".4s",
-    [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
-                                    asm, ".2d", ".2d",
-    [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
-}
-
-multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                asm, ".8b", ".8b",
-    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                asm, ".16b", ".16b",
-    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                asm, ".4h", ".4h",
-    [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                asm, ".8h", ".8h",
-    [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
-                                asm, ".2s", ".2s",
-    [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
-                                asm, ".4s", ".4s",
-    [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
-                                asm, ".2d", ".2d",
-    [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
-}
-
-
-// Supports only B element sizes.
-multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
-                          SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
-                                asm, ".8b", ".8b",
-                    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
-                                asm, ".16b", ".16b",
-                    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-
-}
-
-// Supports only B and H element sizes.
-multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                asm, ".8b", ".8b",
-                    [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                asm, ".16b", ".16b",
-                    [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                asm, ".4h", ".4h",
-                    [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                asm, ".8h", ".8h",
-                    [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
-}
-
-// Supports only S and D element sizes, uses high bit of the size field
-// as an extra opcode bit.
-multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
-                                asm, ".2d", ".2d",
-                          [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
-}
-
-// Supports only S element size.
-multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-}
-
-
-multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
-                                asm, ".2d", ".2d",
-                          [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
-}
-
-multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
-                                asm, ".2d", ".2d",
-                          [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
-}
-
-
-class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand inreg, RegisterOperand outreg,
-                           string asm, string outkind, string inkind,
-                           list<dag> pattern>
-  : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind #
-      "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand inreg, RegisterOperand outreg,
-                           string asm, string outkind, string inkind,
-                           list<dag> pattern>
-  : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind #
-      "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
-                              SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
-                                      asm, ".8b", ".8h",
-        [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
-                                      asm#"2", ".16b", ".8h", []>;
-  def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
-                                      asm, ".4h", ".4s",
-        [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
-                                      asm#"2", ".8h", ".4s", []>;
-  def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
-                                      asm, ".2s", ".2d",
-        [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
-  def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
-                                      asm#"2", ".4s", ".2d", []>;
-
-  def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v16i8")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v8i16")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v4i32")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-}
-
-class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand regtype,
-                           string asm, string kind, string zero,
-                           ValueType dty, ValueType sty, SDNode OpNode>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
-      "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
-      [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// Comparisons support all element sizes, except 1xD.
-multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
-                            SDNode OpNode> {
-  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
-                                     asm, ".8b", "0",
-                                     v8i8, v8i8, OpNode>;
-  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
-                                     asm, ".16b", "0",
-                                     v16i8, v16i8, OpNode>;
-  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
-                                     asm, ".4h", "0",
-                                     v4i16, v4i16, OpNode>;
-  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
-                                     asm, ".8h", "0",
-                                     v8i16, v8i16, OpNode>;
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
-                                     asm, ".2s", "0",
-                                     v2i32, v2i32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
-                                     asm, ".4s", "0",
-                                     v4i32, v4i32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
-                                     asm, ".2d", "0",
-                                     v2i64, v2i64, OpNode>;
-}
-
-// FP Comparisons support only S and D element sizes.
-multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
-                              string asm, SDNode OpNode> {
-
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
-                                     asm, ".2s", "0.0",
-                                     v2i32, v2f32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
-                                     asm, ".4s", "0.0",
-                                     v4i32, v4f32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
-                                     asm, ".2d", "0.0",
-                                     v2i64, v2f64, OpNode>;
-
-  def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0",
-                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
-  def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0",
-                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
-  def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0",
-                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
-  def : InstAlias<asm # ".2s $Vd, $Vn, #0",
-                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
-  def : InstAlias<asm # ".4s $Vd, $Vn, #0",
-                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
-  def : InstAlias<asm # ".2d $Vd, $Vn, #0",
-                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                             RegisterOperand outtype, RegisterOperand intype,
-                             string asm, string VdTy, string VnTy,
-                             list<dag> pattern>
-  : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
-      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                             RegisterOperand outtype, RegisterOperand intype,
-                             string asm, string VdTy, string VnTy,
-                             list<dag> pattern>
-  : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
-      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
-  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
-                                    asm, ".4s", ".4h", []>;
-  def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
-                                    asm#"2", ".4s", ".8h", []>;
-  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
-                                    asm, ".2d", ".2s", []>;
-  def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
-                                    asm#"2", ".2d", ".4s", []>;
-}
-
-multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
-  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
-                                    asm, ".4h", ".4s", []>;
-  def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
-                                    asm#"2", ".8h", ".4s", []>;
-  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
-                                    asm, ".2s", ".2d", []>;
-  def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
-                                    asm#"2", ".4s", ".2d", []>;
-}
-
-multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
-                                     Intrinsic OpNode> {
-  def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
-                                     asm, ".2s", ".2d",
-                          [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
-  def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
-                                    asm#"2", ".4s", ".2d", []>;
-
-  def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v4f32")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD three register different-size vector instructions.
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
-                      RegisterOperand outtype, RegisterOperand intype1,
-                      RegisterOperand intype2, string asm,
-                      string outkind, string inkind1, string inkind2,
-                      list<dag> pattern>
-  : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
-      "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size{0};
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size{2-1};
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
-                      RegisterOperand outtype, RegisterOperand intype1,
-                      RegisterOperand intype2, string asm,
-                      string outkind, string inkind1, string inkind2,
-                      list<dag> pattern>
-  : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
-      "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size{0};
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size{2-1};
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// FIXME: TableGen doesn't know how to deal with expanded types that also
-//        change the element count (in this case, placing the results in
-//        the high elements of the result register rather than the low
-//        elements). Until that's fixed, we can't code-gen those.
-multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
-                                    Intrinsic IntOp> {
-  def v8i16_v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V64, V128, V128,
-                                                  asm, ".8b", ".8h", ".8h",
-     [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v8i16_v16i8  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".16b", ".8h", ".8h",
-     []>;
-  def v4i32_v4i16  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V64, V128, V128,
-                                                  asm, ".4h", ".4s", ".4s",
-     [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-  def v4i32_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".8h", ".4s", ".4s",
-     []>;
-  def v2i64_v2i32  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V64, V128, V128,
-                                                  asm, ".2s", ".2d", ".2d",
-     [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
-  def v2i64_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".2d", ".2d",
-     []>;
-
-
-  // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
-  // a version attached to an instruction.
-  def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
-                                                   (v8i16 V128:$Rm))),
-            (!cast<Instruction>(NAME # "v8i16_v16i8")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, V128:$Rm)>;
-  def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
-                                                    (v4i32 V128:$Rm))),
-            (!cast<Instruction>(NAME # "v4i32_v8i16")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, V128:$Rm)>;
-  def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
-                                                    (v2i64 V128:$Rm))),
-            (!cast<Instruction>(NAME # "v2i64_v4i32")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, V128:$Rm)>;
-}
-
-multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
-                                      Intrinsic IntOp> {
-  def v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                            V128, V64, V64,
-                                            asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                            V128, V128, V128,
-                                            asm#"2", ".8h", ".16b", ".16b", []>;
-  let Predicates = [HasCrypto] in {
-    def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
-                                              V128, V64, V64,
-                                              asm, ".1q", ".1d", ".1d", []>;
-    def v2i64  : BaseSIMDDifferentThreeVector<U, 0b111, opc,
-                                              V128, V128, V128,
-                                              asm#"2", ".1q", ".2d", ".2d", []>;
-  }
-
-  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
-                          (v8i8 (extract_high_v16i8 V128:$Rm)))),
-      (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
-}
-
-multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
-                                 SDPatternOperator OpNode> {
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
-                                      (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
-                                      (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd),
-            (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-      [(set (v8i16 V128:$Rd),
-            (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
-                                (extract_high_v16i8 V128:$Rm)))))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd),
-            (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd),
-            (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
-                                  (extract_high_v8i16 V128:$Rm)))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd),
-            (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd),
-            (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
-                                 (extract_high_v4i32 V128:$Rm)))))]>;
-}
-
-multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
-                                          string asm,
-                                          SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-    [(set (v8i16 V128:$dst),
-          (add (v8i16 V128:$Rd),
-               (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-    [(set (v8i16 V128:$dst),
-          (add (v8i16 V128:$Rd),
-               (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
-                                   (extract_high_v16i8 V128:$Rm))))))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (add (v4i32 V128:$Rd),
-               (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (add (v4i32 V128:$Rd),
-               (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
-                                    (extract_high_v8i16 V128:$Rm))))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (add (v2i64 V128:$Rd),
-               (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (add (v2i64 V128:$Rd),
-               (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
-                                    (extract_high_v4i32 V128:$Rm))))))]>;
-}
-
-multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
-                                      (extract_high_v16i8 V128:$Rm)))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
-                                      (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
-                                      (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
-                                      string asm,
-                                      SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-    [(set (v8i16 V128:$dst),
-          (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-    [(set (v8i16 V128:$dst),
-          (OpNode (v8i16 V128:$Rd),
-                  (extract_high_v16i8 V128:$Rn),
-                  (extract_high_v16i8 V128:$Rm)))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (OpNode (v4i32 V128:$Rd),
-                  (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (OpNode (v2i64 V128:$Rd),
-                  (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
-                                           SDPatternOperator Accum> {
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull (v4i16 V64:$Rn),
-                                                (v4i16 V64:$Rm)))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
-                                            (extract_high_v8i16 V128:$Rm)))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_arm64_neon_sqdmull (v2i32 V64:$Rn),
-                                                (v2i32 V64:$Rm)))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_arm64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
-                                            (extract_high_v4i32 V128:$Rm)))))]>;
-}
-
-multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V128, V64,
-                                                  asm, ".8h", ".8h", ".8b",
-       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".8h", ".8h", ".16b",
-       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                                       (extract_high_v16i8 V128:$Rm)))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V128, V64,
-                                                  asm, ".4s", ".4s", ".4h",
-       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".4s", ".8h",
-       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                                       (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V128, V64,
-                                                  asm, ".2d", ".2d", ".2s",
-       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".2d", ".4s",
-       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                                       (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD bitwise extract from vector
-//----------------------------------------------------------------------------
-
-class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
-                             string asm, string kind>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
-      "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
-      [(set (vty regtype:$Rd),
-            (ARM64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> imm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size;
-  let Inst{29-21} = 0b101110000;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{14-11} = imm;
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-
-multiclass SIMDBitwiseExtract<string asm> {
-  def v8i8  : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> {
-    let imm{3} = 0;
-  }
-  def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD zip vector
-//----------------------------------------------------------------------------
-
-class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
-                        string asm, string kind, SDNode OpNode, ValueType valty>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
-      "|" # kind # "\t$Rd, $Rn, $Rm}", "",
-      [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size{0};
-  let Inst{29-24} = 0b001110;
-  let Inst{23-22} = size{2-1};
-  let Inst{21}    = 0;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{14-12} = opc;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDZipVector<bits<3>opc, string asm,
-                         SDNode OpNode> {
-  def v8i8   : BaseSIMDZipVector<0b000, opc, V64,
-      asm, ".8b", OpNode, v8i8>;
-  def v16i8  : BaseSIMDZipVector<0b001, opc, V128,
-      asm, ".16b", OpNode, v16i8>;
-  def v4i16  : BaseSIMDZipVector<0b010, opc, V64,
-      asm, ".4h", OpNode, v4i16>;
-  def v8i16  : BaseSIMDZipVector<0b011, opc, V128,
-      asm, ".8h", OpNode, v8i16>;
-  def v2i32  : BaseSIMDZipVector<0b100, opc, V64,
-      asm, ".2s", OpNode, v2i32>;
-  def v4i32  : BaseSIMDZipVector<0b101, opc, V128,
-      asm, ".4s", OpNode, v4i32>;
-  def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
-      asm, ".2d", OpNode, v2i64>;
-
-  def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
-        (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
-  def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
-        (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
-  def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
-        (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD three register scalar instructions
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, string asm,
-                        list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
-      "\t$Rd, $Rn, $Rm", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
-                            SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
-}
-
-multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
-                               SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
-  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
-
-  def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-            (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
-  def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
-            (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
-                             [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
-}
-
-multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
-      [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
-      [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
-  }
-
-  def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
-}
-
-multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
-                                SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
-      [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
-      [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
-  }
-
-  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
-}
-
-class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
-              dag oops, dag iops, string asm, string cstr, list<dag> pat>
-  : I<oops, iops, asm,
-      "\t$Rd, $Rn, $Rm", cstr, pat>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
-                                      (outs FPR32:$Rd),
-                                      (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
-  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
-                                      (outs FPR64:$Rd),
-                                      (ins FPR32:$Rn, FPR32:$Rm), asm, "",
-            [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
-                                      (outs FPR32:$dst),
-                                      (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
-                                      asm, "$Rd = $dst", []>;
-  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
-                                      (outs FPR64:$dst),
-                                      (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
-                                      asm, "$Rd = $dst",
-            [(set (i64 FPR64:$dst),
-                  (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD two register scalar instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, RegisterClass regtype2,
-                        string asm, list<dag> pat>
-  : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
-      "\t$Rd, $Rn", "", pat>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, RegisterClass regtype2,
-                        string asm, list<dag> pat>
-  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
-      "\t$Rd, $Rn", "$Rd = $dst", pat>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, string asm, string zero>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
-      "\t$Rd, $Rn, #" # zero, "", []>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
-  : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
-     [(set (f32 FPR32:$Rd), (int_arm64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-17} = 0b011111100110000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
-
-  def : Pat<(v1i64 (OpNode FPR64:$Rn)),
-            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
-}
-
-multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
-                              SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
-  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
-
-  def : InstAlias<asm # " $Rd, $Rn, #0",
-                  (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
-  def : InstAlias<asm # " $Rd, $Rn, #0",
-                  (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
-
-  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
-}
-
-multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
-                          SDPatternOperator OpNode = null_frag> {
-  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
-
-  def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
-}
-
-multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
-  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
-  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
-}
-
-multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
-                              SDPatternOperator OpNode> {
-  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
-                                [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
-  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
-                                [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
-}
-
-multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
-           [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
-           [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
-    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
-}
-
-multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
-                                 Intrinsic OpNode> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def v1i64  : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
-        [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
-    def v1i32  : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
-        [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
-    def v1i16  : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
-    def v1i8   : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
-}
-
-
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode = null_frag> {
-  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
-        [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
-  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar pairwise instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, RegisterOperand vectype,
-                        string asm, string kind>
-  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
-      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
-  def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
-                                      asm, ".2d">;
-}
-
-multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
-  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
-                                      asm, ".2s">;
-  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
-                                      asm, ".2d">;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD across lanes instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
-                          RegisterClass regtype, RegisterOperand vectype,
-                          string asm, string kind, list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
-      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
-                              string asm> {
-  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8,  V64,
-                                   asm, ".8b", []>;
-  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8,  V128,
-                                   asm, ".16b", []>;
-  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
-                                   asm, ".4h", []>;
-  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
-                                   asm, ".8h", []>;
-  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
-                                   asm, ".4s", []>;
-}
-
-multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
-  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
-                                   asm, ".8b", []>;
-  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
-                                   asm, ".16b", []>;
-  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
-                                   asm, ".4h", []>;
-  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
-                                   asm, ".8h", []>;
-  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
-                                   asm, ".4s", []>;
-}
-
-multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
-                            Intrinsic intOp> {
-  def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
-                                   asm, ".4s",
-        [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD INS/DUP instructions
-//----------------------------------------------------------------------------
-
-// FIXME: There has got to be a better way to factor these. ugh.
-
-class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
-                     string operands, string constraints, list<dag> pattern>
-  : I<outs, ins, asm, operands, constraints, pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31} = 0;
-  let Inst{30} = Q;
-  let Inst{29} = op;
-  let Inst{28-21} = 0b01110000;
-  let Inst{15} = 0;
-  let Inst{10} = 1;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
-                      RegisterOperand vecreg, RegisterClass regtype>
-  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
-                   "{\t$Rd" # size # ", $Rn" #
-                   "|" # size # "\t$Rd, $Rn}", "",
-                   [(set (vectype vecreg:$Rd), (ARM64dup regtype:$Rn))]> {
-  let Inst{20-16} = imm5;
-  let Inst{14-11} = 0b0001;
-}
-
-class SIMDDupFromElement<bit Q, string dstkind, string srckind,
-                         ValueType vectype, ValueType insreg,
-                         RegisterOperand vecreg, Operand idxtype,
-                         ValueType elttype, SDNode OpNode>
-  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
-                   "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
-                   "|" # dstkind # "\t$Rd, $Rn$idx}", "",
-                 [(set (vectype vecreg:$Rd),
-                       (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
-  let Inst{14-11} = 0b0000;
-}
-
-class SIMDDup64FromElement
-  : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
-                       VectorIndexD, i64, ARM64duplane64> {
-  bits<1> idx;
-  let Inst{20} = idx;
-  let Inst{19-16} = 0b1000;
-}
-
-class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
-                           RegisterOperand vecreg>
-  : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
-                       VectorIndexS, i64, ARM64duplane32> {
-  bits<2> idx;
-  let Inst{20-19} = idx;
-  let Inst{18-16} = 0b100;
-}
-
-class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
-                           RegisterOperand vecreg>
-  : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
-                       VectorIndexH, i64, ARM64duplane16> {
-  bits<3> idx;
-  let Inst{20-18} = idx;
-  let Inst{17-16} = 0b10;
-}
-
-class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
-                          RegisterOperand vecreg>
-  : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
-                       VectorIndexB, i64, ARM64duplane8> {
-  bits<4> idx;
-  let Inst{20-17} = idx;
-  let Inst{16} = 1;
-}
-
-class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
-                  Operand idxtype, string asm, list<dag> pattern>
-  : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
-                   "{\t$Rd, $Rn" # size # "$idx" #
-                   "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
-  let Inst{14-11} = imm4;
-}
-
-class SIMDSMov<bit Q, string size, RegisterClass regtype,
-               Operand idxtype>
-  : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
-class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
-               Operand idxtype>
-  : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
-      [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
-
-class SIMDMovAlias<string asm, string size, Instruction inst,
-                   RegisterClass regtype, Operand idxtype>
-    : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
-                    "|" # size # "\t$dst, $src$idx}",
-                (inst regtype:$dst, V128:$src, idxtype:$idx)>;
-
-multiclass SMov {
-  def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-}
-
-multiclass UMov {
-  def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-  def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
-    bits<1> idx;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-  }
-  def : SIMDMovAlias<"mov", ".s",
-                     !cast<Instruction>(NAME#"vi32"),
-                     GPR32, VectorIndexS>;
-  def : SIMDMovAlias<"mov", ".d",
-                     !cast<Instruction>(NAME#"vi64"),
-                     GPR64, VectorIndexD>;
-}
-
-class SIMDInsFromMain<string size, ValueType vectype,
-                      RegisterClass regtype, Operand idxtype>
-  : BaseSIMDInsDup<1, 0, (outs V128:$dst),
-                   (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
-                   "{\t$Rd" # size # "$idx, $Rn" #
-                   "|" # size # "\t$Rd$idx, $Rn}",
-                   "$Rd = $dst",
-            [(set V128:$dst,
-              (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
-  let Inst{14-11} = 0b0011;
-}
-
-class SIMDInsFromElement<string size, ValueType vectype,
-                         ValueType elttype, Operand idxtype>
-  : BaseSIMDInsDup<1, 1, (outs V128:$dst),
-                   (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
-                   "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
-                   "|" # size # "\t$Rd$idx, $Rn$idx2}",
-                   "$Rd = $dst",
-         [(set V128:$dst,
-               (vector_insert
-                 (vectype V128:$Rd),
-                 (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
-                 idxtype:$idx))]>;
-
-class SIMDInsMainMovAlias<string size, Instruction inst,
-                          RegisterClass regtype, Operand idxtype>
-    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
-                        "|" # size #"\t$dst$idx, $src}",
-                (inst V128:$dst, idxtype:$idx, regtype:$src)>;
-class SIMDInsElementMovAlias<string size, Instruction inst,
-                             Operand idxtype>
-    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
-                      # "|" # size #" $dst$idx, $src$idx2}",
-                (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
-
-
-multiclass SIMDIns {
-  def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-  def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
-    bits<1> idx;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-  }
-
-  def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
-    bits<4> idx;
-    bits<4> idx2;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-    let Inst{14-11} = idx2;
-  }
-  def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
-    bits<3> idx;
-    bits<3> idx2;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-    let Inst{14-12} = idx2;
-    let Inst{11} = 0;
-  }
-  def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
-    bits<2> idx;
-    bits<2> idx2;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-    let Inst{14-13} = idx2;
-    let Inst{12-11} = 0;
-  }
-  def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
-    bits<1> idx;
-    bits<1> idx2;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-    let Inst{14} = idx2;
-    let Inst{13-11} = 0;
-  }
-
-  // For all forms of the INS instruction, the "mov" mnemonic is the
-  // preferred alias. Why they didn't just call the instruction "mov" in
-  // the first place is a very good question indeed...
-  def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
-                         GPR32, VectorIndexB>;
-  def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
-                         GPR32, VectorIndexH>;
-  def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
-                         GPR32, VectorIndexS>;
-  def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
-                         GPR64, VectorIndexD>;
-
-  def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
-                         VectorIndexB>;
-  def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
-                         VectorIndexH>;
-  def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
-                         VectorIndexS>;
-  def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
-                         VectorIndexD>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD TBL/TBX
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
-                          RegisterOperand listtype, string asm, string kind>
-  : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
-       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
-    Sched<[WriteV]> {
-  bits<5> Vd;
-  bits<5> Vn;
-  bits<5> Vm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29-21} = 0b001110000;
-  let Inst{20-16} = Vm;
-  let Inst{15}    = 0;
-  let Inst{14-13} = len;
-  let Inst{12}    = op;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Vn;
-  let Inst{4-0}   = Vd;
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
-                          RegisterOperand listtype, string asm, string kind>
-  : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
-       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
-    Sched<[WriteV]> {
-  bits<5> Vd;
-  bits<5> Vn;
-  bits<5> Vm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29-21} = 0b001110000;
-  let Inst{20-16} = Vm;
-  let Inst{15}    = 0;
-  let Inst{14-13} = len;
-  let Inst{12}    = op;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Vn;
-  let Inst{4-0}   = Vd;
-}
-
-class SIMDTableLookupAlias<string asm, Instruction inst,
-                          RegisterOperand vectype, RegisterOperand listtype>
-    : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
-                (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
-
-multiclass SIMDTableLookup<bit op, string asm> {
-  def v8i8One   : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
-                                      asm, ".8b">;
-  def v8i8Two   : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
-                                      asm, ".8b">;
-  def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
-                                      asm, ".8b">;
-  def v8i8Four  : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
-                                      asm, ".8b">;
-  def v16i8One  : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
-                                      asm, ".16b">;
-  def v16i8Two  : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
-                                      asm, ".16b">;
-  def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
-                                      asm, ".16b">;
-  def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
-                                      asm, ".16b">;
-
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8One"),
-                         V64, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Two"),
-                         V64, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Three"),
-                         V64, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Four"),
-                         V64, VecListFour128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8One"),
-                         V128, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Two"),
-                         V128, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Three"),
-                         V128, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Four"),
-                         V128, VecListFour128>;
-}
-
-multiclass SIMDTableLookupTied<bit op, string asm> {
-  def v8i8One   : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
-                                      asm, ".8b">;
-  def v8i8Two   : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
-                                      asm, ".8b">;
-  def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
-                                      asm, ".8b">;
-  def v8i8Four  : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
-                                      asm, ".8b">;
-  def v16i8One  : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
-                                      asm, ".16b">;
-  def v16i8Two  : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
-                                      asm, ".16b">;
-  def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
-                                      asm, ".16b">;
-  def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
-                                      asm, ".16b">;
-
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8One"),
-                         V64, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Two"),
-                         V64, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Three"),
-                         V64, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Four"),
-                         V64, VecListFour128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8One"),
-                         V128, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Two"),
-                         V128, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Three"),
-                         V128, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Four"),
-                         V128, VecListFour128>;
-}
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar CPY
-//----------------------------------------------------------------------------
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
-                        string kind, Operand idxtype>
-  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
-       "{\t$dst, $src" # kind # "$idx" #
-       "|\t$dst, $src$idx}", "", []>,
-    Sched<[WriteV]> {
-  bits<5> dst;
-  bits<5> src;
-  let Inst{31-21} = 0b01011110000;
-  let Inst{15-10} = 0b000001;
-  let Inst{9-5}   = src;
-  let Inst{4-0}   = dst;
-}
-
-class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
-      RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
-    : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
-                    # "|\t$dst, $src$index}",
-                (inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
-
-
-multiclass SIMDScalarCPY<string asm> {
-  def i8  : BaseSIMDScalarCPY<FPR8,  V128, ".b", VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-  def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
-    bits<1> idx;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-  }
-
-  def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src),
-                                                          VectorIndexD:$idx)))),
-            (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;
-
-  // 'DUP' mnemonic aliases.
-  def : SIMDScalarCPYAlias<"dup", ".b",
-                           !cast<Instruction>(NAME#"i8"),
-                           FPR8, V128, VectorIndexB>;
-  def : SIMDScalarCPYAlias<"dup", ".h",
-                           !cast<Instruction>(NAME#"i16"),
-                           FPR16, V128, VectorIndexH>;
-  def : SIMDScalarCPYAlias<"dup", ".s",
-                           !cast<Instruction>(NAME#"i32"),
-                           FPR32, V128, VectorIndexS>;
-  def : SIMDScalarCPYAlias<"dup", ".d",
-                           !cast<Instruction>(NAME#"i64"),
-                           FPR64, V128, VectorIndexD>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD modified immediate instructions
-//----------------------------------------------------------------------------
-
-class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
-                          string asm, string op_string,
-                          string cstr, list<dag> pattern>
-  : I<oops, iops, asm, op_string, cstr, pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<8> imm8;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = op;
-  let Inst{28-19} = 0b0111100000;
-  let Inst{18-16} = imm8{7-5};
-  let Inst{11-10} = 0b01;
-  let Inst{9-5}   = imm8{4-0};
-  let Inst{4-0}   = Rd;
-}
-
-class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
-                                Operand immtype, dag opt_shift_iop,
-                                string opt_shift, string asm, string kind,
-                                list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
-                        !con((ins immtype:$imm8), opt_shift_iop), asm,
-                        "{\t$Rd" # kind # ", $imm8" # opt_shift #
-                        "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
-                        "", pattern> {
-  let DecoderMethod = "DecodeModImmInstruction";
-}
-
-class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
-                                Operand immtype, dag opt_shift_iop,
-                                string opt_shift, string asm, string kind,
-                                list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
-                        !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
-                        asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
-                             "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
-                        "$Rd = $dst", pattern> {
-  let DecoderMethod = "DecodeModImmTiedInstruction";
-}
-
-class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
-                                     RegisterOperand vectype, string asm,
-                                     string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
-                              (ins logical_vec_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15}    = b15_b12{1};
-  let Inst{14-13} = shift;
-  let Inst{12}    = b15_b12{0};
-}
-
-class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
-                                     RegisterOperand vectype, string asm,
-                                     string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
-                              (ins logical_vec_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15}    = b15_b12{1};
-  let Inst{14-13} = shift;
-  let Inst{12}    = b15_b12{0};
-}
-
-
-class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
-                                         RegisterOperand vectype, string asm,
-                                         string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
-                              (ins logical_vec_hw_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15} = b15_b12{1};
-  let Inst{14} = 0;
-  let Inst{13} = shift{0};
-  let Inst{12} = b15_b12{0};
-}
-
-class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
-                                         RegisterOperand vectype, string asm,
-                                         string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
-                              (ins logical_vec_hw_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15} = b15_b12{1};
-  let Inst{14} = 0;
-  let Inst{13} = shift{0};
-  let Inst{12} = b15_b12{0};
-}
-
-multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
-                                      string asm> {
-  def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
-                                                 asm, ".4h", []>;
-  def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
-                                                 asm, ".8h", []>;
-
-  def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
-                                             asm, ".2s", []>;
-  def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
-                                             asm, ".4s", []>;
-}
-
-multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
-                                      bits<2> w_cmode, string asm,
-                                      SDNode OpNode> {
-  def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
-                                                 asm, ".4h",
-             [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
-                                             imm0_255:$imm8,
-                                             (i32 imm:$shift)))]>;
-  def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
-                                                 asm, ".8h",
-             [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
-                                              imm0_255:$imm8,
-                                              (i32 imm:$shift)))]>;
-
-  def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
-                                             asm, ".2s",
-             [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
-                                             imm0_255:$imm8,
-                                             (i32 imm:$shift)))]>;
-  def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
-                                             asm, ".4s",
-             [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
-                                              imm0_255:$imm8,
-                                              (i32 imm:$shift)))]>;
-}
-
-class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
-                             RegisterOperand vectype, string asm,
-                             string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
-                              (ins move_vec_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<1> shift;
-  let Inst{15-13} = cmode{3-1};
-  let Inst{12}    = shift;
-}
-
-class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
-                                   RegisterOperand vectype,
-                                   Operand imm_type, string asm,
-                                   string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
-                              asm, kind, pattern> {
-  let Inst{15-12} = cmode;
-}
-
-class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
-                                   list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
-                        "\t$Rd, $imm8", "", pattern> {
-  let Inst{15-12} = cmode;
-  let DecoderMethod = "DecodeModImmInstruction";
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD indexed element
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
-                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
-                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
-                      string apple_kind, string dst_kind, string lhs_kind,
-                      string rhs_kind, list<dag> pattern>
-  : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
-      asm,
-      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
-      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28}    = Scalar;
-  let Inst{27-24} = 0b1111;
-  let Inst{23-22} = size;
-  // Bit 21 must be set by the derived class.
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opc;
-  // Bit 11 must be set by the derived class.
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
-                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
-                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
-                      string apple_kind, string dst_kind, string lhs_kind,
-                      string rhs_kind, list<dag> pattern>
-  : I<(outs dst_reg:$dst),
-      (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
-      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
-      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28}    = Scalar;
-  let Inst{27-24} = 0b1111;
-  let Inst{23-22} = size;
-  // Bit 21 must be set by the derived class.
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opc;
-  // Bit 11 must be set by the derived class.
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s", ".s",
-    [(set (v2f32 V64:$Rd),
-        (OpNode (v2f32 V64:$Rn),
-         (v2f32 (ARM64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4f32 V128:$Rd),
-        (OpNode (v4f32 V128:$Rn),
-         (v4f32 (ARM64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
-                                      V128, V128,
-                                      V128, VectorIndexD,
-                                      asm, ".2d", ".2d", ".2d", ".d",
-    [(set (v2f64 V128:$Rd),
-        (OpNode (v2f64 V128:$Rn),
-         (v2f64 (ARM64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-
-  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
-                                      FPR32Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s",
-    [(set (f32 FPR32Op:$Rd),
-          (OpNode (f32 FPR32Op:$Rn),
-                  (f32 (vector_extract (v4f32 V128:$Rm),
-                                       VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
-                                      FPR64Op, FPR64Op, V128, VectorIndexD,
-                                      asm, ".d", "", "", ".d",
-    [(set (f64 FPR64Op:$Rd),
-          (OpNode (f64 FPR64Op:$Rn),
-                  (f64 (vector_extract (v2f64 V128:$Rm),
-                                       VectorIndexD:$idx))))]> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-}
-
-multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
-  // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64duplane32 (v4f32 V128:$Rm),
-                                           VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # v2i32_indexed)
-                V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64dup (f32 FPR32Op:$Rm)))),
-            (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-
-  // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64duplane32 (v4f32 V128:$Rm),
-                                           VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v4i32_indexed")
-                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64dup (f32 FPR32Op:$Rm)))),
-            (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-  // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64duplane64 (v2f64 V128:$Rm),
-                                           VectorIndexD:$idx))),
-            (!cast<Instruction>(INST # "v2i64_indexed")
-                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64dup (f64 FPR64Op:$Rm)))),
-            (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
-
-  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
-                V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
-                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
-
-  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
-  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
-                         (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
-            (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
-                V128:$Rm, VectorIndexD:$idx)>;
-}
-
-multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
-                                          V128, VectorIndexS,
-                                          asm, ".2s", ".2s", ".2s", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
-                                      V128, V128,
-                                      V128, VectorIndexD,
-                                      asm, ".2d", ".2d", ".2d", ".d", []> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-
-
-  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
-                                      FPR32Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
-                                      FPR64Op, FPR64Op, V128, VectorIndexD,
-                                      asm, ".d", "", "", ".d", []> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-}
-
-multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
-                         SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4h", ".4h", ".4h", ".h",
-    [(set (v4i16 V64:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".8h", ".8h", ".8h", ".h",
-    [(set (v8i16 V128:$Rd),
-       (OpNode (v8i16 V128:$Rn),
-         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s",  ".s",
-    [(set (v2i32 V64:$Rd),
-       (OpNode (v2i32 V64:$Rn),
-          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4i32 V128:$Rd),
-       (OpNode (v4i32 V128:$Rn),
-          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
-                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
-                                      asm, ".h", "", "", ".h", []> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
-                                      FPR32Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s",
-      [(set (i32 FPR32Op:$Rd),
-            (OpNode FPR32Op:$Rn,
-                    (i32 (vector_extract (v4i32 V128:$Rm),
-                                         VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
-                               SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
-                                      V64, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4h", ".4h", ".4h", ".h",
-    [(set (v4i16 V64:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".8h", ".8h", ".8h", ".h",
-    [(set (v8i16 V128:$Rd),
-       (OpNode (v8i16 V128:$Rn),
-         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s", ".s",
-    [(set (v2i32 V64:$Rd),
-       (OpNode (v2i32 V64:$Rn),
-          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4i32 V128:$Rd),
-       (OpNode (v4i32 V128:$Rn),
-          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
-                                   SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
-                                          V128_lo, VectorIndexH,
-                                          asm, ".4h", ".4h", ".4h", ".h",
-    [(set (v4i16 V64:$dst),
-        (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".8h", ".8h", ".8h", ".h",
-    [(set (v8i16 V128:$dst),
-       (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
-         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s", ".s",
-    [(set (v2i32 V64:$dst),
-       (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
-          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4i32 V128:$dst),
-       (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$Rd),
-          (OpNode (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
-
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$Rd),
-        (OpNode (v2i32 V64:$Rn),
-         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$Rd),
-          (OpNode (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
-                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
-                                      asm, ".h", "", "", ".h", []> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
-                                      FPR64Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
-                                       SDPatternOperator Accum> {
-  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull
-                             (v4i16 V64:$Rn),
-                             (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                    VectorIndexH:$idx))))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
-  // intermediate EXTRACT_SUBREG would be untyped.
-  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
-                (i32 (vector_extract (v4i32
-                         (int_arm64_neon_sqdmull (v4i16 V64:$Rn),
-                             (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                    VectorIndexH:$idx)))),
-                         (i64 0))))),
-            (EXTRACT_SUBREG
-                (!cast<Instruction>(NAME # v4i16_indexed)
-                    (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
-                    V128_lo:$Rm, VectorIndexH:$idx),
-                ssub)>;
-
-  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull
-                            (extract_high_v8i16 V128:$Rn),
-                            (extract_high_v8i16
-                                (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                VectorIndexH:$idx))))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$dst),
-        (Accum (v2i64 V128:$Rd),
-               (v2i64 (int_arm64_neon_sqdmull
-                          (v2i32 V64:$Rn),
-                          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                 VectorIndexS:$idx))))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$dst),
-          (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_arm64_neon_sqdmull
-                            (extract_high_v4i32 V128:$Rn),
-                            (extract_high_v4i32
-                                (ARM64duplane32 (v4i32 V128:$Rm),
-                                                VectorIndexS:$idx))))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
-                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
-                                      asm, ".h", "", "", ".h", []> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-
-  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
-                                      FPR64Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s",
-    [(set (i64 FPR64Op:$dst),
-          (Accum (i64 FPR64Op:$Rd),
-                 (i64 (int_arm64_neon_sqdmulls_scalar
-                            (i32 FPR32Op:$Rn),
-                            (i32 (vector_extract (v4i32 V128:$Rm),
-                                                 VectorIndexS:$idx))))))]> {
-
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
-                                   SDPatternOperator OpNode> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$Rd),
-          (OpNode (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
-
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$Rd),
-        (OpNode (v2i32 V64:$Rn),
-         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$Rd),
-          (OpNode (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-  }
-}
-
-multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
-                                       SDPatternOperator OpNode> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$dst),
-        (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$dst),
-          (OpNode (v4i32 V128:$Rd),
-                  (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$dst),
-        (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
-         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$dst),
-          (OpNode (v2i64 V128:$Rd),
-                  (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-  }
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar shift by immediate
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterClass regtype1, RegisterClass regtype2,
-                     Operand immtype, string asm, list<dag> pattern>
-  : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
-      asm, "\t$Rd, $Rn, $imm", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<7> imm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b111110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterClass regtype1, RegisterClass regtype2,
-                     Operand immtype, string asm, list<dag> pattern>
-  : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
-      asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<7> imm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b111110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-
-multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftR32, asm, []> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm,
-  [(set (i64 FPR64:$Rd),
-     (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
-            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
-}
-
-multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode = null_frag> {
-  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm,
-  [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
-                                                   (i32 vecshiftR64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
-                           (i32 vecshiftR64:$imm))),
-            (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
-                                            vecshiftR64:$imm)>;
-}
-
-multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftL64, asm,
-    [(set (v1i64 FPR64:$Rd),
-       (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
-  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftL64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
-                               SDPatternOperator OpNode = null_frag> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR16, vecshiftR8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR32, vecshiftR16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR64, vecshiftR32, asm,
-    [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
-    let Inst{20-16} = imm{4-0};
-  }
-}
-
-multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR8, vecshiftL8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR16, vecshiftL16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftL32, asm,
-    [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftL64, asm,
-    [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
-            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
-}
-
-multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR8, vecshiftR8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR16, vecshiftR16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftR32, asm, []> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD vector x indexed element
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterOperand dst_reg, RegisterOperand src_reg,
-                     Operand immtype,
-                     string asm, string dst_kind, string src_kind,
-                     list<dag> pattern>
-  : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
-      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
-           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b011110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterOperand vectype1, RegisterOperand vectype2,
-                     Operand immtype,
-                     string asm, string dst_kind, string src_kind,
-                     list<dag> pattern>
-  : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
-      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
-           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b011110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
-                              Intrinsic OpNode> {
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32,
-                                  asm, ".2s", ".2s",
-      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32,
-                                  asm, ".4s", ".4s",
-      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d",
-      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
-                                  Intrinsic OpNode> {
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32,
-                                  asm, ".2s", ".2s",
-      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32,
-                                  asm, ".4s", ".4s",
-      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d",
-      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
-                                     SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V128, vecshiftR16Narrow,
-                                  asm, ".8b", ".8h",
-      [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftR16Narrow,
-                                  asm#"2", ".16b", ".8h", []> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-    let hasSideEffects = 0;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V128, vecshiftR32Narrow,
-                                  asm, ".4h", ".4s",
-      [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftR32Narrow,
-                                  asm#"2", ".8h", ".4s", []> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-    let hasSideEffects = 0;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V128, vecshiftR64Narrow,
-                                  asm, ".2s", ".2d",
-      [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR64Narrow,
-                                  asm#"2", ".4s", ".2d", []> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-    let hasSideEffects = 0;
-  }
-
-  // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
-  // themselves, so put them here instead.
-
-  // Patterns involving what's effectively an insert high and a normal
-  // intrinsic, represented by CONCAT_VECTORS.
-  def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
-                                                   vecshiftR16Narrow:$imm)),
-            (!cast<Instruction>(NAME # "v16i8_shift")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, vecshiftR16Narrow:$imm)>;
-  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
-                                                     vecshiftR32Narrow:$imm)),
-            (!cast<Instruction>(NAME # "v8i16_shift")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, vecshiftR32Narrow:$imm)>;
-  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
-                                                     vecshiftR64Narrow:$imm)),
-            (!cast<Instruction>(NAME # "v4i32_shift")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, vecshiftR64Narrow:$imm)>;
-}
-
-multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftL8,
-                                  asm, ".8b", ".8b",
-                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
-                       (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftL8,
-                                  asm, ".16b", ".16b",
-             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
-                   (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftL16,
-                                  asm, ".4h", ".4h",
-              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
-                    (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftL16,
-                                  asm, ".8h", ".8h",
-            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                  (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftL32,
-                                  asm, ".2s", ".2s",
-              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
-                    (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftL32,
-                                  asm, ".4s", ".4s",
-            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                  (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftL64,
-                                  asm, ".2d", ".2d",
-            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                  (i32 vecshiftL64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftR8,
-                                  asm, ".8b", ".8b",
-                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
-                       (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftR8,
-                                  asm, ".16b", ".16b",
-             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
-                   (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftR16,
-                                  asm, ".4h", ".4h",
-              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
-                    (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftR16,
-                                  asm, ".8h", ".8h",
-            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                  (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32,
-                                  asm, ".2s", ".2s",
-              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
-                    (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32,
-                                  asm, ".4s", ".4s",
-            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                  (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d",
-            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                  (i32 vecshiftR64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
-                                    SDPatternOperator OpNode = null_frag> {
-  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftR8, asm, ".8b", ".8b",
-                 [(set (v8i8 V64:$dst),
-                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
-                           (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftR8, asm, ".16b", ".16b",
-             [(set (v16i8 V128:$dst),
-               (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
-                       (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftR16, asm, ".4h", ".4h",
-              [(set (v4i16 V64:$dst),
-                (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
-                        (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftR16, asm, ".8h", ".8h",
-            [(set (v8i16 V128:$dst),
-              (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
-                      (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32, asm, ".2s", ".2s",
-              [(set (v2i32 V64:$dst),
-                (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
-                        (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32, asm, ".4s", ".4s",
-            [(set (v4i32 V128:$dst),
-              (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-                      (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
-              (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
-                      (i32 vecshiftR64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
-                                    SDPatternOperator OpNode = null_frag> {
-  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftL8,
-                                  asm, ".8b", ".8b",
-                    [(set (v8i8 V64:$dst),
-                          (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
-                                  (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftL8,
-                                  asm, ".16b", ".16b",
-                    [(set (v16i8 V128:$dst),
-                          (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
-                                  (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftL16,
-                                  asm, ".4h", ".4h",
-                    [(set (v4i16 V64:$dst),
-                           (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
-                                   (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftL16,
-                                  asm, ".8h", ".8h",
-                    [(set (v8i16 V128:$dst),
-                          (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
-                                  (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftL32,
-                                  asm, ".2s", ".2s",
-                    [(set (v2i32 V64:$dst),
-                          (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
-                                  (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftL32,
-                                  asm, ".4s", ".4s",
-                    [(set (v4i32 V128:$dst),
-                          (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-                                  (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftL64,
-                                  asm, ".2d", ".2d",
-                    [(set (v2i64 V128:$dst),
-                          (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
-                                  (i32 vecshiftL64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
-                                   SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V64, vecshiftL8, asm, ".8h", ".8b",
-      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftL8,
-                                  asm#"2", ".8h", ".16b",
-      [(set (v8i16 V128:$Rd),
-            (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V64, vecshiftL16, asm, ".4s", ".4h",
-      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftL16,
-                                  asm#"2", ".4s", ".8h",
-      [(set (v4i32 V128:$Rd),
-            (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
-
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V64, vecshiftL32, asm, ".2d", ".2s",
-      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftL32,
-                                  asm#"2", ".2d", ".4s",
-      [(set (v2i64 V128:$Rd),
-            (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-}
-
-
-//---
-// Vector load/store
-//---
-// SIMD ldX/stX no-index memory references don't allow the optional
-// ", #0" constant and handle post-indexing explicitly, so we use
-// a more specialized parse method for them. Otherwise, it's the same as
-// the general GPR64sp handling.
-
-class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
-                   string asm, dag oops, dag iops, list<dag> pattern>
-  : I<oops, iops, asm, "\t$Vt, [$Rn]", "", pattern> {
-  bits<5> Vt;
-  bits<5> Rn;
-  let Inst{31} = 0;
-  let Inst{30} = Q;
-  let Inst{29-23} = 0b0011000;
-  let Inst{22} = L;
-  let Inst{21-16} = 0b000000;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = size;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Vt;
-}
-
-class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
-                       string asm, dag oops, dag iops>
-  : I<oops, iops, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", []> {
-  bits<5> Vt;
-  bits<5> Rn;
-  bits<5> Xm;
-  let Inst{31} = 0;
-  let Inst{30} = Q;
-  let Inst{29-23} = 0b0011001;
-  let Inst{22} = L;
-  let Inst{21} = 0;
-  let Inst{20-16} = Xm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = size;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Vt;
-}
-
-// The immediate form of AdvSIMD post-indexed addressing is encoded with
-// register post-index addressing from the zero register.
-multiclass SIMDLdStAliases<string asm, string layout, string Count,
-                           int Offset, int Size> {
-  // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
-  //      "ld1\t$Vt, [$Rn], #16"
-  // may get mapped to
-  //      (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
-  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
-                      GPR64sp:$Rn,
-                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
-                      XZR), 1>;
-
-  // E.g. "ld1.8b { v0, v1 }, [x1], #16"
-  //      "ld1.8b\t$Vt, [$Rn], #16"
-  // may get mapped to
-  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
-  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
-                      GPR64sp:$Rn,
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      XZR), 0>;
-
-  // E.g. "ld1.8b { v0, v1 }, [x1]"
-  //      "ld1\t$Vt, [$Rn]"
-  // may get mapped to
-  //      (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
-  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
-                  (!cast<Instruction>(NAME # Count # "v" # layout)
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      GPR64sp:$Rn), 0>;
-
-  // E.g. "ld1.8b { v0, v1 }, [x1], x2"
-  //      "ld1\t$Vt, [$Rn], $Xm"
-  // may get mapped to
-  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
-  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
-                      GPR64sp:$Rn,
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
-}
-
-multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
-                       int Offset64, bits<4> opcode> {
-  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
-    def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
-                           (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
-                           (ins GPR64sp:$Rn), []>;
-    def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
-                           (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
-                           (ins GPR64sp:$Rn), []>;
-    def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
-                           (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
-                           (ins GPR64sp:$Rn), []>;
-    def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
-                           (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
-                           (ins GPR64sp:$Rn), []>;
-    def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
-                           (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
-                           (ins GPR64sp:$Rn), []>;
-    def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
-                           (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
-                           (ins GPR64sp:$Rn), []>;
-    def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
-                           (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
-                           (ins GPR64sp:$Rn), []>;
-
-
-    def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
-                       (outs GPR64sp:$wback,
-                             !cast<RegisterOperand>(veclist # "16b"):$Vt),
-                       (ins GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
-                       (outs GPR64sp:$wback,
-                             !cast<RegisterOperand>(veclist # "8h"):$Vt),
-                       (ins GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
-                       (outs GPR64sp:$wback,
-                             !cast<RegisterOperand>(veclist # "4s"):$Vt),
-                       (ins GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
-                       (outs GPR64sp:$wback,
-                             !cast<RegisterOperand>(veclist # "2d"):$Vt),
-                       (ins GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
-                       (outs GPR64sp:$wback,
-                             !cast<RegisterOperand>(veclist # "8b"):$Vt),
-                       (ins GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
-                       (outs GPR64sp:$wback,
-                             !cast<RegisterOperand>(veclist # "4h"):$Vt),
-                       (ins GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
-                       (outs GPR64sp:$wback,
-                             !cast<RegisterOperand>(veclist # "2s"):$Vt),
-                       (ins GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
-}
-
-// Only ld1/st1 has a v1d version.
-multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
-                       int Offset64, bits<4> opcode> {
-  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
-    def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
-                            (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
-                                 GPR64sp:$Rn), []>;
-    def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
-                                GPR64sp:$Rn), []>;
-    def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
-                                GPR64sp:$Rn), []>;
-    def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
-                                GPR64sp:$Rn), []>;
-    def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
-                                GPR64sp:$Rn), []>;
-    def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
-                                GPR64sp:$Rn), []>;
-    def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
-                                GPR64sp:$Rn), []>;
-
-    def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm,
-                       (outs GPR64sp:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
-                            GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm,
-                       (outs GPR64sp:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
-                            GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm,
-                       (outs GPR64sp:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
-                            GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm,
-                       (outs GPR64sp:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
-                            GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm,
-                       (outs GPR64sp:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
-                            GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm,
-                       (outs GPR64sp:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
-                            GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm,
-                       (outs GPR64sp:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
-                            GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
-}
-
-multiclass BaseSIMDLd1<string Count, string asm, string veclist,
-                       int Offset128, int Offset64, bits<4> opcode>
-  : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
-
-  // LD1 instructions have extra "1d" variants.
-  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
-    def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
-                           (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
-                           (ins GPR64sp:$Rn), []>;
-
-    def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
-                       (outs GPR64sp:$wback,
-                             !cast<RegisterOperand>(veclist # "1d"):$Vt),
-                       (ins GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
-}
-
-multiclass BaseSIMDSt1<string Count, string asm, string veclist,
-                       int Offset128, int Offset64, bits<4> opcode>
-  : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
-
-  // ST1 instructions have extra "1d" variants.
-  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
-    def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
-                                GPR64sp:$Rn), []>;
-
-    def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm,
-                       (outs GPR64sp:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
-                            GPR64sp:$Rn,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
-}
-
-multiclass SIMDLd1Multiple<string asm> {
-  defm One   : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8,  0b0111>;
-  defm Two   : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
-  defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
-  defm Four  : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
-}
-
-multiclass SIMDSt1Multiple<string asm> {
-  defm One   : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8,  0b0111>;
-  defm Two   : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
-  defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
-  defm Four  : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
-}
-
-multiclass SIMDLd2Multiple<string asm> {
-  defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
-}
-
-multiclass SIMDSt2Multiple<string asm> {
-  defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
-}
-
-multiclass SIMDLd3Multiple<string asm> {
-  defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
-}
-
-multiclass SIMDSt3Multiple<string asm> {
-  defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
-}
-
-multiclass SIMDLd4Multiple<string asm> {
-  defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
-}
-
-multiclass SIMDSt4Multiple<string asm> {
-  defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
-}
-
-//---
-// AdvSIMD Load/store single-element
-//---
-
-class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
-                         string asm, string operands, string cst,
-                         dag oops, dag iops, list<dag> pattern>
-  : I<oops, iops, asm, operands, cst, pattern> {
-  bits<5> Vt;
-  bits<5> Rn;
-  let Inst{31} = 0;
-  let Inst{29-24} = 0b001101;
-  let Inst{22} = L;
-  let Inst{21} = R;
-  let Inst{15-13} = opcode;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Vt;
-}
-
-class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
-                         string asm, string operands, string cst,
-                         dag oops, dag iops, list<dag> pattern>
-  : I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> {
-  bits<5> Vt;
-  bits<5> Rn;
-  let Inst{31} = 0;
-  let Inst{29-24} = 0b001101;
-  let Inst{22} = L;
-  let Inst{21} = R;
-  let Inst{15-13} = opcode;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Vt;
-}
-
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
-                  Operand listtype>
-  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
-                       (outs listtype:$Vt), (ins GPR64sp:$Rn),
-                       []> {
-  let Inst{30} = Q;
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = S;
-  let Inst{11-10} = size;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
-                      string asm, Operand listtype, Operand GPR64pi>
-  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
-                       "$Rn = $wback",
-                       (outs GPR64sp:$wback, listtype:$Vt),
-                       (ins GPR64sp:$Rn, GPR64pi:$Xm), []> {
-  bits<5> Xm;
-  let Inst{30} = Q;
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = S;
-  let Inst{11-10} = size;
-}
-
-multiclass SIMDLdrAliases<string asm, string layout, string Count,
-                          int Offset, int Size> {
-  // E.g. "ld1r { v0.8b }, [x1], #1"
-  //      "ld1r.8b\t$Vt, [$Rn], #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
-  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
-                      GPR64sp:$Rn,
-                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
-                      XZR), 1>;
-
-  // E.g. "ld1r.8b { v0 }, [x1], #1"
-  //      "ld1r.8b\t$Vt, [$Rn], #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
-  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
-                      GPR64sp:$Rn,
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      XZR), 0>;
-
-  // E.g. "ld1r.8b { v0 }, [x1]"
-  //      "ld1r.8b\t$Vt, [$Rn]"
-  // may get mapped to
-  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
-  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
-                  (!cast<Instruction>(NAME # "v" # layout)
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      GPR64sp:$Rn), 0>;
-
-  // E.g. "ld1r.8b { v0 }, [x1], x2"
-  //      "ld1r.8b\t$Vt, [$Rn], $Xm"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
-  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
-                      GPR64sp:$Rn,
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
-}
-
-multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
-  int Offset1, int Offset2, int Offset4, int Offset8> {
-  def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
-                        !cast<Operand>("VecList" # Count # "8b")>;
-  def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
-                        !cast<Operand>("VecList" # Count #"16b")>;
-  def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
-                        !cast<Operand>("VecList" # Count #"4h")>;
-  def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
-                        !cast<Operand>("VecList" # Count #"8h")>;
-  def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
-                        !cast<Operand>("VecList" # Count #"2s")>;
-  def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
-                        !cast<Operand>("VecList" # Count #"4s")>;
-  def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
-                        !cast<Operand>("VecList" # Count #"1d")>;
-  def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
-                        !cast<Operand>("VecList" # Count #"2d")>;
-
-  def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
-                                 !cast<Operand>("VecList" # Count # "8b"),
-                                 !cast<Operand>("GPR64pi" # Offset1)>;
-  def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
-                                 !cast<Operand>("VecList" # Count # "16b"),
-                                 !cast<Operand>("GPR64pi" # Offset1)>;
-  def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
-                                 !cast<Operand>("VecList" # Count # "4h"),
-                                 !cast<Operand>("GPR64pi" # Offset2)>;
-  def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
-                                 !cast<Operand>("VecList" # Count # "8h"),
-                                 !cast<Operand>("GPR64pi" # Offset2)>;
-  def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
-                                 !cast<Operand>("VecList" # Count # "2s"),
-                                 !cast<Operand>("GPR64pi" # Offset4)>;
-  def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
-                                 !cast<Operand>("VecList" # Count # "4s"),
-                                 !cast<Operand>("GPR64pi" # Offset4)>;
-  def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
-                                 !cast<Operand>("VecList" # Count # "1d"),
-                                 !cast<Operand>("GPR64pi" # Offset8)>;
-  def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
-                                 !cast<Operand>("VecList" # Count # "2d"),
-                                 !cast<Operand>("GPR64pi" # Offset8)>;
-
-  defm : SIMDLdrAliases<asm, "8b",  Count, Offset1,  64>;
-  defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
-  defm : SIMDLdrAliases<asm, "4h",  Count, Offset2,  64>;
-  defm : SIMDLdrAliases<asm, "8h",  Count, Offset2, 128>;
-  defm : SIMDLdrAliases<asm, "2s",  Count, Offset4,  64>;
-  defm : SIMDLdrAliases<asm, "4s",  Count, Offset4, 128>;
-  defm : SIMDLdrAliases<asm, "1d",  Count, Offset8,  64>;
-  defm : SIMDLdrAliases<asm, "2d",  Count, Offset8, 128>;
-}
-
-class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  let Inst{30} = idx{3};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
-                           oops, iops, pattern> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  let Inst{30} = idx{3};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
-                       "$Rn = $wback", oops, iops, []> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{3};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
-                           "$Rn = $wback", oops, iops, []> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{3};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-
-class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  let Inst{30} = idx{2};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
-                           oops, iops, pattern> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  let Inst{30} = idx{2};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-
-class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
-                       "$Rn = $wback", oops, iops, []> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{2};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
-                           "$Rn = $wback", oops, iops, []> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{2};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  let Inst{30} = idx{1};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
-                           oops, iops, pattern> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  let Inst{30} = idx{1};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
-                       "$Rn = $wback", oops, iops, []> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{1};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
-                           "$Rn = $wback", oops, iops, []> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{1};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
-                       pattern> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  let Inst{30} = idx;
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
-                           oops, iops, pattern> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  let Inst{30} = idx;
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
-                       "$Rn = $wback", oops, iops, []> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  bits<5> Xm;
-  let Inst{30} = idx;
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
-                           "$Rn = $wback", oops, iops, []> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  bits<5> Xm;
-  let Inst{30} = idx;
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
-  def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
-                           (outs listtype:$dst),
-                           (ins listtype:$Vt, VectorIndexB:$idx,
-                                GPR64sp:$Rn), []>;
-
-  def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
-                            (outs GPR64sp:$wback, listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexB:$idx,
-                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
-  def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexH:$idx,
-                                 GPR64sp:$Rn), []>;
-
-  def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
-                            (outs GPR64sp:$wback, listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexH:$idx,
-                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
-  def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexS:$idx,
-                                 GPR64sp:$Rn), []>;
-
-  def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
-                            (outs GPR64sp:$wback, listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexS:$idx,
-                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexD:$idx,
-                                 GPR64sp:$Rn), []>;
-
-  def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
-                            (outs GPR64sp:$wback, listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexD:$idx,
-                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i8 : SIMDLdStSingleB<0, R, opcode, asm,
-                           (outs), (ins listtype:$Vt, VectorIndexB:$idx,
-                                        GPR64sp:$Rn), []>;
-
-  def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
-                                    (outs GPR64sp:$wback),
-                                    (ins listtype:$Vt, VectorIndexB:$idx,
-                                         GPR64sp:$Rn, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
-                                         GPR64sp:$Rn), []>;
-
-  def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
-                            (outs GPR64sp:$wback),
-                            (ins listtype:$Vt, VectorIndexH:$idx,
-                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
-                                         GPR64sp:$Rn), []>;
-
-  def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
-                            (outs GPR64sp:$wback),
-                            (ins listtype:$Vt, VectorIndexS:$idx,
-                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
-                                         GPR64sp:$Rn), []>;
-
-  def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
-                            (outs GPR64sp:$wback),
-                            (ins listtype:$Vt, VectorIndexD:$idx,
-                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
-}
-
-multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
-                                 string Count, int Offset, Operand idxtype> {
-  // E.g. "ld1 { v0.8b }[0], [x1], #1"
-  //      "ld1\t$Vt, [$Rn], #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
-  def : InstAlias<asm # "\t$Vt$idx, [$Rn], #" # Offset,
-                  (!cast<Instruction>(NAME # Type  # "_POST")
-                      GPR64sp:$Rn,
-                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
-                      idxtype:$idx, XZR), 1>;
-
-  // E.g. "ld1.8b { v0 }[0], [x1], #1"
-  //      "ld1.8b\t$Vt, [$Rn], #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
-  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], #" # Offset,
-                  (!cast<Instruction>(NAME # Type # "_POST")
-                      GPR64sp:$Rn,
-                      !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
-                      idxtype:$idx, XZR), 0>;
-
-  // E.g. "ld1.8b { v0 }[0], [x1]"
-  //      "ld1.8b\t$Vt, [$Rn]"
-  // may get mapped to
-  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
-  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn]",
-                      (!cast<Instruction>(NAME # Type)
-                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
-                         idxtype:$idx, GPR64sp:$Rn), 0>;
-
-  // E.g. "ld1.8b { v0 }[0], [x1], x2"
-  //      "ld1.8b\t$Vt, [$Rn], $Xm"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
-  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], $Xm",
-                      (!cast<Instruction>(NAME # Type # "_POST")
-                         GPR64sp:$Rn,
-                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
-                         idxtype:$idx,
-                         !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
-}
-
-multiclass SIMDLdSt1SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
-}
-
-multiclass SIMDLdSt2SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
-}
-
-multiclass SIMDLdSt3SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
-}
-
-multiclass SIMDLdSt4SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
-}
-} // end of 'let Predicates = [HasNEON]'
-
-//----------------------------------------------------------------------------
-// Crypto extensions
-//----------------------------------------------------------------------------
-
-let Predicates = [HasCrypto] in {
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
-              list<dag> pat>
-  : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
-    Sched<[WriteV]>{
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-16} = 0b0100111000101000;
-  let Inst{15-12} = opc;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
-  : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
-            [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-
-class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
-  : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
-            "$Rd = $dst",
-            [(set (v16i8 V128:$dst),
-                  (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
-                     dag oops, dag iops, list<dag> pat>
-  : I<oops, iops, asm,
-      "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
-      "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
-    Sched<[WriteV]>{
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-21} = 0b01011110000;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{14-12} = opc;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
-  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
-                   (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
-                   [(set (v4i32 FPR128:$dst),
-                         (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
-                                 (v4i32 V128:$Rm)))]>;
-
-class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
-  : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
-                   (ins V128:$Rd, V128:$Rn, V128:$Rm),
-                   [(set (v4i32 V128:$dst),
-                         (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-                                 (v4i32 V128:$Rm)))]>;
-
-class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
-  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
-                   (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
-                   [(set (v4i32 FPR128:$dst),
-                         (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
-                                 (v4i32 V128:$Rm)))]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class SHA2OpInst<bits<4> opc, string asm, string kind,
-                 string cstr, dag oops, dag iops,
-                 list<dag> pat>
-  : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
-                       "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
-    Sched<[WriteV]>{
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-16} = 0b0101111000101000;
-  let Inst{15-12} = opc;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
-  : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
-               (ins V128:$Rd, V128:$Rn),
-               [(set (v4i32 V128:$dst),
-                     (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
-
-class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
-  : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
-               [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-} // end of 'let Predicates = [HasCrypto]'
-
-// Allow the size specifier tokens to be upper case, not just lower.
-def : TokenAlias<".8B", ".8b">;
-def : TokenAlias<".4H", ".4h">;
-def : TokenAlias<".2S", ".2s">;
-def : TokenAlias<".1D", ".1d">;
-def : TokenAlias<".16B", ".16b">;
-def : TokenAlias<".8H", ".8h">;
-def : TokenAlias<".4S", ".4s">;
-def : TokenAlias<".2D", ".2d">;
-def : TokenAlias<".1Q", ".1q">;
-def : TokenAlias<".B", ".b">;
-def : TokenAlias<".H", ".h">;
-def : TokenAlias<".S", ".s">;
-def : TokenAlias<".D", ".d">;
-def : TokenAlias<".Q", ".q">;
diff --git a/lib/Target/ARM64/ARM64InstrInfo.cpp b/lib/Target/ARM64/ARM64InstrInfo.cpp
deleted file mode 100644
index fbbddd56660..00000000000
--- a/lib/Target/ARM64/ARM64InstrInfo.cpp
+++ /dev/null
@@ -1,2059 +0,0 @@
-//===- ARM64InstrInfo.cpp - ARM64 Instruction Information -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64InstrInfo.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-#define GET_INSTRINFO_CTOR_DTOR
-#include "ARM64GenInstrInfo.inc"
-
-ARM64InstrInfo::ARM64InstrInfo(const ARM64Subtarget &STI)
-    : ARM64GenInstrInfo(ARM64::ADJCALLSTACKDOWN, ARM64::ADJCALLSTACKUP),
-      RI(this, &STI), Subtarget(STI) {}
-
-/// GetInstSize - Return the number of bytes of code the specified
-/// instruction may be.  This returns the maximum number of bytes.
-unsigned ARM64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const MCInstrDesc &Desc = MI->getDesc();
-
-  switch (Desc.getOpcode()) {
-  default:
-    // Anything not explicitly designated otherwise is a nomal 4-byte insn.
-    return 4;
-  case TargetOpcode::DBG_VALUE:
-  case TargetOpcode::EH_LABEL:
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL:
-    return 0;
-  }
-
-  llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size");
-}
-
-static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
-                            SmallVectorImpl<MachineOperand> &Cond) {
-  // Block ends with fall-through condbranch.
-  switch (LastInst->getOpcode()) {
-  default:
-    llvm_unreachable("Unknown branch instruction?");
-  case ARM64::Bcc:
-    Target = LastInst->getOperand(1).getMBB();
-    Cond.push_back(LastInst->getOperand(0));
-    break;
-  case ARM64::CBZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZW:
-  case ARM64::CBNZX:
-    Target = LastInst->getOperand(1).getMBB();
-    Cond.push_back(MachineOperand::CreateImm(-1));
-    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
-    Cond.push_back(LastInst->getOperand(0));
-    break;
-  case ARM64::TBZW:
-  case ARM64::TBZX:
-  case ARM64::TBNZW:
-  case ARM64::TBNZX:
-    Target = LastInst->getOperand(2).getMBB();
-    Cond.push_back(MachineOperand::CreateImm(-1));
-    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
-    Cond.push_back(LastInst->getOperand(0));
-    Cond.push_back(LastInst->getOperand(1));
-  }
-}
-
-// Branch analysis.
-bool ARM64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                                   MachineBasicBlock *&TBB,
-                                   MachineBasicBlock *&FBB,
-                                   SmallVectorImpl<MachineOperand> &Cond,
-                                   bool AllowModify) const {
-  // If the block has no terminators, it just falls into the block after it.
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin())
-    return false;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return false;
-    --I;
-  }
-  if (!isUnpredicatedTerminator(I))
-    return false;
-
-  // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
-
-  // If there is only one terminator instruction, process it.
-  unsigned LastOpc = LastInst->getOpcode();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (isUncondBranchOpcode(LastOpc)) {
-      TBB = LastInst->getOperand(0).getMBB();
-      return false;
-    }
-    if (isCondBranchOpcode(LastOpc)) {
-      // Block ends with fall-through condbranch.
-      parseCondBranch(LastInst, TBB, Cond);
-      return false;
-    }
-    return true; // Can't handle indirect branch.
-  }
-
-  // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
-  unsigned SecondLastOpc = SecondLastInst->getOpcode();
-
-  // If AllowModify is true and the block ends with two or more unconditional
-  // branches, delete all but the first unconditional branch.
-  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
-    while (isUncondBranchOpcode(SecondLastOpc)) {
-      LastInst->eraseFromParent();
-      LastInst = SecondLastInst;
-      LastOpc = LastInst->getOpcode();
-      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-        // Return now the only terminator is an unconditional branch.
-        TBB = LastInst->getOperand(0).getMBB();
-        return false;
-      } else {
-        SecondLastInst = I;
-        SecondLastOpc = SecondLastInst->getOpcode();
-      }
-    }
-  }
-
-  // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
-    return true;
-
-  // If the block ends with a B and a Bcc, handle it.
-  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    parseCondBranch(SecondLastInst, TBB, Cond);
-    FBB = LastInst->getOperand(0).getMBB();
-    return false;
-  }
-
-  // If the block ends with two unconditional branches, handle it.  The second
-  // one is not executed, so remove it.
-  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return false;
-  }
-
-  // ...likewise if it ends with an indirect branch followed by an unconditional
-  // branch.
-  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return true;
-  }
-
-  // Otherwise, can't handle this.
-  return true;
-}
-
-bool ARM64InstrInfo::ReverseBranchCondition(
-    SmallVectorImpl<MachineOperand> &Cond) const {
-  if (Cond[0].getImm() != -1) {
-    // Regular Bcc
-    ARM64CC::CondCode CC = (ARM64CC::CondCode)(int)Cond[0].getImm();
-    Cond[0].setImm(ARM64CC::getInvertedCondCode(CC));
-  } else {
-    // Folded compare-and-branch
-    switch (Cond[1].getImm()) {
-    default:
-      llvm_unreachable("Unknown conditional branch!");
-    case ARM64::CBZW:
-      Cond[1].setImm(ARM64::CBNZW);
-      break;
-    case ARM64::CBNZW:
-      Cond[1].setImm(ARM64::CBZW);
-      break;
-    case ARM64::CBZX:
-      Cond[1].setImm(ARM64::CBNZX);
-      break;
-    case ARM64::CBNZX:
-      Cond[1].setImm(ARM64::CBZX);
-      break;
-    case ARM64::TBZW:
-      Cond[1].setImm(ARM64::TBNZW);
-      break;
-    case ARM64::TBNZW:
-      Cond[1].setImm(ARM64::TBZW);
-      break;
-    case ARM64::TBZX:
-      Cond[1].setImm(ARM64::TBNZX);
-      break;
-    case ARM64::TBNZX:
-      Cond[1].setImm(ARM64::TBZX);
-      break;
-    }
-  }
-
-  return false;
-}
-
-unsigned ARM64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin())
-    return 0;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return 0;
-    --I;
-  }
-  if (!isUncondBranchOpcode(I->getOpcode()) &&
-      !isCondBranchOpcode(I->getOpcode()))
-    return 0;
-
-  // Remove the branch.
-  I->eraseFromParent();
-
-  I = MBB.end();
-
-  if (I == MBB.begin())
-    return 1;
-  --I;
-  if (!isCondBranchOpcode(I->getOpcode()))
-    return 1;
-
-  // Remove the branch.
-  I->eraseFromParent();
-  return 2;
-}
-
-void ARM64InstrInfo::instantiateCondBranch(
-    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
-    const SmallVectorImpl<MachineOperand> &Cond) const {
-  if (Cond[0].getImm() != -1) {
-    // Regular Bcc
-    BuildMI(&MBB, DL, get(ARM64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
-  } else {
-    // Folded compare-and-branch
-    const MachineInstrBuilder MIB =
-        BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
-    if (Cond.size() > 3)
-      MIB.addImm(Cond[3].getImm());
-    MIB.addMBB(TBB);
-  }
-}
-
-unsigned ARM64InstrInfo::InsertBranch(
-    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
-  // Shouldn't be a fall through.
-  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
-
-  if (!FBB) {
-    if (Cond.empty()) // Unconditional branch?
-      BuildMI(&MBB, DL, get(ARM64::B)).addMBB(TBB);
-    else
-      instantiateCondBranch(MBB, DL, TBB, Cond);
-    return 1;
-  }
-
-  // Two-way conditional branch.
-  instantiateCondBranch(MBB, DL, TBB, Cond);
-  BuildMI(&MBB, DL, get(ARM64::B)).addMBB(FBB);
-  return 2;
-}
-
-// Find the original register that VReg is copied from.
-static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
-  while (TargetRegisterInfo::isVirtualRegister(VReg)) {
-    const MachineInstr *DefMI = MRI.getVRegDef(VReg);
-    if (!DefMI->isFullCopy())
-      return VReg;
-    VReg = DefMI->getOperand(1).getReg();
-  }
-  return VReg;
-}
-
-// Determine if VReg is defined by an instruction that can be folded into a
-// csel instruction. If so, return the folded opcode, and the replacement
-// register.
-static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
-                                unsigned *NewVReg = nullptr) {
-  VReg = removeCopies(MRI, VReg);
-  if (!TargetRegisterInfo::isVirtualRegister(VReg))
-    return 0;
-
-  bool Is64Bit = ARM64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
-  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
-  unsigned Opc = 0;
-  unsigned SrcOpNum = 0;
-  switch (DefMI->getOpcode()) {
-  case ARM64::ADDSXri:
-  case ARM64::ADDSWri:
-    // if NZCV is used, do not fold.
-    if (DefMI->findRegisterDefOperandIdx(ARM64::NZCV, true) == -1)
-      return 0;
-  // fall-through to ADDXri and ADDWri.
-  case ARM64::ADDXri:
-  case ARM64::ADDWri:
-    // add x, 1 -> csinc.
-    if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
-        DefMI->getOperand(3).getImm() != 0)
-      return 0;
-    SrcOpNum = 1;
-    Opc = Is64Bit ? ARM64::CSINCXr : ARM64::CSINCWr;
-    break;
-
-  case ARM64::ORNXrr:
-  case ARM64::ORNWrr: {
-    // not x -> csinv, represented as orn dst, xzr, src.
-    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
-    if (ZReg != ARM64::XZR && ZReg != ARM64::WZR)
-      return 0;
-    SrcOpNum = 2;
-    Opc = Is64Bit ? ARM64::CSINVXr : ARM64::CSINVWr;
-    break;
-  }
-
-  case ARM64::SUBSXrr:
-  case ARM64::SUBSWrr:
-    // if NZCV is used, do not fold.
-    if (DefMI->findRegisterDefOperandIdx(ARM64::NZCV, true) == -1)
-      return 0;
-  // fall-through to SUBXrr and SUBWrr.
-  case ARM64::SUBXrr:
-  case ARM64::SUBWrr: {
-    // neg x -> csneg, represented as sub dst, xzr, src.
-    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
-    if (ZReg != ARM64::XZR && ZReg != ARM64::WZR)
-      return 0;
-    SrcOpNum = 2;
-    Opc = Is64Bit ? ARM64::CSNEGXr : ARM64::CSNEGWr;
-    break;
-  }
-  default:
-    return 0;
-  }
-  assert(Opc && SrcOpNum && "Missing parameters");
-
-  if (NewVReg)
-    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
-  return Opc;
-}
-
-bool ARM64InstrInfo::canInsertSelect(
-    const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond,
-    unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
-    int &FalseCycles) const {
-  // Check register classes.
-  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  const TargetRegisterClass *RC =
-      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
-  if (!RC)
-    return false;
-
-  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
-  unsigned ExtraCondLat = Cond.size() != 1;
-
-  // GPRs are handled by csel.
-  // FIXME: Fold in x+1, -x, and ~x when applicable.
-  if (ARM64::GPR64allRegClass.hasSubClassEq(RC) ||
-      ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
-    // Single-cycle csel, csinc, csinv, and csneg.
-    CondCycles = 1 + ExtraCondLat;
-    TrueCycles = FalseCycles = 1;
-    if (canFoldIntoCSel(MRI, TrueReg))
-      TrueCycles = 0;
-    else if (canFoldIntoCSel(MRI, FalseReg))
-      FalseCycles = 0;
-    return true;
-  }
-
-  // Scalar floating point is handled by fcsel.
-  // FIXME: Form fabs, fmin, and fmax when applicable.
-  if (ARM64::FPR64RegClass.hasSubClassEq(RC) ||
-      ARM64::FPR32RegClass.hasSubClassEq(RC)) {
-    CondCycles = 5 + ExtraCondLat;
-    TrueCycles = FalseCycles = 2;
-    return true;
-  }
-
-  // Can't do vectors.
-  return false;
-}
-
-void ARM64InstrInfo::insertSelect(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I, DebugLoc DL,
-                                  unsigned DstReg,
-                                  const SmallVectorImpl<MachineOperand> &Cond,
-                                  unsigned TrueReg, unsigned FalseReg) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
-  // Parse the condition code, see parseCondBranch() above.
-  ARM64CC::CondCode CC;
-  switch (Cond.size()) {
-  default:
-    llvm_unreachable("Unknown condition opcode in Cond");
-  case 1: // b.cc
-    CC = ARM64CC::CondCode(Cond[0].getImm());
-    break;
-  case 3: { // cbz/cbnz
-    // We must insert a compare against 0.
-    bool Is64Bit;
-    switch (Cond[1].getImm()) {
-    default:
-      llvm_unreachable("Unknown branch opcode in Cond");
-    case ARM64::CBZW:
-      Is64Bit = 0;
-      CC = ARM64CC::EQ;
-      break;
-    case ARM64::CBZX:
-      Is64Bit = 1;
-      CC = ARM64CC::EQ;
-      break;
-    case ARM64::CBNZW:
-      Is64Bit = 0;
-      CC = ARM64CC::NE;
-      break;
-    case ARM64::CBNZX:
-      Is64Bit = 1;
-      CC = ARM64CC::NE;
-      break;
-    }
-    unsigned SrcReg = Cond[2].getReg();
-    if (Is64Bit) {
-      // cmp reg, #0 is actually subs xzr, reg, #0.
-      MRI.constrainRegClass(SrcReg, &ARM64::GPR64spRegClass);
-      BuildMI(MBB, I, DL, get(ARM64::SUBSXri), ARM64::XZR)
-          .addReg(SrcReg)
-          .addImm(0)
-          .addImm(0);
-    } else {
-      MRI.constrainRegClass(SrcReg, &ARM64::GPR32spRegClass);
-      BuildMI(MBB, I, DL, get(ARM64::SUBSWri), ARM64::WZR)
-          .addReg(SrcReg)
-          .addImm(0)
-          .addImm(0);
-    }
-    break;
-  }
-  case 4: { // tbz/tbnz
-    // We must insert a tst instruction.
-    switch (Cond[1].getImm()) {
-    default:
-      llvm_unreachable("Unknown branch opcode in Cond");
-    case ARM64::TBZW:
-    case ARM64::TBZX:
-      CC = ARM64CC::EQ;
-      break;
-    case ARM64::TBNZW:
-    case ARM64::TBNZX:
-      CC = ARM64CC::NE;
-      break;
-    }
-    // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
-    if (Cond[1].getImm() == ARM64::TBZW || Cond[1].getImm() == ARM64::TBNZW)
-      BuildMI(MBB, I, DL, get(ARM64::ANDSWri), ARM64::WZR)
-          .addReg(Cond[2].getReg())
-          .addImm(ARM64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
-    else
-      BuildMI(MBB, I, DL, get(ARM64::ANDSXri), ARM64::XZR)
-          .addReg(Cond[2].getReg())
-          .addImm(ARM64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
-    break;
-  }
-  }
-
-  unsigned Opc = 0;
-  const TargetRegisterClass *RC = nullptr;
-  bool TryFold = false;
-  if (MRI.constrainRegClass(DstReg, &ARM64::GPR64RegClass)) {
-    RC = &ARM64::GPR64RegClass;
-    Opc = ARM64::CSELXr;
-    TryFold = true;
-  } else if (MRI.constrainRegClass(DstReg, &ARM64::GPR32RegClass)) {
-    RC = &ARM64::GPR32RegClass;
-    Opc = ARM64::CSELWr;
-    TryFold = true;
-  } else if (MRI.constrainRegClass(DstReg, &ARM64::FPR64RegClass)) {
-    RC = &ARM64::FPR64RegClass;
-    Opc = ARM64::FCSELDrrr;
-  } else if (MRI.constrainRegClass(DstReg, &ARM64::FPR32RegClass)) {
-    RC = &ARM64::FPR32RegClass;
-    Opc = ARM64::FCSELSrrr;
-  }
-  assert(RC && "Unsupported regclass");
-
-  // Try folding simple instructions into the csel.
-  if (TryFold) {
-    unsigned NewVReg = 0;
-    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
-    if (FoldedOpc) {
-      // The folded opcodes csinc, csinc and csneg apply the operation to
-      // FalseReg, so we need to invert the condition.
-      CC = ARM64CC::getInvertedCondCode(CC);
-      TrueReg = FalseReg;
-    } else
-      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
-
-    // Fold the operation. Leave any dead instructions for DCE to clean up.
-    if (FoldedOpc) {
-      FalseReg = NewVReg;
-      Opc = FoldedOpc;
-      // The extends the live range of NewVReg.
-      MRI.clearKillFlags(NewVReg);
-    }
-  }
-
-  // Pull all virtual register into the appropriate class.
-  MRI.constrainRegClass(TrueReg, RC);
-  MRI.constrainRegClass(FalseReg, RC);
-
-  // Insert the csel.
-  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
-      CC);
-}
-
-bool ARM64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
-                                           unsigned &SrcReg, unsigned &DstReg,
-                                           unsigned &SubIdx) const {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case ARM64::SBFMXri: // aka sxtw
-  case ARM64::UBFMXri: // aka uxtw
-    // Check for the 32 -> 64 bit extension case, these instructions can do
-    // much more.
-    if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
-      return false;
-    // This is a signed or unsigned 32 -> 64 bit extension.
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    SubIdx = ARM64::sub_32;
-    return true;
-  }
-}
-
-/// analyzeCompare - For a comparison instruction, return the source registers
-/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
-/// Return true if the comparison instruction can be analyzed.
-bool ARM64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                                    unsigned &SrcReg2, int &CmpMask,
-                                    int &CmpValue) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::SUBSWrr:
-  case ARM64::SUBSWrs:
-  case ARM64::SUBSWrx:
-  case ARM64::SUBSXrr:
-  case ARM64::SUBSXrs:
-  case ARM64::SUBSXrx:
-  case ARM64::ADDSWrr:
-  case ARM64::ADDSWrs:
-  case ARM64::ADDSWrx:
-  case ARM64::ADDSXrr:
-  case ARM64::ADDSXrs:
-  case ARM64::ADDSXrx:
-    // Replace SUBSWrr with SUBWrr if NZCV is not used.
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = MI->getOperand(2).getReg();
-    CmpMask = ~0;
-    CmpValue = 0;
-    return true;
-  case ARM64::SUBSWri:
-  case ARM64::ADDSWri:
-  case ARM64::SUBSXri:
-  case ARM64::ADDSXri:
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = 0;
-    CmpMask = ~0;
-    CmpValue = MI->getOperand(2).getImm();
-    return true;
-  case ARM64::ANDSWri:
-  case ARM64::ANDSXri:
-    // ANDS does not use the same encoding scheme as the others xxxS
-    // instructions.
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = 0;
-    CmpMask = ~0;
-    CmpValue = ARM64_AM::decodeLogicalImmediate(
-        MI->getOperand(2).getImm(),
-        MI->getOpcode() == ARM64::ANDSWri ? 32 : 64);
-    return true;
-  }
-
-  return false;
-}
-
-static bool UpdateOperandRegClass(MachineInstr *Instr) {
-  MachineBasicBlock *MBB = Instr->getParent();
-  assert(MBB && "Can't get MachineBasicBlock here");
-  MachineFunction *MF = MBB->getParent();
-  assert(MF && "Can't get MachineFunction here");
-  const TargetMachine *TM = &MF->getTarget();
-  const TargetInstrInfo *TII = TM->getInstrInfo();
-  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
-  MachineRegisterInfo *MRI = &MF->getRegInfo();
-
-  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
-       ++OpIdx) {
-    MachineOperand &MO = Instr->getOperand(OpIdx);
-    const TargetRegisterClass *OpRegCstraints =
-        Instr->getRegClassConstraint(OpIdx, TII, TRI);
-
-    // If there's no constraint, there's nothing to do.
-    if (!OpRegCstraints)
-      continue;
-    // If the operand is a frame index, there's nothing to do here.
-    // A frame index operand will resolve correctly during PEI.
-    if (MO.isFI())
-      continue;
-
-    assert(MO.isReg() &&
-           "Operand has register constraints without being a register!");
-
-    unsigned Reg = MO.getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-      if (!OpRegCstraints->contains(Reg))
-        return false;
-    } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
-               !MRI->constrainRegClass(Reg, OpRegCstraints))
-      return false;
-  }
-
-  return true;
-}
-
-/// optimizeCompareInstr - Convert the instruction supplying the argument to the
-/// comparison into one that sets the zero bit in the flags register.
-bool ARM64InstrInfo::optimizeCompareInstr(
-    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
-    int CmpValue, const MachineRegisterInfo *MRI) const {
-
-  // Replace SUBSWrr with SUBWrr if NZCV is not used.
-  int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(ARM64::NZCV, true);
-  if (Cmp_NZCV != -1) {
-    unsigned NewOpc;
-    switch (CmpInstr->getOpcode()) {
-    default:
-      return false;
-    case ARM64::ADDSWrr:      NewOpc = ARM64::ADDWrr; break;
-    case ARM64::ADDSWri:      NewOpc = ARM64::ADDWri; break;
-    case ARM64::ADDSWrs:      NewOpc = ARM64::ADDWrs; break;
-    case ARM64::ADDSWrx:      NewOpc = ARM64::ADDWrx; break;
-    case ARM64::ADDSXrr:      NewOpc = ARM64::ADDXrr; break;
-    case ARM64::ADDSXri:      NewOpc = ARM64::ADDXri; break;
-    case ARM64::ADDSXrs:      NewOpc = ARM64::ADDXrs; break;
-    case ARM64::ADDSXrx:      NewOpc = ARM64::ADDXrx; break;
-    case ARM64::SUBSWrr:      NewOpc = ARM64::SUBWrr; break;
-    case ARM64::SUBSWri:      NewOpc = ARM64::SUBWri; break;
-    case ARM64::SUBSWrs:      NewOpc = ARM64::SUBWrs; break;
-    case ARM64::SUBSWrx:      NewOpc = ARM64::SUBWrx; break;
-    case ARM64::SUBSXrr:      NewOpc = ARM64::SUBXrr; break;
-    case ARM64::SUBSXri:      NewOpc = ARM64::SUBXri; break;
-    case ARM64::SUBSXrs:      NewOpc = ARM64::SUBXrs; break;
-    case ARM64::SUBSXrx:      NewOpc = ARM64::SUBXrx; break;
-    }
-
-    const MCInstrDesc &MCID = get(NewOpc);
-    CmpInstr->setDesc(MCID);
-    CmpInstr->RemoveOperand(Cmp_NZCV);
-    bool succeeded = UpdateOperandRegClass(CmpInstr);
-    (void)succeeded;
-    assert(succeeded && "Some operands reg class are incompatible!");
-    return true;
-  }
-
-  // Continue only if we have a "ri" where immediate is zero.
-  if (CmpValue != 0 || SrcReg2 != 0)
-    return false;
-
-  // CmpInstr is a Compare instruction if destination register is not used.
-  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
-    return false;
-
-  // Get the unique definition of SrcReg.
-  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
-  if (!MI)
-    return false;
-
-  // We iterate backward, starting from the instruction before CmpInstr and
-  // stop when reaching the definition of the source register or done with the
-  // basic block, to check whether NZCV is used or modified in between.
-  MachineBasicBlock::iterator I = CmpInstr, E = MI,
-                              B = CmpInstr->getParent()->begin();
-
-  // Early exit if CmpInstr is at the beginning of the BB.
-  if (I == B)
-    return false;
-
-  // Check whether the definition of SrcReg is in the same basic block as
-  // Compare. If not, we can't optimize away the Compare.
-  if (MI->getParent() != CmpInstr->getParent())
-    return false;
-
-  // Check that NZCV isn't set between the comparison instruction and the one we
-  // want to change.
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  for (--I; I != E; --I) {
-    const MachineInstr &Instr = *I;
-
-    if (Instr.modifiesRegister(ARM64::NZCV, TRI) ||
-        Instr.readsRegister(ARM64::NZCV, TRI))
-      // This instruction modifies or uses NZCV after the one we want to
-      // change. We can't do this transformation.
-      return false;
-    if (I == B)
-      // The 'and' is below the comparison instruction.
-      return false;
-  }
-
-  unsigned NewOpc = MI->getOpcode();
-  switch (MI->getOpcode()) {
-  default:
-    return false;
-  case ARM64::ADDSWrr:
-  case ARM64::ADDSWri:
-  case ARM64::ADDSXrr:
-  case ARM64::ADDSXri:
-  case ARM64::SUBSWrr:
-  case ARM64::SUBSWri:
-  case ARM64::SUBSXrr:
-  case ARM64::SUBSXri:
-    break;
-  case ARM64::ADDWrr:    NewOpc = ARM64::ADDSWrr; break;
-  case ARM64::ADDWri:    NewOpc = ARM64::ADDSWri; break;
-  case ARM64::ADDXrr:    NewOpc = ARM64::ADDSXrr; break;
-  case ARM64::ADDXri:    NewOpc = ARM64::ADDSXri; break;
-  case ARM64::ADCWr:     NewOpc = ARM64::ADCSWr; break;
-  case ARM64::ADCXr:     NewOpc = ARM64::ADCSXr; break;
-  case ARM64::SUBWrr:    NewOpc = ARM64::SUBSWrr; break;
-  case ARM64::SUBWri:    NewOpc = ARM64::SUBSWri; break;
-  case ARM64::SUBXrr:    NewOpc = ARM64::SUBSXrr; break;
-  case ARM64::SUBXri:    NewOpc = ARM64::SUBSXri; break;
-  case ARM64::SBCWr:     NewOpc = ARM64::SBCSWr; break;
-  case ARM64::SBCXr:     NewOpc = ARM64::SBCSXr; break;
-  case ARM64::ANDWri:    NewOpc = ARM64::ANDSWri; break;
-  case ARM64::ANDXri:    NewOpc = ARM64::ANDSXri; break;
-  }
-
-  // Scan forward for the use of NZCV.
-  // When checking against MI: if it's a conditional code requires
-  // checking of V bit, then this is not safe to do.
-  // It is safe to remove CmpInstr if NZCV is redefined or killed.
-  // If we are done with the basic block, we need to check whether NZCV is
-  // live-out.
-  bool IsSafe = false;
-  for (MachineBasicBlock::iterator I = CmpInstr,
-                                   E = CmpInstr->getParent()->end();
-       !IsSafe && ++I != E;) {
-    const MachineInstr &Instr = *I;
-    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
-         ++IO) {
-      const MachineOperand &MO = Instr.getOperand(IO);
-      if (MO.isRegMask() && MO.clobbersPhysReg(ARM64::NZCV)) {
-        IsSafe = true;
-        break;
-      }
-      if (!MO.isReg() || MO.getReg() != ARM64::NZCV)
-        continue;
-      if (MO.isDef()) {
-        IsSafe = true;
-        break;
-      }
-
-      // Decode the condition code.
-      unsigned Opc = Instr.getOpcode();
-      ARM64CC::CondCode CC;
-      switch (Opc) {
-      default:
-        return false;
-      case ARM64::Bcc:
-        CC = (ARM64CC::CondCode)Instr.getOperand(IO - 2).getImm();
-        break;
-      case ARM64::CSINVWr:
-      case ARM64::CSINVXr:
-      case ARM64::CSINCWr:
-      case ARM64::CSINCXr:
-      case ARM64::CSELWr:
-      case ARM64::CSELXr:
-      case ARM64::CSNEGWr:
-      case ARM64::CSNEGXr:
-      case ARM64::FCSELSrrr:
-      case ARM64::FCSELDrrr:
-        CC = (ARM64CC::CondCode)Instr.getOperand(IO - 1).getImm();
-        break;
-      }
-
-      // It is not safe to remove Compare instruction if Overflow(V) is used.
-      switch (CC) {
-      default:
-        // NZCV can be used multiple times, we should continue.
-        break;
-      case ARM64CC::VS:
-      case ARM64CC::VC:
-      case ARM64CC::GE:
-      case ARM64CC::LT:
-      case ARM64CC::GT:
-      case ARM64CC::LE:
-        return false;
-      }
-    }
-  }
-
-  // If NZCV is not killed nor re-defined, we should check whether it is
-  // live-out. If it is live-out, do not optimize.
-  if (!IsSafe) {
-    MachineBasicBlock *ParentBlock = CmpInstr->getParent();
-    for (auto *MBB : ParentBlock->successors())
-      if (MBB->isLiveIn(ARM64::NZCV))
-        return false;
-  }
-
-  // Update the instruction to set NZCV.
-  MI->setDesc(get(NewOpc));
-  CmpInstr->eraseFromParent();
-  bool succeeded = UpdateOperandRegClass(MI);
-  (void)succeeded;
-  assert(succeeded && "Some operands reg class are incompatible!");
-  MI->addRegisterDefined(ARM64::NZCV, TRI);
-  return true;
-}
-
-/// Return true if this is this instruction has a non-zero immediate
-bool ARM64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::ADDSWrs:
-  case ARM64::ADDSXrs:
-  case ARM64::ADDWrs:
-  case ARM64::ADDXrs:
-  case ARM64::ANDSWrs:
-  case ARM64::ANDSXrs:
-  case ARM64::ANDWrs:
-  case ARM64::ANDXrs:
-  case ARM64::BICSWrs:
-  case ARM64::BICSXrs:
-  case ARM64::BICWrs:
-  case ARM64::BICXrs:
-  case ARM64::CRC32Brr:
-  case ARM64::CRC32CBrr:
-  case ARM64::CRC32CHrr:
-  case ARM64::CRC32CWrr:
-  case ARM64::CRC32CXrr:
-  case ARM64::CRC32Hrr:
-  case ARM64::CRC32Wrr:
-  case ARM64::CRC32Xrr:
-  case ARM64::EONWrs:
-  case ARM64::EONXrs:
-  case ARM64::EORWrs:
-  case ARM64::EORXrs:
-  case ARM64::ORNWrs:
-  case ARM64::ORNXrs:
-  case ARM64::ORRWrs:
-  case ARM64::ORRXrs:
-  case ARM64::SUBSWrs:
-  case ARM64::SUBSXrs:
-  case ARM64::SUBWrs:
-  case ARM64::SUBXrs:
-    if (MI->getOperand(3).isImm()) {
-      unsigned val = MI->getOperand(3).getImm();
-      return (val != 0);
-    }
-    break;
-  }
-  return false;
-}
-
-/// Return true if this is this instruction has a non-zero immediate
-bool ARM64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::ADDSWrx:
-  case ARM64::ADDSXrx:
-  case ARM64::ADDSXrx64:
-  case ARM64::ADDWrx:
-  case ARM64::ADDXrx:
-  case ARM64::ADDXrx64:
-  case ARM64::SUBSWrx:
-  case ARM64::SUBSXrx:
-  case ARM64::SUBSXrx64:
-  case ARM64::SUBWrx:
-  case ARM64::SUBXrx:
-  case ARM64::SUBXrx64:
-    if (MI->getOperand(3).isImm()) {
-      unsigned val = MI->getOperand(3).getImm();
-      return (val != 0);
-    }
-    break;
-  }
-
-  return false;
-}
-
-// Return true if this instruction simply sets its single destination register
-// to zero. This is equivalent to a register rename of the zero-register.
-bool ARM64InstrInfo::isGPRZero(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::MOVZWi:
-  case ARM64::MOVZXi: // movz Rd, #0 (LSL #0)
-    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 3 &&
-             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
-      return true;
-    }
-    break;
-  case ARM64::ANDWri: // and Rd, Rzr, #imm
-    return MI->getOperand(1).getReg() == ARM64::WZR;
-  case ARM64::ANDXri:
-    return MI->getOperand(1).getReg() == ARM64::XZR;
-  case TargetOpcode::COPY:
-    return MI->getOperand(1).getReg() == ARM64::WZR;
-  }
-  return false;
-}
-
-// Return true if this instruction simply renames a general register without
-// modifying bits.
-bool ARM64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case TargetOpcode::COPY: {
-    // GPR32 copies will by lowered to ORRXrs
-    unsigned DstReg = MI->getOperand(0).getReg();
-    return (ARM64::GPR32RegClass.contains(DstReg) ||
-            ARM64::GPR64RegClass.contains(DstReg));
-  }
-  case ARM64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
-    if (MI->getOperand(1).getReg() == ARM64::XZR) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
-      return true;
-    }
-  case ARM64::ADDXri: // add Xd, Xn, #0 (LSL #0)
-    if (MI->getOperand(2).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
-      return true;
-    }
-  }
-  return false;
-}
-
-// Return true if this instruction simply renames a general register without
-// modifying bits.
-bool ARM64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case TargetOpcode::COPY: {
-    // FPR64 copies will by lowered to ORR.16b
-    unsigned DstReg = MI->getOperand(0).getReg();
-    return (ARM64::FPR64RegClass.contains(DstReg) ||
-            ARM64::FPR128RegClass.contains(DstReg));
-  }
-  case ARM64::ORRv16i8:
-    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
-      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
-             "invalid ORRv16i8 operands");
-      return true;
-    }
-  }
-  return false;
-}
-
-unsigned ARM64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                             int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::LDRWui:
-  case ARM64::LDRXui:
-  case ARM64::LDRBui:
-  case ARM64::LDRHui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
-
-  return 0;
-}
-
-unsigned ARM64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
-                                            int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::STRWui:
-  case ARM64::STRXui:
-  case ARM64::STRBui:
-  case ARM64::STRHui:
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
-  return 0;
-}
-
-/// Return true if this is load/store scales or extends its register offset.
-/// This refers to scaling a dynamic index as opposed to scaled immediates.
-/// MI should be a memory op that allows scaled addressing.
-bool ARM64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::LDRBBroW:
-  case ARM64::LDRBroW:
-  case ARM64::LDRDroW:
-  case ARM64::LDRHHroW:
-  case ARM64::LDRHroW:
-  case ARM64::LDRQroW:
-  case ARM64::LDRSBWroW:
-  case ARM64::LDRSBXroW:
-  case ARM64::LDRSHWroW:
-  case ARM64::LDRSHXroW:
-  case ARM64::LDRSWroW:
-  case ARM64::LDRSroW:
-  case ARM64::LDRWroW:
-  case ARM64::LDRXroW:
-  case ARM64::STRBBroW:
-  case ARM64::STRBroW:
-  case ARM64::STRDroW:
-  case ARM64::STRHHroW:
-  case ARM64::STRHroW:
-  case ARM64::STRQroW:
-  case ARM64::STRSroW:
-  case ARM64::STRWroW:
-  case ARM64::STRXroW:
-  case ARM64::LDRBBroX:
-  case ARM64::LDRBroX:
-  case ARM64::LDRDroX:
-  case ARM64::LDRHHroX:
-  case ARM64::LDRHroX:
-  case ARM64::LDRQroX:
-  case ARM64::LDRSBWroX:
-  case ARM64::LDRSBXroX:
-  case ARM64::LDRSHWroX:
-  case ARM64::LDRSHXroX:
-  case ARM64::LDRSWroX:
-  case ARM64::LDRSroX:
-  case ARM64::LDRWroX:
-  case ARM64::LDRXroX:
-  case ARM64::STRBBroX:
-  case ARM64::STRBroX:
-  case ARM64::STRDroX:
-  case ARM64::STRHHroX:
-  case ARM64::STRHroX:
-  case ARM64::STRQroX:
-  case ARM64::STRSroX:
-  case ARM64::STRWroX:
-  case ARM64::STRXroX:
-
-    unsigned Val = MI->getOperand(3).getImm();
-    ARM64_AM::ShiftExtendType ExtType = ARM64_AM::getMemExtendType(Val);
-    return (ExtType != ARM64_AM::UXTX) || ARM64_AM::getMemDoShift(Val);
-  }
-  return false;
-}
-
-/// Check all MachineMemOperands for a hint to suppress pairing.
-bool ARM64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  for (auto *MM : MI->memoperands()) {
-    if (MM->getFlags() &
-        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/// Set a flag on the first MachineMemOperand to suppress pairing.
-void ARM64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
-  if (MI->memoperands_empty())
-    return;
-
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  (*MI->memoperands_begin())
-      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
-}
-
-bool ARM64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                                          unsigned &Offset,
-                                          const TargetRegisterInfo *TRI) const {
-  switch (LdSt->getOpcode()) {
-  default:
-    return false;
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STRQui:
-  case ARM64::STRXui:
-  case ARM64::STRWui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-  case ARM64::LDRXui:
-  case ARM64::LDRWui:
-    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
-      return false;
-    BaseReg = LdSt->getOperand(1).getReg();
-    MachineFunction &MF = *LdSt->getParent()->getParent();
-    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
-    Offset = LdSt->getOperand(2).getImm() * Width;
-    return true;
-  };
-}
-
-/// Detect opportunities for ldp/stp formation.
-///
-/// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
-bool ARM64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
-                                        MachineInstr *SecondLdSt,
-                                        unsigned NumLoads) const {
-  // Only cluster up to a single pair.
-  if (NumLoads > 1)
-    return false;
-  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
-    return false;
-  // getLdStBaseRegImmOfs guarantees that oper 2 isImm.
-  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
-  // Allow 6 bits of positive range.
-  if (Ofs1 > 64)
-    return false;
-  // The caller should already have ordered First/SecondLdSt by offset.
-  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
-  return Ofs1 + 1 == Ofs2;
-}
-
-bool ARM64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
-                                            MachineInstr *Second) const {
-  // Cyclone can fuse CMN, CMP followed by Bcc.
-
-  // FIXME: B0 can also fuse:
-  // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
-  if (Second->getOpcode() != ARM64::Bcc)
-    return false;
-  switch (First->getOpcode()) {
-  default:
-    return false;
-  case ARM64::SUBSWri:
-  case ARM64::ADDSWri:
-  case ARM64::ANDSWri:
-  case ARM64::SUBSXri:
-  case ARM64::ADDSXri:
-  case ARM64::ANDSXri:
-    return true;
-  }
-}
-
-MachineInstr *ARM64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
-                                                       int FrameIx,
-                                                       uint64_t Offset,
-                                                       const MDNode *MDPtr,
-                                                       DebugLoc DL) const {
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM64::DBG_VALUE))
-                                .addFrameIndex(FrameIx)
-                                .addImm(0)
-                                .addImm(Offset)
-                                .addMetadata(MDPtr);
-  return &*MIB;
-}
-
-static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
-                                            unsigned Reg, unsigned SubIdx,
-                                            unsigned State,
-                                            const TargetRegisterInfo *TRI) {
-  if (!SubIdx)
-    return MIB.addReg(Reg, State);
-
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
-    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
-  return MIB.addReg(Reg, State, SubIdx);
-}
-
-static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
-                                        unsigned NumRegs) {
-  // We really want the positive remainder mod 32 here, that happens to be
-  // easily obtainable with a mask.
-  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
-}
-
-void ARM64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator I,
-                                      DebugLoc DL, unsigned DestReg,
-                                      unsigned SrcReg, bool KillSrc,
-                                      unsigned Opcode,
-                                      llvm::ArrayRef<unsigned> Indices) const {
-  assert(getSubTarget().hasNEON() &&
-         "Unexpected register copy without NEON");
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
-  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
-  unsigned NumRegs = Indices.size();
-
-  int SubReg = 0, End = NumRegs, Incr = 1;
-  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
-    SubReg = NumRegs - 1;
-    End = -1;
-    Incr = -1;
-  }
-
-  for (; SubReg != End; SubReg += Incr) {
-    const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode));
-    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
-    AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
-    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
-  }
-}
-
-void ARM64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator I, DebugLoc DL,
-                                 unsigned DestReg, unsigned SrcReg,
-                                 bool KillSrc) const {
-  if (ARM64::GPR32spRegClass.contains(DestReg) &&
-      (ARM64::GPR32spRegClass.contains(SrcReg) || SrcReg == ARM64::WZR)) {
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-
-    if (DestReg == ARM64::WSP || SrcReg == ARM64::WSP) {
-      // If either operand is WSP, expand to ADD #0.
-      if (Subtarget.hasZeroCycleRegMove()) {
-        // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
-        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, ARM64::sub_32,
-                                                     &ARM64::GPR64spRegClass);
-        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, ARM64::sub_32,
-                                                    &ARM64::GPR64spRegClass);
-        // This instruction is reading and writing X registers.  This may upset
-        // the register scavenger and machine verifier, so we need to indicate
-        // that we are reading an undefined value from SrcRegX, but a proper
-        // value from SrcReg.
-        BuildMI(MBB, I, DL, get(ARM64::ADDXri), DestRegX)
-            .addReg(SrcRegX, RegState::Undef)
-            .addImm(0)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0))
-            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
-      } else {
-        BuildMI(MBB, I, DL, get(ARM64::ADDWri), DestReg)
-            .addReg(SrcReg, getKillRegState(KillSrc))
-            .addImm(0)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-      }
-    } else if (SrcReg == ARM64::WZR && Subtarget.hasZeroCycleZeroing()) {
-      BuildMI(MBB, I, DL, get(ARM64::MOVZWi), DestReg).addImm(0).addImm(
-          ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    } else {
-      if (Subtarget.hasZeroCycleRegMove()) {
-        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
-        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, ARM64::sub_32,
-                                                     &ARM64::GPR64spRegClass);
-        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, ARM64::sub_32,
-                                                    &ARM64::GPR64spRegClass);
-        // This instruction is reading and writing X registers.  This may upset
-        // the register scavenger and machine verifier, so we need to indicate
-        // that we are reading an undefined value from SrcRegX, but a proper
-        // value from SrcReg.
-        BuildMI(MBB, I, DL, get(ARM64::ORRXrr), DestRegX)
-            .addReg(ARM64::XZR)
-            .addReg(SrcRegX, RegState::Undef)
-            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
-      } else {
-        // Otherwise, expand to ORR WZR.
-        BuildMI(MBB, I, DL, get(ARM64::ORRWrr), DestReg)
-            .addReg(ARM64::WZR)
-            .addReg(SrcReg, getKillRegState(KillSrc));
-      }
-    }
-    return;
-  }
-
-  if (ARM64::GPR64spRegClass.contains(DestReg) &&
-      (ARM64::GPR64spRegClass.contains(SrcReg) || SrcReg == ARM64::XZR)) {
-    if (DestReg == ARM64::SP || SrcReg == ARM64::SP) {
-      // If either operand is SP, expand to ADD #0.
-      BuildMI(MBB, I, DL, get(ARM64::ADDXri), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc))
-          .addImm(0)
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    } else if (SrcReg == ARM64::XZR && Subtarget.hasZeroCycleZeroing()) {
-      BuildMI(MBB, I, DL, get(ARM64::MOVZXi), DestReg).addImm(0).addImm(
-          ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    } else {
-      // Otherwise, expand to ORR XZR.
-      BuildMI(MBB, I, DL, get(ARM64::ORRXrr), DestReg)
-          .addReg(ARM64::XZR)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
-  // Copy a DDDD register quad by copying the individual sub-registers.
-  if (ARM64::DDDDRegClass.contains(DestReg) &&
-      ARM64::DDDDRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1,
-                                        ARM64::dsub2, ARM64::dsub3 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a DDD register triple by copying the individual sub-registers.
-  if (ARM64::DDDRegClass.contains(DestReg) &&
-      ARM64::DDDRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1,
-                                        ARM64::dsub2 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a DD register pair by copying the individual sub-registers.
-  if (ARM64::DDRegClass.contains(DestReg) &&
-      ARM64::DDRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a QQQQ register quad by copying the individual sub-registers.
-  if (ARM64::QQQQRegClass.contains(DestReg) &&
-      ARM64::QQQQRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1,
-                                        ARM64::qsub2, ARM64::qsub3 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a QQQ register triple by copying the individual sub-registers.
-  if (ARM64::QQQRegClass.contains(DestReg) &&
-      ARM64::QQQRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1,
-                                        ARM64::qsub2 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a QQ register pair by copying the individual sub-registers.
-  if (ARM64::QQRegClass.contains(DestReg) &&
-      ARM64::QQRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
-                     Indices);
-    return;
-  }
-
-  if (ARM64::FPR128RegClass.contains(DestReg) &&
-      ARM64::FPR128RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
-      BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-          SrcReg, getKillRegState(KillSrc));
-    } else {
-      BuildMI(MBB, I, DL, get(ARM64::STRQpre))
-        .addReg(ARM64::SP, RegState::Define)
-        .addReg(SrcReg, getKillRegState(KillSrc))
-        .addReg(ARM64::SP)
-        .addImm(-16);
-      BuildMI(MBB, I, DL, get(ARM64::LDRQpre))
-        .addReg(ARM64::SP, RegState::Define)
-        .addReg(DestReg, RegState::Define)
-        .addReg(ARM64::SP)
-        .addImm(16);
-    }
-    return;
-  }
-
-  if (ARM64::FPR64RegClass.contains(DestReg) &&
-      ARM64::FPR64RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
-      DestReg =
-          RI.getMatchingSuperReg(DestReg, ARM64::dsub, &ARM64::FPR128RegClass);
-      SrcReg =
-          RI.getMatchingSuperReg(SrcReg, ARM64::dsub, &ARM64::FPR128RegClass);
-      BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-          SrcReg, getKillRegState(KillSrc));
-    } else {
-      BuildMI(MBB, I, DL, get(ARM64::FMOVDr), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
-  if (ARM64::FPR32RegClass.contains(DestReg) &&
-      ARM64::FPR32RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
-      DestReg =
-          RI.getMatchingSuperReg(DestReg, ARM64::ssub, &ARM64::FPR128RegClass);
-      SrcReg =
-          RI.getMatchingSuperReg(SrcReg, ARM64::ssub, &ARM64::FPR128RegClass);
-      BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-          SrcReg, getKillRegState(KillSrc));
-    } else {
-      BuildMI(MBB, I, DL, get(ARM64::FMOVSr), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
-  if (ARM64::FPR16RegClass.contains(DestReg) &&
-      ARM64::FPR16RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
-      DestReg =
-          RI.getMatchingSuperReg(DestReg, ARM64::hsub, &ARM64::FPR128RegClass);
-      SrcReg =
-          RI.getMatchingSuperReg(SrcReg, ARM64::hsub, &ARM64::FPR128RegClass);
-      BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-          SrcReg, getKillRegState(KillSrc));
-    } else {
-      DestReg =
-          RI.getMatchingSuperReg(DestReg, ARM64::hsub, &ARM64::FPR32RegClass);
-      SrcReg =
-          RI.getMatchingSuperReg(SrcReg, ARM64::hsub, &ARM64::FPR32RegClass);
-      BuildMI(MBB, I, DL, get(ARM64::FMOVSr), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
-  if (ARM64::FPR8RegClass.contains(DestReg) &&
-      ARM64::FPR8RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
-      DestReg =
-          RI.getMatchingSuperReg(DestReg, ARM64::bsub, &ARM64::FPR128RegClass);
-      SrcReg =
-          RI.getMatchingSuperReg(SrcReg, ARM64::bsub, &ARM64::FPR128RegClass);
-      BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-          SrcReg, getKillRegState(KillSrc));
-    } else {
-      DestReg =
-          RI.getMatchingSuperReg(DestReg, ARM64::bsub, &ARM64::FPR32RegClass);
-      SrcReg =
-          RI.getMatchingSuperReg(SrcReg, ARM64::bsub, &ARM64::FPR32RegClass);
-      BuildMI(MBB, I, DL, get(ARM64::FMOVSr), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
-  // Copies between GPR64 and FPR64.
-  if (ARM64::FPR64RegClass.contains(DestReg) &&
-      ARM64::GPR64RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVXDr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  if (ARM64::GPR64RegClass.contains(DestReg) &&
-      ARM64::FPR64RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVDXr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  // Copies between GPR32 and FPR32.
-  if (ARM64::FPR32RegClass.contains(DestReg) &&
-      ARM64::GPR32RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVWSr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  if (ARM64::GPR32RegClass.contains(DestReg) &&
-      ARM64::FPR32RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVSWr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  assert(0 && "unimplemented reg-to-reg copy");
-}
-
-void ARM64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                         MachineBasicBlock::iterator MBBI,
-                                         unsigned SrcReg, bool isKill, int FI,
-                                         const TargetRegisterClass *RC,
-                                         const TargetRegisterInfo *TRI) const {
-  DebugLoc DL;
-  if (MBBI != MBB.end())
-    DL = MBBI->getDebugLoc();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
-
-  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
-  unsigned Opc = 0;
-  bool Offset = true;
-  switch (RC->getSize()) {
-  case 1:
-    if (ARM64::FPR8RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRBui;
-    break;
-  case 2:
-    if (ARM64::FPR16RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRHui;
-    break;
-  case 4:
-    if (ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::STRWui;
-      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
-        MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR32RegClass);
-      else
-        assert(SrcReg != ARM64::WSP);
-    } else if (ARM64::FPR32RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRSui;
-    break;
-  case 8:
-    if (ARM64::GPR64allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::STRXui;
-      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
-        MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR64RegClass);
-      else
-        assert(SrcReg != ARM64::SP);
-    } else if (ARM64::FPR64RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRDui;
-    break;
-  case 16:
-    if (ARM64::FPR128RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRQui;
-    else if (ARM64::DDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register store without NEON");
-      Opc = ARM64::ST1Twov1d, Offset = false;
-    }
-    break;
-  case 24:
-    if (ARM64::DDDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register store without NEON");
-      Opc = ARM64::ST1Threev1d, Offset = false;
-    }
-    break;
-  case 32:
-    if (ARM64::DDDDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register store without NEON");
-      Opc = ARM64::ST1Fourv1d, Offset = false;
-    } else if (ARM64::QQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register store without NEON");
-      Opc = ARM64::ST1Twov2d, Offset = false;
-    }
-    break;
-  case 48:
-    if (ARM64::QQQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register store without NEON");
-      Opc = ARM64::ST1Threev2d, Offset = false;
-    }
-    break;
-  case 64:
-    if (ARM64::QQQQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register store without NEON");
-      Opc = ARM64::ST1Fourv2d, Offset = false;
-    }
-    break;
-  }
-  assert(Opc && "Unknown register class");
-
-  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
-                                      .addReg(SrcReg, getKillRegState(isKill))
-                                      .addFrameIndex(FI);
-
-  if (Offset)
-    MI.addImm(0);
-  MI.addMemOperand(MMO);
-}
-
-void ARM64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator MBBI,
-                                          unsigned DestReg, int FI,
-                                          const TargetRegisterClass *RC,
-                                          const TargetRegisterInfo *TRI) const {
-  DebugLoc DL;
-  if (MBBI != MBB.end())
-    DL = MBBI->getDebugLoc();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
-  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
-
-  unsigned Opc = 0;
-  bool Offset = true;
-  switch (RC->getSize()) {
-  case 1:
-    if (ARM64::FPR8RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRBui;
-    break;
-  case 2:
-    if (ARM64::FPR16RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRHui;
-    break;
-  case 4:
-    if (ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::LDRWui;
-      if (TargetRegisterInfo::isVirtualRegister(DestReg))
-        MF.getRegInfo().constrainRegClass(DestReg, &ARM64::GPR32RegClass);
-      else
-        assert(DestReg != ARM64::WSP);
-    } else if (ARM64::FPR32RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRSui;
-    break;
-  case 8:
-    if (ARM64::GPR64allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::LDRXui;
-      if (TargetRegisterInfo::isVirtualRegister(DestReg))
-        MF.getRegInfo().constrainRegClass(DestReg, &ARM64::GPR64RegClass);
-      else
-        assert(DestReg != ARM64::SP);
-    } else if (ARM64::FPR64RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRDui;
-    break;
-  case 16:
-    if (ARM64::FPR128RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRQui;
-    else if (ARM64::DDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register load without NEON");
-      Opc = ARM64::LD1Twov1d, Offset = false;
-    }
-    break;
-  case 24:
-    if (ARM64::DDDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register load without NEON");
-      Opc = ARM64::LD1Threev1d, Offset = false;
-    }
-    break;
-  case 32:
-    if (ARM64::DDDDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register load without NEON");
-      Opc = ARM64::LD1Fourv1d, Offset = false;
-    } else if (ARM64::QQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register load without NEON");
-      Opc = ARM64::LD1Twov2d, Offset = false;
-    }
-    break;
-  case 48:
-    if (ARM64::QQQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register load without NEON");
-      Opc = ARM64::LD1Threev2d, Offset = false;
-    }
-    break;
-  case 64:
-    if (ARM64::QQQQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register load without NEON");
-      Opc = ARM64::LD1Fourv2d, Offset = false;
-    }
-    break;
-  }
-  assert(Opc && "Unknown register class");
-
-  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
-                                      .addReg(DestReg, getDefRegState(true))
-                                      .addFrameIndex(FI);
-  if (Offset)
-    MI.addImm(0);
-  MI.addMemOperand(MMO);
-}
-
-void llvm::emitFrameOffset(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg, int Offset,
-                           const ARM64InstrInfo *TII, MachineInstr::MIFlag Flag,
-                           bool SetNZCV) {
-  if (DestReg == SrcReg && Offset == 0)
-    return;
-
-  bool isSub = Offset < 0;
-  if (isSub)
-    Offset = -Offset;
-
-  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
-  // scratch register.  If DestReg is a virtual register, use it as the
-  // scratch register; otherwise, create a new virtual register (to be
-  // replaced by the scavenger at the end of PEI).  That case can be optimized
-  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
-  // register can be loaded with offset%8 and the add/sub can use an extending
-  // instruction with LSL#3.
-  // Currently the function handles any offsets but generates a poor sequence
-  // of code.
-  //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
-
-  unsigned Opc;
-  if (SetNZCV)
-    Opc = isSub ? ARM64::SUBSXri : ARM64::ADDSXri;
-  else
-    Opc = isSub ? ARM64::SUBXri : ARM64::ADDXri;
-  const unsigned MaxEncoding = 0xfff;
-  const unsigned ShiftSize = 12;
-  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
-  while (((unsigned)Offset) >= (1 << ShiftSize)) {
-    unsigned ThisVal;
-    if (((unsigned)Offset) > MaxEncodableValue) {
-      ThisVal = MaxEncodableValue;
-    } else {
-      ThisVal = Offset & MaxEncodableValue;
-    }
-    assert((ThisVal >> ShiftSize) <= MaxEncoding &&
-           "Encoding cannot handle value that big");
-    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
-        .addReg(SrcReg)
-        .addImm(ThisVal >> ShiftSize)
-        .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftSize))
-        .setMIFlag(Flag);
-
-    SrcReg = DestReg;
-    Offset -= ThisVal;
-    if (Offset == 0)
-      return;
-  }
-  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
-      .addReg(SrcReg)
-      .addImm(Offset)
-      .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0))
-      .setMIFlag(Flag);
-}
-
-MachineInstr *
-ARM64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      const SmallVectorImpl<unsigned> &Ops,
-                                      int FrameIndex) const {
-  // This is a bit of a hack. Consider this instruction:
-  //
-  //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
-  //
-  // We explicitly chose GPR64all for the virtual register so such a copy might
-  // be eliminated by RegisterCoalescer. However, that may not be possible, and
-  // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
-  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
-  //
-  // To prevent that, we are going to constrain the %vreg0 register class here.
-  //
-  // <rdar://problem/11522048>
-  //
-  if (MI->isCopy()) {
-    unsigned DstReg = MI->getOperand(0).getReg();
-    unsigned SrcReg = MI->getOperand(1).getReg();
-    if (SrcReg == ARM64::SP && TargetRegisterInfo::isVirtualRegister(DstReg)) {
-      MF.getRegInfo().constrainRegClass(DstReg, &ARM64::GPR64RegClass);
-      return nullptr;
-    }
-    if (DstReg == ARM64::SP && TargetRegisterInfo::isVirtualRegister(SrcReg)) {
-      MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR64RegClass);
-      return nullptr;
-    }
-  }
-
-  // Cannot fold.
-  return nullptr;
-}
-
-int llvm::isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
-                                  bool *OutUseUnscaledOp,
-                                  unsigned *OutUnscaledOp,
-                                  int *EmittableOffset) {
-  int Scale = 1;
-  bool IsSigned = false;
-  // The ImmIdx should be changed case by case if it is not 2.
-  unsigned ImmIdx = 2;
-  unsigned UnscaledOp = 0;
-  // Set output values in case of early exit.
-  if (EmittableOffset)
-    *EmittableOffset = 0;
-  if (OutUseUnscaledOp)
-    *OutUseUnscaledOp = false;
-  if (OutUnscaledOp)
-    *OutUnscaledOp = 0;
-  switch (MI.getOpcode()) {
-  default:
-    assert(0 && "unhandled opcode in rewriteARM64FrameIndex");
-  // Vector spills/fills can't take an immediate offset.
-  case ARM64::LD1Twov2d:
-  case ARM64::LD1Threev2d:
-  case ARM64::LD1Fourv2d:
-  case ARM64::LD1Twov1d:
-  case ARM64::LD1Threev1d:
-  case ARM64::LD1Fourv1d:
-  case ARM64::ST1Twov2d:
-  case ARM64::ST1Threev2d:
-  case ARM64::ST1Fourv2d:
-  case ARM64::ST1Twov1d:
-  case ARM64::ST1Threev1d:
-  case ARM64::ST1Fourv1d:
-    return ARM64FrameOffsetCannotUpdate;
-  case ARM64::PRFMui:
-    Scale = 8;
-    UnscaledOp = ARM64::PRFUMi;
-    break;
-  case ARM64::LDRXui:
-    Scale = 8;
-    UnscaledOp = ARM64::LDURXi;
-    break;
-  case ARM64::LDRWui:
-    Scale = 4;
-    UnscaledOp = ARM64::LDURWi;
-    break;
-  case ARM64::LDRBui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURBi;
-    break;
-  case ARM64::LDRHui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURHi;
-    break;
-  case ARM64::LDRSui:
-    Scale = 4;
-    UnscaledOp = ARM64::LDURSi;
-    break;
-  case ARM64::LDRDui:
-    Scale = 8;
-    UnscaledOp = ARM64::LDURDi;
-    break;
-  case ARM64::LDRQui:
-    Scale = 16;
-    UnscaledOp = ARM64::LDURQi;
-    break;
-  case ARM64::LDRBBui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURBBi;
-    break;
-  case ARM64::LDRHHui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURHHi;
-    break;
-  case ARM64::LDRSBXui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURSBXi;
-    break;
-  case ARM64::LDRSBWui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURSBWi;
-    break;
-  case ARM64::LDRSHXui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURSHXi;
-    break;
-  case ARM64::LDRSHWui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURSHWi;
-    break;
-  case ARM64::LDRSWui:
-    Scale = 4;
-    UnscaledOp = ARM64::LDURSWi;
-    break;
-
-  case ARM64::STRXui:
-    Scale = 8;
-    UnscaledOp = ARM64::STURXi;
-    break;
-  case ARM64::STRWui:
-    Scale = 4;
-    UnscaledOp = ARM64::STURWi;
-    break;
-  case ARM64::STRBui:
-    Scale = 1;
-    UnscaledOp = ARM64::STURBi;
-    break;
-  case ARM64::STRHui:
-    Scale = 2;
-    UnscaledOp = ARM64::STURHi;
-    break;
-  case ARM64::STRSui:
-    Scale = 4;
-    UnscaledOp = ARM64::STURSi;
-    break;
-  case ARM64::STRDui:
-    Scale = 8;
-    UnscaledOp = ARM64::STURDi;
-    break;
-  case ARM64::STRQui:
-    Scale = 16;
-    UnscaledOp = ARM64::STURQi;
-    break;
-  case ARM64::STRBBui:
-    Scale = 1;
-    UnscaledOp = ARM64::STURBBi;
-    break;
-  case ARM64::STRHHui:
-    Scale = 2;
-    UnscaledOp = ARM64::STURHHi;
-    break;
-
-  case ARM64::LDPXi:
-  case ARM64::LDPDi:
-  case ARM64::STPXi:
-  case ARM64::STPDi:
-    IsSigned = true;
-    Scale = 8;
-    break;
-  case ARM64::LDPQi:
-  case ARM64::STPQi:
-    IsSigned = true;
-    Scale = 16;
-    break;
-  case ARM64::LDPWi:
-  case ARM64::LDPSi:
-  case ARM64::STPWi:
-  case ARM64::STPSi:
-    IsSigned = true;
-    Scale = 4;
-    break;
-
-  case ARM64::LDURXi:
-  case ARM64::LDURWi:
-  case ARM64::LDURBi:
-  case ARM64::LDURHi:
-  case ARM64::LDURSi:
-  case ARM64::LDURDi:
-  case ARM64::LDURQi:
-  case ARM64::LDURHHi:
-  case ARM64::LDURBBi:
-  case ARM64::LDURSBXi:
-  case ARM64::LDURSBWi:
-  case ARM64::LDURSHXi:
-  case ARM64::LDURSHWi:
-  case ARM64::LDURSWi:
-  case ARM64::STURXi:
-  case ARM64::STURWi:
-  case ARM64::STURBi:
-  case ARM64::STURHi:
-  case ARM64::STURSi:
-  case ARM64::STURDi:
-  case ARM64::STURQi:
-  case ARM64::STURBBi:
-  case ARM64::STURHHi:
-    Scale = 1;
-    break;
-  }
-
-  Offset += MI.getOperand(ImmIdx).getImm() * Scale;
-
-  bool useUnscaledOp = false;
-  // If the offset doesn't match the scale, we rewrite the instruction to
-  // use the unscaled instruction instead. Likewise, if we have a negative
-  // offset (and have an unscaled op to use).
-  if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
-    useUnscaledOp = true;
-
-  // Use an unscaled addressing mode if the instruction has a negative offset
-  // (or if the instruction is already using an unscaled addressing mode).
-  unsigned MaskBits;
-  if (IsSigned) {
-    // ldp/stp instructions.
-    MaskBits = 7;
-    Offset /= Scale;
-  } else if (UnscaledOp == 0 || useUnscaledOp) {
-    MaskBits = 9;
-    IsSigned = true;
-    Scale = 1;
-  } else {
-    MaskBits = 12;
-    IsSigned = false;
-    Offset /= Scale;
-  }
-
-  // Attempt to fold address computation.
-  int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
-  int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
-  if (Offset >= MinOff && Offset <= MaxOff) {
-    if (EmittableOffset)
-      *EmittableOffset = Offset;
-    Offset = 0;
-  } else {
-    int NewOff = Offset < 0 ? MinOff : MaxOff;
-    if (EmittableOffset)
-      *EmittableOffset = NewOff;
-    Offset = (Offset - NewOff) * Scale;
-  }
-  if (OutUseUnscaledOp)
-    *OutUseUnscaledOp = useUnscaledOp;
-  if (OutUnscaledOp)
-    *OutUnscaledOp = UnscaledOp;
-  return ARM64FrameOffsetCanUpdate |
-         (Offset == 0 ? ARM64FrameOffsetIsLegal : 0);
-}
-
-bool llvm::rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                                  unsigned FrameReg, int &Offset,
-                                  const ARM64InstrInfo *TII) {
-  unsigned Opcode = MI.getOpcode();
-  unsigned ImmIdx = FrameRegIdx + 1;
-
-  if (Opcode == ARM64::ADDSXri || Opcode == ARM64::ADDXri) {
-    Offset += MI.getOperand(ImmIdx).getImm();
-    emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
-                    MI.getOperand(0).getReg(), FrameReg, Offset, TII,
-                    MachineInstr::NoFlags, (Opcode == ARM64::ADDSXri));
-    MI.eraseFromParent();
-    Offset = 0;
-    return true;
-  }
-
-  int NewOffset;
-  unsigned UnscaledOp;
-  bool UseUnscaledOp;
-  int Status = isARM64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, &UnscaledOp,
-                                       &NewOffset);
-  if (Status & ARM64FrameOffsetCanUpdate) {
-    if (Status & ARM64FrameOffsetIsLegal)
-      // Replace the FrameIndex with FrameReg.
-      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
-    if (UseUnscaledOp)
-      MI.setDesc(TII->get(UnscaledOp));
-
-    MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
-    return Offset == 0;
-  }
-
-  return false;
-}
-
-void ARM64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
-  NopInst.setOpcode(ARM64::HINT);
-  NopInst.addOperand(MCOperand::CreateImm(0));
-}
diff --git a/lib/Target/ARM64/ARM64InstrInfo.h b/lib/Target/ARM64/ARM64InstrInfo.h
deleted file mode 100644
index ce195e763b2..00000000000
--- a/lib/Target/ARM64/ARM64InstrInfo.h
+++ /dev/null
@@ -1,231 +0,0 @@
-//===- ARM64InstrInfo.h - ARM64 Instruction Information ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64INSTRINFO_H
-#define LLVM_TARGET_ARM64INSTRINFO_H
-
-#include "ARM64.h"
-#include "ARM64RegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-#define GET_INSTRINFO_HEADER
-#include "ARM64GenInstrInfo.inc"
-
-namespace llvm {
-
-class ARM64Subtarget;
-class ARM64TargetMachine;
-
-class ARM64InstrInfo : public ARM64GenInstrInfo {
-  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
-  // They will be shifted into MOTargetHintStart when accessed.
-  enum TargetMemOperandFlags {
-    MOSuppressPair = 1
-  };
-
-  const ARM64RegisterInfo RI;
-  const ARM64Subtarget &Subtarget;
-
-public:
-  explicit ARM64InstrInfo(const ARM64Subtarget &STI);
-
-  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
-  /// such, whenever a client has an instance of instruction info, it should
-  /// always be able to get register info as well (through this method).
-  const ARM64RegisterInfo &getRegisterInfo() const { return RI; }
-
-  const ARM64Subtarget &getSubTarget() const { return Subtarget; }
-
-  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
-
-  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
-                             unsigned &DstReg, unsigned &SubIdx) const override;
-
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                               int &FrameIndex) const override;
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
-                              int &FrameIndex) const override;
-
-  /// Returns true if there is a shiftable register and that the shift value
-  /// is non-zero.
-  bool hasShiftedReg(const MachineInstr *MI) const;
-
-  /// Returns true if there is an extendable register and that the extending value
-  /// is non-zero.
-  bool hasExtendedReg(const MachineInstr *MI) const;
-
-  /// \brief Does this instruction set its full destination register to zero?
-  bool isGPRZero(const MachineInstr *MI) const;
-
-  /// \brief Does this instruction rename a GPR without modifying bits?
-  bool isGPRCopy(const MachineInstr *MI) const;
-
-  /// \brief Does this instruction rename an FPR without modifying bits?
-  bool isFPRCopy(const MachineInstr *MI) const;
-
-  /// Return true if this is load/store scales or extends its register offset.
-  /// This refers to scaling a dynamic index as opposed to scaled immediates.
-  /// MI should be a memory op that allows scaled addressing.
-  bool isScaledAddr(const MachineInstr *MI) const;
-
-  /// Return true if pairing the given load or store is hinted to be
-  /// unprofitable.
-  bool isLdStPairSuppressed(const MachineInstr *MI) const;
-
-  /// Hint that pairing the given load or store is unprofitable.
-  void suppressLdStPair(MachineInstr *MI) const;
-
-  bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                            unsigned &Offset,
-                            const TargetRegisterInfo *TRI) const override;
-
-  bool enableClusterLoads() const override { return true; }
-
-  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
-                          unsigned NumLoads) const override;
-
-  bool shouldScheduleAdjacent(MachineInstr *First,
-                              MachineInstr *Second) const override;
-
-  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
-                                         uint64_t Offset, const MDNode *MDPtr,
-                                         DebugLoc DL) const;
-  void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
-                        bool KillSrc, unsigned Opcode,
-                        llvm::ArrayRef<unsigned> Indices) const;
-  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
-                   bool KillSrc) const override;
-
-  void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
-                           bool isKill, int FrameIndex,
-                           const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const override;
-
-  void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
-                            int FrameIndex, const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const override;
-
-  MachineInstr *
-  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                        const SmallVectorImpl<unsigned> &Ops,
-                        int FrameIndex) const override;
-
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                     MachineBasicBlock *&FBB,
-                     SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify = false) const override;
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
-  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                        MachineBasicBlock *FBB,
-                        const SmallVectorImpl<MachineOperand> &Cond,
-                        DebugLoc DL) const override;
-  bool
-  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-  bool canInsertSelect(const MachineBasicBlock &,
-                       const SmallVectorImpl<MachineOperand> &Cond, unsigned,
-                       unsigned, int &, int &, int &) const override;
-  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                    DebugLoc DL, unsigned DstReg,
-                    const SmallVectorImpl<MachineOperand> &Cond,
-                    unsigned TrueReg, unsigned FalseReg) const override;
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
-
-  /// analyzeCompare - For a comparison instruction, return the source registers
-  /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
-  /// Return true if the comparison instruction can be analyzed.
-  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                      unsigned &SrcReg2, int &CmpMask,
-                      int &CmpValue) const override;
-  /// optimizeCompareInstr - Convert the instruction supplying the argument to
-  /// the comparison into one that sets the zero bit in the flags register.
-  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
-                            unsigned SrcReg2, int CmpMask, int CmpValue,
-                            const MachineRegisterInfo *MRI) const override;
-
-private:
-  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
-                             MachineBasicBlock *TBB,
-                             const SmallVectorImpl<MachineOperand> &Cond) const;
-};
-
-/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
-/// plus Offset.  This is intended to be used from within the prolog/epilog
-/// insertion (PEI) pass, where a virtual scratch register may be allocated
-/// if necessary, to be replaced by the scavenger at the end of PEI.
-void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
-                     const ARM64InstrInfo *TII,
-                     MachineInstr::MIFlag = MachineInstr::NoFlags,
-                     bool SetNZCV = false);
-
-/// rewriteARM64FrameIndex - Rewrite MI to access 'Offset' bytes from the
-/// FP. Return false if the offset could not be handled directly in MI, and
-/// return the left-over portion by reference.
-bool rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                            unsigned FrameReg, int &Offset,
-                            const ARM64InstrInfo *TII);
-
-/// \brief Use to report the frame offset status in isARM64FrameOffsetLegal.
-enum ARM64FrameOffsetStatus {
-  ARM64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
-  ARM64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
-  ARM64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
-};
-
-/// \brief Check if the @p Offset is a valid frame offset for @p MI.
-/// The returned value reports the validity of the frame offset for @p MI.
-/// It uses the values defined by ARM64FrameOffsetStatus for that.
-/// If result == ARM64FrameOffsetCannotUpdate, @p MI cannot be updated to
-/// use an offset.eq
-/// If result & ARM64FrameOffsetIsLegal, @p Offset can completely be
-/// rewriten in @p MI.
-/// If result & ARM64FrameOffsetCanUpdate, @p Offset contains the
-/// amount that is off the limit of the legal offset.
-/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
-/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
-/// If set, @p EmittableOffset contains the amount that can be set in @p MI
-/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
-/// is a legal offset.
-int isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
-                            bool *OutUseUnscaledOp = nullptr,
-                            unsigned *OutUnscaledOp = nullptr,
-                            int *EmittableOffset = nullptr);
-
-static inline bool isUncondBranchOpcode(int Opc) { return Opc == ARM64::B; }
-
-static inline bool isCondBranchOpcode(int Opc) {
-  switch (Opc) {
-  case ARM64::Bcc:
-  case ARM64::CBZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZW:
-  case ARM64::CBNZX:
-  case ARM64::TBZW:
-  case ARM64::TBZX:
-  case ARM64::TBNZW:
-  case ARM64::TBNZX:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static inline bool isIndirectBranchOpcode(int Opc) { return Opc == ARM64::BR; }
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64InstrInfo.td b/lib/Target/ARM64/ARM64InstrInfo.td
deleted file mode 100644
index e68980c83c5..00000000000
--- a/lib/Target/ARM64/ARM64InstrInfo.td
+++ /dev/null
@@ -1,5282 +0,0 @@
-//===- ARM64InstrInfo.td - Describe the ARM64 Instructions -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// ARM64 Instruction definitions.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ARM Instruction Predicate Definitions.
-//
-def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
-                               AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
-def HasNEON          : Predicate<"Subtarget->hasNEON()">,
-                                 AssemblerPredicate<"FeatureNEON", "neon">;
-def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
-                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
-def HasCRC           : Predicate<"Subtarget->hasCRC()">,
-                                 AssemblerPredicate<"FeatureCRC", "crc">;
-def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
-def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
-
-//===----------------------------------------------------------------------===//
-// ARM64-specific DAG Nodes.
-//
-
-// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
-def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
-                                              [SDTCisSameAs<0, 2>,
-                                               SDTCisSameAs<0, 3>,
-                                               SDTCisInt<0>, SDTCisVT<1, i32>]>;
-
-// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
-def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
-                                            [SDTCisSameAs<0, 1>,
-                                             SDTCisSameAs<0, 2>,
-                                             SDTCisInt<0>,
-                                             SDTCisVT<3, i32>]>;
-
-// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
-def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
-                                            [SDTCisSameAs<0, 2>,
-                                             SDTCisSameAs<0, 3>,
-                                             SDTCisInt<0>,
-                                             SDTCisVT<1, i32>,
-                                             SDTCisVT<4, i32>]>;
-
-def SDT_ARM64Brcond  : SDTypeProfile<0, 3,
-                                     [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
-                                      SDTCisVT<2, i32>]>;
-def SDT_ARM64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
-def SDT_ARM64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
-                                        SDTCisVT<2, OtherVT>]>;
-
-
-def SDT_ARM64CSel  : SDTypeProfile<1, 4,
-                                   [SDTCisSameAs<0, 1>,
-                                    SDTCisSameAs<0, 2>,
-                                    SDTCisInt<3>,
-                                    SDTCisVT<4, i32>]>;
-def SDT_ARM64FCmp   : SDTypeProfile<0, 2,
-                                   [SDTCisFP<0>,
-                                    SDTCisSameAs<0, 1>]>;
-def SDT_ARM64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
-def SDT_ARM64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
-def SDT_ARM64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
-                                          SDTCisSameAs<0, 1>,
-                                          SDTCisSameAs<0, 2>]>;
-def SDT_ARM64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
-def SDT_ARM64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
-def SDT_ARM64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                           SDTCisInt<2>, SDTCisInt<3>]>;
-def SDT_ARM64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
-def SDT_ARM64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                          SDTCisSameAs<0,2>, SDTCisInt<3>]>;
-def SDT_ARM64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
-
-def SDT_ARM64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
-def SDT_ARM64fcmpz : SDTypeProfile<1, 1, []>;
-def SDT_ARM64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
-def SDT_ARM64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                           SDTCisSameAs<0,2>]>;
-def SDT_ARM64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                           SDTCisSameAs<0,2>,
-                                           SDTCisSameAs<0,3>]>;
-def SDT_ARM64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
-def SDT_ARM64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
-
-def SDT_ARM64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
-
-def SDT_ARM64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
-                                                 SDTCisPtrTy<1>]>;
-def SDT_ARM64WrapperLarge : SDTypeProfile<1, 4,
-                                        [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
-                                         SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
-                                         SDTCisSameAs<1, 4>]>;
-
-
-// Node definitions.
-def ARM64adrp          : SDNode<"ARM64ISD::ADRP", SDTIntUnaryOp, []>;
-def ARM64addlow        : SDNode<"ARM64ISD::ADDlow", SDTIntBinOp, []>;
-def ARM64LOADgot       : SDNode<"ARM64ISD::LOADgot", SDTIntUnaryOp>;
-def ARM64callseq_start : SDNode<"ISD::CALLSEQ_START",
-                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
-                                [SDNPHasChain, SDNPOutGlue]>;
-def ARM64callseq_end   : SDNode<"ISD::CALLSEQ_END",
-                                SDCallSeqEnd<[ SDTCisVT<0, i32>,
-                                               SDTCisVT<1, i32> ]>,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def ARM64call          : SDNode<"ARM64ISD::CALL",
-                                SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
-                                 SDNPVariadic]>;
-def ARM64brcond        : SDNode<"ARM64ISD::BRCOND", SDT_ARM64Brcond,
-                                [SDNPHasChain]>;
-def ARM64cbz           : SDNode<"ARM64ISD::CBZ", SDT_ARM64cbz,
-                                [SDNPHasChain]>;
-def ARM64cbnz           : SDNode<"ARM64ISD::CBNZ", SDT_ARM64cbz,
-                                [SDNPHasChain]>;
-def ARM64tbz           : SDNode<"ARM64ISD::TBZ", SDT_ARM64tbz,
-                                [SDNPHasChain]>;
-def ARM64tbnz           : SDNode<"ARM64ISD::TBNZ", SDT_ARM64tbz,
-                                [SDNPHasChain]>;
-
-
-def ARM64csel          : SDNode<"ARM64ISD::CSEL", SDT_ARM64CSel>;
-def ARM64csinv         : SDNode<"ARM64ISD::CSINV", SDT_ARM64CSel>;
-def ARM64csneg         : SDNode<"ARM64ISD::CSNEG", SDT_ARM64CSel>;
-def ARM64csinc         : SDNode<"ARM64ISD::CSINC", SDT_ARM64CSel>;
-def ARM64retflag       : SDNode<"ARM64ISD::RET_FLAG", SDTNone,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def ARM64adc       : SDNode<"ARM64ISD::ADC",  SDTBinaryArithWithFlagsIn >;
-def ARM64sbc       : SDNode<"ARM64ISD::SBC",  SDTBinaryArithWithFlagsIn>;
-def ARM64add_flag  : SDNode<"ARM64ISD::ADDS",  SDTBinaryArithWithFlagsOut,
-                            [SDNPCommutative]>;
-def ARM64sub_flag  : SDNode<"ARM64ISD::SUBS",  SDTBinaryArithWithFlagsOut>;
-def ARM64and_flag  : SDNode<"ARM64ISD::ANDS",  SDTBinaryArithWithFlagsOut,
-                            [SDNPCommutative]>;
-def ARM64adc_flag  : SDNode<"ARM64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
-def ARM64sbc_flag  : SDNode<"ARM64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
-
-def ARM64threadpointer : SDNode<"ARM64ISD::THREAD_POINTER", SDTPtrLeaf>;
-
-def ARM64fcmp      : SDNode<"ARM64ISD::FCMP", SDT_ARM64FCmp>;
-
-def ARM64fmax      : SDNode<"ARM64ISD::FMAX", SDTFPBinOp>;
-def ARM64fmin      : SDNode<"ARM64ISD::FMIN", SDTFPBinOp>;
-
-def ARM64dup       : SDNode<"ARM64ISD::DUP", SDT_ARM64Dup>;
-def ARM64duplane8  : SDNode<"ARM64ISD::DUPLANE8", SDT_ARM64DupLane>;
-def ARM64duplane16 : SDNode<"ARM64ISD::DUPLANE16", SDT_ARM64DupLane>;
-def ARM64duplane32 : SDNode<"ARM64ISD::DUPLANE32", SDT_ARM64DupLane>;
-def ARM64duplane64 : SDNode<"ARM64ISD::DUPLANE64", SDT_ARM64DupLane>;
-
-def ARM64zip1      : SDNode<"ARM64ISD::ZIP1", SDT_ARM64Zip>;
-def ARM64zip2      : SDNode<"ARM64ISD::ZIP2", SDT_ARM64Zip>;
-def ARM64uzp1      : SDNode<"ARM64ISD::UZP1", SDT_ARM64Zip>;
-def ARM64uzp2      : SDNode<"ARM64ISD::UZP2", SDT_ARM64Zip>;
-def ARM64trn1      : SDNode<"ARM64ISD::TRN1", SDT_ARM64Zip>;
-def ARM64trn2      : SDNode<"ARM64ISD::TRN2", SDT_ARM64Zip>;
-
-def ARM64movi_edit : SDNode<"ARM64ISD::MOVIedit", SDT_ARM64MOVIedit>;
-def ARM64movi_shift : SDNode<"ARM64ISD::MOVIshift", SDT_ARM64MOVIshift>;
-def ARM64movi_msl : SDNode<"ARM64ISD::MOVImsl", SDT_ARM64MOVIshift>;
-def ARM64mvni_shift : SDNode<"ARM64ISD::MVNIshift", SDT_ARM64MOVIshift>;
-def ARM64mvni_msl : SDNode<"ARM64ISD::MVNImsl", SDT_ARM64MOVIshift>;
-def ARM64movi : SDNode<"ARM64ISD::MOVI", SDT_ARM64MOVIedit>;
-def ARM64fmov : SDNode<"ARM64ISD::FMOV", SDT_ARM64MOVIedit>;
-
-def ARM64rev16 : SDNode<"ARM64ISD::REV16", SDT_ARM64UnaryVec>;
-def ARM64rev32 : SDNode<"ARM64ISD::REV32", SDT_ARM64UnaryVec>;
-def ARM64rev64 : SDNode<"ARM64ISD::REV64", SDT_ARM64UnaryVec>;
-def ARM64ext : SDNode<"ARM64ISD::EXT", SDT_ARM64ExtVec>;
-
-def ARM64vashr : SDNode<"ARM64ISD::VASHR", SDT_ARM64vshift>;
-def ARM64vlshr : SDNode<"ARM64ISD::VLSHR", SDT_ARM64vshift>;
-def ARM64vshl : SDNode<"ARM64ISD::VSHL", SDT_ARM64vshift>;
-def ARM64sqshli : SDNode<"ARM64ISD::SQSHL_I", SDT_ARM64vshift>;
-def ARM64uqshli : SDNode<"ARM64ISD::UQSHL_I", SDT_ARM64vshift>;
-def ARM64sqshlui : SDNode<"ARM64ISD::SQSHLU_I", SDT_ARM64vshift>;
-def ARM64srshri : SDNode<"ARM64ISD::SRSHR_I", SDT_ARM64vshift>;
-def ARM64urshri : SDNode<"ARM64ISD::URSHR_I", SDT_ARM64vshift>;
-
-def ARM64not: SDNode<"ARM64ISD::NOT", SDT_ARM64unvec>;
-def ARM64bit: SDNode<"ARM64ISD::BIT", SDT_ARM64trivec>;
-def ARM64bsl: SDNode<"ARM64ISD::BSL", SDT_ARM64trivec>;
-
-def ARM64cmeq: SDNode<"ARM64ISD::CMEQ", SDT_ARM64binvec>;
-def ARM64cmge: SDNode<"ARM64ISD::CMGE", SDT_ARM64binvec>;
-def ARM64cmgt: SDNode<"ARM64ISD::CMGT", SDT_ARM64binvec>;
-def ARM64cmhi: SDNode<"ARM64ISD::CMHI", SDT_ARM64binvec>;
-def ARM64cmhs: SDNode<"ARM64ISD::CMHS", SDT_ARM64binvec>;
-
-def ARM64fcmeq: SDNode<"ARM64ISD::FCMEQ", SDT_ARM64fcmp>;
-def ARM64fcmge: SDNode<"ARM64ISD::FCMGE", SDT_ARM64fcmp>;
-def ARM64fcmgt: SDNode<"ARM64ISD::FCMGT", SDT_ARM64fcmp>;
-
-def ARM64cmeqz: SDNode<"ARM64ISD::CMEQz", SDT_ARM64unvec>;
-def ARM64cmgez: SDNode<"ARM64ISD::CMGEz", SDT_ARM64unvec>;
-def ARM64cmgtz: SDNode<"ARM64ISD::CMGTz", SDT_ARM64unvec>;
-def ARM64cmlez: SDNode<"ARM64ISD::CMLEz", SDT_ARM64unvec>;
-def ARM64cmltz: SDNode<"ARM64ISD::CMLTz", SDT_ARM64unvec>;
-def ARM64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
-                         (ARM64not (ARM64cmeqz (and node:$LHS, node:$RHS)))>;
-
-def ARM64fcmeqz: SDNode<"ARM64ISD::FCMEQz", SDT_ARM64fcmpz>;
-def ARM64fcmgez: SDNode<"ARM64ISD::FCMGEz", SDT_ARM64fcmpz>;
-def ARM64fcmgtz: SDNode<"ARM64ISD::FCMGTz", SDT_ARM64fcmpz>;
-def ARM64fcmlez: SDNode<"ARM64ISD::FCMLEz", SDT_ARM64fcmpz>;
-def ARM64fcmltz: SDNode<"ARM64ISD::FCMLTz", SDT_ARM64fcmpz>;
-
-def ARM64bici: SDNode<"ARM64ISD::BICi", SDT_ARM64vecimm>;
-def ARM64orri: SDNode<"ARM64ISD::ORRi", SDT_ARM64vecimm>;
-
-def ARM64neg : SDNode<"ARM64ISD::NEG", SDT_ARM64unvec>;
-
-def ARM64tcret: SDNode<"ARM64ISD::TC_RETURN", SDT_ARM64TCRET,
-                  [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
-
-def ARM64Prefetch        : SDNode<"ARM64ISD::PREFETCH", SDT_ARM64PREFETCH,
-                               [SDNPHasChain, SDNPSideEffect]>;
-
-def ARM64sitof: SDNode<"ARM64ISD::SITOF", SDT_ARM64ITOF>;
-def ARM64uitof: SDNode<"ARM64ISD::UITOF", SDT_ARM64ITOF>;
-
-def ARM64tlsdesc_call : SDNode<"ARM64ISD::TLSDESC_CALL", SDT_ARM64TLSDescCall,
-                               [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
-                                SDNPVariadic]>;
-
-def ARM64WrapperLarge : SDNode<"ARM64ISD::WrapperLarge", SDT_ARM64WrapperLarge>;
-
-
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-
-// ARM64 Instruction Predicate Definitions.
-//
-def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
-def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
-def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
-def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
-def ForCodeSize   : Predicate<"ForCodeSize">;
-def NotForCodeSize   : Predicate<"!ForCodeSize">;
-
-include "ARM64InstrFormats.td"
-
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Miscellaneous instructions.
-//===----------------------------------------------------------------------===//
-
-let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                              [(ARM64callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-                            [(ARM64callseq_end timm:$amt1, timm:$amt2)]>;
-} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
-
-let isReMaterializable = 1, isCodeGenOnly = 1 in {
-// FIXME: The following pseudo instructions are only needed because remat
-// cannot handle multiple instructions.  When that changes, they can be
-// removed, along with the ARM64Wrapper node.
-
-let AddedComplexity = 10 in
-def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
-                     [(set GPR64:$dst, (ARM64LOADgot tglobaladdr:$addr))]>,
-              Sched<[WriteLDAdr]>;
-
-// The MOVaddr instruction should match only when the add is not folded
-// into a load or store address.
-def MOVaddr
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tglobaladdr:$hi),
-                                            tglobaladdr:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrJT
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tjumptable:$hi),
-                                             tjumptable:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrCP
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tconstpool:$hi),
-                                             tconstpool:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrBA
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tblockaddress:$hi),
-                                             tblockaddress:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrTLS
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tglobaltlsaddr:$hi),
-                                            tglobaltlsaddr:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrEXT
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp texternalsym:$hi),
-                                            texternalsym:$low))]>,
-      Sched<[WriteAdrAdr]>;
-
-} // isReMaterializable, isCodeGenOnly
-
-def : Pat<(ARM64LOADgot tglobaltlsaddr:$addr),
-          (LOADgot tglobaltlsaddr:$addr)>;
-
-def : Pat<(ARM64LOADgot texternalsym:$addr),
-          (LOADgot texternalsym:$addr)>;
-
-def : Pat<(ARM64LOADgot tconstpool:$addr),
-          (LOADgot tconstpool:$addr)>;
-
-//===----------------------------------------------------------------------===//
-// System instructions.
-//===----------------------------------------------------------------------===//
-
-def HINT  : HintI<"hint">;
-def : InstAlias<"nop",  (HINT 0b000)>;
-def : InstAlias<"yield",(HINT 0b001)>;
-def : InstAlias<"wfe",  (HINT 0b010)>;
-def : InstAlias<"wfi",  (HINT 0b011)>;
-def : InstAlias<"sev",  (HINT 0b100)>;
-def : InstAlias<"sevl", (HINT 0b101)>;
-
-  // As far as LLVM is concerned this writes to the system's exclusive monitors.
-let mayLoad = 1, mayStore = 1 in
-def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
-
-def DMB   : CRmSystemI<barrier_op, 0b101, "dmb">;
-def DSB   : CRmSystemI<barrier_op, 0b100, "dsb">;
-def ISB   : CRmSystemI<barrier_op, 0b110, "isb">;
-def : InstAlias<"clrex", (CLREX 0xf)>;
-def : InstAlias<"isb", (ISB 0xf)>;
-
-def MRS    : MRSI;
-def MSR    : MSRI;
-def MSRpstate: MSRpstateI;
-
-// The thread pointer (on Linux, at least, where this has been implemented) is
-// TPIDR_EL0.
-def : Pat<(ARM64threadpointer), (MRS 0xde82)>;
-
-// Generic system instructions
-def SYSxt  : SystemXtI<0, "sys">;
-def SYSLxt : SystemLXtI<1, "sysl">;
-
-def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
-                (SYSxt imm0_7:$op1, sys_cr_op:$Cn,
-                 sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
-
-//===----------------------------------------------------------------------===//
-// Move immediate instructions.
-//===----------------------------------------------------------------------===//
-
-defm MOVK : InsertImmediate<0b11, "movk">;
-defm MOVN : MoveImmediate<0b00, "movn">;
-
-let PostEncoderMethod = "fixMOVZ" in
-defm MOVZ : MoveImmediate<0b10, "movz">;
-
-// First group of aliases covers an implicit "lsl #0".
-def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
-
-// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
-
-// Final group of aliases covers true "mov $Rd, $imm" cases.
-multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
-                          int width, int shift> {
-  def _asmoperand : AsmOperandClass {
-    let Name = basename # width # "_lsl" # shift # "MovAlias";
-    let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
-                               # shift # ">";
-    let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
-  }
-
-  def _movimm : Operand<i32> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
-  }
-
-  def : InstAlias<"mov $Rd, $imm",
-                  (INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
-}
-
-defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
-defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;
-
-defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
-defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
-defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
-defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;
-
-defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
-defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;
-
-defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
-defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
-defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
-defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;
-
-let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
-    isAsCheapAsAMove = 1 in {
-// FIXME: The following pseudo instructions are only needed because remat
-// cannot handle multiple instructions.  When that changes, we can select
-// directly to the real instructions and get rid of these pseudos.
-
-def MOVi32imm
-    : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
-             [(set GPR32:$dst, imm:$src)]>,
-      Sched<[WriteImm]>;
-def MOVi64imm
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
-             [(set GPR64:$dst, imm:$src)]>,
-      Sched<[WriteImm]>;
-} // isReMaterializable, isCodeGenOnly
-
-// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
-// eventual expansion code fewer bits to worry about getting right. Marshalling
-// the types is a little tricky though:
-def i64imm_32bit : ImmLeaf<i64, [{
-  return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
-}]>;
-
-def trunc_imm : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i32);
-}]>;
-
-def : Pat<(i64 i64imm_32bit:$src),
-          (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
-
-// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
-// sequences.
-def : Pat<(ARM64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
-                             tglobaladdr:$g1, tglobaladdr:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
-                                  tglobaladdr:$g2, 32),
-                          tglobaladdr:$g1, 16),
-                  tglobaladdr:$g0, 0)>;
-
-def : Pat<(ARM64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
-                             tblockaddress:$g1, tblockaddress:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
-                                  tblockaddress:$g2, 32),
-                          tblockaddress:$g1, 16),
-                  tblockaddress:$g0, 0)>;
-
-def : Pat<(ARM64WrapperLarge tconstpool:$g3, tconstpool:$g2,
-                             tconstpool:$g1, tconstpool:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
-                                  tconstpool:$g2, 32),
-                          tconstpool:$g1, 16),
-                  tconstpool:$g0, 0)>;
-
-def : Pat<(ARM64WrapperLarge tjumptable:$g3, tjumptable:$g2,
-                             tjumptable:$g1, tjumptable:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
-                                  tjumptable:$g2, 32),
-                          tjumptable:$g1, 16),
-                  tjumptable:$g0, 0)>;
-
-
-//===----------------------------------------------------------------------===//
-// Arithmetic instructions.
-//===----------------------------------------------------------------------===//
-
-// Add/subtract with carry.
-defm ADC : AddSubCarry<0, "adc", "adcs", ARM64adc, ARM64adc_flag>;
-defm SBC : AddSubCarry<1, "sbc", "sbcs", ARM64sbc, ARM64sbc_flag>;
-
-def : InstAlias<"ngc $dst, $src",  (SBCWr  GPR32:$dst, WZR, GPR32:$src)>;
-def : InstAlias<"ngc $dst, $src",  (SBCXr  GPR64:$dst, XZR, GPR64:$src)>;
-def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
-def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
-
-// Add/subtract
-defm ADD : AddSub<0, "add", add>;
-defm SUB : AddSub<1, "sub">;
-
-def : InstAlias<"mov $dst, $src",
-                (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
-def : InstAlias<"mov $dst, $src",
-                (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
-def : InstAlias<"mov $dst, $src",
-                (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
-def : InstAlias<"mov $dst, $src",
-                (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
-
-defm ADDS : AddSubS<0, "adds", ARM64add_flag, "cmn">;
-defm SUBS : AddSubS<1, "subs", ARM64sub_flag, "cmp">;
-
-// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
-def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
-          (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
-def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
-          (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
-def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
-          (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
-def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
-          (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
-def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
-          (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
-def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
-          (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
-def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
-          (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
-def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
-          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
-
-// Because of the immediate format for add/sub-imm instructions, the
-// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
-//  These patterns capture that transformation.
-let AddedComplexity = 1 in {
-def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-}
-
-// Because of the immediate format for add/sub-imm instructions, the
-// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
-//  These patterns capture that transformation.
-let AddedComplexity = 1 in {
-def : Pat<(ARM64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(ARM64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-def : Pat<(ARM64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(ARM64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-}
-
-def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
-def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
-def : InstAlias<"neg $dst, $src$shift",
-                (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
-def : InstAlias<"neg $dst, $src$shift",
-                (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
-
-def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
-def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
-def : InstAlias<"negs $dst, $src$shift",
-                (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
-def : InstAlias<"negs $dst, $src$shift",
-                (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
-
-
-// Unsigned/Signed divide
-defm UDIV : Div<0, "udiv", udiv>;
-defm SDIV : Div<1, "sdiv", sdiv>;
-let isCodeGenOnly = 1 in {
-defm UDIV_Int : Div<0, "udiv", int_arm64_udiv>;
-defm SDIV_Int : Div<1, "sdiv", int_arm64_sdiv>;
-}
-
-// Variable shift
-defm ASRV : Shift<0b10, "asr", sra>;
-defm LSLV : Shift<0b00, "lsl", shl>;
-defm LSRV : Shift<0b01, "lsr", srl>;
-defm RORV : Shift<0b11, "ror", rotr>;
-
-def : ShiftAlias<"asrv", ASRVWr, GPR32>;
-def : ShiftAlias<"asrv", ASRVXr, GPR64>;
-def : ShiftAlias<"lslv", LSLVWr, GPR32>;
-def : ShiftAlias<"lslv", LSLVXr, GPR64>;
-def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
-def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
-def : ShiftAlias<"rorv", RORVWr, GPR32>;
-def : ShiftAlias<"rorv", RORVXr, GPR64>;
-
-// Multiply-add
-let AddedComplexity = 7 in {
-defm MADD : MulAccum<0, "madd", add>;
-defm MSUB : MulAccum<1, "msub", sub>;
-
-def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
-          (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
-def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
-          (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-
-def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
-          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
-def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
-          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-} // AddedComplexity = 7
-
-let AddedComplexity = 5 in {
-def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
-def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
-def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
-def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
-
-def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
-          (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
-          (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-
-def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
-          (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
-          (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-} // AddedComplexity = 5
-
-def : MulAccumWAlias<"mul", MADDWrrr>;
-def : MulAccumXAlias<"mul", MADDXrrr>;
-def : MulAccumWAlias<"mneg", MSUBWrrr>;
-def : MulAccumXAlias<"mneg", MSUBXrrr>;
-def : WideMulAccumAlias<"smull", SMADDLrrr>;
-def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
-def : WideMulAccumAlias<"umull", UMADDLrrr>;
-def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
-
-// Multiply-high
-def SMULHrr : MulHi<0b010, "smulh", mulhs>;
-def UMULHrr : MulHi<0b110, "umulh", mulhu>;
-
-// CRC32
-def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_arm64_crc32b, "crc32b">;
-def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_arm64_crc32h, "crc32h">;
-def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_arm64_crc32w, "crc32w">;
-def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_arm64_crc32x, "crc32x">;
-
-def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_arm64_crc32cb, "crc32cb">;
-def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_arm64_crc32ch, "crc32ch">;
-def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_arm64_crc32cw, "crc32cw">;
-def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_arm64_crc32cx, "crc32cx">;
-
-
-//===----------------------------------------------------------------------===//
-// Logical instructions.
-//===----------------------------------------------------------------------===//
-
-// (immediate)
-defm ANDS : LogicalImmS<0b11, "ands", ARM64and_flag>;
-defm AND  : LogicalImm<0b00, "and", and>;
-defm EOR  : LogicalImm<0b10, "eor", xor>;
-defm ORR  : LogicalImm<0b01, "orr", or>;
-
-// FIXME: these aliases *are* canonical sometimes (when movz can't be
-// used). Actually, it seems to be working right now, but putting logical_immXX
-// here is a bit dodgy on the AsmParser side too.
-def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
-                                          logical_imm32:$imm), 0>;
-def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
-                                          logical_imm64:$imm), 0>;
-
-
-// (register)
-defm ANDS : LogicalRegS<0b11, 0, "ands", ARM64and_flag>;
-defm BICS : LogicalRegS<0b11, 1, "bics",
-                        BinOpFrag<(ARM64and_flag node:$LHS, (not node:$RHS))>>;
-defm AND  : LogicalReg<0b00, 0, "and", and>;
-defm BIC  : LogicalReg<0b00, 1, "bic",
-                       BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
-defm EON  : LogicalReg<0b10, 1, "eon",
-                       BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
-defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
-defm ORN  : LogicalReg<0b01, 1, "orn",
-                       BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
-defm ORR  : LogicalReg<0b01, 0, "orr", or>;
-
-def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
-def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
-
-def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
-def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
-
-def : InstAlias<"mvn $Wd, $Wm$sh",
-                (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
-def : InstAlias<"mvn $Xd, $Xm$sh",
-                (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
-
-def : InstAlias<"tst $src1, $src2",
-                (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
-def : InstAlias<"tst $src1, $src2",
-                (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
-
-def : InstAlias<"tst $src1, $src2",
-                        (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
-def : InstAlias<"tst $src1, $src2",
-                        (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
-
-def : InstAlias<"tst $src1, $src2$sh",
-               (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
-def : InstAlias<"tst $src1, $src2$sh",
-               (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
-
-
-def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
-def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
-
-
-//===----------------------------------------------------------------------===//
-// One operand data processing instructions.
-//===----------------------------------------------------------------------===//
-
-defm CLS    : OneOperandData<0b101, "cls">;
-defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
-defm RBIT   : OneOperandData<0b000, "rbit">;
-def  REV16Wr : OneWRegData<0b001, "rev16",
-                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
-def  REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
-
-def : Pat<(cttz GPR32:$Rn),
-          (CLZWr (RBITWr GPR32:$Rn))>;
-def : Pat<(cttz GPR64:$Rn),
-          (CLZXr (RBITXr GPR64:$Rn))>;
-def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
-                (i32 1))),
-          (CLSWr GPR32:$Rn)>;
-def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
-                (i64 1))),
-          (CLSXr GPR64:$Rn)>;
-
-// Unlike the other one operand instructions, the instructions with the "rev"
-// mnemonic do *not* just different in the size bit, but actually use different
-// opcode bits for the different sizes.
-def REVWr   : OneWRegData<0b010, "rev", bswap>;
-def REVXr   : OneXRegData<0b011, "rev", bswap>;
-def REV32Xr : OneXRegData<0b010, "rev32",
-                                 UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
-
-// The bswap commutes with the rotr so we want a pattern for both possible
-// orders.
-def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
-def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
-
-//===----------------------------------------------------------------------===//
-// Bitfield immediate extraction instruction.
-//===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in
-defm EXTR : ExtractImm<"extr">;
-def : InstAlias<"ror $dst, $src, $shift",
-            (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
-def : InstAlias<"ror $dst, $src, $shift",
-            (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
-
-def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
-          (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
-def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
-          (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
-
-//===----------------------------------------------------------------------===//
-// Other bitfield immediate instructions.
-//===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in {
-defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
-defm SBFM : BitfieldImm<0b00, "sbfm">;
-defm UBFM : BitfieldImm<0b10, "ubfm">;
-}
-
-def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 31 - N->getZExtValue();
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(7, 31 - shift_amt)
-def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 31 - N->getZExtValue();
-  enc = enc > 7 ? 7 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(15, 31 - shift_amt)
-def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 31 - N->getZExtValue();
-  enc = enc > 15 ? 15 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(7, 63 - shift_amt)
-def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  enc = enc > 7 ? 7 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(15, 63 - shift_amt)
-def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  enc = enc > 15 ? 15 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(31, 63 - shift_amt)
-def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  enc = enc > 31 ? 31 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
-          (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
-                              (i64 (i32shift_b imm0_31:$imm)))>;
-def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
-          (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
-                              (i64 (i64shift_b imm0_63:$imm)))>;
-
-let AddedComplexity = 10 in {
-def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
-          (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
-def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
-          (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
-}
-
-def : InstAlias<"asr $dst, $src, $shift",
-                (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
-def : InstAlias<"asr $dst, $src, $shift",
-                (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
-def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
-def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
-def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
-def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
-def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
-
-def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
-          (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
-def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
-          (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
-
-def : InstAlias<"lsr $dst, $src, $shift",
-                (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
-def : InstAlias<"lsr $dst, $src, $shift",
-                (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
-def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
-def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
-def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
-def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
-def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
-
-//===----------------------------------------------------------------------===//
-// Conditionally set flags instructions.
-//===----------------------------------------------------------------------===//
-defm CCMN : CondSetFlagsImm<0, "ccmn">;
-defm CCMP : CondSetFlagsImm<1, "ccmp">;
-
-defm CCMN : CondSetFlagsReg<0, "ccmn">;
-defm CCMP : CondSetFlagsReg<1, "ccmp">;
-
-//===----------------------------------------------------------------------===//
-// Conditional select instructions.
-//===----------------------------------------------------------------------===//
-defm CSEL  : CondSelect<0, 0b00, "csel">;
-
-def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
-defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
-defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
-defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
-
-def : Pat<(ARM64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
-          (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
-          (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
-          (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
-          (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
-          (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
-          (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
-
-def : Pat<(ARM64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
-          (CSINCWr WZR, WZR, (i32 imm:$cc))>;
-def : Pat<(ARM64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
-          (CSINCXr XZR, XZR, (i32 imm:$cc))>;
-def : Pat<(ARM64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
-          (CSINVWr WZR, WZR, (i32 imm:$cc))>;
-def : Pat<(ARM64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
-          (CSINVXr XZR, XZR, (i32 imm:$cc))>;
-
-// The inverse of the condition code from the alias instruction is what is used
-// in the aliased instruction. The parser all ready inverts the condition code
-// for these aliases.
-def : InstAlias<"cset $dst, $cc",
-                (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
-def : InstAlias<"cset $dst, $cc",
-                (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
-
-def : InstAlias<"csetm $dst, $cc",
-                (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
-def : InstAlias<"csetm $dst, $cc",
-                (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
-
-def : InstAlias<"cinc $dst, $src, $cc",
-                (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
-def : InstAlias<"cinc $dst, $src, $cc",
-                (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
-
-def : InstAlias<"cinv $dst, $src, $cc",
-                (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
-def : InstAlias<"cinv $dst, $src, $cc",
-                (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
-
-def : InstAlias<"cneg $dst, $src, $cc",
-                (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
-def : InstAlias<"cneg $dst, $src, $cc",
-                (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
-
-//===----------------------------------------------------------------------===//
-// PC-relative instructions.
-//===----------------------------------------------------------------------===//
-let isReMaterializable = 1 in {
-let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
-def ADR  : ADRI<0, "adr", adrlabel, []>;
-} // neverHasSideEffects = 1
-
-def ADRP : ADRI<1, "adrp", adrplabel,
-                [(set GPR64:$Xd, (ARM64adrp tglobaladdr:$label))]>;
-} // isReMaterializable = 1
-
-// page address of a constant pool entry, block address
-def : Pat<(ARM64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
-def : Pat<(ARM64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (register) instructions.
-//===----------------------------------------------------------------------===//
-
-let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-def RET  : BranchReg<0b0010, "ret", []>;
-def DRPS : SpecialReturn<0b0101, "drps">;
-def ERET : SpecialReturn<0b0100, "eret">;
-} // isReturn = 1, isTerminator = 1, isBarrier = 1
-
-// Default to the LR register.
-def : InstAlias<"ret", (RET LR)>;
-
-let isCall = 1, Defs = [LR], Uses = [SP] in {
-def BLR : BranchReg<0b0001, "blr", [(ARM64call GPR64:$Rn)]>;
-} // isCall
-
-let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
-def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
-} // isBranch, isTerminator, isBarrier, isIndirectBranch
-
-// Create a separate pseudo-instruction for codegen to use so that we don't
-// flag lr as used in every function. It'll be restored before the RET by the
-// epilogue if it's legitimately used.
-def RET_ReallyLR : Pseudo<(outs), (ins), [(ARM64retflag)]> {
-  let isTerminator = 1;
-  let isBarrier = 1;
-  let isReturn = 1;
-}
-
-// This is a directive-like pseudo-instruction. The purpose is to insert an
-// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
-// (which in the usual case is a BLR).
-let hasSideEffects = 1 in
-def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
-  let AsmString = ".tlsdesccall $sym";
-}
-
-// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
-// gets expanded to two MCInsts during lowering.
-let isCall = 1, Defs = [LR] in
-def TLSDESC_BLR
-    : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
-             [(ARM64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
-
-def : Pat<(ARM64tlsdesc_call GPR64:$dest, texternalsym:$sym),
-          (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
-//===----------------------------------------------------------------------===//
-// Conditional branch (immediate) instruction.
-//===----------------------------------------------------------------------===//
-def Bcc : BranchCond;
-
-//===----------------------------------------------------------------------===//
-// Compare-and-branch instructions.
-//===----------------------------------------------------------------------===//
-defm CBZ  : CmpBranch<0, "cbz", ARM64cbz>;
-defm CBNZ : CmpBranch<1, "cbnz", ARM64cbnz>;
-
-//===----------------------------------------------------------------------===//
-// Test-bit-and-branch instructions.
-//===----------------------------------------------------------------------===//
-defm TBZ  : TestBranch<0, "tbz", ARM64tbz>;
-defm TBNZ : TestBranch<1, "tbnz", ARM64tbnz>;
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (immediate) instructions.
-//===----------------------------------------------------------------------===//
-let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
-def B  : BranchImm<0, "b", [(br bb:$addr)]>;
-} // isBranch, isTerminator, isBarrier
-
-let isCall = 1, Defs = [LR], Uses = [SP] in {
-def BL : CallImm<1, "bl", [(ARM64call tglobaladdr:$addr)]>;
-} // isCall
-def : Pat<(ARM64call texternalsym:$func), (BL texternalsym:$func)>;
-
-//===----------------------------------------------------------------------===//
-// Exception generation instructions.
-//===----------------------------------------------------------------------===//
-def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
-def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
-def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
-def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
-def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
-def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
-def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
-def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
-
-// DCPSn defaults to an immediate operand of zero if unspecified.
-def : InstAlias<"dcps1", (DCPS1 0)>;
-def : InstAlias<"dcps2", (DCPS2 0)>;
-def : InstAlias<"dcps3", (DCPS3 0)>;
-
-//===----------------------------------------------------------------------===//
-// Load instructions.
-//===----------------------------------------------------------------------===//
-
-// Pair (indexed, offset)
-defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
-defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
-defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
-defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
-defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;
-
-defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;
-
-// Pair (pre-indexed)
-def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
-def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
-def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
-def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
-def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;
-
-def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
-
-// Pair (post-indexed)
-def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
-def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
-def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
-def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
-def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
-
-def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
-
-
-// Pair (no allocate)
-defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
-defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
-defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
-defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
-defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;
-
-//---
-// (register offset)
-//---
-
-// Integer
-defm LDRBB : Load8RO<0b00,  0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
-defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
-defm LDRW  : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
-defm LDRX  : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
-
-// Floating-point
-defm LDRB : Load8RO<0b00,   1, 0b01, FPR8,   "ldr", untyped, load>;
-defm LDRH : Load16RO<0b01,  1, 0b01, FPR16,  "ldr", f16, load>;
-defm LDRS : Load32RO<0b10,  1, 0b01, FPR32,  "ldr", f32, load>;
-defm LDRD : Load64RO<0b11,  1, 0b01, FPR64,  "ldr", f64, load>;
-defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;
-
-// Load sign-extended half-word
-defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
-defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;
-
-// Load sign-extended byte
-defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
-defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;
-
-// Load sign-extended word
-defm LDRSW  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
-
-// Pre-fetch.
-defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
-
-// For regular load, we do not have any alignment requirement.
-// Thus, it is safe to directly map the vector loads with interesting
-// addressing modes.
-// FIXME: We could do the same for bitconvert to floating point vectors.
-multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
-                              ValueType ScalTy, ValueType VecTy,
-                              Instruction LOADW, Instruction LOADX,
-                              SubRegIndex sub> {
-  def : Pat<(VecTy (scalar_to_vector (ScalTy
-              (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
-            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
-                           (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
-                           sub)>;
-
-  def : Pat<(VecTy (scalar_to_vector (ScalTy
-              (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
-            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
-                           (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
-                           sub)>;
-}
-
-let AddedComplexity = 10 in {
-defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v8i8,  LDRBroW, LDRBroX, bsub>;
-defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v16i8, LDRBroW, LDRBroX, bsub>;
-
-defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
-defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
-
-defm : ScalToVecROLoadPat<ro32, load,       i32, v2i32, LDRSroW, LDRSroX, ssub>;
-defm : ScalToVecROLoadPat<ro32, load,       i32, v4i32, LDRSroW, LDRSroX, ssub>;
-
-defm : ScalToVecROLoadPat<ro32, load,       f32, v2f32, LDRSroW, LDRSroX, ssub>;
-defm : ScalToVecROLoadPat<ro32, load,       f32, v4f32, LDRSroW, LDRSroX, ssub>;
-
-defm : ScalToVecROLoadPat<ro64, load,       i64, v2i64, LDRDroW, LDRDroX, dsub>;
-
-defm : ScalToVecROLoadPat<ro64, load,       f64, v2f64, LDRDroW, LDRDroX, dsub>;
-
-
-def : Pat <(v1i64 (scalar_to_vector (i64
-                      (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
-                                           ro_Wextend64:$extend))))),
-           (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
-
-def : Pat <(v1i64 (scalar_to_vector (i64
-                      (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
-                                           ro_Xextend64:$extend))))),
-           (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
-}
-
-// Match all load 64 bits width whose type is compatible with FPR64
-multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
-                        Instruction LOADW, Instruction LOADX> {
-
-  def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
-            (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
-
-  def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
-            (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
-}
-
-let AddedComplexity = 10 in {
-let Predicates = [IsLE] in {
-  // We must do vector loads with LD1 in big-endian.
-  defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
-  defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
-  defm : VecROLoadPat<ro64, v8i8,  LDRDroW, LDRDroX>;
-  defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
-}
-
-defm : VecROLoadPat<ro64, v1i64,  LDRDroW, LDRDroX>;
-defm : VecROLoadPat<ro64, v1f64,  LDRDroW, LDRDroX>;
-
-// Match all load 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
-  // We must do vector loads with LD1 in big-endian.
-  defm : VecROLoadPat<ro128, v2i64,  LDRQroW, LDRQroX>;
-  defm : VecROLoadPat<ro128, v2f64,  LDRQroW, LDRQroX>;
-  defm : VecROLoadPat<ro128, v4i32,  LDRQroW, LDRQroX>;
-  defm : VecROLoadPat<ro128, v4f32,  LDRQroW, LDRQroX>;
-  defm : VecROLoadPat<ro128, v8i16,  LDRQroW, LDRQroX>;
-  defm : VecROLoadPat<ro128, v16i8,  LDRQroW, LDRQroX>;
-}
-} // AddedComplexity = 10
-
-// zextload -> i64
-multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
-                            Instruction INSTW, Instruction INSTX> {
-  def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
-            (SUBREG_TO_REG (i64 0),
-                           (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
-                           sub_32)>;
-
-  def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
-            (SUBREG_TO_REG (i64 0),
-                           (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
-                           sub_32)>;
-}
-
-let AddedComplexity = 10 in {
-  defm : ExtLoadTo64ROPat<ro8,  zextloadi8,  LDRBBroW, LDRBBroX>;
-  defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
-  defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW,  LDRWroX>;
-
-  // zextloadi1 -> zextloadi8
-  defm : ExtLoadTo64ROPat<ro8,  zextloadi1,  LDRBBroW, LDRBBroX>;
-
-  // extload -> zextload
-  defm : ExtLoadTo64ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
-  defm : ExtLoadTo64ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
-  defm : ExtLoadTo64ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
-
-  // extloadi1 -> zextloadi8
-  defm : ExtLoadTo64ROPat<ro8,  extloadi1,   LDRBBroW, LDRBBroX>;
-}
-
-
-// zextload -> i64
-multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
-                            Instruction INSTW, Instruction INSTX> {
-  def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
-            (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
-
-  def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
-            (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
-
-}
-
-let AddedComplexity = 10 in {
-  // extload -> zextload
-  defm : ExtLoadTo32ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
-  defm : ExtLoadTo32ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
-  defm : ExtLoadTo32ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
-
-  // zextloadi1 -> zextloadi8
-  defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
-}
-
-//---
-// (unsigned immediate)
-//---
-defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
-                   [(set GPR64:$Rt,
-                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
-defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
-                   [(set GPR32:$Rt,
-                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
-defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
-                   [(set FPR8:$Rt,
-                         (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
-defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
-                   [(set (f16 FPR16:$Rt),
-                         (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
-defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
-                   [(set (f32 FPR32:$Rt),
-                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
-defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
-                   [(set (f64 FPR64:$Rt),
-                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
-defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
-                 [(set (f128 FPR128:$Rt),
-                       (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
-
-// For regular load, we do not have any alignment requirement.
-// Thus, it is safe to directly map the vector loads with interesting
-// addressing modes.
-// FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (scalar_to_vector (i32
-               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
-           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
-                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v16i8 (scalar_to_vector (i32
-               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v4i16 (scalar_to_vector (i32
-               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
-           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
-                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v8i16 (scalar_to_vector (i32
-               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
-           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
-                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v2i32 (scalar_to_vector (i32
-               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
-           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v4i32 (scalar_to_vector (i32
-               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
-           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v1i64 (scalar_to_vector (i64
-               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
-           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat <(v2i64 (scalar_to_vector (i64
-               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
-           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
-                          (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
-
-// Match all load 64 bits width whose type is compatible with FPR64
-let Predicates = [IsLE] in {
-  // We must use LD1 to perform vector loads in big-endian.
-  def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
-            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-  def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
-            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-  def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
-            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-  def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
-            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-}
-def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
-          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
-          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-
-// Match all load 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
-  // We must use LD1 to perform vector loads in big-endian.
-  def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
-            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
-  def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
-            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
-  def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
-            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
-  def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
-            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
-  def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
-            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
-  def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
-            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
-}
-def : Pat<(f128  (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
-          (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
-
-defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
-                    [(set GPR32:$Rt,
-                          (zextloadi16 (am_indexed16 GPR64sp:$Rn,
-                                                     uimm12s2:$offset)))]>;
-defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
-                    [(set GPR32:$Rt,
-                          (zextloadi8 (am_indexed8 GPR64sp:$Rn,
-                                                   uimm12s1:$offset)))]>;
-// zextload -> i64
-def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
-def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
-
-// zextloadi1 -> zextloadi8
-def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
-          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
-def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
-
-// extload -> zextload
-def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
-          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
-def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
-          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
-def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
-          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
-def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
-def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
-def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
-def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
-
-// load sign-extended half-word
-defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
-                     [(set GPR32:$Rt,
-                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
-                                                      uimm12s2:$offset)))]>;
-defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
-                     [(set GPR64:$Rt,
-                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
-                                                      uimm12s2:$offset)))]>;
-
-// load sign-extended byte
-defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
-                     [(set GPR32:$Rt,
-                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
-                                                    uimm12s1:$offset)))]>;
-defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
-                     [(set GPR64:$Rt,
-                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
-                                                    uimm12s1:$offset)))]>;
-
-// load sign-extended word
-defm LDRSW  : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
-                     [(set GPR64:$Rt,
-                           (sextloadi32 (am_indexed32 GPR64sp:$Rn,
-                                                      uimm12s4:$offset)))]>;
-
-// load zero-extended word
-def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
-      (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
-
-// Pre-fetch.
-def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
-                        [(ARM64Prefetch imm:$Rt,
-                                        (am_indexed64 GPR64sp:$Rn,
-                                                      uimm12s8:$offset))]>;
-
-def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
-
-//---
-// (literal)
-def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
-def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
-def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
-def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
-def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
-
-// load sign-extended word
-def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
-
-// prefetch
-def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
-//                   [(ARM64Prefetch imm:$Rt, tglobaladdr:$label)]>;
-
-//---
-// (unscaled immediate)
-defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
-                    [(set GPR64:$Rt,
-                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
-                    [(set GPR32:$Rt,
-                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
-                    [(set FPR8:$Rt,
-                          (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
-                    [(set FPR16:$Rt,
-                          (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
-                    [(set (f32 FPR32:$Rt),
-                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
-                    [(set (f64 FPR64:$Rt),
-                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
-                    [(set (f128 FPR128:$Rt),
-                          (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
-
-defm LDURHH
-    : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
-             [(set GPR32:$Rt,
-                    (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURBB
-    : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
-             [(set GPR32:$Rt,
-                    (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
-
-// Match all load 64 bits width whose type is compatible with FPR64
-let Predicates = [IsLE] in {
-  def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
-            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
-            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
-            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
-            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
-}
-def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
-          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
-          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
-
-// Match all load 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
-  def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
-            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
-            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
-            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
-            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
-            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
-            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
-}
-
-//  anyext -> zext
-def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
-          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
-          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
-          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
-def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
-def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
-def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
-// unscaled zext
-def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
-          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
-          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
-          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
-def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
-def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
-def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
-    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
-
-
-//---
-// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
-
-// Define new assembler match classes as we want to only match these when
-// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
-// associate a DiagnosticType either, as we want the diagnostic for the
-// canonical form (the scaled operand) to take precedence.
-class SImm9OffsetOperand<int Width> : AsmOperandClass {
-  let Name = "SImm9OffsetFB" # Width;
-  let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
-  let RenderMethod = "addImmOperands";
-}
-
-def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
-def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
-def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
-def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
-def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;
-
-def simm9_offset_fb8 : Operand<i64> {
-  let ParserMatchClass = SImm9OffsetFB8Operand;
-}
-def simm9_offset_fb16 : Operand<i64> {
-  let ParserMatchClass = SImm9OffsetFB16Operand;
-}
-def simm9_offset_fb32 : Operand<i64> {
-  let ParserMatchClass = SImm9OffsetFB32Operand;
-}
-def simm9_offset_fb64 : Operand<i64> {
-  let ParserMatchClass = SImm9OffsetFB64Operand;
-}
-def simm9_offset_fb128 : Operand<i64> {
-  let ParserMatchClass = SImm9OffsetFB128Operand;
-}
-
-def : InstAlias<"ldr $Rt, [$Rn, $offset]",
-                (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
-def : InstAlias<"ldr $Rt, [$Rn, $offset]",
-                (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
-def : InstAlias<"ldr $Rt, [$Rn, $offset]",
-                (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
-def : InstAlias<"ldr $Rt, [$Rn, $offset]",
-                (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
-def : InstAlias<"ldr $Rt, [$Rn, $offset]",
-                (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
-def : InstAlias<"ldr $Rt, [$Rn, $offset]",
-                (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
-def : InstAlias<"ldr $Rt, [$Rn, $offset]",
-               (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
-
-// zextload -> i64
-def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
-  (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
-def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
-  (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
-
-// load sign-extended half-word
-defm LDURSHW
-    : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
-               [(set GPR32:$Rt,
-                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURSHX
-    : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
-              [(set GPR64:$Rt,
-                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
-
-// load sign-extended byte
-defm LDURSBW
-    : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
-                [(set GPR32:$Rt,
-                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
-defm LDURSBX
-    : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
-                [(set GPR64:$Rt,
-                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
-
-// load sign-extended word
-defm LDURSW
-    : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
-              [(set GPR64:$Rt,
-                    (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
-
-// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
-def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
-                (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
-def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
-                (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
-def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
-                (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
-def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
-                (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
-def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
-                (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
-def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
-                (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
-def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
-                (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
-
-// Pre-fetch.
-defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
-                  [(ARM64Prefetch imm:$Rt,
-                                  (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
-
-//---
-// (unscaled immediate, unprivileged)
-defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
-defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
-
-defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
-defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
-
-// load sign-extended half-word
-defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
-defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
-
-// load sign-extended byte
-defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
-defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
-
-// load sign-extended word
-defm LDTRSW  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
-
-//---
-// (immediate pre-indexed)
-def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
-def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
-def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
-def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
-def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
-def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
-def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
-
-// load sign-extended half-word
-def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
-def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
-
-// load sign-extended byte
-def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
-def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
-
-// load zero-extended byte
-def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
-def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
-
-// load sign-extended word
-def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
-
-//---
-// (immediate post-indexed)
-def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
-def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
-def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
-def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
-def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
-def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
-def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
-
-// load sign-extended half-word
-def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
-def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
-
-// load sign-extended byte
-def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
-def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
-
-// load zero-extended byte
-def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
-def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
-
-// load sign-extended word
-def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
-
-//===----------------------------------------------------------------------===//
-// Store instructions.
-//===----------------------------------------------------------------------===//
-
-// Pair (indexed, offset)
-// FIXME: Use dedicated range-checked addressing mode operand here.
-defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
-defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
-defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
-defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
-defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;
-
-// Pair (pre-indexed)
-def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
-def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
-def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
-def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
-def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;
-
-// Pair (pre-indexed)
-def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
-def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
-def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
-def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
-def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
-
-// Pair (no allocate)
-defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
-defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
-defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
-defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
-defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;
-
-//---
-// (Register offset)
-
-// Integer
-defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
-defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
-defm STRW  : Store32RO<0b10, 0, 0b00, GPR32, "str",  i32, store>;
-defm STRX  : Store64RO<0b11, 0, 0b00, GPR64, "str",  i64, store>;
-
-
-// Floating-point
-defm STRB : Store8RO< 0b00,  1, 0b00, FPR8,   "str", untyped, store>;
-defm STRH : Store16RO<0b01,  1, 0b00, FPR16,  "str", f16,     store>;
-defm STRS : Store32RO<0b10,  1, 0b00, FPR32,  "str", f32,     store>;
-defm STRD : Store64RO<0b11,  1, 0b00, FPR64,  "str", f64,     store>;
-defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128,    store>;
-
-multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
-                                 Instruction STRW, Instruction STRX> {
-
-  def : Pat<(storeop GPR64:$Rt,
-                     (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
-            (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
-                  GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
-
-  def : Pat<(storeop GPR64:$Rt,
-                     (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
-            (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
-                  GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
-}
-
-let AddedComplexity = 10 in {
-  // truncstore i64
-  defm : TruncStoreFrom64ROPat<ro8,  truncstorei8,  STRBBroW, STRBBroX>;
-  defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
-  defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW,  STRWroX>;
-}
-
-multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
-                         Instruction STRW, Instruction STRX> {
-  def : Pat<(store (VecTy FPR:$Rt),
-                   (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
-            (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
-
-  def : Pat<(store (VecTy FPR:$Rt),
-                   (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
-            (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
-}
-
-let AddedComplexity = 10 in {
-// Match all store 64 bits width whose type is compatible with FPR64
-let Predicates = [IsLE] in {
-  // We must use ST1 to store vectors in big-endian.
-  defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
-  defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
-  defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
-  defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
-}
-
-defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
-defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;
-
-// Match all store 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
-  // We must use ST1 to store vectors in big-endian.
-  defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
-  defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
-  defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
-  defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
-  defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
-  defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
-}
-} // AddedComplexity = 10
-
-//---
-// (unsigned immediate)
-defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
-                   [(store GPR64:$Rt,
-                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
-defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str",
-                    [(store GPR32:$Rt,
-                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
-defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
-                    [(store FPR8:$Rt,
-                            (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
-defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
-                    [(store (f16 FPR16:$Rt),
-                            (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
-defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
-                    [(store (f32 FPR32:$Rt),
-                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
-defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
-                    [(store (f64 FPR64:$Rt),
-                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
-defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
-
-defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh",
-                     [(truncstorei16 GPR32:$Rt,
-                                     (am_indexed16 GPR64sp:$Rn,
-                                                   uimm12s2:$offset))]>;
-defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1,  "strb",
-                     [(truncstorei8 GPR32:$Rt,
-                                    (am_indexed8 GPR64sp:$Rn,
-                                                 uimm12s1:$offset))]>;
-
-// Match all store 64 bits width whose type is compatible with FPR64
-let AddedComplexity = 10 in {
-let Predicates = [IsLE] in {
-  // We must use ST1 to store vectors in big-endian.
-  def : Pat<(store (v2f32 FPR64:$Rt),
-                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
-            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
-  def : Pat<(store (v8i8 FPR64:$Rt),
-                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
-            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
-  def : Pat<(store (v4i16 FPR64:$Rt),
-                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
-            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
-  def : Pat<(store (v2i32 FPR64:$Rt),
-                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
-            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
-}
-def : Pat<(store (v1f64 FPR64:$Rt),
-                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
-          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat<(store (v1i64 FPR64:$Rt),
-                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
-          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
-
-// Match all store 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
-  // We must use ST1 to store vectors in big-endian.
-  def : Pat<(store (v4f32 FPR128:$Rt),
-                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
-            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
-  def : Pat<(store (v2f64 FPR128:$Rt),
-                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
-            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
-  def : Pat<(store (v16i8 FPR128:$Rt),
-                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
-            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
-  def : Pat<(store (v8i16 FPR128:$Rt),
-                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
-            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
-  def : Pat<(store (v4i32 FPR128:$Rt),
-                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
-            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
-  def : Pat<(store (v2i64 FPR128:$Rt),
-                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
-            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
-}
-def : Pat<(store (f128  FPR128:$Rt),
-                 (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
-          (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
-
-// truncstore i64
-def : Pat<(truncstorei32 GPR64:$Rt,
-                         (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
-  (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
-def : Pat<(truncstorei16 GPR64:$Rt,
-                         (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
-  (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
-def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
-  (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;
-
-} // AddedComplexity = 10
-
-//---
-// (unscaled immediate)
-defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
-                         [(store GPR64:$Rt,
-                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
-                         [(store GPR32:$Rt,
-                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
-                         [(store FPR8:$Rt,
-                                 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
-                         [(store (f16 FPR16:$Rt),
-                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
-                         [(store (f32 FPR32:$Rt),
-                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
-                         [(store (f64 FPR64:$Rt),
-                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
-                         [(store (f128 FPR128:$Rt),
-                                 (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
-                         [(truncstorei16 GPR32:$Rt,
-                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
-defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
-                         [(truncstorei8 GPR32:$Rt,
-                                  (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
-
-// Match all store 64 bits width whose type is compatible with FPR64
-let Predicates = [IsLE] in {
-  // We must use ST1 to store vectors in big-endian.
-  def : Pat<(store (v2f32 FPR64:$Rt),
-                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
-            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(store (v8i8 FPR64:$Rt),
-                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
-            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(store (v4i16 FPR64:$Rt),
-                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
-            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(store (v2i32 FPR64:$Rt),
-                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
-            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-}
-def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
-          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
-          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-
-// Match all store 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
-  // We must use ST1 to store vectors in big-endian.
-  def : Pat<(store (v4f32 FPR128:$Rt),
-                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
-            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(store (v2f64 FPR128:$Rt),
-                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
-            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(store (v16i8 FPR128:$Rt),
-                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
-            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(store (v8i16 FPR128:$Rt),
-                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
-            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(store (v4i32 FPR128:$Rt),
-                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
-            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(store (v2i64 FPR128:$Rt),
-                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
-            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-  def : Pat<(store (v2f64 FPR128:$Rt),
-                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
-            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-}
-
-// unscaled i64 truncating stores
-def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
-  (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
-  (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
-  (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
-
-//---
-// STR mnemonics fall back to STUR for negative or unaligned offsets.
-def : InstAlias<"str $Rt, [$Rn, $offset]",
-                (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
-def : InstAlias<"str $Rt, [$Rn, $offset]",
-                (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
-def : InstAlias<"str $Rt, [$Rn, $offset]",
-                (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
-def : InstAlias<"str $Rt, [$Rn, $offset]",
-                (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
-def : InstAlias<"str $Rt, [$Rn, $offset]",
-                (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
-def : InstAlias<"str $Rt, [$Rn, $offset]",
-                (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
-def : InstAlias<"str $Rt, [$Rn, $offset]",
-                (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
-
-def : InstAlias<"strb $Rt, [$Rn, $offset]",
-                (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
-def : InstAlias<"strh $Rt, [$Rn, $offset]",
-                (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
-
-//---
-// (unscaled immediate, unprivileged)
-defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
-defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
-
-defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
-defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
-
-//---
-// (immediate pre-indexed)
-def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str",  pre_store, i32>;
-def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str",  pre_store, i64>;
-def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str",  pre_store, untyped>;
-def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str",  pre_store, f16>;
-def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str",  pre_store, f32>;
-def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str",  pre_store, f64>;
-def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;
-
-def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8,  i32>;
-def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;
-
-// truncstore i64
-def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
-  (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
-           simm9:$off)>;
-def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
-  (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
-            simm9:$off)>;
-def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
-  (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
-            simm9:$off)>;
-
-def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-
-def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-
-//---
-// (immediate post-indexed)
-def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32,  "str", post_store, i32>;
-def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64,  "str", post_store, i64>;
-def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,   "str", post_store, untyped>;
-def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16,  "str", post_store, f16>;
-def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32,  "str", post_store, f32>;
-def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64,  "str", post_store, f64>;
-def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;
-
-def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
-def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;
-
-// truncstore i64
-def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
-  (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
-            simm9:$off)>;
-def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
-  (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
-             simm9:$off)>;
-def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
-  (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
-             simm9:$off)>;
-
-def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
-
-def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
-          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
-
-//===----------------------------------------------------------------------===//
-// Load/store exclusive instructions.
-//===----------------------------------------------------------------------===//
-
-def LDARW  : LoadAcquire   <0b10, 1, 1, 0, 1, GPR32, "ldar">;
-def LDARX  : LoadAcquire   <0b11, 1, 1, 0, 1, GPR64, "ldar">;
-def LDARB  : LoadAcquire   <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
-def LDARH  : LoadAcquire   <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
-
-def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
-def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
-def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
-def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
-
-def LDXRW  : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
-def LDXRX  : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
-def LDXRB  : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
-def LDXRH  : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
-
-def STLRW  : StoreRelease  <0b10, 1, 0, 0, 1, GPR32, "stlr">;
-def STLRX  : StoreRelease  <0b11, 1, 0, 0, 1, GPR64, "stlr">;
-def STLRB  : StoreRelease  <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
-def STLRH  : StoreRelease  <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
-
-def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
-def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
-def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
-def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
-
-def STXRW  : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
-def STXRX  : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
-def STXRB  : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
-def STXRH  : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
-
-def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
-def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
-
-def LDXPW  : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
-def LDXPX  : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
-
-def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
-def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
-
-def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
-def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
-
-//===----------------------------------------------------------------------===//
-// Scaled floating point to integer conversion instructions.
-//===----------------------------------------------------------------------===//
-
-defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_arm64_neon_fcvtas>;
-defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_arm64_neon_fcvtau>;
-defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_arm64_neon_fcvtms>;
-defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_arm64_neon_fcvtmu>;
-defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_arm64_neon_fcvtns>;
-defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_arm64_neon_fcvtnu>;
-defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_arm64_neon_fcvtps>;
-defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_arm64_neon_fcvtpu>;
-defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
-defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
-defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
-defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_arm64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_arm64_neon_fcvtzu>;
-defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_arm64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_arm64_neon_fcvtzu>;
-}
-
-//===----------------------------------------------------------------------===//
-// Scaled integer to floating point conversion instructions.
-//===----------------------------------------------------------------------===//
-
-defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
-defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
-
-//===----------------------------------------------------------------------===//
-// Unscaled integer to floating point conversion instruction.
-//===----------------------------------------------------------------------===//
-
-defm FMOV : UnscaledConversion<"fmov">;
-
-def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
-def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
-
-//===----------------------------------------------------------------------===//
-// Floating point conversion instruction.
-//===----------------------------------------------------------------------===//
-
-defm FCVT : FPConversion<"fcvt">;
-
-def : Pat<(f32_to_f16 FPR32:$Rn),
-          (i32 (COPY_TO_REGCLASS
-                   (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)),
-                   GPR32))>;
-
-def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn),
-                          [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>;
-
-//===----------------------------------------------------------------------===//
-// Floating point single operand instructions.
-//===----------------------------------------------------------------------===//
-
-defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
-defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
-defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
-defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
-defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
-defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
-defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_arm64_neon_frintn>;
-defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
-
-def : Pat<(v1f64 (int_arm64_neon_frintn (v1f64 FPR64:$Rn))),
-          (FRINTNDr FPR64:$Rn)>;
-
-// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
-// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
-// <rdar://problem/13715968>
-// TODO: We should really model the FPSR flags correctly. This is really ugly.
-let hasSideEffects = 1 in {
-defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
-}
-
-defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
-
-let SchedRW = [WriteFDiv] in {
-defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating point two operand instructions.
-//===----------------------------------------------------------------------===//
-
-defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
-let SchedRW = [WriteFDiv] in {
-defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
-}
-defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_arm64_neon_fmaxnm>;
-defm FMAX   : TwoOperandFPData<0b0100, "fmax", ARM64fmax>;
-defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_arm64_neon_fminnm>;
-defm FMIN   : TwoOperandFPData<0b0101, "fmin", ARM64fmin>;
-let SchedRW = [WriteFMul] in {
-defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
-defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
-}
-defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
-
-def : Pat<(v1f64 (ARM64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (ARM64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_arm64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_arm64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
-
-//===----------------------------------------------------------------------===//
-// Floating point three operand instructions.
-//===----------------------------------------------------------------------===//
-
-defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
-defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
-     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
-defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
-     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
-defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
-     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
-
-// The following def pats catch the case where the LHS of an FMA is negated.
-// The TriOpFrag above catches the case where the middle operand is negated.
-
-// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
-// the NEON variant.
-def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
-          (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-
-def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
-          (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
-// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
-// "(-a) + b*(-c)".
-def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
-          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-
-def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
-          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
-def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
-          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-
-def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
-          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
-//===----------------------------------------------------------------------===//
-// Floating point comparison instructions.
-//===----------------------------------------------------------------------===//
-
-defm FCMPE : FPComparison<1, "fcmpe">;
-defm FCMP  : FPComparison<0, "fcmp", ARM64fcmp>;
-
-//===----------------------------------------------------------------------===//
-// Floating point conditional comparison instructions.
-//===----------------------------------------------------------------------===//
-
-defm FCCMPE : FPCondComparison<1, "fccmpe">;
-defm FCCMP  : FPCondComparison<0, "fccmp">;
-
-//===----------------------------------------------------------------------===//
-// Floating point conditional select instruction.
-//===----------------------------------------------------------------------===//
-
-defm FCSEL : FPCondSelect<"fcsel">;
-
-// CSEL instructions providing f128 types need to be handled by a
-// pseudo-instruction since the eventual code will need to introduce basic
-// blocks and control flow.
-def F128CSEL : Pseudo<(outs FPR128:$Rd),
-                      (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
-                      [(set (f128 FPR128:$Rd),
-                            (ARM64csel FPR128:$Rn, FPR128:$Rm,
-                                       (i32 imm:$cond), NZCV))]> {
-  let Uses = [NZCV];
-  let usesCustomInserter = 1;
-}
-
-
-//===----------------------------------------------------------------------===//
-// Floating point immediate move.
-//===----------------------------------------------------------------------===//
-
-let isReMaterializable = 1 in {
-defm FMOV : FPMoveImmediate<"fmov">;
-}
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD two vector instructions.
-//===----------------------------------------------------------------------===//
-
-defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_arm64_neon_abs>;
-defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_arm64_neon_cls>;
-defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
-defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", ARM64cmeqz>;
-defm CMGE   : SIMDCmpTwoVector<1, 0b01000, "cmge", ARM64cmgez>;
-defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", ARM64cmgtz>;
-defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", ARM64cmlez>;
-defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", ARM64cmltz>;
-defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
-defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
-
-defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", ARM64fcmeqz>;
-defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", ARM64fcmgez>;
-defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", ARM64fcmgtz>;
-defm FCMLE  : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", ARM64fcmlez>;
-defm FCMLT  : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", ARM64fcmltz>;
-defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_arm64_neon_fcvtas>;
-defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_arm64_neon_fcvtau>;
-defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
-def : Pat<(v4f32 (int_arm64_neon_vcvthf2fp (v4i16 V64:$Rn))),
-          (FCVTLv4i16 V64:$Rn)>;
-def : Pat<(v4f32 (int_arm64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
-                                                              (i64 4)))),
-          (FCVTLv8i16 V128:$Rn)>;
-def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
-def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
-                                                    (i64 2))))),
-          (FCVTLv4i32 V128:$Rn)>;
-
-defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_arm64_neon_fcvtms>;
-defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_arm64_neon_fcvtmu>;
-defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_arm64_neon_fcvtns>;
-defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_arm64_neon_fcvtnu>;
-defm FCVTN  : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
-def : Pat<(v4i16 (int_arm64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
-          (FCVTNv4i16 V128:$Rn)>;
-def : Pat<(concat_vectors V64:$Rd,
-                          (v4i16 (int_arm64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
-          (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
-def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
-          (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_arm64_neon_fcvtps>;
-defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_arm64_neon_fcvtpu>;
-defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
-                                        int_arm64_neon_fcvtxn>;
-defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
-defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
-                                       int_arm64_neon_fcvtzs>;
-defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
-                                       int_arm64_neon_fcvtzu>;
-}
-defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
-defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_arm64_neon_frecpe>;
-defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
-defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
-defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
-defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_arm64_neon_frintn>;
-defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
-defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
-defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
-defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_arm64_neon_frsqrte>;
-defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
-defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
-                               UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
-// Aliases for MVN -> NOT.
-def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
-                (NOTv8i8 V64:$Vd, V64:$Vn)>;
-def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
-                (NOTv16i8 V128:$Vd, V128:$Vn)>;
-
-def : Pat<(ARM64neg (v8i8  V64:$Rn)),  (NEGv8i8  V64:$Rn)>;
-def : Pat<(ARM64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
-def : Pat<(ARM64neg (v4i16 V64:$Rn)),  (NEGv4i16 V64:$Rn)>;
-def : Pat<(ARM64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
-def : Pat<(ARM64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
-def : Pat<(ARM64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
-def : Pat<(ARM64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
-
-def : Pat<(ARM64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(ARM64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(ARM64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v1i64 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(ARM64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-
-def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-
-defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_arm64_neon_rbit>;
-defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", ARM64rev16>;
-defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", ARM64rev32>;
-defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", ARM64rev64>;
-defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
-       BinOpFrag<(add node:$LHS, (int_arm64_neon_saddlp node:$RHS))> >;
-defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_arm64_neon_saddlp>;
-defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
-defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
-defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_arm64_neon_sqabs>;
-defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_arm64_neon_sqneg>;
-defm SQXTN  : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_arm64_neon_sqxtn>;
-defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_arm64_neon_sqxtun>;
-defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_arm64_neon_suqadd>;
-defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
-       BinOpFrag<(add node:$LHS, (int_arm64_neon_uaddlp node:$RHS))> >;
-defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
-                    int_arm64_neon_uaddlp>;
-defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
-defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_arm64_neon_uqxtn>;
-defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_arm64_neon_urecpe>;
-defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_arm64_neon_ursqrte>;
-defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_arm64_neon_usqadd>;
-defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
-
-def : Pat<(v2f32 (ARM64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
-def : Pat<(v4f32 (ARM64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
-
-// Patterns for vector long shift (by element width). These need to match all
-// three of zext, sext and anyext so it's easier to pull the patterns out of the
-// definition.
-multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
-  def : Pat<(ARM64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
-            (SHLLv8i8 V64:$Rn)>;
-  def : Pat<(ARM64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
-            (SHLLv16i8 V128:$Rn)>;
-  def : Pat<(ARM64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
-            (SHLLv4i16 V64:$Rn)>;
-  def : Pat<(ARM64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
-            (SHLLv8i16 V128:$Rn)>;
-  def : Pat<(ARM64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
-            (SHLLv2i32 V64:$Rn)>;
-  def : Pat<(ARM64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
-            (SHLLv4i32 V128:$Rn)>;
-}
-
-defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
-defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
-defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three vector instructions.
-//===----------------------------------------------------------------------===//
-
-defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
-defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_arm64_neon_addp>;
-defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", ARM64cmeq>;
-defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", ARM64cmge>;
-defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", ARM64cmgt>;
-defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", ARM64cmhi>;
-defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", ARM64cmhs>;
-defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", ARM64cmtst>;
-defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_arm64_neon_fabd>;
-defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_arm64_neon_facge>;
-defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_arm64_neon_facgt>;
-defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_arm64_neon_addp>;
-defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
-defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", ARM64fcmeq>;
-defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", ARM64fcmge>;
-defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", ARM64fcmgt>;
-defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
-defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_arm64_neon_fmaxnmp>;
-defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_arm64_neon_fmaxnm>;
-defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_arm64_neon_fmaxp>;
-defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", ARM64fmax>;
-defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_arm64_neon_fminnmp>;
-defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_arm64_neon_fminnm>;
-defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_arm64_neon_fminp>;
-defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", ARM64fmin>;
-
-// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
-// instruction expects the addend first, while the fma intrinsic puts it last.
-defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
-            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
-            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
-
-// The following def pats catch the case where the LHS of an FMA is negated.
-// The TriOpFrag above catches the case where the middle operand is negated.
-def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
-          (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
-
-def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
-          (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
-def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
-          (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
-defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_arm64_neon_fmulx>;
-defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
-defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_arm64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_arm64_neon_frsqrts>;
-defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
-defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
-                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
-defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
-                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
-defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
-defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_arm64_neon_pmul>;
-defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
-      TriOpFrag<(add node:$LHS, (int_arm64_neon_sabd node:$MHS, node:$RHS))> >;
-defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_arm64_neon_sabd>;
-defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_arm64_neon_shadd>;
-defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_arm64_neon_shsub>;
-defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_arm64_neon_smaxp>;
-defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_arm64_neon_smax>;
-defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_arm64_neon_sminp>;
-defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_arm64_neon_smin>;
-defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_arm64_neon_sqadd>;
-defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_arm64_neon_sqdmulh>;
-defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_arm64_neon_sqrdmulh>;
-defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_arm64_neon_sqrshl>;
-defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_arm64_neon_sqshl>;
-defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_arm64_neon_sqsub>;
-defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_arm64_neon_srhadd>;
-defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_arm64_neon_srshl>;
-defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_arm64_neon_sshl>;
-defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
-defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
-      TriOpFrag<(add node:$LHS, (int_arm64_neon_uabd node:$MHS, node:$RHS))> >;
-defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_arm64_neon_uabd>;
-defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_arm64_neon_uhadd>;
-defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_arm64_neon_uhsub>;
-defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_arm64_neon_umaxp>;
-defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_arm64_neon_umax>;
-defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_arm64_neon_uminp>;
-defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_arm64_neon_umin>;
-defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_arm64_neon_uqadd>;
-defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_arm64_neon_uqrshl>;
-defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_arm64_neon_uqshl>;
-defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_arm64_neon_uqsub>;
-defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_arm64_neon_urhadd>;
-defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_arm64_neon_urshl>;
-defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_arm64_neon_ushl>;
-
-defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
-defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
-                                  BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
-defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
-defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", ARM64bit>;
-defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
-    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
-defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
-defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
-                                  BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
-defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
-
-def : Pat<(ARM64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(ARM64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(ARM64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(ARM64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-
-def : Pat<(ARM64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(ARM64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(ARM64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(ARM64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
-def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
-def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-
-def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
-def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-
-def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmls.8b\t$dst, $src1, $src2}",
-                (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmls.16b\t$dst, $src1, $src2}",
-                (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmls.4h\t$dst, $src1, $src2}",
-                (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmls.8h\t$dst, $src1, $src2}",
-                (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmls.2s\t$dst, $src1, $src2}",
-                (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmls.4s\t$dst, $src1, $src2}",
-                (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmls.2d\t$dst, $src1, $src2}",
-                (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmlo.8b\t$dst, $src1, $src2}",
-                (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmlo.16b\t$dst, $src1, $src2}",
-                (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmlo.4h\t$dst, $src1, $src2}",
-                (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmlo.8h\t$dst, $src1, $src2}",
-                (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmlo.2s\t$dst, $src1, $src2}",
-                (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmlo.4s\t$dst, $src1, $src2}",
-                (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmlo.2d\t$dst, $src1, $src2}",
-                (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmle.8b\t$dst, $src1, $src2}",
-                (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmle.16b\t$dst, $src1, $src2}",
-                (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmle.4h\t$dst, $src1, $src2}",
-                (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmle.8h\t$dst, $src1, $src2}",
-                (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmle.2s\t$dst, $src1, $src2}",
-                (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmle.4s\t$dst, $src1, $src2}",
-                (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmle.2d\t$dst, $src1, $src2}",
-                (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmlt.8b\t$dst, $src1, $src2}",
-                (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmlt.16b\t$dst, $src1, $src2}",
-                (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmlt.4h\t$dst, $src1, $src2}",
-                (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmlt.8h\t$dst, $src1, $src2}",
-                (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmlt.2s\t$dst, $src1, $src2}",
-                (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmlt.4s\t$dst, $src1, $src2}",
-                (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmlt.2d\t$dst, $src1, $src2}",
-                (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
-                "|fcmle.2s\t$dst, $src1, $src2}",
-                (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
-                "|fcmle.4s\t$dst, $src1, $src2}",
-                (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
-                "|fcmle.2d\t$dst, $src1, $src2}",
-                (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
-                "|fcmlt.2s\t$dst, $src1, $src2}",
-                (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
-                "|fcmlt.4s\t$dst, $src1, $src2}",
-                (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
-                "|fcmlt.2d\t$dst, $src1, $src2}",
-                (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
-                "|facle.2s\t$dst, $src1, $src2}",
-                (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
-                "|facle.4s\t$dst, $src1, $src2}",
-                (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
-                "|facle.2d\t$dst, $src1, $src2}",
-                (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
-                "|faclt.2s\t$dst, $src1, $src2}",
-                (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
-                "|faclt.4s\t$dst, $src1, $src2}",
-                (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
-                "|faclt.2d\t$dst, $src1, $src2}",
-                (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three scalar instructions.
-//===----------------------------------------------------------------------===//
-
-defm ADD      : SIMDThreeScalarD<0, 0b10000, "add", add>;
-defm CMEQ     : SIMDThreeScalarD<1, 0b10001, "cmeq", ARM64cmeq>;
-defm CMGE     : SIMDThreeScalarD<0, 0b00111, "cmge", ARM64cmge>;
-defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", ARM64cmgt>;
-defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", ARM64cmhi>;
-defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", ARM64cmhs>;
-defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", ARM64cmtst>;
-defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_arm64_sisd_fabd>;
-def : Pat<(v1f64 (int_arm64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FABD64 FPR64:$Rn, FPR64:$Rm)>;
-defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
-                                     int_arm64_neon_facge>;
-defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
-                                     int_arm64_neon_facgt>;
-defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", ARM64fcmeq>;
-defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", ARM64fcmge>;
-defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", ARM64fcmgt>;
-defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_arm64_neon_fmulx>;
-defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_arm64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_arm64_neon_frsqrts>;
-defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_arm64_neon_sqadd>;
-defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_arm64_neon_sqdmulh>;
-defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_arm64_neon_sqrdmulh>;
-defm SQRSHL   : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_arm64_neon_sqrshl>;
-defm SQSHL    : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_arm64_neon_sqshl>;
-defm SQSUB    : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_arm64_neon_sqsub>;
-defm SRSHL    : SIMDThreeScalarD<   0, 0b01010, "srshl", int_arm64_neon_srshl>;
-defm SSHL     : SIMDThreeScalarD<   0, 0b01000, "sshl", int_arm64_neon_sshl>;
-defm SUB      : SIMDThreeScalarD<   1, 0b10000, "sub", sub>;
-defm UQADD    : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_arm64_neon_uqadd>;
-defm UQRSHL   : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_arm64_neon_uqrshl>;
-defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_arm64_neon_uqshl>;
-defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_arm64_neon_uqsub>;
-defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_arm64_neon_urshl>;
-defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_arm64_neon_ushl>;
-
-def : InstAlias<"cmls $dst, $src1, $src2",
-                (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"cmle $dst, $src1, $src2",
-                (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"cmlo $dst, $src1, $src2",
-                (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"cmlt $dst, $src1, $src2",
-                (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"fcmle $dst, $src1, $src2",
-                (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
-def : InstAlias<"fcmle $dst, $src1, $src2",
-                (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"fcmlt $dst, $src1, $src2",
-                (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
-def : InstAlias<"fcmlt $dst, $src1, $src2",
-                (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"facle $dst, $src1, $src2",
-                (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
-def : InstAlias<"facle $dst, $src1, $src2",
-                (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"faclt $dst, $src1, $src2",
-                (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
-def : InstAlias<"faclt $dst, $src1, $src2",
-                (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three scalar instructions (mixed operands).
-//===----------------------------------------------------------------------===//
-defm SQDMULL  : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
-                                       int_arm64_neon_sqdmulls_scalar>;
-defm SQDMLAL  : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
-defm SQDMLSL  : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
-
-def : Pat<(i64 (int_arm64_neon_sqadd (i64 FPR64:$Rd),
-                   (i64 (int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
-                                                        (i32 FPR32:$Rm))))),
-          (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_sqsub (i64 FPR64:$Rd),
-                   (i64 (int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
-                                                        (i32 FPR32:$Rm))))),
-          (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD two scalar instructions.
-//===----------------------------------------------------------------------===//
-
-defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_arm64_neon_abs>;
-defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", ARM64cmeqz>;
-defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", ARM64cmgez>;
-defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", ARM64cmgtz>;
-defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", ARM64cmlez>;
-defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", ARM64cmltz>;
-defm FCMEQ  : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", ARM64fcmeqz>;
-defm FCMGE  : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", ARM64fcmgez>;
-defm FCMGT  : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", ARM64fcmgtz>;
-defm FCMLE  : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", ARM64fcmlez>;
-defm FCMLT  : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", ARM64fcmltz>;
-defm FCVTAS : SIMDTwoScalarSD<   0, 0, 0b11100, "fcvtas">;
-defm FCVTAU : SIMDTwoScalarSD<   1, 0, 0b11100, "fcvtau">;
-defm FCVTMS : SIMDTwoScalarSD<   0, 0, 0b11011, "fcvtms">;
-defm FCVTMU : SIMDTwoScalarSD<   1, 0, 0b11011, "fcvtmu">;
-defm FCVTNS : SIMDTwoScalarSD<   0, 0, 0b11010, "fcvtns">;
-defm FCVTNU : SIMDTwoScalarSD<   1, 0, 0b11010, "fcvtnu">;
-defm FCVTPS : SIMDTwoScalarSD<   0, 1, 0b11010, "fcvtps">;
-defm FCVTPU : SIMDTwoScalarSD<   1, 1, 0b11010, "fcvtpu">;
-def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
-defm FCVTZS : SIMDTwoScalarSD<   0, 1, 0b11011, "fcvtzs">;
-defm FCVTZU : SIMDTwoScalarSD<   1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDTwoScalarSD<   0, 1, 0b11101, "frecpe">;
-defm FRECPX : SIMDTwoScalarSD<   0, 1, 0b11111, "frecpx">;
-defm FRSQRTE : SIMDTwoScalarSD<  1, 1, 0b11101, "frsqrte">;
-defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
-                                 UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", ARM64sitof>;
-defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_arm64_neon_sqabs>;
-defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_arm64_neon_sqneg>;
-defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_arm64_neon_scalar_sqxtn>;
-defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_arm64_neon_scalar_sqxtun>;
-defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
-                                     int_arm64_neon_suqadd>;
-defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", ARM64uitof>;
-defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_arm64_neon_scalar_uqxtn>;
-defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
-                                    int_arm64_neon_usqadd>;
-
-def : Pat<(ARM64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm64_neon_fcvtas (v1f64 FPR64:$Rn))),
-          (FCVTASv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtau (v1f64 FPR64:$Rn))),
-          (FCVTAUv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtms (v1f64 FPR64:$Rn))),
-          (FCVTMSv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtmu (v1f64 FPR64:$Rn))),
-          (FCVTMUv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtns (v1f64 FPR64:$Rn))),
-          (FCVTNSv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtnu (v1f64 FPR64:$Rn))),
-          (FCVTNUv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtps (v1f64 FPR64:$Rn))),
-          (FCVTPSv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtpu (v1f64 FPR64:$Rn))),
-          (FCVTPUv1i64 FPR64:$Rn)>;
-
-def : Pat<(f32 (int_arm64_neon_frecpe (f32 FPR32:$Rn))),
-          (FRECPEv1i32 FPR32:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_frecpe (f64 FPR64:$Rn))),
-          (FRECPEv1i64 FPR64:$Rn)>;
-def : Pat<(v1f64 (int_arm64_neon_frecpe (v1f64 FPR64:$Rn))),
-          (FRECPEv1i64 FPR64:$Rn)>;
-
-def : Pat<(f32 (int_arm64_neon_frecpx (f32 FPR32:$Rn))),
-          (FRECPXv1i32 FPR32:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_frecpx (f64 FPR64:$Rn))),
-          (FRECPXv1i64 FPR64:$Rn)>;
-
-def : Pat<(f32 (int_arm64_neon_frsqrte (f32 FPR32:$Rn))),
-          (FRSQRTEv1i32 FPR32:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_frsqrte (f64 FPR64:$Rn))),
-          (FRSQRTEv1i64 FPR64:$Rn)>;
-def : Pat<(v1f64 (int_arm64_neon_frsqrte (v1f64 FPR64:$Rn))),
-          (FRSQRTEv1i64 FPR64:$Rn)>;
-
-// If an integer is about to be converted to a floating point value,
-// just load it on the floating point unit.
-// Here are the patterns for 8 and 16-bits to float.
-// 8-bits -> float.
-multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
-                             SDPatternOperator loadop, Instruction UCVTF,
-                             ROAddrMode ro, Instruction LDRW, Instruction LDRX,
-                             SubRegIndex sub> {
-  def : Pat<(DstTy (uint_to_fp (SrcTy
-                     (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
-                                      ro.Wext:$extend))))),
-           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
-                                 (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
-                                 sub))>;
-
-  def : Pat<(DstTy (uint_to_fp (SrcTy
-                     (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
-                                      ro.Wext:$extend))))),
-           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
-                                 (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
-                                 sub))>;
-}
-
-defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
-                         UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
-def : Pat <(f32 (uint_to_fp (i32
-               (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
-def : Pat <(f32 (uint_to_fp (i32
-                     (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
-// 16-bits -> float.
-defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
-                         UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
-def : Pat <(f32 (uint_to_fp (i32
-                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
-def : Pat <(f32 (uint_to_fp (i32
-                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
-// 32-bits are handled in target specific dag combine:
-// performIntToFpCombine.
-// 64-bits integer to 32-bits floating point, not possible with
-// UCVTF on floating point registers (both source and destination
-// must have the same size).
-
-// Here are the patterns for 8, 16, 32, and 64-bits to double.
-// 8-bits -> double.
-defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
-                         UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
-def : Pat <(f64 (uint_to_fp (i32
-                    (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
-def : Pat <(f64 (uint_to_fp (i32
-                  (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
-// 16-bits -> double.
-defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
-                         UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
-def : Pat <(f64 (uint_to_fp (i32
-                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
-def : Pat <(f64 (uint_to_fp (i32
-                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
-// 32-bits -> double.
-defm : UIntToFPROLoadPat<f64, i32, load,
-                         UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
-def : Pat <(f64 (uint_to_fp (i32
-                  (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
-def : Pat <(f64 (uint_to_fp (i32
-                  (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
-// 64-bits -> double are handled in target specific dag combine:
-// performIntToFpCombine.
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three different-sized vector instructions.
-//===----------------------------------------------------------------------===//
-
-defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_arm64_neon_addhn>;
-defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_arm64_neon_subhn>;
-defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_arm64_neon_raddhn>;
-defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_arm64_neon_rsubhn>;
-defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_arm64_neon_pmull>;
-defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
-                                             int_arm64_neon_sabd>;
-defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
-                                          int_arm64_neon_sabd>;
-defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
-            BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
-defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
-                 BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
-defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_arm64_neon_smull>;
-defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
-                                               int_arm64_neon_sqadd>;
-defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
-                                               int_arm64_neon_sqsub>;
-defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
-                                     int_arm64_neon_sqdmull>;
-defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
-                 BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
-defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
-                 BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
-defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
-                                              int_arm64_neon_uabd>;
-defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
-                                          int_arm64_neon_uabd>;
-defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
-                 BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
-defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
-                 BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
-defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_arm64_neon_umull>;
-defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
-                 BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
-defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
-                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
-
-// Patterns for 64-bit pmull
-def : Pat<(int_arm64_neon_pmull64 V64:$Rn, V64:$Rm),
-          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
-def : Pat<(int_arm64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
-                                  (vector_extract (v2i64 V128:$Rm), (i64 1))),
-          (PMULLv2i64 V128:$Rn, V128:$Rm)>;
-
-// CodeGen patterns for addhn and subhn instructions, which can actually be
-// written in LLVM IR without too much difficulty.
-
-// ADDHN
-def : Pat<(v8i8 (trunc (v8i16 (ARM64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
-          (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i16 (trunc (v4i32 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                           (i32 16))))),
-          (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v2i32 (trunc (v2i64 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                           (i32 32))))),
-          (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v8i8 V64:$Rd),
-                          (trunc (v8i16 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                                    (i32 8))))),
-          (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v4i16 V64:$Rd),
-                          (trunc (v4i32 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                                    (i32 16))))),
-          (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v2i32 V64:$Rd),
-                          (trunc (v2i64 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                                    (i32 32))))),
-          (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-
-// SUBHN
-def : Pat<(v8i8 (trunc (v8i16 (ARM64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
-          (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i16 (trunc (v4i32 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                           (i32 16))))),
-          (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v2i32 (trunc (v2i64 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                           (i32 32))))),
-          (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v8i8 V64:$Rd),
-                          (trunc (v8i16 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                                    (i32 8))))),
-          (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v4i16 V64:$Rd),
-                          (trunc (v4i32 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                                    (i32 16))))),
-          (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v2i32 V64:$Rd),
-                          (trunc (v2i64 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                                    (i32 32))))),
-          (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD bitwise extract from vector instruction.
-//----------------------------------------------------------------------------
-
-defm EXT : SIMDBitwiseExtract<"ext">;
-
-def : Pat<(v4i16 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v8i16 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2i32 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v2f32 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v4i32 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v4f32 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2i64 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2f64 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-
-// We use EXT to handle extract_subvector to copy the upper 64-bits of a
-// 128-bit vector.
-def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD zip vector
-//----------------------------------------------------------------------------
-
-defm TRN1 : SIMDZipVector<0b010, "trn1", ARM64trn1>;
-defm TRN2 : SIMDZipVector<0b110, "trn2", ARM64trn2>;
-defm UZP1 : SIMDZipVector<0b001, "uzp1", ARM64uzp1>;
-defm UZP2 : SIMDZipVector<0b101, "uzp2", ARM64uzp2>;
-defm ZIP1 : SIMDZipVector<0b011, "zip1", ARM64zip1>;
-defm ZIP2 : SIMDZipVector<0b111, "zip2", ARM64zip2>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD TBL/TBX instructions
-//----------------------------------------------------------------------------
-
-defm TBL : SIMDTableLookup<    0, "tbl">;
-defm TBX : SIMDTableLookupTied<1, "tbx">;
-
-def : Pat<(v8i8 (int_arm64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
-          (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
-def : Pat<(v16i8 (int_arm64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
-          (TBLv16i8One V128:$Ri, V128:$Rn)>;
-
-def : Pat<(v8i8 (int_arm64_neon_tbx1 (v8i8 V64:$Rd),
-                  (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
-          (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
-def : Pat<(v16i8 (int_arm64_neon_tbx1 (v16i8 V128:$Rd),
-                   (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
-          (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar CPY instruction
-//----------------------------------------------------------------------------
-
-defm CPY : SIMDScalarCPY<"cpy">;
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar pairwise instructions
-//----------------------------------------------------------------------------
-
-defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
-defm FADDP   : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
-defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
-defm FMAXP   : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
-defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
-defm FMINP   : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
-def : Pat<(i64 (int_arm64_neon_saddv (v2i64 V128:$Rn))),
-          (ADDPv2i64p V128:$Rn)>;
-def : Pat<(i64 (int_arm64_neon_uaddv (v2i64 V128:$Rn))),
-          (ADDPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_faddv (v2f32 V64:$Rn))),
-          (FADDPv2i32p V64:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_faddv (v4f32 V128:$Rn))),
-          (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
-def : Pat<(f64 (int_arm64_neon_faddv (v2f64 V128:$Rn))),
-          (FADDPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fmaxnmv (v2f32 V64:$Rn))),
-          (FMAXNMPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fmaxnmv (v2f64 V128:$Rn))),
-          (FMAXNMPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fmaxv (v2f32 V64:$Rn))),
-          (FMAXPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fmaxv (v2f64 V128:$Rn))),
-          (FMAXPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fminnmv (v2f32 V64:$Rn))),
-          (FMINNMPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fminnmv (v2f64 V128:$Rn))),
-          (FMINNMPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fminv (v2f32 V64:$Rn))),
-          (FMINPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fminv (v2f64 V128:$Rn))),
-          (FMINPv2i64p V128:$Rn)>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD INS/DUP instructions
-//----------------------------------------------------------------------------
-
-def DUPv8i8gpr  : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>;
-def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>;
-def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>;
-def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>;
-def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>;
-def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>;
-def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>;
-
-def DUPv2i64lane : SIMDDup64FromElement;
-def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
-def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
-def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
-def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
-def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
-def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
-
-def : Pat<(v2f32 (ARM64dup (f32 FPR32:$Rn))),
-          (v2f32 (DUPv2i32lane
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
-            (i64 0)))>;
-def : Pat<(v4f32 (ARM64dup (f32 FPR32:$Rn))),
-          (v4f32 (DUPv4i32lane
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
-            (i64 0)))>;
-def : Pat<(v2f64 (ARM64dup (f64 FPR64:$Rn))),
-          (v2f64 (DUPv2i64lane
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
-            (i64 0)))>;
-
-def : Pat<(v2f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
-          (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
-def : Pat<(v4f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
-         (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
-def : Pat<(v2f64 (ARM64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
-          (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
-
-// If there's an (ARM64dup (vector_extract ...) ...), we can use a duplane
-// instruction even if the types don't match: we just have to remap the lane
-// carefully. N.b. this trick only applies to truncations.
-def VecIndex_x2 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(2 * N->getZExtValue(), MVT::i64);
-}]>;
-def VecIndex_x4 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(4 * N->getZExtValue(), MVT::i64);
-}]>;
-def VecIndex_x8 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(8 * N->getZExtValue(), MVT::i64);
-}]>;
-
-multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
-                            ValueType Src128VT, ValueType ScalVT,
-                            Instruction DUP, SDNodeXForm IdxXFORM> {
-  def : Pat<(ResVT (ARM64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
-                                                     imm:$idx)))),
-            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
-  def : Pat<(ResVT (ARM64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
-                                                     imm:$idx)))),
-            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTruncPats<v8i8,   v4i16, v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
-defm : DUPWithTruncPats<v8i8,   v2i32, v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
-defm : DUPWithTruncPats<v4i16,  v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
-
-defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
-defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
-defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
-
-multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
-                               SDNodeXForm IdxXFORM> {
-  def : Pat<(ResVT (ARM64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
-                                                         imm:$idx))))),
-            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
-  def : Pat<(ResVT (ARM64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
-                                                         imm:$idx))))),
-            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTrunci64Pats<v8i8,  DUPv8i8lane,   VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane,  VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane,  VecIndex_x2>;
-
-defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
-
-// SMOV and UMOV definitions, with some extra patterns for convenience
-defm SMOV : SMov;
-defm UMOV : UMov;
-
-def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
-          (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
-          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
-          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
-          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
-          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
-def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
-          (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
-
-// Extracting i8 or i16 elements will have the zero-extend transformed to
-// an 'and' mask by type legalization since neither i8 nor i16 are legal types
-// for ARM64. Match these patterns here since UMOV already zeroes out the high
-// bits of the destination register.
-def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
-               (i32 0xff)),
-          (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
-def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
-               (i32 0xffff)),
-          (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
-
-defm INS : SIMDIns;
-
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
-          (SUBREG_TO_REG (i32 0),
-                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
-          (SUBREG_TO_REG (i32 0),
-                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
-          (SUBREG_TO_REG (i32 0),
-                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
-          (SUBREG_TO_REG (i32 0),
-                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-
-def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
-            (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                                  (i32 FPR32:$Rn), ssub))>;
-def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
-            (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                                  (i32 FPR32:$Rn), ssub))>;
-def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
-            (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
-                                  (i64 FPR64:$Rn), dsub))>;
-
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
-
-def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
-            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
-          (EXTRACT_SUBREG
-            (INSvi32lane
-              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
-              VectorIndexS:$imm,
-              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
-              (i64 0)),
-            dsub)>;
-def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
-            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
-          (INSvi32lane
-            V128:$Rn, VectorIndexS:$imm,
-            (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
-            (i64 0))>;
-def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
-            (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
-          (INSvi64lane
-            V128:$Rn, VectorIndexD:$imm,
-            (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
-            (i64 0))>;
-
-// Copy an element at a constant index in one vector into a constant indexed
-// element of another.
-// FIXME refactor to a shared class/dev parameterized on vector type, vector
-// index type and INS extension
-def : Pat<(v16i8 (int_arm64_neon_vcopy_lane
-                   (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
-                   VectorIndexB:$idx2)),
-          (v16i8 (INSvi8lane
-                   V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
-          )>;
-def : Pat<(v8i16 (int_arm64_neon_vcopy_lane
-                   (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
-                   VectorIndexH:$idx2)),
-          (v8i16 (INSvi16lane
-                   V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
-          )>;
-def : Pat<(v4i32 (int_arm64_neon_vcopy_lane
-                   (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
-                   VectorIndexS:$idx2)),
-          (v4i32 (INSvi32lane
-                   V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
-          )>;
-def : Pat<(v2i64 (int_arm64_neon_vcopy_lane
-                   (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
-                   VectorIndexD:$idx2)),
-          (v2i64 (INSvi64lane
-                   V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
-          )>;
-
-multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
-                                ValueType VTScal, Instruction INS> {
-  def : Pat<(VT128 (vector_insert V128:$src,
-                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
-                        imm:$Immd)),
-            (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
-
-  def : Pat<(VT128 (vector_insert V128:$src,
-                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
-                        imm:$Immd)),
-            (INS V128:$src, imm:$Immd,
-                 (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
-
-  def : Pat<(VT64 (vector_insert V64:$src,
-                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
-                        imm:$Immd)),
-            (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
-                                 imm:$Immd, V128:$Rn, imm:$Immn),
-                            dsub)>;
-
-  def : Pat<(VT64 (vector_insert V64:$src,
-                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
-                        imm:$Immd)),
-            (EXTRACT_SUBREG
-                (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
-                     (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
-                dsub)>;
-}
-
-defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
-defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
-defm : Neon_INS_elt_pattern<v16i8, v8i8,  i32, INSvi8lane>;
-defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>;
-defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>;
-defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi32lane>;
-
-
-// Floating point vector extractions are codegen'd as either a sequence of
-// subregister extractions, possibly fed by an INS if the lane number is
-// anything other than zero.
-def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
-          (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
-def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
-          (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
-def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
-          (f64 (EXTRACT_SUBREG
-            (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
-                         V128:$Rn, VectorIndexD:$idx),
-            dsub))>;
-def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
-          (f32 (EXTRACT_SUBREG
-            (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
-                         V128:$Rn, VectorIndexS:$idx),
-            ssub))>;
-
-// All concat_vectors operations are canonicalised to act on i64 vectors for
-// ARM64. In the general case we need an instruction, which had just as well be
-// INS.
-class ConcatPat<ValueType DstTy, ValueType SrcTy>
-  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
-        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
-                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
-
-def : ConcatPat<v2i64, v1i64>;
-def : ConcatPat<v2f64, v1f64>;
-def : ConcatPat<v4i32, v2i32>;
-def : ConcatPat<v4f32, v2f32>;
-def : ConcatPat<v8i16, v4i16>;
-def : ConcatPat<v16i8, v8i8>;
-
-// If the high lanes are undef, though, we can just ignore them:
-class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
-  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
-        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
-
-def : ConcatUndefPat<v2i64, v1i64>;
-def : ConcatUndefPat<v2f64, v1f64>;
-def : ConcatUndefPat<v4i32, v2i32>;
-def : ConcatUndefPat<v4f32, v2f32>;
-def : ConcatUndefPat<v8i16, v4i16>;
-def : ConcatUndefPat<v16i8, v8i8>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD across lanes instructions
-//----------------------------------------------------------------------------
-
-defm ADDV    : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
-defm SMAXV   : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
-defm SMINV   : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
-defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
-defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
-defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
-defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
-defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_arm64_neon_fmaxnmv>;
-defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_arm64_neon_fmaxv>;
-defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_arm64_neon_fminnmv>;
-defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_arm64_neon_fminv>;
-
-multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> {
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-  def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          (i64 0)))>;
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          (i64 0)))>;
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          (i64 0)))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          (i64 0)))>;
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)),
-          (i32 (SMOVvi16to32
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-           (i64 0)))>;
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (SMOVvi16to32
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-           (i64 0)))>;
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          (i64 0)))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          (i64 0)))>;
-
-def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
-          ssub))>;
-}
-
-multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> {
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-  def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          ssub))>;
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          ssub))>;
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          ssub))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          ssub))>;
-
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)),
-          (i32 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-            ssub))>;
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-            ssub))>;
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          ssub))>;
-
-def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
-          ssub))>;
-
-}
-
-multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
-          (i64 0)))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
-          (i64 0)))>;
-
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (EXTRACT_SUBREG
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
-           ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
-          ssub))>;
-
-def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
-        (i64 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
-          dsub))>;
-}
-
-multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
-                                                Intrinsic intOp> {
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
-          ssub))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
-          ssub))>;
-
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
-            ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
-          ssub))>;
-
-def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
-        (i64 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
-          dsub))>;
-}
-
-defm : SIMDAcrossLanesSignedIntrinsic<"ADDV",  int_arm64_neon_saddv>;
-// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
-def : Pat<(i32 (int_arm64_neon_saddv (v2i32 V64:$Rn))),
-          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV",  int_arm64_neon_uaddv>;
-// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
-def : Pat<(i32 (int_arm64_neon_uaddv (v2i32 V64:$Rn))),
-          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_arm64_neon_smaxv>;
-def : Pat<(i32 (int_arm64_neon_smaxv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_arm64_neon_sminv>;
-def : Pat<(i32 (int_arm64_neon_sminv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_arm64_neon_umaxv>;
-def : Pat<(i32 (int_arm64_neon_umaxv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_arm64_neon_uminv>;
-def : Pat<(i32 (int_arm64_neon_uminv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_arm64_neon_saddlv>;
-defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_arm64_neon_uaddlv>;
-
-// The vaddlv_s32 intrinsic gets mapped to SADDLP.
-def : Pat<(i64 (int_arm64_neon_saddlv (v2i32 V64:$Rn))),
-          (i64 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (SADDLPv2i32_v1i64 V64:$Rn), dsub),
-            dsub))>;
-// The vaddlv_u32 intrinsic gets mapped to UADDLP.
-def : Pat<(i64 (int_arm64_neon_uaddlv (v2i32 V64:$Rn))),
-          (i64 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (UADDLPv2i32_v1i64 V64:$Rn), dsub),
-            dsub))>;
-
-//------------------------------------------------------------------------------
-// AdvSIMD modified immediate instructions
-//------------------------------------------------------------------------------
-
-// AdvSIMD BIC
-defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", ARM64bici>;
-// AdvSIMD ORR
-defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", ARM64orri>;
-
-def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
-def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
-def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
-def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
-
-def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
-
-def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
-def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
-def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
-def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
-
-def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
-
-// AdvSIMD FMOV
-def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
-                                              "fmov", ".2d",
-                       [(set (v2f64 V128:$Rd), (ARM64fmov imm0_255:$imm8))]>;
-def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
-                                              "fmov", ".2s",
-                       [(set (v2f32 V64:$Rd), (ARM64fmov imm0_255:$imm8))]>;
-def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
-                                              "fmov", ".4s",
-                       [(set (v4f32 V128:$Rd), (ARM64fmov imm0_255:$imm8))]>;
-
-// AdvSIMD MOVI
-
-// EDIT byte mask: scalar
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
-                    [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
-// The movi_edit node has the immediate value already encoded, so we use
-// a plain imm0_255 here.
-def : Pat<(f64 (ARM64movi_edit imm0_255:$shift)),
-          (MOVID imm0_255:$shift)>;
-
-def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
-
-def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
-
-// EDIT byte mask: 2d
-
-// The movi_edit node has the immediate value already encoded, so we use
-// a plain imm0_255 in the pattern
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
-                                                simdimmtype10,
-                                                "movi", ".2d",
-                   [(set (v2i64 V128:$Rd), (ARM64movi_edit imm0_255:$imm8))]>;
-
-
-// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
-// Complexity is added to break a tie with a plain MOVI.
-let AddedComplexity = 1 in {
-def : Pat<(f32   fpimm0),
-          (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
-      Requires<[HasZCZ]>;
-def : Pat<(f64   fpimm0),
-          (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
-      Requires<[HasZCZ]>;
-}
-
-def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-
-def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-
-def : Pat<(v2f64 (ARM64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v4f32 (ARM64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
-
-// EDIT per word & halfword: 2s, 4h, 4s, & 8h
-defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
-
-def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
-
-def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
-
-def : Pat<(v2i32 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i32 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i16 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v8i16 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
-
-// EDIT per word: 2s & 4s with MSL shifter
-def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
-                      [(set (v2i32 V64:$Rd),
-                            (ARM64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
-                      [(set (v4i32 V128:$Rd),
-                            (ARM64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-
-// Per byte: 8b & 16b
-def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
-                                                 "movi", ".8b",
-                       [(set (v8i8 V64:$Rd), (ARM64movi imm0_255:$imm8))]>;
-def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
-                                                 "movi", ".16b",
-                       [(set (v16i8 V128:$Rd), (ARM64movi imm0_255:$imm8))]>;
-
-// AdvSIMD MVNI
-
-// EDIT per word & halfword: 2s, 4h, 4s, & 8h
-defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
-
-def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
-
-def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
-
-def : Pat<(v2i32 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i32 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i16 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v8i16 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
-
-// EDIT per word: 2s & 4s with MSL shifter
-def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
-                      [(set (v2i32 V64:$Rd),
-                            (ARM64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
-                      [(set (v4i32 V128:$Rd),
-                            (ARM64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD indexed element
-//----------------------------------------------------------------------------
-
-let neverHasSideEffects = 1 in {
-  defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
-  defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
-}
-
-// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
-// instruction expects the addend first, while the intrinsic expects it last.
-
-// On the other hand, there are quite a few valid combinatorial options due to
-// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
-
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
-
-multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
-  // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
-  // and DUP scalar.
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64duplane32 (v4f32 (fneg V128:$Rm)),
-                                           VectorIndexS:$idx))),
-            (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (v2f32 (ARM64duplane32
-                                      (v4f32 (insert_subvector undef,
-                                                 (v2f32 (fneg V64:$Rm)),
-                                                 (i32 0))),
-                                      VectorIndexS:$idx)))),
-            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
-                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
-                               VectorIndexS:$idx)>;
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64dup (f32 (fneg FPR32Op:$Rm))))),
-            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-  // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
-  // and DUP scalar.
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64duplane32 (v4f32 (fneg V128:$Rm)),
-                                           VectorIndexS:$idx))),
-            (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
-                               VectorIndexS:$idx)>;
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (v4f32 (ARM64duplane32
-                                      (v4f32 (insert_subvector undef,
-                                                 (v2f32 (fneg V64:$Rm)),
-                                                 (i32 0))),
-                                      VectorIndexS:$idx)))),
-            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
-                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
-                               VectorIndexS:$idx)>;
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64dup (f32 (fneg FPR32Op:$Rm))))),
-            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-  // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
-  // (DUPLANE from 64-bit would be trivial).
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64duplane64 (v2f64 (fneg V128:$Rm)),
-                                           VectorIndexD:$idx))),
-            (FMLSv2i64_indexed
-                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64dup (f64 (fneg FPR64Op:$Rm))))),
-            (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
-
-  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v4f32 (fneg V128:$Rm)),
-                                         VectorIndexS:$idx))),
-            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
-                V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v2f32 (fneg V64:$Rm)),
-                                         VectorIndexS:$idx))),
-            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
-                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
-
-  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
-  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
-                         (vector_extract (v2f64 (fneg V128:$Rm)),
-                                         VectorIndexS:$idx))),
-            (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
-                V128:$Rm, VectorIndexS:$idx)>;
-}
-
-defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
-
-defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_arm64_neon_fmulx>;
-defm FMUL  : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
-
-def : Pat<(v2f32 (fmul V64:$Rn, (ARM64dup (f32 FPR32:$Rm)))),
-          (FMULv2i32_indexed V64:$Rn,
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
-            (i64 0))>;
-def : Pat<(v4f32 (fmul V128:$Rn, (ARM64dup (f32 FPR32:$Rm)))),
-          (FMULv4i32_indexed V128:$Rn,
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
-            (i64 0))>;
-def : Pat<(v2f64 (fmul V128:$Rn, (ARM64dup (f64 FPR64:$Rm)))),
-          (FMULv2i64_indexed V128:$Rn,
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
-            (i64 0))>;
-
-defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_arm64_neon_sqdmulh>;
-defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_arm64_neon_sqrdmulh>;
-defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
-              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
-defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
-              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
-defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
-defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
-                int_arm64_neon_smull>;
-defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
-                                           int_arm64_neon_sqadd>;
-defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
-                                           int_arm64_neon_sqsub>;
-defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_arm64_neon_sqdmull>;
-defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
-                int_arm64_neon_umull>;
-
-// A scalar sqdmull with the second operand being a vector lane can be
-// handled directly with the indexed instruction encoding.
-def : Pat<(int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
-                                          (vector_extract (v4i32 V128:$Vm),
-                                                           VectorIndexS:$idx)),
-          (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar shift instructions
-//----------------------------------------------------------------------------
-defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
-defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
-defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
-defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
-// Codegen patterns for the above. We don't put these directly on the
-// instructions because TableGen's type inference can't handle the truth.
-// Having the same base pattern for fp <--> int totally freaks it out.
-def : Pat<(int_arm64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
-          (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(int_arm64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
-          (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(i64 (int_arm64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
-          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(i64 (int_arm64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
-          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1i64 (int_arm64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1i64 (int_arm64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(int_arm64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
-          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(int_arm64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
-          (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(f64 (int_arm64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
-          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(f64 (int_arm64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
-          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1f64 (int_arm64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1f64 (int_arm64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-
-defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", ARM64vshl>;
-defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
-defm SQRSHRN  : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
-                                     int_arm64_neon_sqrshrn>;
-defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
-                                     int_arm64_neon_sqrshrun>;
-defm SQSHLU   : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", ARM64sqshlui>;
-defm SQSHL    : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", ARM64sqshli>;
-defm SQSHRN   : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
-                                     int_arm64_neon_sqshrn>;
-defm SQSHRUN  : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
-                                     int_arm64_neon_sqshrun>;
-defm SRI      : SIMDScalarRShiftDTied<   1, 0b01000, "sri">;
-defm SRSHR    : SIMDScalarRShiftD<   0, 0b00100, "srshr", ARM64srshri>;
-defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64srshri node:$MHS, node:$RHS))>>;
-defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", ARM64vashr>;
-defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64vashr node:$MHS, node:$RHS))>>;
-defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
-                                     int_arm64_neon_uqrshrn>;
-defm UQSHL    : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", ARM64uqshli>;
-defm UQSHRN   : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
-                                     int_arm64_neon_uqshrn>;
-defm URSHR    : SIMDScalarRShiftD<   1, 0b00100, "urshr", ARM64urshri>;
-defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64urshri node:$MHS, node:$RHS))>>;
-defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", ARM64vlshr>;
-defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64vlshr node:$MHS, node:$RHS))>>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD vector shift instructions
-//----------------------------------------------------------------------------
-defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_arm64_neon_vcvtfp2fxs>;
-defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_arm64_neon_vcvtfp2fxu>;
-defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
-                                   int_arm64_neon_vcvtfxs2fp>;
-defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
-                                         int_arm64_neon_rshrn>;
-defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", ARM64vshl>;
-defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
-                          BinOpFrag<(trunc (ARM64vashr node:$LHS, node:$RHS))>>;
-defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_arm64_neon_vsli>;
-def : Pat<(v1i64 (int_arm64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
-                                      (i32 vecshiftL64:$imm))),
-          (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
-defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
-                                         int_arm64_neon_sqrshrn>;
-defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
-                                         int_arm64_neon_sqrshrun>;
-defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", ARM64sqshlui>;
-defm SQSHL  : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", ARM64sqshli>;
-defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
-                                         int_arm64_neon_sqshrn>;
-defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
-                                         int_arm64_neon_sqshrun>;
-defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_arm64_neon_vsri>;
-def : Pat<(v1i64 (int_arm64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
-                                      (i32 vecshiftR64:$imm))),
-          (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
-defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", ARM64srshri>;
-defm SRSRA   : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
-                 TriOpFrag<(add node:$LHS,
-                                (ARM64srshri node:$MHS, node:$RHS))> >;
-defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
-                BinOpFrag<(ARM64vshl (sext node:$LHS), node:$RHS)>>;
-
-defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", ARM64vashr>;
-defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
-                TriOpFrag<(add node:$LHS, (ARM64vashr node:$MHS, node:$RHS))>>;
-defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
-                        int_arm64_neon_vcvtfxu2fp>;
-defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
-                                         int_arm64_neon_uqrshrn>;
-defm UQSHL   : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", ARM64uqshli>;
-defm UQSHRN  : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
-                                         int_arm64_neon_uqshrn>;
-defm URSHR   : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", ARM64urshri>;
-defm URSRA   : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
-                TriOpFrag<(add node:$LHS,
-                               (ARM64urshri node:$MHS, node:$RHS))> >;
-defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
-                BinOpFrag<(ARM64vshl (zext node:$LHS), node:$RHS)>>;
-defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", ARM64vlshr>;
-defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
-                TriOpFrag<(add node:$LHS, (ARM64vlshr node:$MHS, node:$RHS))> >;
-
-// SHRN patterns for when a logical right shift was used instead of arithmetic
-// (the immediate guarantees no sign bits actually end up in the result so it
-// doesn't matter).
-def : Pat<(v8i8 (trunc (ARM64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
-          (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
-def : Pat<(v4i16 (trunc (ARM64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
-          (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
-def : Pat<(v2i32 (trunc (ARM64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
-          (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
-
-def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
-                                 (trunc (ARM64vlshr (v8i16 V128:$Rn),
-                                                    vecshiftR16Narrow:$imm)))),
-          (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                           V128:$Rn, vecshiftR16Narrow:$imm)>;
-def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
-                                 (trunc (ARM64vlshr (v4i32 V128:$Rn),
-                                                    vecshiftR32Narrow:$imm)))),
-          (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                           V128:$Rn, vecshiftR32Narrow:$imm)>;
-def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
-                                 (trunc (ARM64vlshr (v2i64 V128:$Rn),
-                                                    vecshiftR64Narrow:$imm)))),
-          (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                           V128:$Rn, vecshiftR32Narrow:$imm)>;
-
-// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
-// Anyexts are implemented as zexts.
-def : Pat<(v8i16 (sext   (v8i8 V64:$Rn))),  (SSHLLv8i8_shift  V64:$Rn, (i32 0))>;
-def : Pat<(v8i16 (zext   (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
-def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
-def : Pat<(v4i32 (sext   (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
-def : Pat<(v4i32 (zext   (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
-def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
-def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
-def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
-def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
-// Also match an extend from the upper half of a 128 bit source register.
-def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
-          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
-def : Pat<(v8i16 (zext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
-          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
-def : Pat<(v8i16 (sext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
-          (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
-def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
-          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
-def : Pat<(v4i32 (zext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
-          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
-def : Pat<(v4i32 (sext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
-          (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
-def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
-          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
-def : Pat<(v2i64 (zext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
-          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
-def : Pat<(v2i64 (sext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
-          (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
-
-// Vector shift sxtl aliases
-def : InstAlias<"sxtl.8h $dst, $src1",
-                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl $dst.8h, $src1.8b",
-                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl.4s $dst, $src1",
-                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl $dst.4s, $src1.4h",
-                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl.2d $dst, $src1",
-                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl $dst.2d, $src1.2s",
-                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-
-// Vector shift sxtl2 aliases
-def : InstAlias<"sxtl2.8h $dst, $src1",
-                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
-                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2.4s $dst, $src1",
-                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
-                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2.2d $dst, $src1",
-                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
-                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-
-// Vector shift uxtl aliases
-def : InstAlias<"uxtl.8h $dst, $src1",
-                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl $dst.8h, $src1.8b",
-                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl.4s $dst, $src1",
-                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl $dst.4s, $src1.4h",
-                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl.2d $dst, $src1",
-                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl $dst.2d, $src1.2s",
-                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-
-// Vector shift uxtl2 aliases
-def : InstAlias<"uxtl2.8h $dst, $src1",
-                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
-                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2.4s $dst, $src1",
-                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
-                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2.2d $dst, $src1",
-                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
-                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-
-// If an integer is about to be converted to a floating point value,
-// just load it on the floating point unit.
-// These patterns are more complex because floating point loads do not
-// support sign extension.
-// The sign extension has to be explicitly added and is only supported for
-// one step: byte-to-half, half-to-word, word-to-doubleword.
-// SCVTF GPR -> FPR is 9 cycles.
-// SCVTF FPR -> FPR is 4 cyclces.
-// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
-// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
-// and still being faster.
-// However, this is not good for code size.
-// 8-bits -> float. 2 sizes step-up.
-class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
-  : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
-        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                            (SSHLLv4i16_shift
-                              (f64
-                                (EXTRACT_SUBREG
-                                  (SSHLLv8i8_shift
-                                    (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                        INST,
-                                        bsub),
-                                    0),
-                                  dsub)),
-                               0),
-                             ssub)))>, Requires<[NotForCodeSize]>;
-
-def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
-                          (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
-def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
-                          (LDRBroX  GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
-def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
-                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
-def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
-                          (LDURBi GPR64sp:$Rn, simm9:$offset)>;
-
-// 16-bits -> float. 1 size step-up.
-class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
-  : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
-        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                            (SSHLLv4i16_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                  INST,
-                                  hsub),
-                                0),
-                            ssub)))>, Requires<[NotForCodeSize]>;
-
-def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
-                           (LDRHroW   GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
-def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
-                           (LDRHroX   GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
-def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
-                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
-def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
-                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
-
-// 32-bits to 32-bits are handled in target specific dag combine:
-// performIntToFpCombine.
-// 64-bits integer to 32-bits floating point, not possible with
-// SCVTF on floating point registers (both source and destination
-// must have the same size).
-
-// Here are the patterns for 8, 16, 32, and 64-bits to double.
-// 8-bits -> double. 3 size step-up: give up.
-// 16-bits -> double. 2 size step.
-class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
-  : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                 (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv4i16_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                        INST,
-                                        hsub),
-                                     0),
-                                   dsub)),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-
-def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
-                           (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
-def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
-                           (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
-def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
-                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
-def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
-                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
-// 32-bits -> double. 1 size step-up.
-class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
-  : Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                  INST,
-                                  ssub),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-
-def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
-                           (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
-def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
-                           (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
-def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
-                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
-def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
-                           (LDURSi GPR64sp:$Rn, simm9:$offset)>;
-
-// 64-bits -> double are handled in target specific dag combine:
-// performIntToFpCombine.
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD Load-Store Structure
-//----------------------------------------------------------------------------
-defm LD1 : SIMDLd1Multiple<"ld1">;
-defm LD2 : SIMDLd2Multiple<"ld2">;
-defm LD3 : SIMDLd3Multiple<"ld3">;
-defm LD4 : SIMDLd4Multiple<"ld4">;
-
-defm ST1 : SIMDSt1Multiple<"st1">;
-defm ST2 : SIMDSt2Multiple<"st2">;
-defm ST3 : SIMDSt3Multiple<"st3">;
-defm ST4 : SIMDSt4Multiple<"st4">;
-
-class Ld1Pat<ValueType ty, Instruction INST>
-  : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;
-
-def : Ld1Pat<v16i8, LD1Onev16b>;
-def : Ld1Pat<v8i16, LD1Onev8h>;
-def : Ld1Pat<v4i32, LD1Onev4s>;
-def : Ld1Pat<v2i64, LD1Onev2d>;
-def : Ld1Pat<v8i8,  LD1Onev8b>;
-def : Ld1Pat<v4i16, LD1Onev4h>;
-def : Ld1Pat<v2i32, LD1Onev2s>;
-def : Ld1Pat<v1i64, LD1Onev1d>;
-
-class St1Pat<ValueType ty, Instruction INST>
-  : Pat<(store ty:$Vt, GPR64sp:$Rn),
-        (INST ty:$Vt, GPR64sp:$Rn)>;
-
-def : St1Pat<v16i8, ST1Onev16b>;
-def : St1Pat<v8i16, ST1Onev8h>;
-def : St1Pat<v4i32, ST1Onev4s>;
-def : St1Pat<v2i64, ST1Onev2d>;
-def : St1Pat<v8i8,  ST1Onev8b>;
-def : St1Pat<v4i16, ST1Onev4h>;
-def : St1Pat<v2i32, ST1Onev2s>;
-def : St1Pat<v1i64, ST1Onev1d>;
-
-//---
-// Single-element
-//---
-
-defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
-defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
-defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
-defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
-let mayLoad = 1, neverHasSideEffects = 1 in {
-defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
-defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
-defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
-defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned,   GPR64pi8>;
-defm LD2 : SIMDLdSingleBTied<1, 0b000,       "ld2", VecListTwob,   GPR64pi2>;
-defm LD2 : SIMDLdSingleHTied<1, 0b010, 0,    "ld2", VecListTwoh,   GPR64pi4>;
-defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos,   GPR64pi8>;
-defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod,   GPR64pi16>;
-defm LD3 : SIMDLdSingleBTied<0, 0b001,       "ld3", VecListThreeb, GPR64pi3>;
-defm LD3 : SIMDLdSingleHTied<0, 0b011, 0,    "ld3", VecListThreeh, GPR64pi6>;
-defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
-defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
-defm LD4 : SIMDLdSingleBTied<1, 0b001,       "ld4", VecListFourb,  GPR64pi4>;
-defm LD4 : SIMDLdSingleHTied<1, 0b011, 0,    "ld4", VecListFourh,  GPR64pi8>;
-defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours,  GPR64pi16>;
-defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd,  GPR64pi32>;
-}
-
-def : Pat<(v8i8 (ARM64dup (i32 (extloadi8 GPR64sp:$Rn)))),
-          (LD1Rv8b GPR64sp:$Rn)>;
-def : Pat<(v16i8 (ARM64dup (i32 (extloadi8 GPR64sp:$Rn)))),
-          (LD1Rv16b GPR64sp:$Rn)>;
-def : Pat<(v4i16 (ARM64dup (i32 (extloadi16 GPR64sp:$Rn)))),
-          (LD1Rv4h GPR64sp:$Rn)>;
-def : Pat<(v8i16 (ARM64dup (i32 (extloadi16 GPR64sp:$Rn)))),
-          (LD1Rv8h GPR64sp:$Rn)>;
-def : Pat<(v2i32 (ARM64dup (i32 (load GPR64sp:$Rn)))),
-          (LD1Rv2s GPR64sp:$Rn)>;
-def : Pat<(v4i32 (ARM64dup (i32 (load GPR64sp:$Rn)))),
-          (LD1Rv4s GPR64sp:$Rn)>;
-def : Pat<(v2i64 (ARM64dup (i64 (load GPR64sp:$Rn)))),
-          (LD1Rv2d GPR64sp:$Rn)>;
-def : Pat<(v1i64 (ARM64dup (i64 (load GPR64sp:$Rn)))),
-          (LD1Rv1d GPR64sp:$Rn)>;
-// Grab the floating point version too
-def : Pat<(v2f32 (ARM64dup (f32 (load GPR64sp:$Rn)))),
-          (LD1Rv2s GPR64sp:$Rn)>;
-def : Pat<(v4f32 (ARM64dup (f32 (load GPR64sp:$Rn)))),
-          (LD1Rv4s GPR64sp:$Rn)>;
-def : Pat<(v2f64 (ARM64dup (f64 (load GPR64sp:$Rn)))),
-          (LD1Rv2d GPR64sp:$Rn)>;
-def : Pat<(v1f64 (ARM64dup (f64 (load GPR64sp:$Rn)))),
-          (LD1Rv1d GPR64sp:$Rn)>;
-
-class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
-                    ValueType VTy, ValueType STy, Instruction LD1>
-  : Pat<(vector_insert (VTy VecListOne128:$Rd),
-           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
-        (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;
-
-def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
-def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
-def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
-def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
-def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
-def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
-
-class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
-                   ValueType VTy, ValueType STy, Instruction LD1>
-  : Pat<(vector_insert (VTy VecListOne64:$Rd),
-           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
-        (EXTRACT_SUBREG
-            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
-                          VecIndex:$idx, GPR64sp:$Rn),
-            dsub)>;
-
-def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
-def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
-def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
-def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
-
-
-defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
-defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
-defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
-defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
-
-// Stores
-defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
-defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
-defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
-defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
-
-let AddedComplexity = 15 in
-class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
-                    ValueType VTy, ValueType STy, Instruction ST1>
-  : Pat<(scalar_store
-             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
-             GPR64sp:$Rn),
-        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;
-
-def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
-def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
-def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
-def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
-def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
-def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
-
-let AddedComplexity = 15 in
-class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
-                   ValueType VTy, ValueType STy, Instruction ST1>
-  : Pat<(scalar_store
-             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
-             GPR64sp:$Rn),
-        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
-             VecIndex:$idx, GPR64sp:$Rn)>;
-
-def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
-def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
-def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
-def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
-
-multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
-                             ValueType VTy, ValueType STy, Instruction ST1,
-                             int offset> {
-  def : Pat<(scalar_store
-              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
-              GPR64sp:$Rn, offset),
-        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
-             VecIndex:$idx, GPR64sp:$Rn, XZR)>;
-
-  def : Pat<(scalar_store
-              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
-              GPR64sp:$Rn, GPR64:$Rm),
-        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
-             VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
-}
-
-defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
-defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
-                        2>;
-defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
-defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
-defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
-defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
-
-multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
-                             ValueType VTy, ValueType STy, Instruction ST1,
-                             int offset> {
-  def : Pat<(scalar_store
-              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
-              GPR64sp:$Rn, offset),
-        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;
-
-  def : Pat<(scalar_store
-              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
-              GPR64sp:$Rn, GPR64:$Rm),
-        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
-}
-
-defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
-                         1>;
-defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
-                         2>;
-defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
-defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
-defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
-defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
-
-let mayStore = 1, neverHasSideEffects = 1 in {
-defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
-defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
-defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
-defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
-defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
-defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
-defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
-defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
-defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
-defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
-defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
-defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
-}
-
-defm ST1 : SIMDLdSt1SingleAliases<"st1">;
-defm ST2 : SIMDLdSt2SingleAliases<"st2">;
-defm ST3 : SIMDLdSt3SingleAliases<"st3">;
-defm ST4 : SIMDLdSt4SingleAliases<"st4">;
-
-//----------------------------------------------------------------------------
-// Crypto extensions
-//----------------------------------------------------------------------------
-
-def AESErr   : AESTiedInst<0b0100, "aese",   int_arm64_crypto_aese>;
-def AESDrr   : AESTiedInst<0b0101, "aesd",   int_arm64_crypto_aesd>;
-def AESMCrr  : AESInst<    0b0110, "aesmc",  int_arm64_crypto_aesmc>;
-def AESIMCrr : AESInst<    0b0111, "aesimc", int_arm64_crypto_aesimc>;
-
-def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_arm64_crypto_sha1c>;
-def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_arm64_crypto_sha1p>;
-def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_arm64_crypto_sha1m>;
-def SHA1SU0rrr   : SHATiedInstVVV<0b011, "sha1su0", int_arm64_crypto_sha1su0>;
-def SHA256Hrrr   : SHATiedInstQQV<0b100, "sha256h", int_arm64_crypto_sha256h>;
-def SHA256H2rrr  : SHATiedInstQQV<0b101, "sha256h2",int_arm64_crypto_sha256h2>;
-def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_arm64_crypto_sha256su1>;
-
-def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_arm64_crypto_sha1h>;
-def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_arm64_crypto_sha1su1>;
-def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_arm64_crypto_sha256su0>;
-
-//----------------------------------------------------------------------------
-// Compiler-pseudos
-//----------------------------------------------------------------------------
-// FIXME: Like for X86, these should go in their own separate .td file.
-
-// Any instruction that defines a 32-bit result leaves the high half of the
-// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. But any other 32-bit operation will zero-extend
-// up to 64 bits.
-// FIXME: X86 also checks for CMOV here. Do we need something similar?
-def def32 : PatLeaf<(i32 GPR32:$src), [{
-  return N->getOpcode() != ISD::TRUNCATE &&
-         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
-         N->getOpcode() != ISD::CopyFromReg;
-}]>;
-
-// In the case of a 32-bit def that is known to implicitly zero-extend,
-// we can use a SUBREG_TO_REG.
-def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
-
-// For an anyext, we don't care what the high bits are, so we can perform an
-// INSERT_SUBREF into an IMPLICIT_DEF.
-def : Pat<(i64 (anyext GPR32:$src)),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
-
-// When we need to explicitly zero-extend, we use an unsigned bitfield move
-// instruction (UBFM) on the enclosing super-reg.
-def : Pat<(i64 (zext GPR32:$src)),
- (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
-
-// To sign extend, we use a signed bitfield move instruction (SBFM) on the
-// containing super-reg.
-def : Pat<(i64 (sext GPR32:$src)),
-   (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i8)),  (SBFMXri GPR64:$src, 0, 7)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i1)),  (SBFMXri GPR64:$src, 0, 0)>;
-def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
-def : Pat<(i32 (sext_inreg GPR32:$src, i8)),  (SBFMWri GPR32:$src, 0, 7)>;
-def : Pat<(i32 (sext_inreg GPR32:$src, i1)),  (SBFMWri GPR32:$src, 0, 0)>;
-
-def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 (i32shift_a       imm0_31:$imm)),
-                              (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
-def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
-                              (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
-
-def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 (i32shift_a        imm0_31:$imm)),
-                              (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
-def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 (i64shift_a        imm0_63:$imm)),
-                              (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
-
-def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
-          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
-                   (i64 (i64shift_a        imm0_63:$imm)),
-                   (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
-
-// sra patterns have an AddedComplexity of 10, so make sure we have a higher
-// AddedComplexity for the following patterns since we want to match sext + sra
-// patterns before we attempt to match a single sra node.
-let AddedComplexity = 20 in {
-// We support all sext + sra combinations which preserve at least one bit of the
-// original value which is to be sign extended. E.g. we support shifts up to
-// bitwidth-1 bits.
-def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
-def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
-
-def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
-def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
-
-def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
-          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
-                   (i64 imm0_31:$imm), 31)>;
-} // AddedComplexity = 20
-
-// To truncate, we can simply extract from a subregister.
-def : Pat<(i32 (trunc GPR64sp:$src)),
-          (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
-
-// __builtin_trap() uses the BRK instruction on ARM64.
-def : Pat<(trap), (BRK 1)>;
-
-// Conversions within AdvSIMD types in the same register size are free.
-// But because we need a consistent lane ordering, in big endian many
-// conversions require one or more REV instructions.
-//
-// Consider a simple memory load followed by a bitconvert then a store.
-//   v0 = load v2i32
-//   v1 = BITCAST v2i32 v0 to v4i16
-//        store v4i16 v2
-//
-// In big endian mode every memory access has an implicit byte swap. LDR and
-// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
-// is, they treat the vector as a sequence of elements to be byte-swapped.
-// The two pairs of instructions are fundamentally incompatible. We've decided
-// to use LD1/ST1 only to simplify compiler implementation.
-//
-// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
-// the original code sequence:
-//   v0 = load v2i32
-//   v1 = REV v2i32                  (implicit)
-//   v2 = BITCAST v2i32 v1 to v4i16
-//   v3 = REV v4i16 v2               (implicit)
-//        store v4i16 v3
-//
-// But this is now broken - the value stored is different to the value loaded
-// due to lane reordering. To fix this, on every BITCAST we must perform two
-// other REVs:
-//   v0 = load v2i32
-//   v1 = REV v2i32                  (implicit)
-//   v2 = REV v2i32
-//   v3 = BITCAST v2i32 v2 to v4i16
-//   v4 = REV v4i16
-//   v5 = REV v4i16 v4               (implicit)
-//        store v4i16 v5
-//
-// This means an extra two instructions, but actually in most cases the two REV
-// instructions can be combined into one. For example:
-//   (REV64_2s (REV64_4h X)) === (REV32_4h X)
-//
-// There is also no 128-bit REV instruction. This must be synthesized with an
-// EXT instruction.
-//
-// Most bitconverts require some sort of conversion. The only exceptions are:
-//   a) Identity conversions -  vNfX <-> vNiX
-//   b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
-//
-
-let Predicates = [IsLE] in {
-def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-
-def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v8i8  (bitconvert GPR64:$Xn)),
-                 (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
-def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
-                 (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
-def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
-                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
-def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
-                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
-
-def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
-          (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
-def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
-          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
-def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
-          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
-def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
-          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
-}
-def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
-          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
-          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
-
-def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
-          (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
-def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
-          (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
-def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
-          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
-          (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
-def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
-                             (v1i64 (REV64v2i32 FPR64:$src))>;
-def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
-                             (v1i64 (REV64v4i16 FPR64:$src))>;
-def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
-                             (v1i64 (REV64v8i8 FPR64:$src))>;
-def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
-                             (v1i64 (REV64v2i32 FPR64:$src))>;
-}
-def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
-                             (v2i32 (REV64v2i32 FPR64:$src))>;
-def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
-                             (v2i32 (REV32v4i16 FPR64:$src))>;
-def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))),
-                             (v2i32 (REV32v8i8 FPR64:$src))>;
-def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))),
-                             (v2i32 (REV64v2i32 FPR64:$src))>;
-def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
-                             (v2i32 (REV64v2i32 FPR64:$src))>;
-}
-def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
-                             (v4i16 (REV64v4i16 FPR64:$src))>;
-def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
-                             (v4i16 (REV32v4i16 FPR64:$src))>;
-def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))),
-                             (v4i16 (REV16v8i8 FPR64:$src))>;
-def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))),
-                             (v4i16 (REV64v4i16 FPR64:$src))>;
-def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
-                             (v4i16 (REV32v4i16 FPR64:$src))>;
-def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
-                             (v4i16 (REV64v4i16 FPR64:$src))>;
-}
-
-let Predicates = [IsLE] in {
-def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
-                             (v8i8 (REV64v8i8 FPR64:$src))>;
-def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))),
-                             (v8i8 (REV32v8i8 FPR64:$src))>;
-def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))),
-                             (v8i8 (REV16v8i8 FPR64:$src))>;
-def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))),
-                             (v8i8 (REV64v8i8 FPR64:$src))>;
-def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))),
-                             (v8i8 (REV32v8i8 FPR64:$src))>;
-def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
-                             (v8i8 (REV64v8i8 FPR64:$src))>;
-}
-
-let Predicates = [IsLE] in {
-def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
-                             (f64 (REV64v2i32 FPR64:$src))>;
-def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))),
-                             (f64 (REV64v4i16 FPR64:$src))>;
-def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))),
-                             (f64 (REV64v2i32 FPR64:$src))>;
-def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
-                             (f64 (REV64v8i8 FPR64:$src))>;
-}
-def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
-                             (v1f64 (REV64v2i32 FPR64:$src))>;
-def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
-                             (v1f64 (REV64v4i16 FPR64:$src))>;
-def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))),
-                             (v1f64 (REV64v8i8 FPR64:$src))>;
-def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
-                             (v1f64 (REV64v2i32 FPR64:$src))>;
-}
-def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
-                             (v2f32 (REV64v2i32 FPR64:$src))>;
-def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
-                             (v2f32 (REV32v4i16 FPR64:$src))>;
-def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))),
-                             (v2f32 (REV32v8i8 FPR64:$src))>;
-def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
-                             (v2f32 (REV64v2i32 FPR64:$src))>;
-def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
-                             (v2f32 (REV64v2i32 FPR64:$src))>;
-}
-def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
-                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
-def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
-                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
-                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
-def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
-                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
-                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
-def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
-                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
-def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
-                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
-                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
-def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
-                            (f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
-                                            (REV64v16i8 FPR128:$src), (i32 8)))>;
-}
-
-let Predicates = [IsLE] in {
-def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))),
-                             (v2f64 (EXTv16i8 FPR128:$src,
-                                              FPR128:$src, (i32 8)))>;
-def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
-                             (v2f64 (REV64v4i32 FPR128:$src))>;
-def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
-                             (v2f64 (REV64v8i16 FPR128:$src))>;
-def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
-                             (v2f64 (REV64v16i8 FPR128:$src))>;
-def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
-                             (v2f64 (REV64v4i32 FPR128:$src))>;
-}
-def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))),
-                             (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
-                                    (REV64v4i32 FPR128:$src), (i32 8)))>;
-def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
-                             (v4f32 (REV32v8i16 FPR128:$src))>;
-def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
-                             (v4f32 (REV32v16i8 FPR128:$src))>;
-def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
-                             (v4f32 (REV64v4i32 FPR128:$src))>;
-def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
-                             (v4f32 (REV64v4i32 FPR128:$src))>;
-}
-def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
-                             (v2i64 (EXTv16i8 FPR128:$src,
-                                              FPR128:$src, (i32 8)))>;
-def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
-                             (v2i64 (REV64v4i32 FPR128:$src))>;
-def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
-                             (v2i64 (REV64v8i16 FPR128:$src))>;
-def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
-                             (v2i64 (REV64v16i8 FPR128:$src))>;
-def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
-                             (v2i64 (REV64v4i32 FPR128:$src))>;
-}
-def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
-                             (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
-                                              (REV64v4i32 FPR128:$src),
-                                              (i32 8)))>;
-def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
-                             (v4i32 (REV64v4i32 FPR128:$src))>;
-def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
-                             (v4i32 (REV32v8i16 FPR128:$src))>;
-def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
-                             (v4i32 (REV32v16i8 FPR128:$src))>;
-def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
-                             (v4i32 (REV64v4i32 FPR128:$src))>;
-}
-def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))),
-                             (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
-                                              (REV64v8i16 FPR128:$src),
-                                              (i32 8)))>;
-def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
-                             (v8i16 (REV64v8i16 FPR128:$src))>;
-def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
-                             (v8i16 (REV32v8i16 FPR128:$src))>;
-def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
-                             (v8i16 (REV16v16i8 FPR128:$src))>;
-def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
-                             (v8i16 (REV64v8i16 FPR128:$src))>;
-def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
-                             (v8i16 (REV32v8i16 FPR128:$src))>;
-}
-
-let Predicates = [IsLE] in {
-def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
-                             (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
-                                              (REV64v16i8 FPR128:$src),
-                                              (i32 8)))>;
-def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
-                             (v16i8 (REV64v16i8 FPR128:$src))>;
-def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
-                             (v16i8 (REV32v16i8 FPR128:$src))>;
-def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
-                             (v16i8 (REV16v16i8 FPR128:$src))>;
-def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
-                             (v16i8 (REV64v16i8 FPR128:$src))>;
-def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
-                             (v16i8 (REV32v16i8 FPR128:$src))>;
-}
-
-def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-
-// A 64-bit subvector insert to the first 128-bit vector position
-// is a subregister copy that needs no instruction.
-def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-
-// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
-// or v2f32.
-def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
-                    (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
-           (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
-def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
-                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
-           (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
-    // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
-    // so we match on v4f32 here, not v2f32. This will also catch adding
-    // the low two lanes of a true v4f32 vector.
-def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
-                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
-          (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
-
-// Scalar 64-bit shifts in FPR64 registers.
-def : Pat<(i64 (int_arm64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-
-// Tail call return handling. These are all compiler pseudo-instructions,
-// so no encoding information or anything like that.
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
-  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
-  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
-}
-
-def : Pat<(ARM64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
-          (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
-def : Pat<(ARM64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
-          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
-def : Pat<(ARM64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
-          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
-
-include "ARM64InstrAtomics.td"
diff --git a/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp b/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp
deleted file mode 100644
index e2c4b13f036..00000000000
--- a/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp
+++ /dev/null
@@ -1,944 +0,0 @@
-//===-- ARM64LoadStoreOptimizer.cpp - ARM64 load/store opt. pass --*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that performs load / store related peephole
-// optimizations. This pass should be run after register allocation.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64InstrInfo.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-ldst-opt"
-
-/// ARM64AllocLoadStoreOpt - Post-register allocation pass to combine
-/// load / store instructions to form ldp / stp instructions.
-
-STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
-STATISTIC(NumPostFolded, "Number of post-index updates folded");
-STATISTIC(NumPreFolded, "Number of pre-index updates folded");
-STATISTIC(NumUnscaledPairCreated,
-          "Number of load/store from unscaled generated");
-
-static cl::opt<unsigned> ScanLimit("arm64-load-store-scan-limit", cl::init(20),
-                                   cl::Hidden);
-
-// Place holder while testing unscaled load/store combining
-static cl::opt<bool>
-EnableARM64UnscaledMemOp("arm64-unscaled-mem-op", cl::Hidden,
-                         cl::desc("Allow ARM64 unscaled load/store combining"),
-                         cl::init(true));
-
-namespace {
-struct ARM64LoadStoreOpt : public MachineFunctionPass {
-  static char ID;
-  ARM64LoadStoreOpt() : MachineFunctionPass(ID) {}
-
-  const ARM64InstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-
-  // Scan the instructions looking for a load/store that can be combined
-  // with the current instruction into a load/store pair.
-  // Return the matching instruction if one is found, else MBB->end().
-  // If a matching instruction is found, mergeForward is set to true if the
-  // merge is to remove the first instruction and replace the second with
-  // a pair-wise insn, and false if the reverse is true.
-  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
-                                               bool &mergeForward,
-                                               unsigned Limit);
-  // Merge the two instructions indicated into a single pair-wise instruction.
-  // If mergeForward is true, erase the first instruction and fold its
-  // operation into the second. If false, the reverse. Return the instruction
-  // following the first instruction (which may change during processing).
-  MachineBasicBlock::iterator
-  mergePairedInsns(MachineBasicBlock::iterator I,
-                   MachineBasicBlock::iterator Paired, bool mergeForward);
-
-  // Scan the instruction list to find a base register update that can
-  // be combined with the current instruction (a load or store) using
-  // pre or post indexed addressing with writeback. Scan forwards.
-  MachineBasicBlock::iterator
-  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
-                                int Value);
-
-  // Scan the instruction list to find a base register update that can
-  // be combined with the current instruction (a load or store) using
-  // pre or post indexed addressing with writeback. Scan backwards.
-  MachineBasicBlock::iterator
-  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
-
-  // Merge a pre-index base register update into a ld/st instruction.
-  MachineBasicBlock::iterator
-  mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
-                        MachineBasicBlock::iterator Update);
-
-  // Merge a post-index base register update into a ld/st instruction.
-  MachineBasicBlock::iterator
-  mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
-                         MachineBasicBlock::iterator Update);
-
-  bool optimizeBlock(MachineBasicBlock &MBB);
-
-  bool runOnMachineFunction(MachineFunction &Fn) override;
-
-  const char *getPassName() const override {
-    return "ARM64 load / store optimization pass";
-  }
-
-private:
-  int getMemSize(MachineInstr *MemMI);
-};
-char ARM64LoadStoreOpt::ID = 0;
-}
-
-static bool isUnscaledLdst(unsigned Opc) {
-  switch (Opc) {
-  default:
-    return false;
-  case ARM64::STURSi:
-    return true;
-  case ARM64::STURDi:
-    return true;
-  case ARM64::STURQi:
-    return true;
-  case ARM64::STURWi:
-    return true;
-  case ARM64::STURXi:
-    return true;
-  case ARM64::LDURSi:
-    return true;
-  case ARM64::LDURDi:
-    return true;
-  case ARM64::LDURQi:
-    return true;
-  case ARM64::LDURWi:
-    return true;
-  case ARM64::LDURXi:
-    return true;
-  }
-}
-
-// Size in bytes of the data moved by an unscaled load or store
-int ARM64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
-  switch (MemMI->getOpcode()) {
-  default:
-    llvm_unreachable("Opcode has has unknown size!");
-  case ARM64::STRSui:
-  case ARM64::STURSi:
-    return 4;
-  case ARM64::STRDui:
-  case ARM64::STURDi:
-    return 8;
-  case ARM64::STRQui:
-  case ARM64::STURQi:
-    return 16;
-  case ARM64::STRWui:
-  case ARM64::STURWi:
-    return 4;
-  case ARM64::STRXui:
-  case ARM64::STURXi:
-    return 8;
-  case ARM64::LDRSui:
-  case ARM64::LDURSi:
-    return 4;
-  case ARM64::LDRDui:
-  case ARM64::LDURDi:
-    return 8;
-  case ARM64::LDRQui:
-  case ARM64::LDURQi:
-    return 16;
-  case ARM64::LDRWui:
-  case ARM64::LDURWi:
-    return 4;
-  case ARM64::LDRXui:
-  case ARM64::LDURXi:
-    return 8;
-  }
-}
-
-static unsigned getMatchingPairOpcode(unsigned Opc) {
-  switch (Opc) {
-  default:
-    llvm_unreachable("Opcode has no pairwise equivalent!");
-  case ARM64::STRSui:
-  case ARM64::STURSi:
-    return ARM64::STPSi;
-  case ARM64::STRDui:
-  case ARM64::STURDi:
-    return ARM64::STPDi;
-  case ARM64::STRQui:
-  case ARM64::STURQi:
-    return ARM64::STPQi;
-  case ARM64::STRWui:
-  case ARM64::STURWi:
-    return ARM64::STPWi;
-  case ARM64::STRXui:
-  case ARM64::STURXi:
-    return ARM64::STPXi;
-  case ARM64::LDRSui:
-  case ARM64::LDURSi:
-    return ARM64::LDPSi;
-  case ARM64::LDRDui:
-  case ARM64::LDURDi:
-    return ARM64::LDPDi;
-  case ARM64::LDRQui:
-  case ARM64::LDURQi:
-    return ARM64::LDPQi;
-  case ARM64::LDRWui:
-  case ARM64::LDURWi:
-    return ARM64::LDPWi;
-  case ARM64::LDRXui:
-  case ARM64::LDURXi:
-    return ARM64::LDPXi;
-  }
-}
-
-static unsigned getPreIndexedOpcode(unsigned Opc) {
-  switch (Opc) {
-  default:
-    llvm_unreachable("Opcode has no pre-indexed equivalent!");
-  case ARM64::STRSui:    return ARM64::STRSpre;
-  case ARM64::STRDui:    return ARM64::STRDpre;
-  case ARM64::STRQui:    return ARM64::STRQpre;
-  case ARM64::STRWui:    return ARM64::STRWpre;
-  case ARM64::STRXui:    return ARM64::STRXpre;
-  case ARM64::LDRSui:    return ARM64::LDRSpre;
-  case ARM64::LDRDui:    return ARM64::LDRDpre;
-  case ARM64::LDRQui:    return ARM64::LDRQpre;
-  case ARM64::LDRWui:    return ARM64::LDRWpre;
-  case ARM64::LDRXui:    return ARM64::LDRXpre;
-  }
-}
-
-static unsigned getPostIndexedOpcode(unsigned Opc) {
-  switch (Opc) {
-  default:
-    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
-  case ARM64::STRSui:
-    return ARM64::STRSpost;
-  case ARM64::STRDui:
-    return ARM64::STRDpost;
-  case ARM64::STRQui:
-    return ARM64::STRQpost;
-  case ARM64::STRWui:
-    return ARM64::STRWpost;
-  case ARM64::STRXui:
-    return ARM64::STRXpost;
-  case ARM64::LDRSui:
-    return ARM64::LDRSpost;
-  case ARM64::LDRDui:
-    return ARM64::LDRDpost;
-  case ARM64::LDRQui:
-    return ARM64::LDRQpost;
-  case ARM64::LDRWui:
-    return ARM64::LDRWpost;
-  case ARM64::LDRXui:
-    return ARM64::LDRXpost;
-  }
-}
-
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
-                                    MachineBasicBlock::iterator Paired,
-                                    bool mergeForward) {
-  MachineBasicBlock::iterator NextI = I;
-  ++NextI;
-  // If NextI is the second of the two instructions to be merged, we need
-  // to skip one further. Either way we merge will invalidate the iterator,
-  // and we don't need to scan the new instruction, as it's a pairwise
-  // instruction, which we're not considering for further action anyway.
-  if (NextI == Paired)
-    ++NextI;
-
-  bool IsUnscaled = isUnscaledLdst(I->getOpcode());
-  int OffsetStride = IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(I) : 1;
-
-  unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
-  // Insert our new paired instruction after whichever of the paired
-  // instructions mergeForward indicates.
-  MachineBasicBlock::iterator InsertionPoint = mergeForward ? Paired : I;
-  // Also based on mergeForward is from where we copy the base register operand
-  // so we get the flags compatible with the input code.
-  MachineOperand &BaseRegOp =
-      mergeForward ? Paired->getOperand(1) : I->getOperand(1);
-
-  // Which register is Rt and which is Rt2 depends on the offset order.
-  MachineInstr *RtMI, *Rt2MI;
-  if (I->getOperand(2).getImm() ==
-      Paired->getOperand(2).getImm() + OffsetStride) {
-    RtMI = Paired;
-    Rt2MI = I;
-  } else {
-    RtMI = I;
-    Rt2MI = Paired;
-  }
-  // Handle Unscaled
-  int OffsetImm = RtMI->getOperand(2).getImm();
-  if (IsUnscaled && EnableARM64UnscaledMemOp)
-    OffsetImm /= OffsetStride;
-
-  // Construct the new instruction.
-  MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
-                                    I->getDebugLoc(), TII->get(NewOpc))
-                                .addOperand(RtMI->getOperand(0))
-                                .addOperand(Rt2MI->getOperand(0))
-                                .addOperand(BaseRegOp)
-                                .addImm(OffsetImm);
-  (void)MIB;
-
-  // FIXME: Do we need/want to copy the mem operands from the source
-  //        instructions? Probably. What uses them after this?
-
-  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
-  DEBUG(I->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG(Paired->print(dbgs()));
-  DEBUG(dbgs() << "  with instruction:\n    ");
-  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-  DEBUG(dbgs() << "\n");
-
-  // Erase the old instructions.
-  I->eraseFromParent();
-  Paired->eraseFromParent();
-
-  return NextI;
-}
-
-/// trackRegDefsUses - Remember what registers the specified instruction uses
-/// and modifies.
-static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
-                             BitVector &UsedRegs,
-                             const TargetRegisterInfo *TRI) {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (MO.isRegMask())
-      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
-
-    if (!MO.isReg())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (MO.isDef()) {
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        ModifiedRegs.set(*AI);
-    } else {
-      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        UsedRegs.set(*AI);
-    }
-  }
-}
-
-static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
-  if (!IsUnscaled && (Offset > 63 || Offset < -64))
-    return false;
-  if (IsUnscaled) {
-    // Convert the byte-offset used by unscaled into an "element" offset used
-    // by the scaled pair load/store instructions.
-    int elemOffset = Offset / OffsetStride;
-    if (elemOffset > 63 || elemOffset < -64)
-      return false;
-  }
-  return true;
-}
-
-// Do alignment, specialized to power of 2 and for signed ints,
-// avoiding having to do a C-style cast from uint_64t to int when
-// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
-// FIXME: Move this function to include/MathExtras.h?
-static int alignTo(int Num, int PowOf2) {
-  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
-}
-
-/// findMatchingInsn - Scan the instructions looking for a load/store that can
-/// be combined with the current instruction into a load/store pair.
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
-                                    bool &mergeForward, unsigned Limit) {
-  MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineBasicBlock::iterator MBBI = I;
-  MachineInstr *FirstMI = I;
-  ++MBBI;
-
-  int Opc = FirstMI->getOpcode();
-  bool mayLoad = FirstMI->mayLoad();
-  bool IsUnscaled = isUnscaledLdst(Opc);
-  unsigned Reg = FirstMI->getOperand(0).getReg();
-  unsigned BaseReg = FirstMI->getOperand(1).getReg();
-  int Offset = FirstMI->getOperand(2).getImm();
-
-  // Early exit if the first instruction modifies the base register.
-  // e.g., ldr x0, [x0]
-  // Early exit if the offset if not possible to match. (6 bits of positive
-  // range, plus allow an extra one in case we find a later insn that matches
-  // with Offset-1
-  if (FirstMI->modifiesRegister(BaseReg, TRI))
-    return E;
-  int OffsetStride =
-      IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(FirstMI) : 1;
-  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
-    return E;
-
-  // Track which registers have been modified and used between the first insn
-  // (inclusive) and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
-  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
-    MachineInstr *MI = MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI->isDebugValue())
-      continue;
-
-    // Now that we know this is a real instruction, count it.
-    ++Count;
-
-    if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) {
-      // If we've found another instruction with the same opcode, check to see
-      // if the base and offset are compatible with our starting instruction.
-      // These instructions all have scaled immediate operands, so we just
-      // check for +1/-1. Make sure to check the new instruction offset is
-      // actually an immediate and not a symbolic reference destined for
-      // a relocation.
-      //
-      // Pairwise instructions have a 7-bit signed offset field. Single insns
-      // have a 12-bit unsigned offset field. To be a valid combine, the
-      // final offset must be in range.
-      unsigned MIBaseReg = MI->getOperand(1).getReg();
-      int MIOffset = MI->getOperand(2).getImm();
-      if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
-                                   (Offset + OffsetStride == MIOffset))) {
-        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
-        // If this is a volatile load/store that otherwise matched, stop looking
-        // as something is going on that we don't have enough information to
-        // safely transform. Similarly, stop if we see a hint to avoid pairs.
-        if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
-          return E;
-        // If the resultant immediate offset of merging these instructions
-        // is out of range for a pairwise instruction, bail and keep looking.
-        bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
-        if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
-          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          continue;
-        }
-        // If the alignment requirements of the paired (scaled) instruction
-        // can't express the offset of the unscaled input, bail and keep
-        // looking.
-        if (IsUnscaled && EnableARM64UnscaledMemOp &&
-            (alignTo(MinOffset, OffsetStride) != MinOffset)) {
-          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          continue;
-        }
-        // If the destination register of the loads is the same register, bail
-        // and keep looking. A load-pair instruction with both destination
-        // registers the same is UNPREDICTABLE and will result in an exception.
-        if (mayLoad && Reg == MI->getOperand(0).getReg()) {
-          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          continue;
-        }
-
-        // If the Rt of the second instruction was not modified or used between
-        // the two instructions, we can combine the second into the first.
-        if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
-            !UsedRegs[MI->getOperand(0).getReg()]) {
-          mergeForward = false;
-          return MBBI;
-        }
-
-        // Likewise, if the Rt of the first instruction is not modified or used
-        // between the two instructions, we can combine the first into the
-        // second.
-        if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
-            !UsedRegs[FirstMI->getOperand(0).getReg()]) {
-          mergeForward = true;
-          return MBBI;
-        }
-        // Unable to combine these instructions due to interference in between.
-        // Keep looking.
-      }
-    }
-
-    // If the instruction wasn't a matching load or store, but does (or can)
-    // modify memory, stop searching, as we don't have alias analysis or
-    // anything like that to tell us whether the access is tromping on the
-    // locations we care about. The big one we want to catch is calls.
-    //
-    // FIXME: Theoretically, we can do better than that for SP and FP based
-    // references since we can effectively know where those are touching. It's
-    // unclear if it's worth the extra code, though. Most paired instructions
-    // will be sequential, perhaps with a few intervening non-memory related
-    // instructions.
-    if (MI->mayStore() || MI->isCall())
-      return E;
-    // Likewise, if we're matching a store instruction, we don't want to
-    // move across a load, as it may be reading the same location.
-    if (FirstMI->mayStore() && MI->mayLoad())
-      return E;
-
-    // Update modified / uses register lists.
-    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-
-    // Otherwise, if the base register is modified, we have no match, so
-    // return early.
-    if (ModifiedRegs[BaseReg])
-      return E;
-  }
-  return E;
-}
-
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
-                                         MachineBasicBlock::iterator Update) {
-  assert((Update->getOpcode() == ARM64::ADDXri ||
-          Update->getOpcode() == ARM64::SUBXri) &&
-         "Unexpected base register update instruction to merge!");
-  MachineBasicBlock::iterator NextI = I;
-  // Return the instruction following the merged instruction, which is
-  // the instruction following our unmerged load. Unless that's the add/sub
-  // instruction we're merging, in which case it's the one after that.
-  if (++NextI == Update)
-    ++NextI;
-
-  int Value = Update->getOperand(2).getImm();
-  assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
-         "Can't merge 1 << 12 offset into pre-indexed load / store");
-  if (Update->getOpcode() == ARM64::SUBXri)
-    Value = -Value;
-
-  unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
-  MachineInstrBuilder MIB =
-      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-          .addOperand(Update->getOperand(0))
-          .addOperand(I->getOperand(0))
-          .addOperand(I->getOperand(1))
-          .addImm(Value);
-  (void)MIB;
-
-  DEBUG(dbgs() << "Creating pre-indexed load/store.");
-  DEBUG(dbgs() << "    Replacing instructions:\n    ");
-  DEBUG(I->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG(Update->print(dbgs()));
-  DEBUG(dbgs() << "  with instruction:\n    ");
-  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-  DEBUG(dbgs() << "\n");
-
-  // Erase the old instructions for the block.
-  I->eraseFromParent();
-  Update->eraseFromParent();
-
-  return NextI;
-}
-
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
-                                          MachineBasicBlock::iterator Update) {
-  assert((Update->getOpcode() == ARM64::ADDXri ||
-          Update->getOpcode() == ARM64::SUBXri) &&
-         "Unexpected base register update instruction to merge!");
-  MachineBasicBlock::iterator NextI = I;
-  // Return the instruction following the merged instruction, which is
-  // the instruction following our unmerged load. Unless that's the add/sub
-  // instruction we're merging, in which case it's the one after that.
-  if (++NextI == Update)
-    ++NextI;
-
-  int Value = Update->getOperand(2).getImm();
-  assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
-         "Can't merge 1 << 12 offset into post-indexed load / store");
-  if (Update->getOpcode() == ARM64::SUBXri)
-    Value = -Value;
-
-  unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
-  MachineInstrBuilder MIB =
-      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-          .addOperand(Update->getOperand(0))
-          .addOperand(I->getOperand(0))
-          .addOperand(I->getOperand(1))
-          .addImm(Value);
-  (void)MIB;
-
-  DEBUG(dbgs() << "Creating post-indexed load/store.");
-  DEBUG(dbgs() << "    Replacing instructions:\n    ");
-  DEBUG(I->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG(Update->print(dbgs()));
-  DEBUG(dbgs() << "  with instruction:\n    ");
-  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-  DEBUG(dbgs() << "\n");
-
-  // Erase the old instructions for the block.
-  I->eraseFromParent();
-  Update->eraseFromParent();
-
-  return NextI;
-}
-
-static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
-                                 int Offset) {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::SUBXri:
-    // Negate the offset for a SUB instruction.
-    Offset *= -1;
-  // FALLTHROUGH
-  case ARM64::ADDXri:
-    // Make sure it's a vanilla immediate operand, not a relocation or
-    // anything else we can't handle.
-    if (!MI->getOperand(2).isImm())
-      break;
-    // Watch out for 1 << 12 shifted value.
-    if (ARM64_AM::getShiftValue(MI->getOperand(3).getImm()))
-      break;
-    // If the instruction has the base register as source and dest and the
-    // immediate will fit in a signed 9-bit integer, then we have a match.
-    if (MI->getOperand(0).getReg() == BaseReg &&
-        MI->getOperand(1).getReg() == BaseReg &&
-        MI->getOperand(2).getImm() <= 255 &&
-        MI->getOperand(2).getImm() >= -256) {
-      // If we have a non-zero Offset, we check that it matches the amount
-      // we're adding to the register.
-      if (!Offset || Offset == MI->getOperand(2).getImm())
-        return true;
-    }
-    break;
-  }
-  return false;
-}
-
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
-                                                 unsigned Limit, int Value) {
-  MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineInstr *MemMI = I;
-  MachineBasicBlock::iterator MBBI = I;
-  const MachineFunction &MF = *MemMI->getParent()->getParent();
-
-  unsigned DestReg = MemMI->getOperand(0).getReg();
-  unsigned BaseReg = MemMI->getOperand(1).getReg();
-  int Offset = MemMI->getOperand(2).getImm() *
-               TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
-
-  // If the base register overlaps the destination register, we can't
-  // merge the update.
-  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
-    return E;
-
-  // Scan forward looking for post-index opportunities.
-  // Updating instructions can't be formed if the memory insn already
-  // has an offset other than the value we're looking for.
-  if (Offset != Value)
-    return E;
-
-  // Track which registers have been modified and used between the first insn
-  // (inclusive) and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
-  ++MBBI;
-  for (unsigned Count = 0; MBBI != E; ++MBBI) {
-    MachineInstr *MI = MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI->isDebugValue())
-      continue;
-
-    // Now that we know this is a real instruction, count it.
-    ++Count;
-
-    // If we found a match, return it.
-    if (isMatchingUpdateInsn(MI, BaseReg, Value))
-      return MBBI;
-
-    // Update the status of what the instruction clobbered and used.
-    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-
-    // Otherwise, if the base register is used or modified, we have no match, so
-    // return early.
-    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
-      return E;
-  }
-  return E;
-}
-
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,
-                                                  unsigned Limit) {
-  MachineBasicBlock::iterator B = I->getParent()->begin();
-  MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineInstr *MemMI = I;
-  MachineBasicBlock::iterator MBBI = I;
-  const MachineFunction &MF = *MemMI->getParent()->getParent();
-
-  unsigned DestReg = MemMI->getOperand(0).getReg();
-  unsigned BaseReg = MemMI->getOperand(1).getReg();
-  int Offset = MemMI->getOperand(2).getImm();
-  unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
-
-  // If the load/store is the first instruction in the block, there's obviously
-  // not any matching update. Ditto if the memory offset isn't zero.
-  if (MBBI == B || Offset != 0)
-    return E;
-  // If the base register overlaps the destination register, we can't
-  // merge the update.
-  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
-    return E;
-
-  // Track which registers have been modified and used between the first insn
-  // (inclusive) and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
-  --MBBI;
-  for (unsigned Count = 0; MBBI != B; --MBBI) {
-    MachineInstr *MI = MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI->isDebugValue())
-      continue;
-
-    // Now that we know this is a real instruction, count it.
-    ++Count;
-
-    // If we found a match, return it.
-    if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
-      return MBBI;
-
-    // Update the status of what the instruction clobbered and used.
-    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-
-    // Otherwise, if the base register is used or modified, we have no match, so
-    // return early.
-    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
-      return E;
-  }
-  return E;
-}
-
-bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
-  bool Modified = false;
-  // Two tranformations to do here:
-  // 1) Find loads and stores that can be merged into a single load or store
-  //    pair instruction.
-  //      e.g.,
-  //        ldr x0, [x2]
-  //        ldr x1, [x2, #8]
-  //        ; becomes
-  //        ldp x0, x1, [x2]
-  // 2) Find base register updates that can be merged into the load or store
-  //    as a base-reg writeback.
-  //      e.g.,
-  //        ldr x0, [x2]
-  //        add x2, x2, #4
-  //        ; becomes
-  //        ldr x0, [x2], #4
-
-  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-       MBBI != E;) {
-    MachineInstr *MI = MBBI;
-    switch (MI->getOpcode()) {
-    default:
-      // Just move on to the next instruction.
-      ++MBBI;
-      break;
-    case ARM64::STRSui:
-    case ARM64::STRDui:
-    case ARM64::STRQui:
-    case ARM64::STRXui:
-    case ARM64::STRWui:
-    case ARM64::LDRSui:
-    case ARM64::LDRDui:
-    case ARM64::LDRQui:
-    case ARM64::LDRXui:
-    case ARM64::LDRWui:
-    // do the unscaled versions as well
-    case ARM64::STURSi:
-    case ARM64::STURDi:
-    case ARM64::STURQi:
-    case ARM64::STURWi:
-    case ARM64::STURXi:
-    case ARM64::LDURSi:
-    case ARM64::LDURDi:
-    case ARM64::LDURQi:
-    case ARM64::LDURWi:
-    case ARM64::LDURXi: {
-      // If this is a volatile load/store, don't mess with it.
-      if (MI->hasOrderedMemoryRef()) {
-        ++MBBI;
-        break;
-      }
-      // Make sure this is a reg+imm (as opposed to an address reloc).
-      if (!MI->getOperand(2).isImm()) {
-        ++MBBI;
-        break;
-      }
-      // Check if this load/store has a hint to avoid pair formation.
-      // MachineMemOperands hints are set by the ARM64StorePairSuppress pass.
-      if (TII->isLdStPairSuppressed(MI)) {
-        ++MBBI;
-        break;
-      }
-      // Look ahead up to ScanLimit instructions for a pairable instruction.
-      bool mergeForward = false;
-      MachineBasicBlock::iterator Paired =
-          findMatchingInsn(MBBI, mergeForward, ScanLimit);
-      if (Paired != E) {
-        // Merge the loads into a pair. Keeping the iterator straight is a
-        // pain, so we let the merge routine tell us what the next instruction
-        // is after it's done mucking about.
-        MBBI = mergePairedInsns(MBBI, Paired, mergeForward);
-
-        Modified = true;
-        ++NumPairCreated;
-        if (isUnscaledLdst(MI->getOpcode()))
-          ++NumUnscaledPairCreated;
-        break;
-      }
-      ++MBBI;
-      break;
-    }
-      // FIXME: Do the other instructions.
-    }
-  }
-
-  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-       MBBI != E;) {
-    MachineInstr *MI = MBBI;
-    // Do update merging. It's simpler to keep this separate from the above
-    // switch, though not strictly necessary.
-    int Opc = MI->getOpcode();
-    switch (Opc) {
-    default:
-      // Just move on to the next instruction.
-      ++MBBI;
-      break;
-    case ARM64::STRSui:
-    case ARM64::STRDui:
-    case ARM64::STRQui:
-    case ARM64::STRXui:
-    case ARM64::STRWui:
-    case ARM64::LDRSui:
-    case ARM64::LDRDui:
-    case ARM64::LDRQui:
-    case ARM64::LDRXui:
-    case ARM64::LDRWui:
-    // do the unscaled versions as well
-    case ARM64::STURSi:
-    case ARM64::STURDi:
-    case ARM64::STURQi:
-    case ARM64::STURWi:
-    case ARM64::STURXi:
-    case ARM64::LDURSi:
-    case ARM64::LDURDi:
-    case ARM64::LDURQi:
-    case ARM64::LDURWi:
-    case ARM64::LDURXi: {
-      // Make sure this is a reg+imm (as opposed to an address reloc).
-      if (!MI->getOperand(2).isImm()) {
-        ++MBBI;
-        break;
-      }
-      // Look ahead up to ScanLimit instructions for a mergable instruction.
-      MachineBasicBlock::iterator Update =
-          findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
-      if (Update != E) {
-        // Merge the update into the ld/st.
-        MBBI = mergePostIdxUpdateInsn(MBBI, Update);
-        Modified = true;
-        ++NumPostFolded;
-        break;
-      }
-      // Don't know how to handle pre/post-index versions, so move to the next
-      // instruction.
-      if (isUnscaledLdst(Opc)) {
-        ++MBBI;
-        break;
-      }
-
-      // Look back to try to find a pre-index instruction. For example,
-      // add x0, x0, #8
-      // ldr x1, [x0]
-      //   merged into:
-      // ldr x1, [x0, #8]!
-      Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
-      if (Update != E) {
-        // Merge the update into the ld/st.
-        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
-        Modified = true;
-        ++NumPreFolded;
-        break;
-      }
-
-      // Look forward to try to find a post-index instruction. For example,
-      // ldr x1, [x0, #64]
-      // add x0, x0, #64
-      //   merged into:
-      // ldr x1, [x0, #64]!
-
-      // The immediate in the load/store is scaled by the size of the register
-      // being loaded. The immediate in the add we're looking for,
-      // however, is not, so adjust here.
-      int Value = MI->getOperand(2).getImm() *
-                  TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
-                      ->getSize();
-      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
-      if (Update != E) {
-        // Merge the update into the ld/st.
-        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
-        Modified = true;
-        ++NumPreFolded;
-        break;
-      }
-
-      // Nothing found. Just move to the next instruction.
-      ++MBBI;
-      break;
-    }
-      // FIXME: Do the other instructions.
-    }
-  }
-
-  return Modified;
-}
-
-bool ARM64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetMachine &TM = Fn.getTarget();
-  TII = static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
-  TRI = TM.getRegisterInfo();
-
-  bool Modified = false;
-  for (auto &MBB : Fn)
-    Modified |= optimizeBlock(MBB);
-
-  return Modified;
-}
-
-// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
-// loads and stores near one another?
-
-/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
-/// optimization pass.
-FunctionPass *llvm::createARM64LoadStoreOptimizationPass() {
-  return new ARM64LoadStoreOpt();
-}
diff --git a/lib/Target/ARM64/ARM64MCInstLower.cpp b/lib/Target/ARM64/ARM64MCInstLower.cpp
deleted file mode 100644
index 525f484ca4c..00000000000
--- a/lib/Target/ARM64/ARM64MCInstLower.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-//===-- ARM64MCInstLower.cpp - Convert ARM64 MachineInstr to an MCInst---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains code to lower ARM64 MachineInstrs to their corresponding
-// MCInst records.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCInstLower.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "Utils/ARM64BaseInfo.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-ARM64MCInstLower::ARM64MCInstLower(MCContext &ctx, Mangler &mang,
-                                   AsmPrinter &printer)
-    : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
-
-MCSymbol *
-ARM64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
-  return Printer.getSymbol(MO.getGlobal());
-}
-
-MCSymbol *
-ARM64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
-  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
-}
-
-MCOperand ARM64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
-                                                     MCSymbol *Sym) const {
-  // FIXME: We would like an efficient form for this, so we don't have to do a
-  // lot of extra uniquing.
-  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
-  if ((MO.getTargetFlags() & ARM64II::MO_GOT) != 0) {
-    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-      RefKind = MCSymbolRefExpr::VK_GOTPAGE;
-    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
-             ARM64II::MO_PAGEOFF)
-      RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
-    else
-      assert(0 && "Unexpected target flags with MO_GOT on GV operand");
-  } else if ((MO.getTargetFlags() & ARM64II::MO_TLS) != 0) {
-    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-      RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
-    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
-             ARM64II::MO_PAGEOFF)
-      RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
-    else
-      llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
-  } else {
-    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-      RefKind = MCSymbolRefExpr::VK_PAGE;
-    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
-             ARM64II::MO_PAGEOFF)
-      RefKind = MCSymbolRefExpr::VK_PAGEOFF;
-  }
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
-  if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(
-        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
-  return MCOperand::CreateExpr(Expr);
-}
-
-MCOperand ARM64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
-                                                  MCSymbol *Sym) const {
-  uint32_t RefFlags = 0;
-
-  if (MO.getTargetFlags() & ARM64II::MO_GOT)
-    RefFlags |= ARM64MCExpr::VK_GOT;
-  else if (MO.getTargetFlags() & ARM64II::MO_TLS) {
-    TLSModel::Model Model;
-    if (MO.isGlobal()) {
-      const GlobalValue *GV = MO.getGlobal();
-      Model = Printer.TM.getTLSModel(GV);
-    } else {
-      assert(MO.isSymbol() &&
-             StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
-             "unexpected external TLS symbol");
-      Model = TLSModel::GeneralDynamic;
-    }
-    switch (Model) {
-    case TLSModel::InitialExec:
-      RefFlags |= ARM64MCExpr::VK_GOTTPREL;
-      break;
-    case TLSModel::LocalExec:
-      RefFlags |= ARM64MCExpr::VK_TPREL;
-      break;
-    case TLSModel::LocalDynamic:
-      RefFlags |= ARM64MCExpr::VK_DTPREL;
-      break;
-    case TLSModel::GeneralDynamic:
-      RefFlags |= ARM64MCExpr::VK_TLSDESC;
-      break;
-    }
-  } else {
-    // No modifier means this is a generic reference, classified as absolute for
-    // the cases where it matters (:abs_g0: etc).
-    RefFlags |= ARM64MCExpr::VK_ABS;
-  }
-
-  if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-    RefFlags |= ARM64MCExpr::VK_PAGE;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGEOFF)
-    RefFlags |= ARM64MCExpr::VK_PAGEOFF;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G3)
-    RefFlags |= ARM64MCExpr::VK_G3;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G2)
-    RefFlags |= ARM64MCExpr::VK_G2;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G1)
-    RefFlags |= ARM64MCExpr::VK_G1;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G0)
-    RefFlags |= ARM64MCExpr::VK_G0;
-
-  if (MO.getTargetFlags() & ARM64II::MO_NC)
-    RefFlags |= ARM64MCExpr::VK_NC;
-
-  const MCExpr *Expr =
-      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
-  if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(
-        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
-
-  ARM64MCExpr::VariantKind RefKind;
-  RefKind = static_cast<ARM64MCExpr::VariantKind>(RefFlags);
-  Expr = ARM64MCExpr::Create(Expr, RefKind, Ctx);
-
-  return MCOperand::CreateExpr(Expr);
-}
-
-MCOperand ARM64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
-                                               MCSymbol *Sym) const {
-  if (TargetTriple.isOSDarwin())
-    return lowerSymbolOperandDarwin(MO, Sym);
-
-  assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
-  return lowerSymbolOperandELF(MO, Sym);
-}
-
-bool ARM64MCInstLower::lowerOperand(const MachineOperand &MO,
-                                    MCOperand &MCOp) const {
-  switch (MO.getType()) {
-  default:
-    assert(0 && "unknown operand type");
-  case MachineOperand::MO_Register:
-    // Ignore all implicit register operands.
-    if (MO.isImplicit())
-      return false;
-    MCOp = MCOperand::CreateReg(MO.getReg());
-    break;
-  case MachineOperand::MO_RegisterMask:
-    // Regmasks are like implicit defs.
-    return false;
-  case MachineOperand::MO_Immediate:
-    MCOp = MCOperand::CreateImm(MO.getImm());
-    break;
-  case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::CreateExpr(
-        MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
-    break;
-  case MachineOperand::MO_GlobalAddress:
-    MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
-    break;
-  case MachineOperand::MO_ExternalSymbol:
-    MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
-    break;
-  case MachineOperand::MO_JumpTableIndex:
-    MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
-    break;
-  case MachineOperand::MO_ConstantPoolIndex:
-    MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
-    break;
-  case MachineOperand::MO_BlockAddress:
-    MCOp = LowerSymbolOperand(
-        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
-    break;
-  }
-  return true;
-}
-
-void ARM64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
-  OutMI.setOpcode(MI->getOpcode());
-
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MCOperand MCOp;
-    if (lowerOperand(MI->getOperand(i), MCOp))
-      OutMI.addOperand(MCOp);
-  }
-}
diff --git a/lib/Target/ARM64/ARM64MCInstLower.h b/lib/Target/ARM64/ARM64MCInstLower.h
deleted file mode 100644
index 7e3a2c8e54f..00000000000
--- a/lib/Target/ARM64/ARM64MCInstLower.h
+++ /dev/null
@@ -1,52 +0,0 @@
-//===-- ARM64MCInstLower.h - Lower MachineInstr to MCInst ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64_MCINSTLOWER_H
-#define ARM64_MCINSTLOWER_H
-
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-class AsmPrinter;
-class MCAsmInfo;
-class MCContext;
-class MCInst;
-class MCOperand;
-class MCSymbol;
-class MachineInstr;
-class MachineModuleInfoMachO;
-class MachineOperand;
-class Mangler;
-
-/// ARM64MCInstLower - This class is used to lower an MachineInstr
-/// into an MCInst.
-class LLVM_LIBRARY_VISIBILITY ARM64MCInstLower {
-  MCContext &Ctx;
-  AsmPrinter &Printer;
-  Triple TargetTriple;
-
-public:
-  ARM64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
-
-  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
-  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-
-  MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO,
-                                     MCSymbol *Sym) const;
-  MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
-                                  MCSymbol *Sym) const;
-  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
-
-  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
-  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
-};
-}
-
-#endif
diff --git a/lib/Target/ARM64/ARM64MachineFunctionInfo.h b/lib/Target/ARM64/ARM64MachineFunctionInfo.h
deleted file mode 100644
index 0b6f4f1ec64..00000000000
--- a/lib/Target/ARM64/ARM64MachineFunctionInfo.h
+++ /dev/null
@@ -1,163 +0,0 @@
-//===- ARM64MachineFuctionInfo.h - ARM64 machine function info --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares ARM64-specific per-machine-function information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64MACHINEFUNCTIONINFO_H
-#define ARM64MACHINEFUNCTIONINFO_H
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/MC/MCLinkerOptimizationHint.h"
-
-namespace llvm {
-
-/// ARM64FunctionInfo - This class is derived from MachineFunctionInfo and
-/// contains private ARM64-specific information for each MachineFunction.
-class ARM64FunctionInfo : public MachineFunctionInfo {
-
-  /// Number of bytes of arguments this function has on the stack. If the callee
-  /// is expected to restore the argument stack this should be a multiple of 16,
-  /// all usable during a tail call.
-  ///
-  /// The alternative would forbid tail call optimisation in some cases: if we
-  /// want to transfer control from a function with 8-bytes of stack-argument
-  /// space to a function with 16-bytes then misalignment of this value would
-  /// make a stack adjustment necessary, which could not be undone by the
-  /// callee.
-  unsigned BytesInStackArgArea;
-
-  /// The number of bytes to restore to deallocate space for incoming
-  /// arguments. Canonically 0 in the C calling convention, but non-zero when
-  /// callee is expected to pop the args.
-  unsigned ArgumentStackToRestore;
-
-  /// HasStackFrame - True if this function has a stack frame. Set by
-  /// processFunctionBeforeCalleeSavedScan().
-  bool HasStackFrame;
-
-  /// \brief Amount of stack frame size, not including callee-saved registers.
-  unsigned LocalStackSize;
-
-  /// \brief Number of TLS accesses using the special (combinable)
-  /// _TLS_MODULE_BASE_ symbol.
-  unsigned NumLocalDynamicTLSAccesses;
-
-  /// \brief FrameIndex for start of varargs area for arguments passed on the
-  /// stack.
-  int VarArgsStackIndex;
-
-  /// \brief FrameIndex for start of varargs area for arguments passed in
-  /// general purpose registers.
-  int VarArgsGPRIndex;
-
-  /// \brief Size of the varargs area for arguments passed in general purpose
-  /// registers.
-  unsigned VarArgsGPRSize;
-
-  /// \brief FrameIndex for start of varargs area for arguments passed in
-  /// floating-point registers.
-  int VarArgsFPRIndex;
-
-  /// \brief Size of the varargs area for arguments passed in floating-point
-  /// registers.
-  unsigned VarArgsFPRSize;
-
-public:
-  ARM64FunctionInfo()
-      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
-        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
-        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
-
-  explicit ARM64FunctionInfo(MachineFunction &MF)
-      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
-        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
-        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
-    (void)MF;
-  }
-
-  unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
-  void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
-
-  unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
-  void setArgumentStackToRestore(unsigned bytes) {
-    ArgumentStackToRestore = bytes;
-  }
-
-  bool hasStackFrame() const { return HasStackFrame; }
-  void setHasStackFrame(bool s) { HasStackFrame = s; }
-
-  void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
-  unsigned getLocalStackSize() const { return LocalStackSize; }
-
-  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
-  unsigned getNumLocalDynamicTLSAccesses() const {
-    return NumLocalDynamicTLSAccesses;
-  }
-
-  int getVarArgsStackIndex() const { return VarArgsStackIndex; }
-  void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
-
-  int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
-  void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
-
-  unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
-  void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
-
-  int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
-  void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
-
-  unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
-  void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
-
-  typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
-
-  const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
-
-  // Shortcuts for LOH related types.
-  class MILOHDirective {
-    MCLOHType Kind;
-
-    /// Arguments of this directive. Order matters.
-    SmallVector<const MachineInstr *, 3> Args;
-
-  public:
-    typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
-
-    MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
-        : Kind(Kind), Args(Args.begin(), Args.end()) {
-      assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
-    }
-
-    MCLOHType getKind() const { return Kind; }
-    const LOHArgs &getArgs() const { return Args; }
-  };
-
-  typedef MILOHDirective::LOHArgs MILOHArgs;
-  typedef SmallVector<MILOHDirective, 32> MILOHContainer;
-
-  const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
-
-  /// Add a LOH directive of this @p Kind and this @p Args.
-  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
-    LOHContainerSet.push_back(MILOHDirective(Kind, Args));
-    LOHRelated.insert(Args.begin(), Args.end());
-  }
-
-private:
-  // Hold the lists of LOHs.
-  MILOHContainer LOHContainerSet;
-  SetOfInstructions LOHRelated;
-};
-} // End llvm namespace
-
-#endif // ARM64MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/ARM64/ARM64PerfectShuffle.h b/lib/Target/ARM64/ARM64PerfectShuffle.h
deleted file mode 100644
index 6759236fd14..00000000000
--- a/lib/Target/ARM64/ARM64PerfectShuffle.h
+++ /dev/null
@@ -1,6586 +0,0 @@
-//===-- ARM64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file, which was autogenerated by llvm-PerfectShuffle, contains data
-// for the optimal way to build a perfect shuffle using AdvSIMD instructions.
-//
-//===----------------------------------------------------------------------===//
-
-// 31 entries have cost 0
-// 242 entries have cost 1
-// 1447 entries have cost 2
-// 3602 entries have cost 3
-// 1237 entries have cost 4
-// 2 entries have cost 5
-
-// This table is 6561*4 = 26244 bytes in size.
-static const unsigned PerfectShuffleTable[6561+1] = {
-  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
-  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
-  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
-  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
-  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
-  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
-  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
-  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
-  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
-  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
-  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
-  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
-  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
-  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
-  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
-  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
-  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
-  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
-  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
-  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
-  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
-  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
-  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
-  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
-  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
-  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
-  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
-  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
-  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
-  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
-  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
-  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
-  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
-  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
-  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
-  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
-  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
-  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
-  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
-  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
-  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
-  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
-  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
-  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
-  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
-  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
-  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
-  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
-  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
-  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
-  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
-  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
-  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
-  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
-  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
-  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
-  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
-  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
-  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
-  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
-  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
-  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
-  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
-  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
-  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
-  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
-  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
-  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
-  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
-  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
-  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
-  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
-  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
-  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
-  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
-  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
-  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
-  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
-  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
-  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
-  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
-  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
-  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
-  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
-  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
-  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
-  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
-  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
-  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
-  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
-  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
-  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
-  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
-  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
-  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
-  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
-  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
-  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
-  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
-  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
-  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
-  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
-  835584U, // <0,1,2,3>: Cost 0 copy LHS
-  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
-  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
-  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
-  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
-  835584U, // <0,1,2,u>: Cost 0 copy LHS
-  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
-  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
-  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
-  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
-  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
-  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
-  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
-  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
-  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
-  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
-  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
-  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
-  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
-  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
-  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
-  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
-  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
-  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
-  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
-  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
-  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
-  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
-  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
-  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
-  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
-  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
-  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
-  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
-  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
-  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
-  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
-  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
-  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
-  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
-  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
-  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
-  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
-  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
-  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
-  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
-  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
-  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
-  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
-  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
-  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
-  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
-  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
-  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
-  835584U, // <0,1,u,3>: Cost 0 copy LHS
-  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
-  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
-  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
-  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
-  835584U, // <0,1,u,u>: Cost 0 copy LHS
-  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
-  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
-  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
-  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
-  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
-  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
-  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
-  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
-  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
-  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
-  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
-  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
-  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
-  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
-  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
-  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
-  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
-  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
-  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
-  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
-  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
-  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
-  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
-  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
-  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
-  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
-  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
-  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
-  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
-  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
-  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
-  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
-  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
-  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
-  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
-  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
-  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
-  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
-  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
-  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
-  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
-  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
-  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
-  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
-  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
-  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
-  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
-  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
-  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
-  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
-  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
-  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
-  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
-  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
-  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
-  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
-  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
-  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
-  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
-  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
-  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
-  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
-  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
-  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
-  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
-  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
-  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
-  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
-  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
-  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
-  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
-  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
-  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
-  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
-  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
-  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
-  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
-  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
-  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
-  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
-  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
-  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
-  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
-  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
-  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
-  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
-  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
-  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
-  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
-  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
-  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
-  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
-  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
-  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
-  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
-  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
-  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
-  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
-  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
-  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
-  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
-  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
-  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
-  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
-  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
-  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
-  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
-  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
-  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
-  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
-  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
-  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
-  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
-  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
-  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
-  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
-  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
-  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
-  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
-  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
-  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
-  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
-  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
-  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
-  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
-  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
-  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
-  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
-  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
-  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
-  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
-  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
-  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
-  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
-  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
-  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
-  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
-  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
-  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
-  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
-  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
-  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
-  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
-  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
-  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
-  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
-  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
-  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
-  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
-  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
-  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
-  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
-  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
-  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
-  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
-  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
-  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
-  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
-  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
-  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
-  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
-  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
-  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
-  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
-  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
-  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
-  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
-  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
-  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
-  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
-  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
-  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
-  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
-  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
-  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
-  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
-  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
-  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
-  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
-  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
-  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
-  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
-  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
-  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
-  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
-  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
-  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
-  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
-  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
-  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
-  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
-  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
-  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
-  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
-  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
-  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
-  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
-  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
-  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
-  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
-  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
-  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
-  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
-  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
-  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
-  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
-  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
-  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
-  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
-  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
-  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
-  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
-  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
-  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
-  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
-  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
-  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
-  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
-  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
-  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
-  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
-  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
-  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
-  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
-  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
-  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
-  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
-  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
-  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
-  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
-  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
-  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
-  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
-  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
-  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
-  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
-  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
-  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
-  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
-  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
-  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
-  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
-  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
-  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
-  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
-  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
-  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
-  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
-  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
-  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
-  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
-  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
-  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
-  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
-  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
-  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
-  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
-  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
-  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
-  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
-  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
-  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
-  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
-  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
-  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
-  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
-  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
-  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
-  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
-  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
-  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
-  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
-  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
-  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
-  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
-  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
-  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
-  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
-  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
-  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
-  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
-  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
-  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
-  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
-  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
-  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
-  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
-  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
-  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
-  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
-  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
-  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
-  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
-  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
-  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
-  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
-  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
-  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
-  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
-  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
-  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
-  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
-  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
-  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
-  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
-  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
-  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
-  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
-  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
-  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
-  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
-  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
-  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
-  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
-  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
-  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
-  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
-  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
-  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
-  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
-  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
-  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
-  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
-  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
-  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
-  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
-  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
-  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
-  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
-  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
-  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
-  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
-  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
-  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
-  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
-  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
-  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
-  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
-  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
-  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
-  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
-  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
-  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
-  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
-  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
-  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
-  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
-  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
-  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
-  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
-  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
-  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
-  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
-  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
-  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
-  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
-  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
-  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
-  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
-  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
-  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
-  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
-  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
-  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
-  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
-  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
-  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
-  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
-  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
-  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
-  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
-  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
-  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
-  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
-  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
-  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
-  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
-  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
-  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
-  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
-  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
-  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
-  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
-  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
-  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
-  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
-  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
-  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
-  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
-  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
-  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
-  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
-  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
-  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
-  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
-  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
-  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
-  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
-  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
-  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
-  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
-  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
-  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
-  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
-  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
-  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
-  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
-  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
-  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
-  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
-  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
-  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
-  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
-  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
-  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
-  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
-  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
-  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
-  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
-  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
-  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
-  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
-  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
-  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
-  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
-  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
-  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
-  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
-  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
-  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
-  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
-  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
-  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
-  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
-  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
-  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
-  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
-  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
-  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
-  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
-  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
-  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
-  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
-  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
-  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
-  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
-  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
-  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
-  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
-  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
-  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
-  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
-  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
-  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
-  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
-  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
-  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
-  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
-  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
-  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
-  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
-  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
-  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
-  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
-  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
-  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
-  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
-  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
-  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
-  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
-  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
-  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
-  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
-  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
-  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
-  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
-  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
-  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
-  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
-  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
-  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
-  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
-  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
-  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
-  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
-  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
-  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
-  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
-  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
-  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
-  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
-  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
-  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
-  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
-  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
-  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
-  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
-  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
-  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
-  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
-  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
-  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
-  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
-  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
-  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
-  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
-  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
-  835584U, // <0,u,2,3>: Cost 0 copy LHS
-  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
-  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
-  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
-  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
-  835584U, // <0,u,2,u>: Cost 0 copy LHS
-  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
-  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
-  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
-  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
-  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
-  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
-  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
-  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
-  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
-  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
-  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
-  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
-  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
-  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
-  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
-  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
-  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
-  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
-  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
-  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
-  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
-  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
-  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
-  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
-  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
-  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
-  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
-  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
-  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
-  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
-  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
-  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
-  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
-  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
-  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
-  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
-  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
-  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
-  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
-  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
-  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
-  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
-  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
-  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
-  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
-  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
-  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
-  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
-  835584U, // <0,u,u,3>: Cost 0 copy LHS
-  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
-  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
-  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
-  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
-  835584U, // <0,u,u,u>: Cost 0 copy LHS
-  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
-  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
-  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
-  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
-  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
-  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
-  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
-  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
-  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
-  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
-  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
-  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
-  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
-  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
-  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
-  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
-  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
-  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
-  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
-  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
-  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
-  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
-  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
-  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
-  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
-  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
-  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
-  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
-  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
-  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
-  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
-  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
-  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
-  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
-  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
-  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
-  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
-  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
-  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
-  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
-  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
-  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
-  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
-  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
-  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
-  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
-  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
-  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
-  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
-  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
-  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
-  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
-  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
-  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
-  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
-  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
-  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
-  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
-  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
-  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
-  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
-  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
-  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
-  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
-  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
-  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
-  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
-  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
-  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
-  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
-  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
-  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
-  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
-  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
-  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
-  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
-  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
-  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
-  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
-  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
-  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
-  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
-  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
-  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
-  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
-  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
-  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
-  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
-  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
-  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
-  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
-  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
-  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
-  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
-  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
-  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
-  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
-  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
-  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
-  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
-  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
-  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
-  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
-  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
-  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
-  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
-  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
-  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
-  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
-  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
-  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
-  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
-  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
-  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
-  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
-  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
-  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
-  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
-  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
-  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
-  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
-  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
-  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
-  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
-  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
-  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
-  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
-  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
-  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
-  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
-  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
-  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
-  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
-  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
-  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
-  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
-  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
-  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
-  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
-  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
-  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
-  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
-  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
-  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
-  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
-  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
-  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
-  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
-  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
-  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
-  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
-  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
-  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
-  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
-  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
-  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
-  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
-  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
-  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
-  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
-  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
-  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
-  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
-  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
-  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
-  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
-  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
-  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
-  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
-  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
-  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
-  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
-  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
-  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
-  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
-  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
-  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
-  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
-  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
-  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
-  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
-  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
-  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
-  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
-  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
-  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
-  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
-  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
-  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
-  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
-  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
-  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
-  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
-  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
-  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
-  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
-  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
-  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
-  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
-  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
-  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
-  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
-  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
-  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
-  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
-  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
-  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
-  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
-  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
-  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
-  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
-  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
-  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
-  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
-  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
-  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
-  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
-  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
-  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
-  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
-  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
-  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
-  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
-  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
-  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
-  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
-  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
-  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
-  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
-  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
-  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
-  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
-  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
-  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
-  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
-  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
-  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
-  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
-  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
-  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
-  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
-  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
-  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
-  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
-  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
-  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
-  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
-  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
-  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
-  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
-  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
-  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
-  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
-  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
-  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
-  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
-  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
-  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
-  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
-  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
-  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
-  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
-  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
-  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
-  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
-  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
-  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
-  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
-  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
-  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
-  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
-  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
-  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
-  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
-  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
-  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
-  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
-  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
-  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
-  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
-  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
-  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
-  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
-  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
-  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
-  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
-  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
-  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
-  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
-  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
-  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
-  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
-  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
-  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
-  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
-  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
-  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
-  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
-  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
-  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
-  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
-  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
-  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
-  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
-  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
-  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
-  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
-  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
-  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
-  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
-  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
-  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
-  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
-  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
-  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
-  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
-  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
-  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
-  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
-  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
-  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
-  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
-  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
-  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
-  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
-  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
-  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
-  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
-  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
-  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
-  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
-  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
-  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
-  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
-  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
-  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
-  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
-  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
-  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
-  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
-  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
-  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
-  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
-  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
-  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
-  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
-  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
-  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
-  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
-  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
-  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
-  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
-  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
-  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
-  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
-  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
-  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
-  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
-  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
-  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
-  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
-  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
-  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
-  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
-  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
-  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
-  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
-  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
-  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
-  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
-  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
-  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
-  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
-  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
-  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
-  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
-  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
-  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
-  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
-  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
-  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
-  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
-  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
-  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
-  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
-  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
-  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
-  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
-  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
-  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
-  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
-  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
-  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
-  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
-  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
-  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
-  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
-  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
-  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
-  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
-  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
-  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
-  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
-  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
-  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
-  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
-  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
-  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
-  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
-  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
-  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
-  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
-  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
-  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
-  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
-  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
-  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
-  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
-  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
-  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
-  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
-  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
-  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
-  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
-  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
-  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
-  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
-  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
-  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
-  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
-  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
-  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
-  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
-  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
-  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
-  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
-  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
-  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
-  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
-  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
-  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
-  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
-  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
-  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
-  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
-  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
-  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
-  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
-  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
-  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
-  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
-  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
-  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
-  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
-  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
-  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
-  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
-  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
-  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
-  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
-  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
-  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
-  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
-  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
-  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
-  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
-  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
-  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
-  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
-  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
-  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
-  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
-  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
-  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
-  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
-  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
-  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
-  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
-  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
-  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
-  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
-  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
-  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
-  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
-  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
-  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
-  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
-  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
-  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
-  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
-  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
-  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
-  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
-  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
-  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
-  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
-  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
-  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
-  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
-  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
-  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
-  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
-  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
-  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
-  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
-  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
-  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
-  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
-  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
-  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
-  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
-  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
-  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
-  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
-  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
-  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
-  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
-  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
-  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
-  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
-  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
-  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
-  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
-  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
-  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
-  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
-  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
-  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
-  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
-  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
-  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
-  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
-  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
-  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
-  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
-  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
-  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
-  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
-  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
-  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
-  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
-  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
-  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
-  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
-  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
-  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
-  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
-  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
-  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
-  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
-  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
-  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
-  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
-  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
-  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
-  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
-  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
-  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
-  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
-  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
-  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
-  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
-  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
-  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
-  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
-  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
-  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
-  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
-  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
-  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
-  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
-  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
-  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
-  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
-  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
-  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
-  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
-  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
-  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
-  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
-  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
-  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
-  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
-  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
-  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
-  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
-  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
-  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
-  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
-  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
-  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
-  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
-  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
-  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
-  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
-  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
-  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
-  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
-  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
-  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
-  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
-  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
-  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
-  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
-  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
-  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
-  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
-  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
-  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
-  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
-  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
-  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
-  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
-  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
-  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
-  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
-  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
-  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
-  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
-  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
-  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
-  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
-  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
-  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
-  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
-  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
-  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
-  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
-  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
-  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
-  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
-  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
-  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
-  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
-  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
-  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
-  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
-  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
-  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
-  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
-  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
-  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
-  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
-  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
-  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
-  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
-  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
-  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
-  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
-  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
-  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
-  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
-  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
-  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
-  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
-  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
-  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
-  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
-  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
-  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
-  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
-  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
-  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
-  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
-  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
-  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
-  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
-  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
-  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
-  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
-  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
-  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
-  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
-  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
-  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
-  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
-  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
-  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
-  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
-  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
-  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
-  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
-  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
-  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
-  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
-  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
-  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
-  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
-  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
-  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
-  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
-  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
-  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
-  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
-  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
-  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
-  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
-  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
-  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
-  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
-  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
-  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
-  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
-  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
-  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
-  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
-  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
-  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
-  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
-  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
-  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
-  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
-  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
-  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
-  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
-  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
-  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
-  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
-  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
-  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
-  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
-  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
-  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
-  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
-  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
-  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
-  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
-  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
-  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
-  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
-  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
-  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
-  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
-  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
-  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
-  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
-  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
-  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
-  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
-  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
-  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
-  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
-  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
-  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
-  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
-  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
-  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
-  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
-  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
-  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
-  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
-  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
-  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
-  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
-  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
-  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
-  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
-  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
-  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
-  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
-  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
-  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
-  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
-  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
-  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
-  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
-  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
-  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
-  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
-  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
-  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
-  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
-  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
-  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
-  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
-  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
-  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
-  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
-  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
-  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
-  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
-  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
-  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
-  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
-  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
-  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
-  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
-  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
-  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
-  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
-  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
-  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
-  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
-  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
-  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
-  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
-  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
-  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
-  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
-  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
-  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
-  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
-  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
-  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
-  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
-  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
-  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
-  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
-  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
-  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
-  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
-  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
-  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
-  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
-  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
-  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
-  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
-  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
-  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
-  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
-  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
-  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
-  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
-  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
-  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
-  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
-  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
-  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
-  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
-  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
-  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
-  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
-  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
-  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
-  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
-  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
-  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
-  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
-  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
-  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
-  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
-  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
-  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
-  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
-  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
-  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
-  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
-  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
-  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
-  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
-  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
-  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
-  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
-  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
-  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
-  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
-  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
-  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
-  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
-  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
-  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
-  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
-  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
-  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
-  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
-  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
-  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
-  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
-  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
-  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
-  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
-  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
-  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
-  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
-  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
-  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
-  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
-  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
-  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
-  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
-  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
-  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
-  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
-  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
-  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
-  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
-  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
-  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
-  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
-  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
-  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
-  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
-  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
-  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
-  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
-  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
-  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
-  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
-  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
-  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
-  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
-  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
-  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
-  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
-  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
-  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
-  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
-  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
-  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
-  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
-  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
-  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
-  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
-  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
-  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
-  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
-  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
-  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
-  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
-  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
-  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
-  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
-  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
-  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
-  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
-  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
-  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
-  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
-  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
-  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
-  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
-  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
-  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
-  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
-  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
-  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
-  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
-  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
-  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
-  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
-  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
-  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
-  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
-  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
-  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
-  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
-  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
-  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
-  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
-  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
-  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
-  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
-  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
-  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
-  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
-  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
-  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
-  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
-  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
-  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
-  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
-  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
-  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
-  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
-  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
-  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
-  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
-  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
-  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
-  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
-  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
-  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
-  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
-  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
-  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
-  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
-  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
-  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
-  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
-  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
-  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
-  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
-  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
-  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
-  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
-  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
-  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
-  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
-  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
-  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
-  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
-  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
-  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
-  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
-  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
-  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
-  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
-  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
-  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
-  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
-  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
-  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
-  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
-  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
-  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
-  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
-  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
-  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
-  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
-  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
-  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
-  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
-  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
-  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
-  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
-  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
-  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
-  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
-  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
-  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
-  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
-  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
-  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
-  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
-  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
-  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
-  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
-  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
-  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
-  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
-  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
-  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
-  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
-  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
-  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
-  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
-  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
-  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
-  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
-  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
-  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
-  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
-  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
-  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
-  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
-  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
-  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
-  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
-  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
-  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
-  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
-  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
-  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
-  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
-  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
-  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
-  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
-  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
-  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
-  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
-  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
-  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
-  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
-  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
-  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
-  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
-  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
-  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
-  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
-  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
-  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
-  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
-  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
-  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
-  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
-  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
-  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
-  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
-  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
-  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
-  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
-  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
-  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
-  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
-  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
-  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
-  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
-  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
-  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
-  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
-  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
-  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
-  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
-  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
-  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
-  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
-  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
-  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
-  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
-  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
-  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
-  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
-  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
-  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
-  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
-  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
-  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
-  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
-  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
-  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
-  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
-  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
-  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
-  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
-  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
-  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
-  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
-  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
-  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
-  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
-  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
-  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
-  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
-  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
-  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
-  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
-  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
-  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
-  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
-  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
-  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
-  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
-  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
-  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
-  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
-  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
-  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
-  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
-  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
-  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
-  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
-  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
-  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
-  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
-  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
-  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
-  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
-  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
-  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
-  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
-  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
-  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
-  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
-  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
-  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
-  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
-  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
-  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
-  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
-  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
-  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
-  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
-  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
-  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
-  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
-  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
-  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
-  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
-  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
-  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
-  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
-  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
-  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
-  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
-  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
-  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
-  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
-  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
-  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
-  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
-  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
-  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
-  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
-  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
-  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
-  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
-  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
-  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
-  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
-  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
-  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
-  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
-  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
-  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
-  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
-  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
-  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
-  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
-  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
-  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
-  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
-  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
-  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
-  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
-  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
-  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
-  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
-  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
-  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
-  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
-  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
-  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
-  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
-  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
-  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
-  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
-  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
-  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
-  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
-  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
-  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
-  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
-  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
-  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
-  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
-  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
-  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
-  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
-  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
-  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
-  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
-  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
-  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
-  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
-  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
-  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
-  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
-  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
-  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
-  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
-  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
-  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
-  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
-  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
-  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
-  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
-  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
-  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
-  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
-  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
-  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
-  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
-  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
-  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
-  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
-  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
-  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
-  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
-  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
-  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
-  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
-  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
-  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
-  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
-  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
-  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
-  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
-  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
-  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
-  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
-  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
-  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
-  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
-  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
-  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
-  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
-  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
-  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
-  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
-  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
-  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
-  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
-  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
-  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
-  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
-  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
-  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
-  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
-  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
-  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
-  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
-  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
-  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
-  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
-  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
-  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
-  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
-  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
-  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
-  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
-  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
-  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
-  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
-  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
-  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
-  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
-  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
-  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
-  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
-  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
-  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
-  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
-  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
-  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
-  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
-  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
-  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
-  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
-  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
-  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
-  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
-  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
-  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
-  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
-  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
-  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
-  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
-  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
-  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
-  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
-  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
-  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
-  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
-  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
-  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
-  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
-  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
-  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
-  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
-  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
-  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
-  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
-  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
-  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
-  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
-  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
-  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
-  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
-  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
-  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
-  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
-  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
-  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
-  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
-  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
-  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
-  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
-  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
-  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
-  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
-  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
-  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
-  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
-  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
-  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
-  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
-  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
-  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
-  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
-  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
-  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
-  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
-  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
-  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
-  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
-  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
-  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
-  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
-  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
-  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
-  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
-  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
-  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
-  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
-  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
-  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
-  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
-  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
-  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
-  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
-  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
-  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
-  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
-  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
-  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
-  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
-  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
-  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
-  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
-  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
-  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
-  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
-  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
-  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
-  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
-  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
-  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
-  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
-  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
-  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
-  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
-  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
-  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
-  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
-  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
-  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
-  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
-  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
-  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
-  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
-  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
-  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
-  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
-  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
-  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
-  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
-  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
-  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
-  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
-  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
-  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
-  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
-  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
-  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
-  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
-  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
-  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
-  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
-  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
-  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
-  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
-  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
-  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
-  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
-  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
-  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
-  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
-  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
-  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
-  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
-  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
-  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
-  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
-  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
-  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
-  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
-  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
-  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
-  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
-  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
-  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
-  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
-  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
-  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
-  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
-  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
-  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
-  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
-  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
-  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
-  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
-  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
-  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
-  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
-  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
-  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
-  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
-  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
-  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
-  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
-  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
-  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
-  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
-  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
-  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
-  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
-  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
-  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
-  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
-  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
-  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
-  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
-  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
-  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
-  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
-  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
-  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
-  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
-  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
-  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
-  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
-  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
-  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
-  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
-  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
-  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
-  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
-  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
-  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
-  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
-  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
-  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
-  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
-  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
-  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
-  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
-  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
-  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
-  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
-  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
-  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
-  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
-  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
-  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
-  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
-  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
-  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
-  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
-  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
-  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
-  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
-  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
-  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
-  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
-  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
-  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
-  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
-  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
-  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
-  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
-  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
-  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
-  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
-  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
-  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
-  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
-  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
-  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
-  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
-  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
-  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
-  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
-  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
-  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
-  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
-  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
-  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
-  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
-  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
-  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
-  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
-  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
-  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
-  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
-  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
-  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
-  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
-  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
-  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
-  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
-  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
-  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
-  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
-  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
-  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
-  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
-  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
-  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
-  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
-  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
-  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
-  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
-  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
-  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
-  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
-  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
-  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
-  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
-  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
-  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
-  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
-  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
-  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
-  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
-  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
-  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
-  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
-  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
-  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
-  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
-  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
-  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
-  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
-  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
-  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
-  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
-  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
-  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
-  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
-  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
-  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
-  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
-  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
-  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
-  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
-  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
-  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
-  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
-  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
-  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
-  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
-  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
-  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
-  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
-  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
-  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
-  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
-  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
-  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
-  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
-  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
-  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
-  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
-  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
-  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
-  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
-  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
-  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
-  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
-  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
-  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
-  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
-  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
-  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
-  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
-  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
-  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
-  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
-  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
-  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
-  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
-  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
-  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
-  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
-  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
-  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
-  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
-  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
-  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
-  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
-  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
-  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
-  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
-  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
-  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
-  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
-  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
-  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
-  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
-  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
-  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
-  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
-  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
-  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
-  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
-  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
-  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
-  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
-  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
-  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
-  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
-  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
-  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
-  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
-  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
-  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
-  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
-  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
-  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
-  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
-  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
-  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
-  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
-  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
-  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
-  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
-  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
-  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
-  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
-  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
-  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
-  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
-  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
-  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
-  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
-  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
-  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
-  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
-  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
-  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
-  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
-  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
-  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
-  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
-  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
-  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
-  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
-  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
-  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
-  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
-  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
-  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
-  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
-  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
-  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
-  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
-  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
-  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
-  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
-  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
-  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
-  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
-  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
-  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
-  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
-  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
-  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
-  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
-  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
-  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
-  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
-  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
-  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
-  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
-  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
-  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
-  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
-  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
-  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
-  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
-  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
-  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
-  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
-  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
-  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
-  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
-  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
-  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
-  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
-  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
-  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
-  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
-  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
-  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
-  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
-  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
-  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
-  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
-  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
-  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
-  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
-  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
-  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
-  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
-  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
-  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
-  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
-  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
-  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
-  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
-  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
-  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
-  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
-  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
-  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
-  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
-  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
-  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
-  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
-  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
-  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
-  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
-  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
-  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
-  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
-  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
-  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
-  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
-  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
-  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
-  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
-  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
-  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
-  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
-  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
-  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
-  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
-  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
-  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
-  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
-  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
-  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
-  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
-  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
-  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
-  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
-  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
-  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
-  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
-  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
-  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
-  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
-  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
-  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
-  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
-  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
-  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
-  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
-  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
-  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
-  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
-  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
-  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
-  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
-  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
-  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
-  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
-  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
-  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
-  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
-  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
-  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
-  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
-  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
-  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
-  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
-  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
-  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
-  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
-  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
-  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
-  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
-  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
-  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
-  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
-  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
-  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
-  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
-  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
-  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
-  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
-  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
-  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
-  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
-  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
-  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
-  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
-  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
-  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
-  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
-  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
-  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
-  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
-  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
-  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
-  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
-  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
-  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
-  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
-  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
-  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
-  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
-  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
-  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
-  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
-  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
-  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
-  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
-  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
-  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
-  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
-  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
-  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
-  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
-  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
-  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
-  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
-  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
-  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
-  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
-  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
-  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
-  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
-  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
-  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
-  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
-  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
-  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
-  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
-  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
-  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
-  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
-  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
-  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
-  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
-  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
-  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
-  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
-  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
-  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
-  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
-  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
-  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
-  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
-  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
-  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
-  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
-  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
-  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
-  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
-  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
-  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
-  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
-  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
-  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
-  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
-  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
-  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
-  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
-  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
-  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
-  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
-  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
-  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
-  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
-  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
-  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
-  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
-  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
-  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
-  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
-  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
-  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
-  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
-  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
-  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
-  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
-  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
-  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
-  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
-  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
-  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
-  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
-  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
-  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
-  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
-  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
-  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
-  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
-  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
-  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
-  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
-  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
-  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
-  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
-  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
-  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
-  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
-  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
-  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
-  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
-  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
-  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
-  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
-  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
-  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
-  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
-  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
-  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
-  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
-  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
-  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
-  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
-  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
-  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
-  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
-  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
-  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
-  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
-  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
-  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
-  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
-  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
-  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
-  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
-  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
-  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
-  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
-  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
-  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
-  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
-  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
-  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
-  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
-  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
-  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
-  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
-  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
-  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
-  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
-  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
-  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
-  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
-  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
-  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
-  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
-  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
-  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
-  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
-  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
-  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
-  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
-  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
-  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
-  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
-  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
-  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
-  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
-  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
-  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
-  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
-  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
-  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
-  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
-  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
-  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
-  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
-  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
-  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
-  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
-  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
-  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
-  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
-  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
-  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
-  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
-  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
-  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
-  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
-  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
-  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
-  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
-  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
-  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
-  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
-  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
-  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
-  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
-  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
-  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
-  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
-  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
-  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
-  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
-  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
-  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
-  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
-  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
-  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
-  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
-  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
-  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
-  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
-  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
-  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
-  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
-  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
-  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
-  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
-  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
-  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
-  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
-  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
-  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
-  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
-  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
-  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
-  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
-  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
-  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
-  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
-  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
-  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
-  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
-  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
-  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
-  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
-  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
-  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
-  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
-  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
-  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
-  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
-  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
-  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
-  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
-  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
-  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
-  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
-  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
-  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
-  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
-  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
-  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
-  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
-  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
-  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
-  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
-  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
-  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
-  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
-  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
-  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
-  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
-  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
-  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
-  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
-  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
-  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
-  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
-  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
-  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
-  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
-  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
-  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
-  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
-  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
-  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
-  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
-  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
-  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
-  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
-  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
-  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
-  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
-  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
-  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
-  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
-  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
-  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
-  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
-  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
-  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
-  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
-  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
-  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
-  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
-  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
-  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
-  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
-  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
-  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
-  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
-  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
-  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
-  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
-  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
-  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
-  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
-  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
-  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
-  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
-  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
-  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
-  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
-  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
-  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
-  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
-  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
-  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
-  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
-  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
-  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
-  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
-  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
-  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
-  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
-  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
-  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
-  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
-  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
-  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
-  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
-  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
-  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
-  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
-  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
-  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
-  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
-  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
-  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
-  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
-  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
-  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
-  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
-  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
-  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
-  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
-  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
-  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
-  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
-  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
-  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
-  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
-  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
-  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
-  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
-  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
-  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
-  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
-  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
-  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
-  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
-  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
-  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
-  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
-  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
-  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
-  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
-  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
-  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
-  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
-  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
-  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
-  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
-  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
-  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
-  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
-  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
-  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
-  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
-  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
-  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
-  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
-  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
-  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
-  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
-  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
-  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
-  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
-  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
-  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
-  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
-  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
-  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
-  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
-  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
-  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
-  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
-  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
-  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
-  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
-  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
-  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
-  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
-  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
-  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
-  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
-  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
-  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
-  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
-  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
-  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
-  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
-  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
-  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
-  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
-  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
-  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
-  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
-  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
-  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
-  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
-  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
-  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
-  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
-  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
-  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
-  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
-  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
-  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
-  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
-  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
-  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
-  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
-  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
-  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
-  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
-  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
-  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
-  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
-  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
-  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
-  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
-  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
-  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
-  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
-  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
-  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
-  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
-  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
-  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
-  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
-  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
-  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
-  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
-  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
-  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
-  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
-  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
-  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
-  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
-  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
-  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
-  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
-  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
-  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
-  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
-  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
-  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
-  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
-  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
-  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
-  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
-  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
-  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
-  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
-  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
-  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
-  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
-  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
-  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
-  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
-  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
-  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
-  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
-  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
-  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
-  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
-  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
-  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
-  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
-  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
-  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
-  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
-  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
-  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
-  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
-  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
-  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
-  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
-  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
-  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
-  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
-  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
-  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
-  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
-  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
-  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
-  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
-  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
-  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
-  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
-  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
-  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
-  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
-  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
-  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
-  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
-  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
-  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
-  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
-  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
-  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
-  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
-  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
-  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
-  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
-  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
-  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
-  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
-  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
-  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
-  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
-  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
-  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
-  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
-  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
-  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
-  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
-  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
-  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
-  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
-  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
-  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
-  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
-  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
-  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
-  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
-  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
-  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
-  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
-  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
-  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
-  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
-  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
-  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
-  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
-  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
-  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
-  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
-  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
-  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
-  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
-  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
-  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
-  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
-  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
-  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
-  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
-  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
-  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
-  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
-  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
-  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
-  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
-  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
-  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
-  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
-  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
-  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
-  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
-  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
-  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
-  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
-  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
-  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
-  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
-  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
-  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
-  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
-  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
-  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
-  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
-  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
-  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
-  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
-  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
-  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
-  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
-  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
-  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
-  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
-  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
-  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
-  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
-  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
-  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
-  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
-  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
-  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
-  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
-  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
-  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
-  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
-  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
-  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
-  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
-  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
-  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
-  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
-  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
-  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
-  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
-  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
-  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
-  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
-  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
-  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
-  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
-  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
-  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
-  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
-  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
-  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
-  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
-  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
-  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
-  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
-  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
-  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
-  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
-  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
-  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
-  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
-  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
-  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
-  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
-  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
-  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
-  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
-  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
-  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
-  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
-  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
-  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
-  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
-  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
-  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
-  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
-  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
-  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
-  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
-  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
-  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
-  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
-  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
-  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
-  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
-  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
-  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
-  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
-  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
-  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
-  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
-  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
-  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
-  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
-  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
-  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
-  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
-  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
-  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
-  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
-  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
-  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
-  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
-  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
-  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
-  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
-  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
-  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
-  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
-  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
-  27705344U, // <4,5,6,7>: Cost 0 copy RHS
-  27705344U, // <4,5,6,u>: Cost 0 copy RHS
-  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
-  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
-  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
-  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
-  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
-  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
-  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
-  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
-  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
-  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
-  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
-  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
-  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
-  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
-  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
-  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
-  27705344U, // <4,5,u,7>: Cost 0 copy RHS
-  27705344U, // <4,5,u,u>: Cost 0 copy RHS
-  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
-  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
-  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
-  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
-  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
-  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
-  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
-  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
-  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
-  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
-  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
-  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
-  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
-  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
-  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
-  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
-  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
-  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
-  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
-  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
-  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
-  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
-  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
-  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
-  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
-  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
-  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
-  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
-  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
-  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
-  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
-  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
-  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
-  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
-  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
-  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
-  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
-  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
-  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
-  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
-  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
-  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
-  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
-  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
-  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
-  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
-  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
-  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
-  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
-  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
-  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
-  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
-  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
-  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
-  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
-  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
-  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
-  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
-  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
-  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
-  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
-  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
-  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
-  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
-  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
-  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
-  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
-  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
-  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
-  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
-  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
-  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
-  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
-  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
-  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
-  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
-  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
-  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
-  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
-  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
-  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
-  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
-  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
-  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
-  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
-  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
-  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
-  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
-  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
-  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
-  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
-  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
-  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
-  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
-  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
-  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
-  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
-  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
-  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
-  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
-  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
-  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
-  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
-  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
-  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
-  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
-  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
-  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
-  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
-  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
-  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
-  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
-  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
-  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
-  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
-  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
-  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
-  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
-  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
-  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
-  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
-  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
-  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
-  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
-  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
-  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
-  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
-  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
-  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
-  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
-  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
-  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
-  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
-  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
-  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
-  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
-  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
-  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
-  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
-  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
-  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
-  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
-  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
-  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
-  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
-  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
-  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
-  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
-  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
-  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
-  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
-  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
-  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
-  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
-  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
-  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
-  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
-  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
-  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
-  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
-  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
-  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
-  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
-  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
-  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
-  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
-  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
-  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
-  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
-  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
-  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
-  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
-  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
-  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
-  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
-  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
-  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
-  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
-  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
-  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
-  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
-  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
-  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
-  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
-  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
-  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
-  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
-  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
-  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
-  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
-  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
-  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
-  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
-  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
-  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
-  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
-  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
-  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
-  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
-  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
-  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
-  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
-  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
-  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
-  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
-  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
-  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
-  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
-  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
-  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
-  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
-  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
-  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
-  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
-  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
-  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
-  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
-  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
-  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
-  27705344U, // <4,u,6,7>: Cost 0 copy RHS
-  27705344U, // <4,u,6,u>: Cost 0 copy RHS
-  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
-  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
-  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
-  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
-  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
-  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
-  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
-  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
-  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
-  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
-  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
-  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
-  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
-  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
-  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  27705344U, // <4,u,u,7>: Cost 0 copy RHS
-  27705344U, // <4,u,u,u>: Cost 0 copy RHS
-  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
-  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
-  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
-  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
-  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
-  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
-  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
-  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
-  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
-  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
-  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
-  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
-  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
-  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
-  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
-  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
-  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
-  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
-  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
-  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
-  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
-  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
-  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
-  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
-  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
-  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
-  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
-  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
-  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
-  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
-  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
-  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
-  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
-  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
-  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
-  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
-  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
-  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
-  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
-  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
-  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
-  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
-  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
-  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
-  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
-  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
-  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
-  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
-  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
-  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
-  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
-  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
-  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
-  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
-  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
-  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
-  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
-  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
-  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
-  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
-  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
-  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
-  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
-  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
-  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
-  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
-  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
-  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
-  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
-  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
-  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
-  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
-  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
-  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
-  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
-  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
-  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
-  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
-  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
-  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
-  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
-  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
-  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
-  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
-  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
-  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
-  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
-  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
-  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
-  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
-  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
-  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
-  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
-  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
-  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
-  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
-  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
-  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
-  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
-  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
-  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
-  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
-  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
-  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
-  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
-  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
-  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
-  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
-  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
-  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
-  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
-  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
-  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
-  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
-  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
-  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
-  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
-  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
-  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
-  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
-  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
-  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
-  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
-  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
-  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
-  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
-  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
-  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
-  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
-  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
-  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
-  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
-  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
-  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
-  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
-  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
-  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
-  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
-  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
-  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
-  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
-  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
-  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
-  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
-  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
-  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
-  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
-  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
-  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
-  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
-  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
-  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
-  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
-  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
-  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
-  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
-  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
-  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
-  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
-  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
-  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
-  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
-  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
-  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
-  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
-  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
-  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
-  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
-  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
-  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
-  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
-  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
-  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
-  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
-  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
-  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
-  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
-  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
-  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
-  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
-  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
-  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
-  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
-  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
-  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
-  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
-  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
-  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
-  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
-  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
-  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
-  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
-  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
-  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
-  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
-  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
-  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
-  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
-  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
-  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
-  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
-  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
-  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
-  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
-  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
-  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
-  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
-  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
-  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
-  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
-  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
-  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
-  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
-  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
-  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
-  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
-  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
-  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
-  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
-  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
-  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
-  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
-  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
-  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
-  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
-  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
-  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
-  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
-  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
-  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
-  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
-  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
-  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
-  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
-  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
-  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
-  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
-  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
-  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
-  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
-  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
-  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
-  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
-  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
-  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
-  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
-  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
-  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
-  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
-  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
-  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
-  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
-  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
-  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
-  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
-  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
-  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
-  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
-  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
-  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
-  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
-  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
-  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
-  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
-  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
-  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
-  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
-  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
-  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
-  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
-  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
-  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
-  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
-  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
-  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
-  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
-  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
-  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
-  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
-  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
-  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
-  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
-  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
-  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
-  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
-  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
-  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
-  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
-  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
-  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
-  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
-  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
-  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
-  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
-  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
-  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
-  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
-  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
-  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
-  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
-  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
-  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
-  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
-  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
-  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
-  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
-  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
-  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
-  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
-  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
-  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
-  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
-  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
-  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
-  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
-  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
-  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
-  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
-  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
-  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
-  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
-  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
-  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
-  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
-  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
-  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
-  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
-  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
-  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
-  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
-  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
-  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
-  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
-  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
-  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
-  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
-  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
-  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
-  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
-  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
-  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
-  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
-  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
-  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
-  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
-  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
-  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
-  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
-  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
-  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
-  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
-  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
-  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
-  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
-  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
-  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
-  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
-  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
-  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
-  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
-  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
-  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
-  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
-  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
-  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
-  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
-  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
-  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
-  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
-  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
-  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
-  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
-  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
-  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
-  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
-  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
-  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
-  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
-  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
-  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
-  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
-  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
-  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
-  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
-  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
-  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
-  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
-  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
-  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
-  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
-  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
-  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
-  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
-  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
-  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
-  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
-  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
-  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
-  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
-  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
-  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
-  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
-  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
-  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
-  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
-  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
-  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
-  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
-  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
-  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
-  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
-  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
-  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
-  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
-  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
-  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
-  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
-  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
-  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
-  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
-  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
-  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
-  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
-  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
-  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
-  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
-  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
-  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
-  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
-  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
-  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
-  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
-  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
-  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
-  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
-  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
-  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
-  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
-  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
-  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
-  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
-  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
-  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
-  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
-  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
-  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
-  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
-  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
-  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
-  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
-  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
-  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
-  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
-  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
-  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
-  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
-  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
-  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
-  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
-  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
-  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
-  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
-  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
-  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
-  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
-  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
-  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
-  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
-  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
-  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
-  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
-  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
-  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
-  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
-  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
-  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
-  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
-  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
-  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
-  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
-  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
-  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
-  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
-  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
-  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
-  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
-  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
-  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
-  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
-  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
-  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
-  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
-  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
-  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
-  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
-  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
-  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
-  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
-  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
-  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
-  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
-  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
-  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
-  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
-  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
-  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
-  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
-  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
-  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
-  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
-  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
-  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
-  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
-  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
-  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
-  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
-  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
-  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
-  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
-  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
-  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
-  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
-  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
-  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
-  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
-  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
-  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
-  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
-  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
-  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
-  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
-  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
-  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
-  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
-  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
-  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
-  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
-  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
-  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
-  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
-  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
-  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
-  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
-  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
-  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
-  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
-  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
-  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
-  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
-  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
-  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
-  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
-  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
-  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
-  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
-  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
-  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
-  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
-  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
-  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
-  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
-  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
-  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
-  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
-  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
-  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
-  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
-  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
-  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
-  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
-  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
-  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
-  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
-  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
-  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
-  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
-  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
-  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
-  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
-  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
-  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
-  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
-  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
-  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
-  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
-  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
-  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
-  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
-  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
-  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
-  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
-  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
-  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
-  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
-  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
-  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
-  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
-  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
-  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
-  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
-  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
-  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
-  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
-  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
-  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
-  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
-  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
-  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
-  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
-  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
-  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
-  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
-  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
-  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
-  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
-  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
-  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
-  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
-  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
-  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
-  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
-  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
-  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
-  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
-  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
-  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
-  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
-  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
-  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
-  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
-  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
-  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
-  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
-  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
-  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
-  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
-  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
-  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
-  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
-  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
-  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
-  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
-  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
-  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
-  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
-  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
-  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
-  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
-  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
-  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
-  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
-  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
-  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
-  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
-  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
-  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
-  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
-  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
-  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
-  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
-  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
-  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
-  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
-  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
-  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
-  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
-  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
-  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
-  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
-  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
-  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
-  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
-  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
-  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
-  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
-  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
-  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
-  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
-  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
-  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
-  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
-  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
-  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
-  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
-  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
-  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
-  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
-  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
-  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
-  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
-  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
-  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
-  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
-  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
-  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
-  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
-  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
-  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
-  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
-  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
-  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
-  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
-  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
-  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
-  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
-  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
-  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
-  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
-  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
-  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
-  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
-  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
-  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
-  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
-  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
-  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
-  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
-  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
-  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
-  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
-  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
-  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
-  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
-  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
-  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
-  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
-  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
-  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
-  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
-  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
-  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
-  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
-  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
-  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
-  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
-  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
-  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
-  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
-  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
-  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
-  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
-  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
-  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
-  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
-  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
-  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
-  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
-  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
-  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
-  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
-  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
-  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
-  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
-  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
-  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
-  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
-  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
-  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
-  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
-  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
-  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
-  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
-  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
-  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
-  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
-  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
-  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
-  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
-  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
-  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
-  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
-  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
-  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
-  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
-  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
-  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
-  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
-  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
-  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
-  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
-  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
-  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
-  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
-  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
-  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
-  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
-  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
-  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
-  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
-  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
-  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
-  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
-  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
-  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
-  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
-  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
-  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
-  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
-  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
-  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
-  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
-  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
-  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
-  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
-  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
-  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
-  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
-  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
-  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
-  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
-  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
-  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
-  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
-  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
-  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
-  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
-  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
-  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
-  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
-  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
-  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
-  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
-  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
-  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
-  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
-  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
-  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
-  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
-  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
-  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
-  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
-  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
-  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
-  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
-  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
-  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
-  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
-  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
-  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
-  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
-  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
-  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
-  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
-  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
-  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
-  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
-  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
-  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
-  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
-  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
-  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
-  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
-  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
-  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
-  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
-  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
-  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
-  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
-  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
-  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
-  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
-  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
-  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
-  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
-  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
-  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
-  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
-  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
-  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
-  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
-  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
-  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
-  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
-  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
-  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
-  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
-  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
-  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
-  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
-  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
-  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
-  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
-  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
-  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
-  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
-  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
-  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
-  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
-  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
-  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
-  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
-  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
-  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
-  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
-  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
-  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
-  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
-  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
-  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
-  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
-  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
-  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
-  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
-  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
-  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
-  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
-  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
-  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
-  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
-  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
-  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
-  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
-  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
-  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
-  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
-  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
-  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
-  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
-  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
-  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
-  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
-  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
-  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
-  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
-  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
-  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
-  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
-  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
-  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
-  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
-  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
-  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
-  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
-  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
-  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
-  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
-  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
-  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
-  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
-  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
-  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
-  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
-  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
-  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
-  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
-  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
-  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
-  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
-  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
-  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
-  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
-  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
-  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
-  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
-  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
-  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
-  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
-  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
-  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
-  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
-  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
-  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
-  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
-  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
-  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
-  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
-  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
-  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
-  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
-  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
-  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
-  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
-  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
-  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
-  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
-  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
-  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
-  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
-  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
-  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
-  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
-  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
-  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
-  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
-  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
-  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
-  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
-  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
-  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
-  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
-  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
-  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
-  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
-  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
-  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
-  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
-  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
-  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
-  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
-  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
-  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
-  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
-  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
-  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
-  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
-  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
-  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
-  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
-  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
-  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
-  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
-  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
-  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
-  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
-  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
-  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
-  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
-  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
-  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
-  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
-  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
-  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
-  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
-  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
-  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
-  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
-  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
-  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
-  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
-  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
-  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
-  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
-  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
-  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
-  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
-  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
-  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
-  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
-  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
-  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
-  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
-  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
-  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
-  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
-  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
-  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
-  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
-  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
-  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
-  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
-  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
-  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
-  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
-  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
-  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
-  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
-  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
-  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
-  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
-  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
-  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
-  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
-  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
-  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
-  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
-  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
-  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
-  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
-  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
-  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
-  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
-  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
-  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
-  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
-  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
-  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
-  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
-  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
-  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
-  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
-  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
-  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
-  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
-  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
-  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
-  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
-  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
-  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
-  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
-  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
-  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
-  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
-  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
-  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
-  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
-  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
-  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
-  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
-  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
-  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
-  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
-  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
-  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
-  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
-  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
-  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
-  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
-  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
-  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
-  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
-  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
-  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
-  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
-  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
-  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
-  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
-  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
-  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
-  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
-  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
-  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
-  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
-  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
-  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
-  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
-  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
-  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
-  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
-  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
-  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
-  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
-  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
-  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
-  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
-  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
-  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
-  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
-  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
-  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
-  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
-  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
-  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
-  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
-  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
-  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
-  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
-  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
-  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
-  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
-  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
-  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
-  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
-  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
-  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
-  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
-  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
-  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
-  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
-  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
-  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
-  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
-  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
-  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
-  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
-  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
-  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
-  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
-  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
-  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
-  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
-  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
-  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
-  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
-  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
-  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
-  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
-  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
-  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
-  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
-  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
-  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
-  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
-  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
-  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
-  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
-  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
-  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
-  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
-  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
-  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
-  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
-  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
-  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
-  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
-  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
-  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
-  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
-  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
-  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
-  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
-  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
-  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
-  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
-  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
-  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
-  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
-  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
-  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
-  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
-  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
-  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
-  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
-  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
-  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
-  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
-  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
-  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
-  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
-  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
-  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
-  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
-  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
-  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
-  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
-  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
-  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
-  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
-  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
-  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
-  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
-  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
-  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
-  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
-  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
-  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
-  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
-  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
-  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
-  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
-  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
-  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
-  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
-  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
-  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
-  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
-  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
-  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
-  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
-  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
-  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
-  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
-  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
-  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
-  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
-  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
-  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
-  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
-  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
-  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
-  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
-  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
-  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
-  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
-  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
-  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
-  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
-  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
-  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
-  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
-  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
-  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
-  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
-  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
-  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
-  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
-  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
-  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
-  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
-  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
-  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
-  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
-  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
-  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
-  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
-  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
-  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
-  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
-  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
-  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
-  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
-  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
-  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
-  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
-  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
-  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
-  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
-  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
-  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
-  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
-  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
-  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
-  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
-  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
-  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
-  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
-  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
-  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
-  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
-  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
-  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
-  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
-  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
-  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
-  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
-  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
-  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
-  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
-  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
-  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
-  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
-  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
-  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
-  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
-  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
-  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
-  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
-  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
-  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
-  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
-  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
-  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
-  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
-  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
-  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
-  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
-  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
-  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
-  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
-  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
-  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
-  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
-  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
-  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
-  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
-  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
-  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
-  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
-  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
-  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
-  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
-  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
-  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
-  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
-  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
-  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
-  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
-  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
-  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
-  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
-  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
-  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
-  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
-  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
-  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
-  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
-  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
-  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
-  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
-  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
-  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
-  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
-  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
-  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
-  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
-  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
-  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
-  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
-  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
-  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
-  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
-  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
-  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
-  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
-  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
-  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
-  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
-  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
-  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
-  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
-  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
-  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
-  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
-  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
-  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
-  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
-  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
-  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
-  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
-  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
-  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
-  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
-  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
-  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
-  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
-  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
-  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
-  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
-  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
-  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
-  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
-  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
-  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
-  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
-  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
-  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
-  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
-  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
-  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
-  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
-  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
-  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
-  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
-  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
-  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
-  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
-  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
-  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
-  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
-  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
-  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
-  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
-  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
-  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
-  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
-  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
-  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
-  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
-  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
-  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
-  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
-  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
-  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
-  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
-  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
-  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
-  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
-  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
-  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
-  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
-  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
-  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
-  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
-  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
-  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
-  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
-  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
-  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
-  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
-  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
-  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
-  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
-  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
-  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
-  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
-  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
-  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
-  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
-  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
-  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
-  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
-  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
-  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
-  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
-  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
-  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
-  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
-  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
-  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
-  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
-  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
-  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
-  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
-  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
-  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
-  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
-  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
-  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
-  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
-  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
-  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
-  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
-  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
-  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
-  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
-  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
-  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
-  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
-  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
-  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
-  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
-  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
-  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
-  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
-  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
-  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
-  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
-  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
-  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
-  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
-  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
-  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
-  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
-  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
-  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
-  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
-  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
-  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
-  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
-  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
-  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
-  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
-  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
-  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
-  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
-  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
-  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
-  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
-  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
-  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
-  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
-  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
-  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
-  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
-  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
-  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
-  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
-  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
-  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
-  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
-  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
-  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
-  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
-  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
-  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
-  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
-  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
-  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
-  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
-  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
-  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
-  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
-  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
-  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
-  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
-  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
-  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
-  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
-  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
-  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
-  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
-  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
-  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
-  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
-  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
-  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
-  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
-  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
-  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
-  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
-  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
-  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
-  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
-  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
-  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
-  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
-  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
-  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
-  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
-  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
-  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
-  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
-  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
-  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
-  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
-  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
-  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
-  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
-  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
-  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
-  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
-  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
-  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
-  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
-  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
-  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
-  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
-  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
-  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
-  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
-  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
-  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
-  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
-  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
-  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
-  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
-  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
-  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
-  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
-  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
-  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
-  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
-  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
-  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
-  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
-  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
-  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
-  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
-  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
-  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
-  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
-  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
-  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
-  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
-  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
-  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
-  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
-  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
-  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
-  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
-  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
-  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
-  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
-  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
-  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
-  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
-  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
-  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
-  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
-  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
-  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
-  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
-  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
-  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
-  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
-  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
-  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
-  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
-  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
-  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
-  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
-  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
-  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
-  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
-  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
-  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
-  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
-  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
-  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
-  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
-  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
-  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
-  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
-  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
-  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
-  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
-  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
-  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
-  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
-  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
-  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
-  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
-  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
-  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
-  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
-  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
-  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
-  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
-  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
-  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
-  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
-  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
-  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
-  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
-  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
-  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
-  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
-  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
-  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
-  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
-  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
-  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
-  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
-  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
-  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
-  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
-  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
-  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
-  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
-  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
-  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
-  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
-  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
-  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
-  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
-  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
-  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
-  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
-  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
-  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
-  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
-  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
-  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
-  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
-  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
-  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
-  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
-  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
-  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
-  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
-  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
-  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
-  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
-  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
-  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
-  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
-  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
-  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
-  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
-  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
-  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
-  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
-  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
-  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
-  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
-  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
-  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
-  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
-  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
-  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
-  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
-  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
-  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
-  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
-  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
-  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
-  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
-  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
-  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
-  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
-  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
-  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
-  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
-  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
-  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
-  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
-  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
-  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
-  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
-  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
-  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
-  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
-  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
-  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
-  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
-  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
-  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
-  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
-  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
-  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
-  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
-  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
-  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
-  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
-  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
-  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
-  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
-  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
-  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
-  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
-  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
-  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
-  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
-  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
-  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
-  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
-  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
-  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
-  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
-  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
-  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
-  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
-  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
-  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
-  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
-  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
-  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
-  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
-  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
-  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
-  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
-  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
-  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
-  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
-  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
-  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
-  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
-  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
-  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
-  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
-  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
-  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
-  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
-  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
-  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
-  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
-  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
-  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
-  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
-  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
-  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
-  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
-  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
-  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
-  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
-  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
-  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
-  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
-  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
-  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
-  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
-  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
-  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
-  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
-  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
-  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
-  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
-  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
-  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
-  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
-  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
-  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
-  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
-  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
-  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
-  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
-  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
-  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
-  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
-  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
-  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
-  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
-  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
-  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
-  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
-  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
-  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
-  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
-  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
-  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
-  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
-  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
-  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
-  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
-  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
-  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
-  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
-  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
-  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
-  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
-  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
-  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
-  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
-  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
-  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
-  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
-  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
-  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
-  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
-  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
-  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
-  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
-  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
-  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
-  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
-  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
-  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
-  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
-  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
-  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
-  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
-  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
-  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
-  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
-  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
-  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
-  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
-  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
-  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
-  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
-  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
-  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
-  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
-  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
-  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
-  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
-  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
-  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
-  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
-  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
-  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
-  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
-  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
-  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
-  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
-  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
-  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
-  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
-  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
-  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
-  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
-  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
-  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
-  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
-  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
-  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
-  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
-  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
-  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
-  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
-  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
-  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
-  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
-  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
-  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
-  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
-  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
-  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
-  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
-  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
-  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
-  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
-  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
-  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
-  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
-  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
-  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
-  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
-  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
-  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
-  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
-  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
-  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
-  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
-  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
-  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
-  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
-  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
-  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
-  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
-  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
-  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
-  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
-  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
-  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
-  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
-  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
-  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
-  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
-  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
-  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
-  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
-  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
-  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
-  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
-  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
-  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
-  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
-  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
-  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
-  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
-  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
-  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
-  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
-  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
-  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
-  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
-  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
-  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
-  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
-  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
-  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
-  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
-  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
-  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
-  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
-  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
-  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
-  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
-  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
-  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
-  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
-  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
-  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
-  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
-  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
-  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
-  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
-  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
-  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
-  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
-  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
-  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
-  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
-  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
-  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
-  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
-  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
-  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
-  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
-  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
-  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
-  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
-  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
-  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
-  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
-  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
-  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
-  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
-  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
-  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
-  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
-  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
-  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
-  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
-  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
-  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
-  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
-  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
-  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
-  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
-  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
-  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
-  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
-  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
-  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
-  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
-  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
-  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
-  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
-  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
-  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
-  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
-  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
-  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
-  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
-  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
-  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
-  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
-  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
-  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
-  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
-  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
-  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
-  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
-  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
-  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
-  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
-  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
-  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
-  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
-  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
-  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
-  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
-  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
-  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
-  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
-  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
-  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
-  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
-  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
-  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
-  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
-  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
-  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
-  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
-  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
-  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
-  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
-  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
-  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
-  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
-  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
-  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
-  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
-  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
-  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
-  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
-  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
-  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
-  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
-  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
-  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
-  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
-  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
-  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
-  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
-  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
-  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
-  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
-  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
-  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
-  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
-  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
-  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
-  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
-  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
-  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
-  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
-  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
-  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
-  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
-  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
-  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
-  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
-  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
-  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
-  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
-  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
-  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
-  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
-  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
-  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
-  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
-  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
-  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
-  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
-  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
-  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
-  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
-  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
-  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
-  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
-  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
-  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
-  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
-  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
-  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
-  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
-  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
-  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
-  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
-  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
-  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
-  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
-  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
-  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
-  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
-  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
-  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
-  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
-  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
-  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
-  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
-  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
-  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
-  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
-  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
-  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
-  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
-  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
-  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
-  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
-  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
-  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
-  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
-  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
-  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
-  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
-  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
-  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
-  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
-  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
-  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
-  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
-  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
-  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
-  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
-  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
-  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
-  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
-  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
-  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
-  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
-  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
-  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
-  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
-  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
-  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
-  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
-  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
-  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
-  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
-  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
-  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
-  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
-  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
-  835584U, // <u,1,2,3>: Cost 0 copy LHS
-  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
-  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
-  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
-  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
-  835584U, // <u,1,2,u>: Cost 0 copy LHS
-  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
-  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
-  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
-  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
-  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
-  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
-  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
-  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
-  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
-  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
-  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
-  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
-  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
-  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
-  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
-  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
-  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
-  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
-  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
-  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
-  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
-  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
-  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
-  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
-  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
-  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
-  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
-  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
-  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
-  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
-  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
-  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
-  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
-  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
-  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
-  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
-  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
-  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
-  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
-  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
-  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
-  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
-  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
-  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
-  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
-  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
-  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
-  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
-  835584U, // <u,1,u,3>: Cost 0 copy LHS
-  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
-  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
-  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
-  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
-  835584U, // <u,1,u,u>: Cost 0 copy LHS
-  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
-  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
-  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
-  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
-  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
-  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
-  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
-  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
-  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
-  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
-  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
-  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
-  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
-  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
-  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
-  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
-  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
-  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
-  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
-  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
-  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
-  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
-  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
-  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
-  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
-  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
-  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
-  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
-  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
-  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
-  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
-  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
-  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
-  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
-  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
-  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
-  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
-  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
-  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
-  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
-  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
-  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
-  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
-  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
-  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
-  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
-  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
-  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
-  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
-  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
-  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
-  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
-  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
-  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
-  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
-  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
-  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
-  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
-  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
-  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
-  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
-  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
-  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
-  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
-  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
-  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
-  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
-  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
-  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
-  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
-  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
-  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
-  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
-  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
-  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
-  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
-  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
-  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
-  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
-  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
-  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
-  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
-  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
-  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
-  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
-  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
-  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
-  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
-  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
-  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
-  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
-  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
-  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
-  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
-  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
-  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
-  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
-  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
-  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
-  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
-  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
-  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
-  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
-  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
-  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
-  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
-  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
-  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
-  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
-  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
-  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
-  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
-  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
-  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
-  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
-  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
-  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
-  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
-  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
-  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
-  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
-  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
-  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
-  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
-  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
-  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
-  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
-  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
-  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
-  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
-  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
-  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
-  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
-  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
-  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
-  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
-  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
-  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
-  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
-  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
-  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
-  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
-  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
-  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
-  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
-  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
-  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
-  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
-  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
-  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
-  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
-  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
-  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
-  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
-  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
-  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
-  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
-  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
-  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
-  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
-  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
-  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
-  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
-  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
-  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
-  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
-  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
-  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
-  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
-  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
-  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
-  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
-  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
-  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
-  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
-  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
-  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
-  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
-  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
-  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
-  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
-  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
-  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
-  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
-  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
-  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
-  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
-  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
-  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
-  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
-  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
-  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
-  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
-  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
-  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
-  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
-  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
-  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
-  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
-  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
-  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
-  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
-  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
-  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
-  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
-  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
-  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
-  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
-  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
-  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
-  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
-  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
-  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
-  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
-  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
-  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
-  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
-  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
-  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
-  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
-  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
-  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
-  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
-  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
-  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
-  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
-  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
-  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
-  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
-  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
-  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
-  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
-  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
-  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
-  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
-  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
-  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
-  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
-  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
-  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
-  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
-  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
-  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
-  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
-  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
-  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
-  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
-  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
-  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
-  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
-  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
-  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
-  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
-  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
-  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
-  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
-  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
-  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
-  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
-  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
-  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
-  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
-  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
-  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
-  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
-  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
-  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
-  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
-  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
-  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
-  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
-  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
-  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
-  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
-  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
-  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
-  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
-  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
-  27705344U, // <u,5,6,7>: Cost 0 copy RHS
-  27705344U, // <u,5,6,u>: Cost 0 copy RHS
-  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
-  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
-  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
-  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
-  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
-  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
-  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
-  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
-  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
-  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
-  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
-  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
-  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
-  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
-  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
-  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  27705344U, // <u,5,u,7>: Cost 0 copy RHS
-  27705344U, // <u,5,u,u>: Cost 0 copy RHS
-  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
-  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
-  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
-  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
-  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
-  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
-  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
-  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
-  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
-  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
-  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
-  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
-  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
-  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
-  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
-  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
-  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
-  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
-  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
-  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
-  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
-  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
-  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
-  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
-  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
-  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
-  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
-  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
-  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
-  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
-  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
-  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
-  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
-  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
-  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
-  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
-  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
-  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
-  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
-  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
-  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
-  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
-  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
-  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
-  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
-  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
-  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
-  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
-  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
-  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
-  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
-  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
-  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
-  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
-  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
-  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
-  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
-  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
-  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
-  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
-  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
-  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
-  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
-  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
-  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
-  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
-  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
-  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
-  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
-  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
-  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
-  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
-  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
-  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
-  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
-  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
-  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
-  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
-  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
-  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
-  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
-  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
-  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
-  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
-  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
-  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
-  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
-  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
-  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
-  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
-  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
-  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
-  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
-  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
-  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
-  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
-  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
-  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
-  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
-  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
-  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
-  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
-  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
-  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
-  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
-  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
-  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
-  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
-  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
-  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
-  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
-  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
-  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
-  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
-  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
-  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
-  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
-  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
-  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
-  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
-  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
-  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
-  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
-  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
-  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
-  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
-  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
-  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
-  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
-  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
-  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
-  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
-  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
-  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
-  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
-  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
-  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
-  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
-  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
-  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
-  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
-  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
-  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
-  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
-  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
-  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
-  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
-  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
-  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
-  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
-  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
-  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
-  835584U, // <u,u,2,3>: Cost 0 copy LHS
-  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
-  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
-  835584U, // <u,u,2,u>: Cost 0 copy LHS
-  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
-  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
-  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
-  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
-  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
-  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
-  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
-  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
-  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
-  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
-  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
-  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
-  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
-  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
-  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
-  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
-  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
-  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
-  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
-  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
-  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
-  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
-  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
-  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
-  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
-  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
-  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
-  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
-  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
-  27705344U, // <u,u,6,7>: Cost 0 copy RHS
-  27705344U, // <u,u,6,u>: Cost 0 copy RHS
-  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
-  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
-  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
-  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
-  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
-  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
-  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
-  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
-  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
-  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
-  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
-  835584U, // <u,u,u,3>: Cost 0 copy LHS
-  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
-  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
-  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
-  27705344U, // <u,u,u,7>: Cost 0 copy RHS
-  835584U, // <u,u,u,u>: Cost 0 copy LHS
-  0
-};
diff --git a/lib/Target/ARM64/ARM64PromoteConstant.cpp b/lib/Target/ARM64/ARM64PromoteConstant.cpp
deleted file mode 100644
index e61a62262d3..00000000000
--- a/lib/Target/ARM64/ARM64PromoteConstant.cpp
+++ /dev/null
@@ -1,580 +0,0 @@
-
-//===-- ARM64PromoteConstant.cpp --- Promote constant to global for ARM64 -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64PromoteConstant pass which promotes constants
-// to global variables when this is likely to be more efficient. Currently only
-// types related to constant vector (i.e., constant vector, array of constant
-// vectors, constant structure with a constant vector field, etc.) are promoted
-// to global variables. Constant vectors are likely to be lowered in target
-// constant pool during instruction selection already; therefore, the access
-// will remain the same (memory load), but the structure types are not split
-// into different constant pool accesses for each field. A bonus side effect is
-// that created globals may be merged by the global merge pass.
-//
-// FIXME: This pass may be useful for other targets too.
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-promote-const"
-
-// Stress testing mode - disable heuristics.
-static cl::opt<bool> Stress("arm64-stress-promote-const", cl::Hidden,
-                            cl::desc("Promote all vector constants"));
-
-STATISTIC(NumPromoted, "Number of promoted constants");
-STATISTIC(NumPromotedUses, "Number of promoted constants uses");
-
-//===----------------------------------------------------------------------===//
-//                       ARM64PromoteConstant
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Promotes interesting constant into global variables.
-/// The motivating example is:
-/// static const uint16_t TableA[32] = {
-///   41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768,
-///   31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215,
-///   25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846,
-///   21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725,
-/// };
-///
-/// uint8x16x4_t LoadStatic(void) {
-///   uint8x16x4_t ret;
-///   ret.val[0] = vld1q_u16(TableA +  0);
-///   ret.val[1] = vld1q_u16(TableA +  8);
-///   ret.val[2] = vld1q_u16(TableA + 16);
-///   ret.val[3] = vld1q_u16(TableA + 24);
-///   return ret;
-/// }
-///
-/// The constants in this example are folded into the uses. Thus, 4 different
-/// constants are created.
-///
-/// As their type is vector the cheapest way to create them is to load them
-/// for the memory.
-///
-/// Therefore the final assembly final has 4 different loads. With this pass
-/// enabled, only one load is issued for the constants.
-class ARM64PromoteConstant : public ModulePass {
-
-public:
-  static char ID;
-  ARM64PromoteConstant() : ModulePass(ID) {}
-
-  const char *getPassName() const override { return "ARM64 Promote Constant"; }
-
-  /// Iterate over the functions and promote the interesting constants into
-  /// global variables with module scope.
-  bool runOnModule(Module &M) override {
-    DEBUG(dbgs() << getPassName() << '\n');
-    bool Changed = false;
-    for (auto &MF : M) {
-      Changed |= runOnFunction(MF);
-    }
-    return Changed;
-  }
-
-private:
-  /// Look for interesting constants used within the given function.
-  /// Promote them into global variables, load these global variables within
-  /// the related function, so that the number of inserted load is minimal.
-  bool runOnFunction(Function &F);
-
-  // This transformation requires dominator info
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-  }
-
-  /// Type to store a list of User.
-  typedef SmallVector<Value::user_iterator, 4> Users;
-  /// Map an insertion point to all the uses it dominates.
-  typedef DenseMap<Instruction *, Users> InsertionPoints;
-  /// Map a function to the required insertion point of load for a
-  /// global variable.
-  typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
-
-  /// Find the closest point that dominates the given Use.
-  Instruction *findInsertionPoint(Value::user_iterator &Use);
-
-  /// Check if the given insertion point is dominated by an existing
-  /// insertion point.
-  /// If true, the given use is added to the list of dominated uses for
-  /// the related existing point.
-  /// \param NewPt the insertion point to be checked
-  /// \param UseIt the use to be added into the list of dominated uses
-  /// \param InsertPts existing insertion points
-  /// \pre NewPt and all instruction in InsertPts belong to the same function
-  /// \return true if one of the insertion point in InsertPts dominates NewPt,
-  ///         false otherwise
-  bool isDominated(Instruction *NewPt, Value::user_iterator &UseIt,
-                   InsertionPoints &InsertPts);
-
-  /// Check if the given insertion point can be merged with an existing
-  /// insertion point in a common dominator.
-  /// If true, the given use is added to the list of the created insertion
-  /// point.
-  /// \param NewPt the insertion point to be checked
-  /// \param UseIt the use to be added into the list of dominated uses
-  /// \param InsertPts existing insertion points
-  /// \pre NewPt and all instruction in InsertPts belong to the same function
-  /// \pre isDominated returns false for the exact same parameters.
-  /// \return true if it exists an insertion point in InsertPts that could
-  ///         have been merged with NewPt in a common dominator,
-  ///         false otherwise
-  bool tryAndMerge(Instruction *NewPt, Value::user_iterator &UseIt,
-                   InsertionPoints &InsertPts);
-
-  /// Compute the minimal insertion points to dominates all the interesting
-  /// uses of value.
-  /// Insertion points are group per function and each insertion point
-  /// contains a list of all the uses it dominates within the related function
-  /// \param Val constant to be examined
-  /// \param[out] InsPtsPerFunc output storage of the analysis
-  void computeInsertionPoints(Constant *Val,
-                              InsertionPointsPerFunc &InsPtsPerFunc);
-
-  /// Insert a definition of a new global variable at each point contained in
-  /// InsPtsPerFunc and update the related uses (also contained in
-  /// InsPtsPerFunc).
-  bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc);
-
-  /// Compute the minimal insertion points to dominate all the interesting
-  /// uses of Val and insert a definition of a new global variable
-  /// at these points.
-  /// Also update the uses of Val accordingly.
-  /// Currently a use of Val is considered interesting if:
-  /// - Val is not UndefValue
-  /// - Val is not zeroinitialized
-  /// - Replacing Val per a load of a global variable is valid.
-  /// \see shouldConvert for more details
-  bool computeAndInsertDefinitions(Constant *Val);
-
-  /// Promote the given constant into a global variable if it is expected to
-  /// be profitable.
-  /// \return true if Cst has been promoted
-  bool promoteConstant(Constant *Cst);
-
-  /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
-  /// Append UseIt to this list and delete the entry of IPI in InsertPts.
-  static void appendAndTransferDominatedUses(Instruction *NewPt,
-                                             Value::user_iterator &UseIt,
-                                             InsertionPoints::iterator &IPI,
-                                             InsertionPoints &InsertPts) {
-    // Record the dominated use.
-    IPI->second.push_back(UseIt);
-    // Transfer the dominated uses of IPI to NewPt
-    // Inserting into the DenseMap may invalidate existing iterator.
-    // Keep a copy of the key to find the iterator to erase.
-    Instruction *OldInstr = IPI->first;
-    InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second));
-    // Erase IPI.
-    IPI = InsertPts.find(OldInstr);
-    InsertPts.erase(IPI);
-  }
-};
-} // end anonymous namespace
-
-char ARM64PromoteConstant::ID = 0;
-
-namespace llvm {
-void initializeARM64PromoteConstantPass(PassRegistry &);
-}
-
-INITIALIZE_PASS_BEGIN(ARM64PromoteConstant, "arm64-promote-const",
-                      "ARM64 Promote Constant Pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ARM64PromoteConstant, "arm64-promote-const",
-                    "ARM64 Promote Constant Pass", false, false)
-
-ModulePass *llvm::createARM64PromoteConstantPass() {
-  return new ARM64PromoteConstant();
-}
-
-/// Check if the given type uses a vector type.
-static bool isConstantUsingVectorTy(const Type *CstTy) {
-  if (CstTy->isVectorTy())
-    return true;
-  if (CstTy->isStructTy()) {
-    for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements();
-         EltIdx < EndEltIdx; ++EltIdx)
-      if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx)))
-        return true;
-  } else if (CstTy->isArrayTy())
-    return isConstantUsingVectorTy(CstTy->getArrayElementType());
-  return false;
-}
-
-/// Check if the given use (Instruction + OpIdx) of Cst should be converted into
-/// a load of a global variable initialized with Cst.
-/// A use should be converted if it is legal to do so.
-/// For instance, it is not legal to turn the mask operand of a shuffle vector
-/// into a load of a global variable.
-static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
-                             unsigned OpIdx) {
-  // shufflevector instruction expects a const for the mask argument, i.e., the
-  // third argument. Do not promote this use in that case.
-  if (isa<const ShuffleVectorInst>(Instr) && OpIdx == 2)
-    return false;
-
-  // extractvalue instruction expects a const idx.
-  if (isa<const ExtractValueInst>(Instr) && OpIdx > 0)
-    return false;
-
-  // extractvalue instruction expects a const idx.
-  if (isa<const InsertValueInst>(Instr) && OpIdx > 1)
-    return false;
-
-  if (isa<const AllocaInst>(Instr) && OpIdx > 0)
-    return false;
-
-  // Alignment argument must be constant.
-  if (isa<const LoadInst>(Instr) && OpIdx > 0)
-    return false;
-
-  // Alignment argument must be constant.
-  if (isa<const StoreInst>(Instr) && OpIdx > 1)
-    return false;
-
-  // Index must be constant.
-  if (isa<const GetElementPtrInst>(Instr) && OpIdx > 0)
-    return false;
-
-  // Personality function and filters must be constant.
-  // Give up on that instruction.
-  if (isa<const LandingPadInst>(Instr))
-    return false;
-
-  // Switch instruction expects constants to compare to.
-  if (isa<const SwitchInst>(Instr))
-    return false;
-
-  // Expected address must be a constant.
-  if (isa<const IndirectBrInst>(Instr))
-    return false;
-
-  // Do not mess with intrinsics.
-  if (isa<const IntrinsicInst>(Instr))
-    return false;
-
-  // Do not mess with inline asm.
-  const CallInst *CI = dyn_cast<const CallInst>(Instr);
-  if (CI && isa<const InlineAsm>(CI->getCalledValue()))
-    return false;
-
-  return true;
-}
-
-/// Check if the given Cst should be converted into
-/// a load of a global variable initialized with Cst.
-/// A constant should be converted if it is likely that the materialization of
-/// the constant will be tricky. Thus, we give up on zero or undef values.
-///
-/// \todo Currently, accept only vector related types.
-/// Also we give up on all simple vector type to keep the existing
-/// behavior. Otherwise, we should push here all the check of the lowering of
-/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging
-/// constant via global merge and the fact that the same constant is stored
-/// only once with this method (versus, as many function that uses the constant
-/// for the regular approach, even for float).
-/// Again, the simplest solution would be to promote every
-/// constant and rematerialize them when they are actually cheap to create.
-static bool shouldConvert(const Constant *Cst) {
-  if (isa<const UndefValue>(Cst))
-    return false;
-
-  // FIXME: In some cases, it may be interesting to promote in memory
-  // a zero initialized constant.
-  // E.g., when the type of Cst require more instructions than the
-  // adrp/add/load sequence or when this sequence can be shared by several
-  // instances of Cst.
-  // Ideally, we could promote this into a global and rematerialize the constant
-  // when it was a bad idea.
-  if (Cst->isZeroValue())
-    return false;
-
-  if (Stress)
-    return true;
-
-  // FIXME: see function \todo
-  if (Cst->getType()->isVectorTy())
-    return false;
-  return isConstantUsingVectorTy(Cst->getType());
-}
-
-Instruction *
-ARM64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
-  // If this user is a phi, the insertion point is in the related
-  // incoming basic block.
-  PHINode *PhiInst = dyn_cast<PHINode>(*Use);
-  Instruction *InsertionPoint;
-  if (PhiInst)
-    InsertionPoint =
-        PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
-  else
-    InsertionPoint = dyn_cast<Instruction>(*Use);
-  assert(InsertionPoint && "User is not an instruction!");
-  return InsertionPoint;
-}
-
-bool ARM64PromoteConstant::isDominated(Instruction *NewPt,
-                                       Value::user_iterator &UseIt,
-                                       InsertionPoints &InsertPts) {
-
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
-      *NewPt->getParent()->getParent()).getDomTree();
-
-  // Traverse all the existing insertion points and check if one is dominating
-  // NewPt. If it is, remember that.
-  for (auto &IPI : InsertPts) {
-    if (NewPt == IPI.first || DT.dominates(IPI.first, NewPt) ||
-        // When IPI.first is a terminator instruction, DT may think that
-        // the result is defined on the edge.
-        // Here we are testing the insertion point, not the definition.
-        (IPI.first->getParent() != NewPt->getParent() &&
-         DT.dominates(IPI.first->getParent(), NewPt->getParent()))) {
-      // No need to insert this point. Just record the dominated use.
-      DEBUG(dbgs() << "Insertion point dominated by:\n");
-      DEBUG(IPI.first->print(dbgs()));
-      DEBUG(dbgs() << '\n');
-      IPI.second.push_back(UseIt);
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ARM64PromoteConstant::tryAndMerge(Instruction *NewPt,
-                                       Value::user_iterator &UseIt,
-                                       InsertionPoints &InsertPts) {
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
-      *NewPt->getParent()->getParent()).getDomTree();
-  BasicBlock *NewBB = NewPt->getParent();
-
-  // Traverse all the existing insertion point and check if one is dominated by
-  // NewPt and thus useless or can be combined with NewPt into a common
-  // dominator.
-  for (InsertionPoints::iterator IPI = InsertPts.begin(),
-                                 EndIPI = InsertPts.end();
-       IPI != EndIPI; ++IPI) {
-    BasicBlock *CurBB = IPI->first->getParent();
-    if (NewBB == CurBB) {
-      // Instructions are in the same block.
-      // By construction, NewPt is dominating the other.
-      // Indeed, isDominated returned false with the exact same arguments.
-      DEBUG(dbgs() << "Merge insertion point with:\n");
-      DEBUG(IPI->first->print(dbgs()));
-      DEBUG(dbgs() << "\nat considered insertion point.\n");
-      appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
-      return true;
-    }
-
-    // Look for a common dominator
-    BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB);
-    // If none exists, we cannot merge these two points.
-    if (!CommonDominator)
-      continue;
-
-    if (CommonDominator != NewBB) {
-      // By construction, the CommonDominator cannot be CurBB.
-      assert(CommonDominator != CurBB &&
-             "Instruction has not been rejected during isDominated check!");
-      // Take the last instruction of the CommonDominator as insertion point
-      NewPt = CommonDominator->getTerminator();
-    }
-    // else, CommonDominator is the block of NewBB, hence NewBB is the last
-    // possible insertion point in that block.
-    DEBUG(dbgs() << "Merge insertion point with:\n");
-    DEBUG(IPI->first->print(dbgs()));
-    DEBUG(dbgs() << '\n');
-    DEBUG(NewPt->print(dbgs()));
-    DEBUG(dbgs() << '\n');
-    appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
-    return true;
-  }
-  return false;
-}
-
-void ARM64PromoteConstant::computeInsertionPoints(
-    Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
-  DEBUG(dbgs() << "** Compute insertion points **\n");
-  for (Value::user_iterator UseIt = Val->user_begin(),
-                            EndUseIt = Val->user_end();
-       UseIt != EndUseIt; ++UseIt) {
-    // If the user is not an Instruction, we cannot modify it.
-    if (!isa<Instruction>(*UseIt))
-      continue;
-
-    // Filter out uses that should not be converted.
-    if (!shouldConvertUse(Val, cast<Instruction>(*UseIt), UseIt.getOperandNo()))
-      continue;
-
-    DEBUG(dbgs() << "Considered use, opidx " << UseIt.getOperandNo() << ":\n");
-    DEBUG((*UseIt)->print(dbgs()));
-    DEBUG(dbgs() << '\n');
-
-    Instruction *InsertionPoint = findInsertionPoint(UseIt);
-
-    DEBUG(dbgs() << "Considered insertion point:\n");
-    DEBUG(InsertionPoint->print(dbgs()));
-    DEBUG(dbgs() << '\n');
-
-    // Check if the current insertion point is useless, i.e., it is dominated
-    // by another one.
-    InsertionPoints &InsertPts =
-        InsPtsPerFunc[InsertionPoint->getParent()->getParent()];
-    if (isDominated(InsertionPoint, UseIt, InsertPts))
-      continue;
-    // This insertion point is useful, check if we can merge some insertion
-    // point in a common dominator or if NewPt dominates an existing one.
-    if (tryAndMerge(InsertionPoint, UseIt, InsertPts))
-      continue;
-
-    DEBUG(dbgs() << "Keep considered insertion point\n");
-
-    // It is definitely useful by its own
-    InsertPts[InsertionPoint].push_back(UseIt);
-  }
-}
-
-bool
-ARM64PromoteConstant::insertDefinitions(Constant *Cst,
-                                        InsertionPointsPerFunc &InsPtsPerFunc) {
-  // We will create one global variable per Module.
-  DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
-  bool HasChanged = false;
-
-  // Traverse all insertion points in all the function.
-  for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(),
-                                        EndIt = InsPtsPerFunc.end();
-       FctToInstPtsIt != EndIt; ++FctToInstPtsIt) {
-    InsertionPoints &InsertPts = FctToInstPtsIt->second;
-// Do more checking for debug purposes.
-#ifndef NDEBUG
-    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
-        *FctToInstPtsIt->first).getDomTree();
-#endif
-    GlobalVariable *PromotedGV;
-    assert(!InsertPts.empty() && "Empty uses does not need a definition");
-
-    Module *M = FctToInstPtsIt->first->getParent();
-    DenseMap<Module *, GlobalVariable *>::iterator MapIt =
-        ModuleToMergedGV.find(M);
-    if (MapIt == ModuleToMergedGV.end()) {
-      PromotedGV = new GlobalVariable(
-          *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr,
-          "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
-      PromotedGV->setInitializer(Cst);
-      ModuleToMergedGV[M] = PromotedGV;
-      DEBUG(dbgs() << "Global replacement: ");
-      DEBUG(PromotedGV->print(dbgs()));
-      DEBUG(dbgs() << '\n');
-      ++NumPromoted;
-      HasChanged = true;
-    } else {
-      PromotedGV = MapIt->second;
-    }
-
-    for (InsertionPoints::iterator IPI = InsertPts.begin(),
-                                   EndIPI = InsertPts.end();
-         IPI != EndIPI; ++IPI) {
-      // Create the load of the global variable.
-      IRBuilder<> Builder(IPI->first->getParent(), IPI->first);
-      LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
-      DEBUG(dbgs() << "**********\n");
-      DEBUG(dbgs() << "New def: ");
-      DEBUG(LoadedCst->print(dbgs()));
-      DEBUG(dbgs() << '\n');
-
-      // Update the dominated uses.
-      Users &DominatedUsers = IPI->second;
-      for (Value::user_iterator Use : DominatedUsers) {
-#ifndef NDEBUG
-        assert((DT.dominates(LoadedCst, cast<Instruction>(*Use)) ||
-                (isa<PHINode>(*Use) &&
-                 DT.dominates(LoadedCst, findInsertionPoint(Use)))) &&
-               "Inserted definition does not dominate all its uses!");
-#endif
-        DEBUG(dbgs() << "Use to update " << Use.getOperandNo() << ":");
-        DEBUG(Use->print(dbgs()));
-        DEBUG(dbgs() << '\n');
-        Use->setOperand(Use.getOperandNo(), LoadedCst);
-        ++NumPromotedUses;
-      }
-    }
-  }
-  return HasChanged;
-}
-
-bool ARM64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
-  InsertionPointsPerFunc InsertPtsPerFunc;
-  computeInsertionPoints(Val, InsertPtsPerFunc);
-  return insertDefinitions(Val, InsertPtsPerFunc);
-}
-
-bool ARM64PromoteConstant::promoteConstant(Constant *Cst) {
-  assert(Cst && "Given variable is not a valid constant.");
-
-  if (!shouldConvert(Cst))
-    return false;
-
-  DEBUG(dbgs() << "******************************\n");
-  DEBUG(dbgs() << "Candidate constant: ");
-  DEBUG(Cst->print(dbgs()));
-  DEBUG(dbgs() << '\n');
-
-  return computeAndInsertDefinitions(Cst);
-}
-
-bool ARM64PromoteConstant::runOnFunction(Function &F) {
-  // Look for instructions using constant vector. Promote that constant to a
-  // global variable. Create as few loads of this variable as possible and
-  // update the uses accordingly.
-  bool LocalChange = false;
-  SmallSet<Constant *, 8> AlreadyChecked;
-
-  for (auto &MBB : F) {
-    for (auto &MI : MBB) {
-      // Traverse the operand, looking for constant vectors. Replace them by a
-      // load of a global variable of constant vector type.
-      for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands();
-           OpIdx != EndOpIdx; ++OpIdx) {
-        Constant *Cst = dyn_cast<Constant>(MI.getOperand(OpIdx));
-        // There is no point in promoting global values as they are already
-        // global. Do not promote constant expressions either, as they may
-        // require some code expansion.
-        if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
-            AlreadyChecked.insert(Cst))
-          LocalChange |= promoteConstant(Cst);
-      }
-    }
-  }
-  return LocalChange;
-}
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.cpp b/lib/Target/ARM64/ARM64RegisterInfo.cpp
deleted file mode 100644
index d3c647bd90b..00000000000
--- a/lib/Target/ARM64/ARM64RegisterInfo.cpp
+++ /dev/null
@@ -1,400 +0,0 @@
-//===- ARM64RegisterInfo.cpp - ARM64 Register Information -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the TargetRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64RegisterInfo.h"
-#include "ARM64FrameLowering.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetOptions.h"
-
-using namespace llvm;
-
-#define GET_REGINFO_TARGET_DESC
-#include "ARM64GenRegisterInfo.inc"
-
-ARM64RegisterInfo::ARM64RegisterInfo(const ARM64InstrInfo *tii,
-                                     const ARM64Subtarget *sti)
-    : ARM64GenRegisterInfo(ARM64::LR), TII(tii), STI(sti) {}
-
-const MCPhysReg *
-ARM64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  assert(MF && "Invalid MachineFunction pointer.");
-  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
-    return CSR_ARM64_AllRegs_SaveList;
-  else
-    return CSR_ARM64_AAPCS_SaveList;
-}
-
-const uint32_t *
-ARM64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
-  if (CC == CallingConv::AnyReg)
-    return CSR_ARM64_AllRegs_RegMask;
-  else
-    return CSR_ARM64_AAPCS_RegMask;
-}
-
-const uint32_t *ARM64RegisterInfo::getTLSCallPreservedMask() const {
-  if (STI->isTargetDarwin())
-    return CSR_ARM64_TLS_Darwin_RegMask;
-
-  assert(STI->isTargetELF() && "only expect Darwin or ELF TLS");
-  return CSR_ARM64_TLS_ELF_RegMask;
-}
-
-const uint32_t *
-ARM64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
-  // This should return a register mask that is the same as that returned by
-  // getCallPreservedMask but that additionally preserves the register used for
-  // the first i64 argument (which must also be the register used to return a
-  // single i64 return value)
-  //
-  // In case that the calling convention does not use the same register for
-  // both, the function should return NULL (does not currently apply)
-  return CSR_ARM64_AAPCS_ThisReturn_RegMask;
-}
-
-BitVector ARM64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  // FIXME: avoid re-calculating this every time.
-  BitVector Reserved(getNumRegs());
-  Reserved.set(ARM64::SP);
-  Reserved.set(ARM64::XZR);
-  Reserved.set(ARM64::WSP);
-  Reserved.set(ARM64::WZR);
-
-  if (TFI->hasFP(MF) || STI->isTargetDarwin()) {
-    Reserved.set(ARM64::FP);
-    Reserved.set(ARM64::W29);
-  }
-
-  if (STI->isTargetDarwin()) {
-    Reserved.set(ARM64::X18); // Platform register
-    Reserved.set(ARM64::W18);
-  }
-
-  if (hasBasePointer(MF)) {
-    Reserved.set(ARM64::X19);
-    Reserved.set(ARM64::W19);
-  }
-
-  return Reserved;
-}
-
-bool ARM64RegisterInfo::isReservedReg(const MachineFunction &MF,
-                                      unsigned Reg) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  switch (Reg) {
-  default:
-    break;
-  case ARM64::SP:
-  case ARM64::XZR:
-  case ARM64::WSP:
-  case ARM64::WZR:
-    return true;
-  case ARM64::X18:
-  case ARM64::W18:
-    return STI->isTargetDarwin();
-  case ARM64::FP:
-  case ARM64::W29:
-    return TFI->hasFP(MF) || STI->isTargetDarwin();
-  case ARM64::W19:
-  case ARM64::X19:
-    return hasBasePointer(MF);
-  }
-
-  return false;
-}
-
-const TargetRegisterClass *
-ARM64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                      unsigned Kind) const {
-  return &ARM64::GPR64RegClass;
-}
-
-const TargetRegisterClass *
-ARM64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
-  if (RC == &ARM64::CCRRegClass)
-    return nullptr; // Can't copy NZCV.
-  return RC;
-}
-
-unsigned ARM64RegisterInfo::getBaseRegister() const { return ARM64::X19; }
-
-bool ARM64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // In the presence of variable sized objects, if the fixed stack size is
-  // large enough that referencing from the FP won't result in things being
-  // in range relatively often, we can use a base pointer to allow access
-  // from the other direction like the SP normally works.
-  if (MFI->hasVarSizedObjects()) {
-    // Conservatively estimate whether the negative offset from the frame
-    // pointer will be sufficient to reach. If a function has a smallish
-    // frame, it's less likely to have lots of spills and callee saved
-    // space, so it's all more likely to be within range of the frame pointer.
-    // If it's wrong, we'll materialize the constant and still get to the
-    // object; it's just suboptimal. Negative offsets use the unscaled
-    // load/store instructions, which have a 9-bit signed immediate.
-    if (MFI->getLocalFrameSize() < 256)
-      return false;
-    return true;
-  }
-
-  return false;
-}
-
-unsigned ARM64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  return TFI->hasFP(MF) ? ARM64::FP : ARM64::SP;
-}
-
-bool
-ARM64RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
-  return true;
-}
-
-bool ARM64RegisterInfo::requiresVirtualBaseRegisters(const MachineFunction &MF)
-    const {
-  return true;
-}
-
-bool
-ARM64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  // ARM64FrameLowering::resolveFrameIndexReference() can always fall back
-  // to the stack pointer, so only put the emergency spill slot next to the
-  // FP when there's no better way to access it (SP or base pointer).
-  return MFI->hasVarSizedObjects() && !hasBasePointer(MF);
-}
-
-bool ARM64RegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF)
-    const {
-  return true;
-}
-
-bool ARM64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  // Only consider eliminating leaf frames.
-  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
-                          MFI->adjustsStack()))
-    return true;
-  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
-}
-
-/// needsFrameBaseReg - Returns true if the instruction's frame index
-/// reference would be better served by a base register other than FP
-/// or SP. Used by LocalStackFrameAllocation to determine which frame index
-/// references it should create new base registers for.
-bool ARM64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
-                                          int64_t Offset) const {
-  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
-    assert(i < MI->getNumOperands() &&
-           "Instr doesn't have FrameIndex operand!");
-
-  // It's the load/store FI references that cause issues, as it can be difficult
-  // to materialize the offset if it won't fit in the literal field. Estimate
-  // based on the size of the local frame and some conservative assumptions
-  // about the rest of the stack frame (note, this is pre-regalloc, so
-  // we don't know everything for certain yet) whether this offset is likely
-  // to be out of range of the immediate. Return true if so.
-
-  // We only generate virtual base registers for loads and stores, so
-  // return false for everything else.
-  if (!MI->mayLoad() && !MI->mayStore())
-    return false;
-
-  // Without a virtual base register, if the function has variable sized
-  // objects, all fixed-size local references will be via the frame pointer,
-  // Approximate the offset and see if it's legal for the instruction.
-  // Note that the incoming offset is based on the SP value at function entry,
-  // so it'll be negative.
-  MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Estimate an offset from the frame pointer.
-  // Conservatively assume all GPR callee-saved registers get pushed.
-  // FP, LR, X19-X28, D8-D15. 64-bits each.
-  int64_t FPOffset = Offset - 16 * 20;
-  // Estimate an offset from the stack pointer.
-  // The incoming offset is relating to the SP at the start of the function,
-  // but when we access the local it'll be relative to the SP after local
-  // allocation, so adjust our SP-relative offset by that allocation size.
-  Offset += MFI->getLocalFrameSize();
-  // Assume that we'll have at least some spill slots allocated.
-  // FIXME: This is a total SWAG number. We should run some statistics
-  //        and pick a real one.
-  Offset += 128; // 128 bytes of spill slots
-
-  // If there is a frame pointer, try using it.
-  // The FP is only available if there is no dynamic realignment. We
-  // don't know for sure yet whether we'll need that, so we guess based
-  // on whether there are any local variables that would trigger it.
-  if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset))
-    return false;
-
-  // If we can reference via the stack pointer or base pointer, try that.
-  // FIXME: This (and the code that resolves the references) can be improved
-  //        to only disallow SP relative references in the live range of
-  //        the VLA(s). In practice, it's unclear how much difference that
-  //        would make, but it may be worth doing.
-  if (isFrameOffsetLegal(MI, Offset))
-    return false;
-
-  // The offset likely isn't legal; we want to allocate a virtual base register.
-  return true;
-}
-
-bool ARM64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
-                                           int64_t Offset) const {
-  assert(Offset <= INT_MAX && "Offset too big to fit in int.");
-  assert(MI && "Unable to get the legal offset for nil instruction.");
-  int SaveOffset = Offset;
-  return isARM64FrameOffsetLegal(*MI, SaveOffset) & ARM64FrameOffsetIsLegal;
-}
-
-/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
-/// at the beginning of the basic block.
-void ARM64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                                     unsigned BaseReg,
-                                                     int FrameIdx,
-                                                     int64_t Offset) const {
-  MachineBasicBlock::iterator Ins = MBB->begin();
-  DebugLoc DL; // Defaults to "unknown"
-  if (Ins != MBB->end())
-    DL = Ins->getDebugLoc();
-
-  const MCInstrDesc &MCID = TII->get(ARM64::ADDXri);
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  const MachineFunction &MF = *MBB->getParent();
-  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
-  unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0);
-
-  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
-      .addFrameIndex(FrameIdx)
-      .addImm(Offset)
-      .addImm(Shifter);
-}
-
-void ARM64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
-                                          int64_t Offset) const {
-  int Off = Offset; // ARM doesn't need the general 64-bit offsets
-  unsigned i = 0;
-
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-  bool Done = rewriteARM64FrameIndex(MI, i, BaseReg, Off, TII);
-  assert(Done && "Unable to resolve frame index!");
-  (void)Done;
-}
-
-void ARM64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                            int SPAdj, unsigned FIOperandNum,
-                                            RegScavenger *RS) const {
-  assert(SPAdj == 0 && "Unexpected");
-
-  MachineInstr &MI = *II;
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  const ARM64FrameLowering *TFI = static_cast<const ARM64FrameLowering *>(
-      MF.getTarget().getFrameLowering());
-
-  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  unsigned FrameReg;
-  int Offset;
-
-  // Special handling of dbg_value, stackmap and patchpoint instructions.
-  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
-      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
-    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
-                                             /*PreferFP=*/true);
-    Offset += MI.getOperand(FIOperandNum + 1).getImm();
-    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
-    return;
-  }
-
-  // Modify MI as necessary to handle as much of 'Offset' as possible
-  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
-  if (rewriteARM64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
-    return;
-
-  assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
-         "Emergency spill slot is out of reach");
-
-  // If we get here, the immediate doesn't fit into the instruction.  We folded
-  // as much as possible above.  Handle the rest, providing a register that is
-  // SP+LargeImm.
-  unsigned ScratchReg =
-      MF.getRegInfo().createVirtualRegister(&ARM64::GPR64RegClass);
-  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
-  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
-}
-
-namespace llvm {
-
-unsigned ARM64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
-                                                MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  switch (RC->getID()) {
-  default:
-    return 0;
-  case ARM64::GPR32RegClassID:
-  case ARM64::GPR32spRegClassID:
-  case ARM64::GPR32allRegClassID:
-  case ARM64::GPR64spRegClassID:
-  case ARM64::GPR64allRegClassID:
-  case ARM64::GPR64RegClassID:
-  case ARM64::GPR32commonRegClassID:
-  case ARM64::GPR64commonRegClassID:
-    return 32 - 1                                      // XZR/SP
-           - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
-           - STI->isTargetDarwin() // X18 reserved as platform register
-           - hasBasePointer(MF);   // X19
-  case ARM64::FPR8RegClassID:
-  case ARM64::FPR16RegClassID:
-  case ARM64::FPR32RegClassID:
-  case ARM64::FPR64RegClassID:
-  case ARM64::FPR128RegClassID:
-    return 32;
-
-  case ARM64::DDRegClassID:
-  case ARM64::DDDRegClassID:
-  case ARM64::DDDDRegClassID:
-  case ARM64::QQRegClassID:
-  case ARM64::QQQRegClassID:
-  case ARM64::QQQQRegClassID:
-    return 32;
-
-  case ARM64::FPR128_loRegClassID:
-    return 16;
-  }
-}
-
-} // namespace llvm
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.h b/lib/Target/ARM64/ARM64RegisterInfo.h
deleted file mode 100644
index 7691fadbcc8..00000000000
--- a/lib/Target/ARM64/ARM64RegisterInfo.h
+++ /dev/null
@@ -1,101 +0,0 @@
-//===- ARM64RegisterInfo.h - ARM64 Register Information Impl ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the MRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64REGISTERINFO_H
-#define LLVM_TARGET_ARM64REGISTERINFO_H
-
-#define GET_REGINFO_HEADER
-#include "ARM64GenRegisterInfo.inc"
-
-namespace llvm {
-
-class ARM64InstrInfo;
-class ARM64Subtarget;
-class MachineFunction;
-class RegScavenger;
-class TargetRegisterClass;
-
-struct ARM64RegisterInfo : public ARM64GenRegisterInfo {
-private:
-  const ARM64InstrInfo *TII;
-  const ARM64Subtarget *STI;
-
-public:
-  ARM64RegisterInfo(const ARM64InstrInfo *tii, const ARM64Subtarget *sti);
-
-  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
-
-  /// Code Generation virtual methods...
-  const MCPhysReg *
-  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
-  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
-
-  unsigned getCSRFirstUseCost() const override {
-    // The cost will be compared against BlockFrequency where entry has the
-    // value of 1 << 14. A value of 5 will choose to spill or split really
-    // cold path instead of using a callee-saved register.
-    return 5;
-  }
-
-  // Calls involved in thread-local variable lookup save more registers than
-  // normal calls, so they need a different mask to represent this.
-  const uint32_t *getTLSCallPreservedMask() const;
-
-  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
-  /// case that 'returned' is on an i64 first argument if the calling convention
-  /// is one that can (partially) model this attribute with a preserved mask
-  /// (i.e. it is a calling convention that uses the same register for the first
-  /// i64 argument and an i64 return value)
-  ///
-  /// Should return NULL in the case that the calling convention does not have
-  /// this property
-  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
-
-  BitVector getReservedRegs(const MachineFunction &MF) const override;
-  const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override;
-  const TargetRegisterClass *
-  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
-
-  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
-  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
-
-  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
-  bool isFrameOffsetLegal(const MachineInstr *MI,
-                          int64_t Offset) const override;
-  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
-                                    int FrameIdx,
-                                    int64_t Offset) const override;
-  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
-                         int64_t Offset) const override;
-  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                           unsigned FIOperandNum,
-                           RegScavenger *RS = nullptr) const override;
-  bool cannotEliminateFrame(const MachineFunction &MF) const;
-
-  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
-  bool hasBasePointer(const MachineFunction &MF) const;
-  unsigned getBaseRegister() const;
-
-  // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const override;
-
-  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const override;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_TARGET_ARM64REGISTERINFO_H
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.td b/lib/Target/ARM64/ARM64RegisterInfo.td
deleted file mode 100644
index 28d01809739..00000000000
--- a/lib/Target/ARM64/ARM64RegisterInfo.td
+++ /dev/null
@@ -1,593 +0,0 @@
-//===- ARM64RegisterInfo.td - Describe the ARM64 Regisers --*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-
-class ARM64Reg<bits<16> enc, string n, list<Register> subregs = [],
-               list<string> altNames = []>
-        : Register<n, altNames> {
-  let HWEncoding = enc;
-  let Namespace = "ARM64";
-  let SubRegs = subregs;
-}
-
-let Namespace = "ARM64" in {
-  def sub_32 : SubRegIndex<32>;
-
-  def bsub : SubRegIndex<8>;
-  def hsub : SubRegIndex<16>;
-  def ssub : SubRegIndex<32>;
-  def dsub : SubRegIndex<32>;
-  def qhisub : SubRegIndex<64>;
-  def qsub : SubRegIndex<64>;
-  // Note: Code depends on these having consecutive numbers
-  def dsub0 : SubRegIndex<64>;
-  def dsub1 : SubRegIndex<64>;
-  def dsub2 : SubRegIndex<64>;
-  def dsub3 : SubRegIndex<64>;
-  // Note: Code depends on these having consecutive numbers
-  def qsub0 : SubRegIndex<128>;
-  def qsub1 : SubRegIndex<128>;
-  def qsub2 : SubRegIndex<128>;
-  def qsub3 : SubRegIndex<128>;
-}
-
-let Namespace = "ARM64" in {
-  def vreg : RegAltNameIndex;
-  def vlist1 : RegAltNameIndex;
-}
-
-//===----------------------------------------------------------------------===//
-// Registers
-//===----------------------------------------------------------------------===//
-def W0    : ARM64Reg<0,   "w0" >, DwarfRegNum<[0]>;
-def W1    : ARM64Reg<1,   "w1" >, DwarfRegNum<[1]>;
-def W2    : ARM64Reg<2,   "w2" >, DwarfRegNum<[2]>;
-def W3    : ARM64Reg<3,   "w3" >, DwarfRegNum<[3]>;
-def W4    : ARM64Reg<4,   "w4" >, DwarfRegNum<[4]>;
-def W5    : ARM64Reg<5,   "w5" >, DwarfRegNum<[5]>;
-def W6    : ARM64Reg<6,   "w6" >, DwarfRegNum<[6]>;
-def W7    : ARM64Reg<7,   "w7" >, DwarfRegNum<[7]>;
-def W8    : ARM64Reg<8,   "w8" >, DwarfRegNum<[8]>;
-def W9    : ARM64Reg<9,   "w9" >, DwarfRegNum<[9]>;
-def W10   : ARM64Reg<10, "w10">, DwarfRegNum<[10]>;
-def W11   : ARM64Reg<11, "w11">, DwarfRegNum<[11]>;
-def W12   : ARM64Reg<12, "w12">, DwarfRegNum<[12]>;
-def W13   : ARM64Reg<13, "w13">, DwarfRegNum<[13]>;
-def W14   : ARM64Reg<14, "w14">, DwarfRegNum<[14]>;
-def W15   : ARM64Reg<15, "w15">, DwarfRegNum<[15]>;
-def W16   : ARM64Reg<16, "w16">, DwarfRegNum<[16]>;
-def W17   : ARM64Reg<17, "w17">, DwarfRegNum<[17]>;
-def W18   : ARM64Reg<18, "w18">, DwarfRegNum<[18]>;
-def W19   : ARM64Reg<19, "w19">, DwarfRegNum<[19]>;
-def W20   : ARM64Reg<20, "w20">, DwarfRegNum<[20]>;
-def W21   : ARM64Reg<21, "w21">, DwarfRegNum<[21]>;
-def W22   : ARM64Reg<22, "w22">, DwarfRegNum<[22]>;
-def W23   : ARM64Reg<23, "w23">, DwarfRegNum<[23]>;
-def W24   : ARM64Reg<24, "w24">, DwarfRegNum<[24]>;
-def W25   : ARM64Reg<25, "w25">, DwarfRegNum<[25]>;
-def W26   : ARM64Reg<26, "w26">, DwarfRegNum<[26]>;
-def W27   : ARM64Reg<27, "w27">, DwarfRegNum<[27]>;
-def W28   : ARM64Reg<28, "w28">, DwarfRegNum<[28]>;
-def W29   : ARM64Reg<29, "w29">, DwarfRegNum<[29]>;
-def W30   : ARM64Reg<30, "w30">, DwarfRegNum<[30]>;
-def WSP   : ARM64Reg<31, "wsp">, DwarfRegNum<[31]>;
-def WZR   : ARM64Reg<31, "wzr">, DwarfRegAlias<WSP>;
-
-let SubRegIndices = [sub_32] in {
-def X0    : ARM64Reg<0,   "x0",  [W0]>, DwarfRegAlias<W0>;
-def X1    : ARM64Reg<1,   "x1",  [W1]>, DwarfRegAlias<W1>;
-def X2    : ARM64Reg<2,   "x2",  [W2]>, DwarfRegAlias<W2>;
-def X3    : ARM64Reg<3,   "x3",  [W3]>, DwarfRegAlias<W3>;
-def X4    : ARM64Reg<4,   "x4",  [W4]>, DwarfRegAlias<W4>;
-def X5    : ARM64Reg<5,   "x5",  [W5]>, DwarfRegAlias<W5>;
-def X6    : ARM64Reg<6,   "x6",  [W6]>, DwarfRegAlias<W6>;
-def X7    : ARM64Reg<7,   "x7",  [W7]>, DwarfRegAlias<W7>;
-def X8    : ARM64Reg<8,   "x8",  [W8]>, DwarfRegAlias<W8>;
-def X9    : ARM64Reg<9,   "x9",  [W9]>, DwarfRegAlias<W9>;
-def X10   : ARM64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
-def X11   : ARM64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
-def X12   : ARM64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
-def X13   : ARM64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
-def X14   : ARM64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
-def X15   : ARM64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
-def X16   : ARM64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
-def X17   : ARM64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
-def X18   : ARM64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
-def X19   : ARM64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
-def X20   : ARM64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
-def X21   : ARM64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
-def X22   : ARM64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
-def X23   : ARM64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
-def X24   : ARM64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
-def X25   : ARM64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
-def X26   : ARM64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
-def X27   : ARM64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
-def X28   : ARM64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
-def FP    : ARM64Reg<29, "x29", [W29]>, DwarfRegAlias<W29>;
-def LR    : ARM64Reg<30, "x30", [W30]>, DwarfRegAlias<W30>;
-def SP    : ARM64Reg<31, "sp",  [WSP]>, DwarfRegAlias<WSP>;
-def XZR   : ARM64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
-}
-
-// Condition code register.
-def NZCV  : ARM64Reg<0, "nzcv">;
-
-// GPR register classes with the intersections of GPR32/GPR32sp and
-// GPR64/GPR64sp for use by the coalescer.
-def GPR32common : RegisterClass<"ARM64", [i32], 32, (sequence "W%u", 0, 30)> {
-  let AltOrders = [(rotl GPR32common, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-def GPR64common : RegisterClass<"ARM64", [i64], 64,
-                                (add (sequence "X%u", 0, 28), FP, LR)> {
-  let AltOrders = [(rotl GPR64common, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-// GPR register classes which exclude SP/WSP.
-def GPR32 : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR)> {
-  let AltOrders = [(rotl GPR32, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-def GPR64 : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR)> {
-  let AltOrders = [(rotl GPR64, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-
-// GPR register classes which include SP/WSP.
-def GPR32sp : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WSP)> {
-  let AltOrders = [(rotl GPR32sp, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-def GPR64sp : RegisterClass<"ARM64", [i64], 64, (add GPR64common, SP)> {
-  let AltOrders = [(rotl GPR64sp, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-
-def GPR32sponly : RegisterClass<"ARM64", [i32], 32, (add WSP)>;
-def GPR64sponly : RegisterClass<"ARM64", [i64], 64, (add SP)>;
-
-def GPR64spPlus0Operand : AsmOperandClass {
-  let Name = "GPR64sp0";
-  let RenderMethod = "addRegOperands";
-  let ParserMethod = "tryParseGPR64sp0Operand";
-}
-
-def GPR64sp0 : RegisterOperand<GPR64sp> {
-  let ParserMatchClass = GPR64spPlus0Operand;
-}
-
-// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
-// constraint used by any instructions, it is used as a common super-class.
-def GPR32all : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR, WSP)>;
-def GPR64all : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR, SP)>;
-
-// For tail calls, we can't use callee-saved registers, as they are restored
-// to the saved value before the tail call, which would clobber a call address.
-// This is for indirect tail calls to store the address of the destination.
-def tcGPR64 : RegisterClass<"ARM64", [i64], 64, (sub GPR64common, X19, X20, X21,
-                                                     X22, X23, X24, X25, X26,
-                                                     X27, X28)>;
-
-// GPR register classes for post increment amount of vector load/store that
-// has alternate printing when Rm=31 and prints a constant immediate value
-// equal to the total number of bytes transferred.
-
-// FIXME: TableGen *should* be able to do these itself now. There appears to be
-// a bug in counting how many operands a Post-indexed MCInst should have which
-// means the aliases don't trigger.
-def GPR64pi1  : RegisterOperand<GPR64, "printPostIncOperand<1>">;
-def GPR64pi2  : RegisterOperand<GPR64, "printPostIncOperand<2>">;
-def GPR64pi3  : RegisterOperand<GPR64, "printPostIncOperand<3>">;
-def GPR64pi4  : RegisterOperand<GPR64, "printPostIncOperand<4>">;
-def GPR64pi6  : RegisterOperand<GPR64, "printPostIncOperand<6>">;
-def GPR64pi8  : RegisterOperand<GPR64, "printPostIncOperand<8>">;
-def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand<12>">;
-def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand<16>">;
-def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand<24>">;
-def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand<32>">;
-def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
-def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
-
-// Condition code regclass.
-def CCR : RegisterClass<"ARM64", [i32], 32, (add NZCV)> {
-  let CopyCost = -1;  // Don't allow copying of status registers.
-
-  // CCR is not allocatable.
-  let isAllocatable = 0;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating Point Scalar Registers
-//===----------------------------------------------------------------------===//
-
-def B0    : ARM64Reg<0,   "b0">, DwarfRegNum<[64]>;
-def B1    : ARM64Reg<1,   "b1">, DwarfRegNum<[65]>;
-def B2    : ARM64Reg<2,   "b2">, DwarfRegNum<[66]>;
-def B3    : ARM64Reg<3,   "b3">, DwarfRegNum<[67]>;
-def B4    : ARM64Reg<4,   "b4">, DwarfRegNum<[68]>;
-def B5    : ARM64Reg<5,   "b5">, DwarfRegNum<[69]>;
-def B6    : ARM64Reg<6,   "b6">, DwarfRegNum<[70]>;
-def B7    : ARM64Reg<7,   "b7">, DwarfRegNum<[71]>;
-def B8    : ARM64Reg<8,   "b8">, DwarfRegNum<[72]>;
-def B9    : ARM64Reg<9,   "b9">, DwarfRegNum<[73]>;
-def B10   : ARM64Reg<10, "b10">, DwarfRegNum<[74]>;
-def B11   : ARM64Reg<11, "b11">, DwarfRegNum<[75]>;
-def B12   : ARM64Reg<12, "b12">, DwarfRegNum<[76]>;
-def B13   : ARM64Reg<13, "b13">, DwarfRegNum<[77]>;
-def B14   : ARM64Reg<14, "b14">, DwarfRegNum<[78]>;
-def B15   : ARM64Reg<15, "b15">, DwarfRegNum<[79]>;
-def B16   : ARM64Reg<16, "b16">, DwarfRegNum<[80]>;
-def B17   : ARM64Reg<17, "b17">, DwarfRegNum<[81]>;
-def B18   : ARM64Reg<18, "b18">, DwarfRegNum<[82]>;
-def B19   : ARM64Reg<19, "b19">, DwarfRegNum<[83]>;
-def B20   : ARM64Reg<20, "b20">, DwarfRegNum<[84]>;
-def B21   : ARM64Reg<21, "b21">, DwarfRegNum<[85]>;
-def B22   : ARM64Reg<22, "b22">, DwarfRegNum<[86]>;
-def B23   : ARM64Reg<23, "b23">, DwarfRegNum<[87]>;
-def B24   : ARM64Reg<24, "b24">, DwarfRegNum<[88]>;
-def B25   : ARM64Reg<25, "b25">, DwarfRegNum<[89]>;
-def B26   : ARM64Reg<26, "b26">, DwarfRegNum<[90]>;
-def B27   : ARM64Reg<27, "b27">, DwarfRegNum<[91]>;
-def B28   : ARM64Reg<28, "b28">, DwarfRegNum<[92]>;
-def B29   : ARM64Reg<29, "b29">, DwarfRegNum<[93]>;
-def B30   : ARM64Reg<30, "b30">, DwarfRegNum<[94]>;
-def B31   : ARM64Reg<31, "b31">, DwarfRegNum<[95]>;
-
-let SubRegIndices = [bsub] in {
-def H0    : ARM64Reg<0,   "h0", [B0]>, DwarfRegAlias<B0>;
-def H1    : ARM64Reg<1,   "h1", [B1]>, DwarfRegAlias<B1>;
-def H2    : ARM64Reg<2,   "h2", [B2]>, DwarfRegAlias<B2>;
-def H3    : ARM64Reg<3,   "h3", [B3]>, DwarfRegAlias<B3>;
-def H4    : ARM64Reg<4,   "h4", [B4]>, DwarfRegAlias<B4>;
-def H5    : ARM64Reg<5,   "h5", [B5]>, DwarfRegAlias<B5>;
-def H6    : ARM64Reg<6,   "h6", [B6]>, DwarfRegAlias<B6>;
-def H7    : ARM64Reg<7,   "h7", [B7]>, DwarfRegAlias<B7>;
-def H8    : ARM64Reg<8,   "h8", [B8]>, DwarfRegAlias<B8>;
-def H9    : ARM64Reg<9,   "h9", [B9]>, DwarfRegAlias<B9>;
-def H10   : ARM64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
-def H11   : ARM64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
-def H12   : ARM64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
-def H13   : ARM64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
-def H14   : ARM64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
-def H15   : ARM64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
-def H16   : ARM64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
-def H17   : ARM64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
-def H18   : ARM64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
-def H19   : ARM64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
-def H20   : ARM64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
-def H21   : ARM64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
-def H22   : ARM64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
-def H23   : ARM64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
-def H24   : ARM64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
-def H25   : ARM64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
-def H26   : ARM64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
-def H27   : ARM64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
-def H28   : ARM64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
-def H29   : ARM64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
-def H30   : ARM64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
-def H31   : ARM64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
-}
-
-let SubRegIndices = [hsub] in {
-def S0    : ARM64Reg<0,   "s0", [H0]>, DwarfRegAlias<B0>;
-def S1    : ARM64Reg<1,   "s1", [H1]>, DwarfRegAlias<B1>;
-def S2    : ARM64Reg<2,   "s2", [H2]>, DwarfRegAlias<B2>;
-def S3    : ARM64Reg<3,   "s3", [H3]>, DwarfRegAlias<B3>;
-def S4    : ARM64Reg<4,   "s4", [H4]>, DwarfRegAlias<B4>;
-def S5    : ARM64Reg<5,   "s5", [H5]>, DwarfRegAlias<B5>;
-def S6    : ARM64Reg<6,   "s6", [H6]>, DwarfRegAlias<B6>;
-def S7    : ARM64Reg<7,   "s7", [H7]>, DwarfRegAlias<B7>;
-def S8    : ARM64Reg<8,   "s8", [H8]>, DwarfRegAlias<B8>;
-def S9    : ARM64Reg<9,   "s9", [H9]>, DwarfRegAlias<B9>;
-def S10   : ARM64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
-def S11   : ARM64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
-def S12   : ARM64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
-def S13   : ARM64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
-def S14   : ARM64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
-def S15   : ARM64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
-def S16   : ARM64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
-def S17   : ARM64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
-def S18   : ARM64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
-def S19   : ARM64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
-def S20   : ARM64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
-def S21   : ARM64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
-def S22   : ARM64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
-def S23   : ARM64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
-def S24   : ARM64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
-def S25   : ARM64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
-def S26   : ARM64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
-def S27   : ARM64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
-def S28   : ARM64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
-def S29   : ARM64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
-def S30   : ARM64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
-def S31   : ARM64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
-}
-
-let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
-def D0    : ARM64Reg<0,   "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
-def D1    : ARM64Reg<1,   "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
-def D2    : ARM64Reg<2,   "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
-def D3    : ARM64Reg<3,   "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
-def D4    : ARM64Reg<4,   "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
-def D5    : ARM64Reg<5,   "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
-def D6    : ARM64Reg<6,   "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
-def D7    : ARM64Reg<7,   "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
-def D8    : ARM64Reg<8,   "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
-def D9    : ARM64Reg<9,   "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
-def D10   : ARM64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
-def D11   : ARM64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
-def D12   : ARM64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
-def D13   : ARM64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
-def D14   : ARM64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
-def D15   : ARM64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
-def D16   : ARM64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
-def D17   : ARM64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
-def D18   : ARM64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
-def D19   : ARM64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
-def D20   : ARM64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
-def D21   : ARM64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
-def D22   : ARM64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
-def D23   : ARM64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
-def D24   : ARM64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
-def D25   : ARM64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
-def D26   : ARM64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
-def D27   : ARM64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
-def D28   : ARM64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
-def D29   : ARM64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
-def D30   : ARM64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
-def D31   : ARM64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
-}
-
-let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
-def Q0    : ARM64Reg<0,   "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
-def Q1    : ARM64Reg<1,   "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
-def Q2    : ARM64Reg<2,   "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
-def Q3    : ARM64Reg<3,   "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
-def Q4    : ARM64Reg<4,   "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
-def Q5    : ARM64Reg<5,   "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
-def Q6    : ARM64Reg<6,   "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
-def Q7    : ARM64Reg<7,   "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
-def Q8    : ARM64Reg<8,   "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
-def Q9    : ARM64Reg<9,   "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
-def Q10   : ARM64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
-def Q11   : ARM64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
-def Q12   : ARM64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
-def Q13   : ARM64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
-def Q14   : ARM64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
-def Q15   : ARM64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
-def Q16   : ARM64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
-def Q17   : ARM64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
-def Q18   : ARM64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
-def Q19   : ARM64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
-def Q20   : ARM64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
-def Q21   : ARM64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
-def Q22   : ARM64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
-def Q23   : ARM64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
-def Q24   : ARM64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
-def Q25   : ARM64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
-def Q26   : ARM64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
-def Q27   : ARM64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
-def Q28   : ARM64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
-def Q29   : ARM64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
-def Q30   : ARM64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
-def Q31   : ARM64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
-}
-
-def FPR8  : RegisterClass<"ARM64", [untyped], 8, (sequence "B%u", 0, 31)> {
-  let Size = 8;
-}
-def FPR16 : RegisterClass<"ARM64", [f16], 16, (sequence "H%u", 0, 31)> {
-  let Size = 16;
-}
-def FPR32 : RegisterClass<"ARM64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
-def FPR64 : RegisterClass<"ARM64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
-                                    v1i64],
-                                    64, (sequence "D%u", 0, 31)>;
-// We don't (yet) have an f128 legal type, so don't use that here. We
-// normalize 128-bit vectors to v2f64 for arg passing and such, so use
-// that here.
-def FPR128 : RegisterClass<"ARM64",
-                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
-                           128, (sequence "Q%u", 0, 31)>;
-
-// The lower 16 vector registers.  Some instructions can only take registers
-// in this range.
-def FPR128_lo : RegisterClass<"ARM64",
-                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                              128, (trunc FPR128, 16)>;
-
-// Pairs, triples, and quads of 64-bit vector registers.
-def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
-def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
-                                 [(rotl FPR64, 0), (rotl FPR64, 1),
-                                  (rotl FPR64, 2)]>;
-def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
-                               [(rotl FPR64, 0), (rotl FPR64, 1),
-                                (rotl FPR64, 2), (rotl FPR64, 3)]>;
-def DD   : RegisterClass<"ARM64", [untyped], 64, (add DSeqPairs)> {
-  let Size = 128;
-}
-def DDD  : RegisterClass<"ARM64", [untyped], 64, (add DSeqTriples)> {
-  let Size = 196;
-}
-def DDDD : RegisterClass<"ARM64", [untyped], 64, (add DSeqQuads)> {
-  let Size = 256;
-}
-
-// Pairs, triples, and quads of 128-bit vector registers.
-def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
-def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
-                                 [(rotl FPR128, 0), (rotl FPR128, 1),
-                                  (rotl FPR128, 2)]>;
-def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
-                               [(rotl FPR128, 0), (rotl FPR128, 1),
-                                (rotl FPR128, 2), (rotl FPR128, 3)]>;
-def QQ   : RegisterClass<"ARM64", [untyped], 128, (add QSeqPairs)> {
-  let Size = 256;
-}
-def QQQ  : RegisterClass<"ARM64", [untyped], 128, (add QSeqTriples)> {
-  let Size = 384;
-}
-def QQQQ : RegisterClass<"ARM64", [untyped], 128, (add QSeqQuads)> {
-  let Size = 512;
-}
-
-
-// Vector operand versions of the FP registers. Alternate name printing and
-// assmebler matching.
-def VectorReg64AsmOperand : AsmOperandClass {
-  let Name = "VectorReg64";
-  let PredicateMethod = "isVectorReg";
-}
-def VectorReg128AsmOperand : AsmOperandClass {
-  let Name = "VectorReg128";
-  let PredicateMethod = "isVectorReg";
-}
-
-def V64  : RegisterOperand<FPR64, "printVRegOperand"> {
-  let ParserMatchClass = VectorReg64AsmOperand;
-}
-
-def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
-  let ParserMatchClass = VectorReg128AsmOperand;
-}
-
-def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; }
-def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
-  let ParserMatchClass = VectorRegLoAsmOperand;
-}
-
-class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
-    : AsmOperandClass {
-  let Name = "TypedVectorList" # count # "_" # lanes # kind;
-
-  let PredicateMethod
-      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
-  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
-}
-
-class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
-    : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
-                                                   # kind # "'>">;
-
-multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
-  // With implicit types (probably on instruction instead). E.g. { v0, v1 }
-  def _64AsmOperand : AsmOperandClass {
-    let Name = NAME # "64";
-    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
-    let RenderMethod = "addVectorList64Operands<" # count # ">";
-  }
-
-  def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
-  }
-
-  def _128AsmOperand : AsmOperandClass {
-    let Name = NAME # "128";
-    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
-    let RenderMethod = "addVectorList128Operands<" # count # ">";
-  }
-
-  def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
-  }
-
-  // 64-bit register lists with explicit type.
-
-  // { v0.8b, v1.8b }
-  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
-  def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
-  }
-
-  // { v0.4h, v1.4h }
-  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
-  def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
-  }
-
-  // { v0.2s, v1.2s }
-  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
-  def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
-  }
-
-  // { v0.1d, v1.1d }
-  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
-  def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
-  }
-
-  // 128-bit register lists with explicit type
-
-  // { v0.16b, v1.16b }
-  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
-  def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
-  }
-
-  // { v0.8h, v1.8h }
-  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
-  def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
-  }
-
-  // { v0.4s, v1.4s }
-  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
-  def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
-  }
-
-  // { v0.2d, v1.2d }
-  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
-  def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
-  }
-
-  // { v0.b, v1.b }
-  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
-  def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
-  }
-
-  // { v0.h, v1.h }
-  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
-  def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
-  }
-
-  // { v0.s, v1.s }
-  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
-  def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
-  }
-
-  // { v0.d, v1.d }
-  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
-  def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
-  }
-
-
-}
-
-defm VecListOne   : VectorList<1, FPR64, FPR128>;
-defm VecListTwo   : VectorList<2, DD,    QQ>;
-defm VecListThree : VectorList<3, DDD,   QQQ>;
-defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
-
-
-// Register operand versions of the scalar FP registers.
-def FPR16Op : RegisterOperand<FPR16, "printOperand">;
-def FPR32Op : RegisterOperand<FPR32, "printOperand">;
-def FPR64Op : RegisterOperand<FPR64, "printOperand">;
-def FPR128Op : RegisterOperand<FPR128, "printOperand">;
diff --git a/lib/Target/ARM64/ARM64SchedA53.td b/lib/Target/ARM64/ARM64SchedA53.td
deleted file mode 100644
index cf1a8202764..00000000000
--- a/lib/Target/ARM64/ARM64SchedA53.td
+++ /dev/null
@@ -1,291 +0,0 @@
-//=- ARM64SchedA53.td - ARM Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the itinerary class data for the ARM Cortex A53 processors.
-//
-//===----------------------------------------------------------------------===//
-
-// ===---------------------------------------------------------------------===//
-// The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedModel.h for details.
-
-// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
-def CortexA53Model : SchedMachineModel {
-  let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
-  let IssueWidth = 2;        // 2 micro-ops are dispatched per cycle.
-  let MinLatency = 1 ;       // OperandCycles are interpreted as MinLatency.
-  let LoadLatency = 3;       // Optimistic load latency assuming bypass.
-                             // This is overriden by OperandCycles if the
-                             // Itineraries are queried instead.
-  let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
-                             // Specification - Instruction Timings"
-                             // v 1.0 Spreadsheet
-}
-
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available.
-
-// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
-// Cortex-A53 is in-order.
-
-def A53UnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
-def A53UnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC
-def A53UnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division
-def A53UnitLdSt   : ProcResource<1> { let BufferSize = 0; } // Load/Store
-def A53UnitB      : ProcResource<1> { let BufferSize = 0; } // Branch
-def A53UnitFPALU  : ProcResource<1> { let BufferSize = 0; } // FP ALU
-def A53UnitFPMDS  : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt
-
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedWrite types which both map the ProcResources and
-// set the latency.
-
-let SchedModel = CortexA53Model in {
-
-// ALU - Despite having a full latency of 4, most of the ALU instructions can
-//       forward a cycle earlier and then two cycles earlier in the case of a
-//       shift-only instruction. These latencies will be incorrect when the
-//       result cannot be forwarded, but modeling isn't rocket surgery.
-def : WriteRes<WriteImm, [A53UnitALU]> { let Latency = 3; }
-def : WriteRes<WriteI, [A53UnitALU]> { let Latency = 3; }
-def : WriteRes<WriteISReg, [A53UnitALU]> { let Latency = 3; }
-def : WriteRes<WriteIEReg, [A53UnitALU]> { let Latency = 3; }
-def : WriteRes<WriteIS, [A53UnitALU]> { let Latency = 2; }
-def : WriteRes<WriteExtr, [A53UnitALU]> { let Latency = 3; }
-
-// MAC
-def : WriteRes<WriteIM32, [A53UnitMAC]> { let Latency = 4; }
-def : WriteRes<WriteIM64, [A53UnitMAC]> { let Latency = 4; }
-
-// Div
-def : WriteRes<WriteID32, [A53UnitDiv]> { let Latency = 4; }
-def : WriteRes<WriteID64, [A53UnitDiv]> { let Latency = 4; }
-
-// Load
-def : WriteRes<WriteLD, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteLDIdx, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteLDHi, [A53UnitLdSt]> { let Latency = 4; }
-
-// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
-//               below, choosing the median of 3 which makes the latency 6.
-//               May model this more carefully in the future. The remaining
-//               A53WriteVLD# types represent the 1-5 cycle issues explicitly.
-def : WriteRes<WriteVLD, [A53UnitLdSt]> { let Latency = 6;
-                                          let ResourceCycles = [3]; }
-def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
-def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
-                                                  let ResourceCycles = [2]; }
-def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
-                                                  let ResourceCycles = [3]; }
-def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7;
-                                                  let ResourceCycles = [4]; }
-def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8;
-                                                  let ResourceCycles = [5]; }
-
-// Pre/Post Indexing - Performed as part of address generation which is already
-//                     accounted for in the WriteST* latencies below
-def : WriteRes<WriteAdr, []> { let Latency = 0; }
-
-// Store
-def : WriteRes<WriteST, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteSTP, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteSTIdx, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteSTX, [A53UnitLdSt]> { let Latency = 4; }
-
-// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
-def : WriteRes<WriteVST, [A53UnitLdSt]> { let Latency = 5;
-                                          let ResourceCycles = [2];}
-def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
-def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
-                                                  let ResourceCycles = [2]; }
-def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
-                                                  let ResourceCycles = [3]; }
-
-// Branch
-def : WriteRes<WriteBr, [A53UnitB]>;
-def : WriteRes<WriteBrReg, [A53UnitB]>;
-def : WriteRes<WriteSys, [A53UnitB]>;
-def : WriteRes<WriteBarrier, [A53UnitB]>;
-def : WriteRes<WriteHint, [A53UnitB]>;
-
-// FP ALU
-def : WriteRes<WriteF, [A53UnitFPALU]> { let Latency = 6; }
-def : WriteRes<WriteFCmp, [A53UnitFPALU]> { let Latency = 6; }
-def : WriteRes<WriteFCvt, [A53UnitFPALU]> { let Latency = 6; }
-def : WriteRes<WriteFCopy, [A53UnitFPALU]> { let Latency = 6; }
-def : WriteRes<WriteFImm, [A53UnitFPALU]> { let Latency = 6; }
-def : WriteRes<WriteV, [A53UnitFPALU]> { let Latency = 6; }
-
-// FP Mul, Div, Sqrt
-def : WriteRes<WriteFMul, [A53UnitFPMDS]> { let Latency = 6; }
-def : WriteRes<WriteFDiv, [A53UnitFPMDS]> { let Latency = 33;
-                                            let ResourceCycles = [29]; }
-def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; }
-def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18;
-                                                     let ResourceCycles = [14]; }
-def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33;
-                                                     let ResourceCycles = [29]; }
-def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17;
-                                                      let ResourceCycles = [13]; }
-def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
-                                                      let ResourceCycles = [28]; }
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedRead types.
-
-// No forwarding for these reads.
-def : ReadAdvance<ReadExtrHi, 0>;
-def : ReadAdvance<ReadAdrBase, 0>;
-def : ReadAdvance<ReadVLD, 0>;
-
-// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
-//       operands are needed one cycle later if and only if they are to be
-//       shifted. Otherwise, they too are needed two cycle later. This same
-//       ReadAdvance applies to Extended registers as well, even though there is
-//       a seperate SchedPredicate for them.
-def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
-                             WriteISReg, WriteIEReg,WriteIS,
-                             WriteID32,WriteID64,
-                             WriteIM32,WriteIM64]>;
-def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
-                                          WriteISReg, WriteIEReg,WriteIS,
-                                          WriteID32,WriteID64,
-                                          WriteIM32,WriteIM64]>;
-def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
-                                             WriteISReg, WriteIEReg,WriteIS,
-                                             WriteID32,WriteID64,
-                                             WriteIM32,WriteIM64]>;
-def A53ReadISReg : SchedReadVariant<[
-	SchedVar<RegShiftedPred, [A53ReadShifted]>,
-	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
-def : SchedAlias<ReadISReg, A53ReadISReg>;
-
-def A53ReadIEReg : SchedReadVariant<[
-	SchedVar<RegExtendedPred, [A53ReadShifted]>,
-	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
-def : SchedAlias<ReadIEReg, A53ReadIEReg>;
-
-// MAC - Operands are generally needed one cycle later in the MAC pipe.
-//       Accumulator operands are needed two cycles later.
-def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
-                              WriteISReg, WriteIEReg,WriteIS,
-                              WriteID32,WriteID64,
-                              WriteIM32,WriteIM64]>;
-def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
-                               WriteISReg, WriteIEReg,WriteIS,
-                               WriteID32,WriteID64,
-                               WriteIM32,WriteIM64]>;
-
-// Div
-def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
-                              WriteISReg, WriteIEReg,WriteIS,
-                              WriteID32,WriteID64,
-                              WriteIM32,WriteIM64]>;
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific InstRWs.
-
-//---
-// Miscellaneous
-//---
-def : InstRW<[WriteI], (instrs COPY)>;
-
-//---
-// Vector Loads
-//---
-def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
-def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
-def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
-def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
-def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
-
-def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
-def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
-def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
-def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
-
-def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
-def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
-def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
-def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
-
-//---
-// Vector Stores
-//---
-def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
-def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>;
-def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
-def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
-def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
-def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
-
-def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
-def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
-def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
-
-//---
-// Floating Point MAC, DIV, SQRT
-//---
-def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
-def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>;
-def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>;
-def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>;
-def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>;
-def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>;
-def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
-def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
-
-}
diff --git a/lib/Target/ARM64/ARM64SchedCyclone.td b/lib/Target/ARM64/ARM64SchedCyclone.td
deleted file mode 100644
index c04a7bb8baf..00000000000
--- a/lib/Target/ARM64/ARM64SchedCyclone.td
+++ /dev/null
@@ -1,865 +0,0 @@
-//=- ARMSchedCyclone.td - ARM64 Cyclone Scheduling Defs ------*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the machine model for ARM64 Cyclone to support
-// instruction scheduling and other instruction cost heuristics.
-//
-//===----------------------------------------------------------------------===//
-
-def CycloneModel : SchedMachineModel {
-  let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
-  let MicroOpBufferSize = 192; // Based on the reorder buffer.
-  let LoadLatency = 4; // Optimistic load latency.
-  let MispredictPenalty = 16; // 14-19 cycles are typical.
-}
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available on Cyclone.
-
-// 4 integer pipes
-def CyUnitI : ProcResource<4> {
-  let BufferSize = 48;
-}
-
-// 2 branch units: I[0..1]
-def CyUnitB : ProcResource<2> {
-  let Super  = CyUnitI;
-  let BufferSize = 24;
-}
-
-// 1 indirect-branch unit: I[0]
-def CyUnitBR : ProcResource<1> {
-  let Super  = CyUnitB;
-}
-
-// 2 shifter pipes: I[2..3]
-// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
-def CyUnitIS : ProcResource<2> {
-  let Super = CyUnitI;
-  let BufferSize = 24;
-}
-
-// 1 mul pipe: I[0]
-def CyUnitIM : ProcResource<1> {
-  let Super = CyUnitBR;
-  let BufferSize = 32;
-}
-
-// 1 div pipe: I[1]
-def CyUnitID : ProcResource<1> {
-  let Super = CyUnitB;
-  let BufferSize = 16;
-}
-
-// 1 integer division unit. This is driven by the ID pipe, but only
-// consumes the pipe for one cycle at issue and another cycle at writeback.
-def CyUnitIntDiv : ProcResource<1>;
-
-// 2 ld/st pipes.
-def CyUnitLS : ProcResource<2> {
-  let BufferSize = 28;
-}
-
-// 3 fp/vector pipes.
-def CyUnitV : ProcResource<3> {
-  let BufferSize = 48;
-}
-// 2 fp/vector arithmetic and multiply pipes: V[0-1]
-def CyUnitVM : ProcResource<2> {
-  let Super = CyUnitV;
-  let BufferSize = 32;
-}
-// 1 fp/vector division/sqrt pipe: V[2]
-def CyUnitVD : ProcResource<1> {
-  let Super = CyUnitV;
-  let BufferSize = 16;
-}
-// 1 fp compare pipe: V[0]
-def CyUnitVC : ProcResource<1> {
-  let Super = CyUnitVM;
-  let BufferSize = 16;
-}
-
-// 2 fp division/square-root units.  These are driven by the VD pipe,
-// but only consume the pipe for one cycle at issue and a cycle at writeback.
-def CyUnitFloatDiv : ProcResource<2>;
-
-//===----------------------------------------------------------------------===//
-// Define scheduler read/write resources and latency on Cyclone.
-// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
-
-let SchedModel = CycloneModel in {
-
-//---
-// 7.8.1. Moves
-//---
-
-// A single nop micro-op (uX).
-def WriteX : SchedWriteRes<[]> { let Latency = 0; }
-
-// Move zero is a register rename (to machine register zero).
-// The move is replaced by a single nop micro-op.
-// MOVZ Rd, #0
-// AND Rd, Rzr, #imm
-def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>;
-def WriteImmZ  : SchedWriteVariant<[
-                   SchedVar<WriteZPred, [WriteX]>,
-                   SchedVar<NoSchedPred, [WriteImm]>]>;
-def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
-
-// Move GPR is a register rename and single nop micro-op.
-// ORR Xd, XZR, Xm
-// ADD Xd, Xn, #0
-def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>;
-def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>;
-def WriteMov      : SchedWriteVariant<[
-                      SchedVar<WriteIMovPred, [WriteX]>,
-                      SchedVar<WriteVMovPred, [WriteX]>,
-                      SchedVar<NoSchedPred,   [WriteI]>]>;
-def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
-
-// Move non-zero immediate is an integer ALU op.
-// MOVN,MOVZ,MOVK
-def : WriteRes<WriteImm, [CyUnitI]>;
-
-//---
-// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
-//              Shifts and Bitfield Operations
-//---
-
-// ADR,ADRP
-// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
-// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
-// ADC(S),SBC(S)
-// Aliases: CMN, CMP, TST
-//
-// Conditional operations.
-// CCMNi,CCMPi,CCMNr,CCMPr,
-// CSEL,CSINC,CSINV,CSNEG
-//
-// Bit counting and reversal operations.
-// CLS,CLZ,RBIT,REV,REV16,REV32
-def : WriteRes<WriteI, [CyUnitI]>;
-
-// ADD with shifted register operand is a single micro-op that
-// consumes a shift pipeline for two cycles.
-// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
-// EXAMPLE: ADDrs Xn, Xm LSL #imm
-def : WriteRes<WriteISReg, [CyUnitIS]> {
-  let Latency = 2;
-  let ResourceCycles = [2];
-}
-
-// ADD with extended register operand is the same as shifted reg operand.
-// ADD(S)re,SUB(S)re
-// EXAMPLE: ADDXre Xn, Xm, UXTB #1
-def : WriteRes<WriteIEReg, [CyUnitIS]> {
-  let Latency = 2;
-  let ResourceCycles = [2];
-}
-
-// Variable shift and bitfield operations.
-// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
-def : WriteRes<WriteIS, [CyUnitIS]>;
-
-// EXTR Shifts a pair of registers and requires two micro-ops.
-// The second micro-op is delayed, as modeled by ReadExtrHi.
-// EXTR Xn, Xm, #imm
-def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}
-
-// EXTR's first register read is delayed by one cycle, effectively
-// shortening its writer's latency.
-// EXTR Xn, Xm, #imm
-def : ReadAdvance<ReadExtrHi, 1>;
-
-//---
-// 7.8.6. Multiplies
-//---
-
-// MUL/MNEG are aliases for MADD/MSUB.
-// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
-def : WriteRes<WriteIM32, [CyUnitIM]> {
-  let Latency = 4;
-}
-// MADDX,MSUBX,SMULH,UMULH
-def : WriteRes<WriteIM64, [CyUnitIM]> {
-  let Latency = 5;
-}
-
-//---
-// 7.8.7. Divide
-//---
-
-// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
-// The ID pipe is consumed for 2 cycles: issue and writeback.
-// SDIVW,UDIVW
-def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
-  let Latency = 10;
-  let ResourceCycles = [2, 10];
-}
-// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
-// The ID pipe is consumed for 2 cycles: issue and writeback.
-// SDIVX,UDIVX
-def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
-  let Latency = 13;
-  let ResourceCycles = [2, 13];
-}
-
-//---
-// 7.8.8,7.8.10. Load/Store, single element
-//---
-
-// Integer loads take 4 cycles and use one LS unit for one cycle.
-def : WriteRes<WriteLD, [CyUnitLS]> {
-  let Latency = 4;
-}
-
-// Store-load forwarding is 4 cycles.
-//
-// Note: The store-exclusive sequence incorporates this
-// latency. However, general heuristics should not model the
-// dependence between a store and subsequent may-alias load because
-// hardware speculation works.
-def : WriteRes<WriteST, [CyUnitLS]> {
-  let Latency = 4;
-}
-
-// Load from base address plus an optionally scaled register offset.
-// Rt latency is latency WriteIS + WriteLD.
-// EXAMPLE: LDR Xn, Xm [, lsl 3]
-def CyWriteLDIdx : SchedWriteVariant<[
-  SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
-  SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
-def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map ARM64->Cyclone type.
-
-// EXAMPLE: STR Xn, Xm [, lsl 3]
-def CyWriteSTIdx : SchedWriteVariant<[
-  SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
-  SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
-def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map ARM64->Cyclone type.
-
-// Read the (unshifted) base register Xn in the second micro-op one cycle later.
-// EXAMPLE: LDR Xn, Xm [, lsl 3]
-def ReadBaseRS : SchedReadAdvance<1>;
-def CyReadAdrBase : SchedReadVariant<[
-  SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
-  SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
-def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map ARM64->Cyclone type.
-
-//---
-// 7.8.9,7.8.11. Load/Store, paired
-//---
-
-// Address pre/post increment is a simple ALU op with one cycle latency.
-def : WriteRes<WriteAdr, [CyUnitI]>;
-
-// LDP high register write is fused with the load, but a nop micro-op remains.
-def : WriteRes<WriteLDHi, []> {
-  let Latency = 4;
-}
-
-// STP is a vector op and store, except for QQ, which is just two stores.
-def : SchedAlias<WriteSTP, WriteVSTShuffle>;
-def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
-
-//---
-// 7.8.13. Branches
-//---
-
-// Branches take a single micro-op.
-// The misprediction penalty is defined as a SchedMachineModel property.
-def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
-def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
-
-//---
-// 7.8.14. Never-issued Instructions, Barrier and Hint Operations
-//---
-
-// NOP,SEV,SEVL,WFE,WFI,YIELD
-def : WriteRes<WriteHint, []> {let Latency = 0;}
-// ISB
-def : InstRW<[WriteI], (instrs ISB)>;
-// SLREX,DMB,DSB
-def : WriteRes<WriteBarrier, [CyUnitLS]>;
-
-// System instructions get an invalid latency because the latency of
-// other operations across them is meaningless.
-def : WriteRes<WriteSys, []> {let Latency = -1;}
-
-//===----------------------------------------------------------------------===//
-// 7.9 Vector Unit Instructions
-
-// Simple vector operations take 2 cycles.
-def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
-
-// Define some longer latency vector op types for Cyclone.
-def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
-def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
-def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
-def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
-
-// Simple floating-point operations take 2 cycles.
-def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
-
-//---
-// 7.9.1 Vector Moves
-//---
-
-// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
-// generates expensive int-float conversion instead:
-// FMOVDi Dd, #0.0
-// FMOVv2f64ns Vd.2d, #0.0
-
-// FMOVSi,FMOVDi
-def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
-
-// MOVI,MVNI are WriteV
-// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
-
-// Move FPR is a register rename and single nop micro-op.
-// ORR.16b Vd,Vn,Vn
-// COPY is handled above in the WriteMov Variant.
-def WriteVMov    : SchedWriteVariant<[
-                     SchedVar<WriteVMovPred, [WriteX]>,
-                     SchedVar<NoSchedPred,   [WriteV]>]>;
-def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
-
-// FMOVSr,FMOVDr are WriteF.
-
-// MOV V,V is a WriteV.
-
-// CPY D,V[x] is a WriteV
-
-// INS V[x],V[y] is a WriteV.
-
-// FMOVWSr,FMOVXDr,FMOVXDHighr
-def : WriteRes<WriteFCopy, [CyUnitLS]> {
-  let Latency = 5;
-}
-
-// FMOVSWr,FMOVDXr
-def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
-
-// INS V[x],R
-def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
-def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
-
-// SMOV,UMOV R,V[x]
-def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
-def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
-
-// DUP V,R
-def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
-
-// DUP V,V[x] is a WriteV.
-
-//---
-// 7.9.2 Integer Arithmetic, Logical, and Comparisons
-//---
-
-// BIC,ORR V,#imm are WriteV
-
-def : InstRW<[CyWriteV3], (instregex "ABSv")>;
-
-// MVN,NEG,NOT are WriteV
-
-def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
-
-// ADDP is a WriteV.
-def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
-def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
-
-def : InstRW<[CyWriteV3],
-             (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
-
-def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
-
-// ADD,SUB are WriteV
-
-// Forward declare.
-def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
-
-// Add/Diff and accumulate uses the vector multiply unit.
-def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
-def CyReadVAccum  : SchedReadAdvance<1,
-                    [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
-
-def : InstRW<[CyWriteVAccum, CyReadVAccum],
-             (instregex "SADALP","UADALP")>;
-
-def : InstRW<[CyWriteVAccum, CyReadVAccum],
-             (instregex "SABAv","UABAv","SABALv","UABALv")>;
-
-def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
-
-def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
-
-def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
-
-// WriteV includes:
-// AND,BIC,CMTST,EOR,ORN,ORR
-// ADDP
-// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
-// SADDL,SSUBL,UADDL,USUBL
-// SADDW,SSUBW,UADDW,USUBW
-
-def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
-                                     "CMLEv","CMLTv",
-                                     "CMHIv","CMHSv")>;
-
-def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
-                                     "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
-
-def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
-                                       "SABDLv","UABDLv")>;
-
-//---
-// 7.9.3 Floating Point Arithmetic and Comparisons
-//---
-
-// FABS,FNEG are WriteF
-
-def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
-def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
-
-def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
-                                     "FMINPv2i","FMINNMPv2i")>;
-
-def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
-
-def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
-                                  FSUBSrr,FSUBv2f32,FSUBv4f32,
-                                  FADDPv2f32,FADDPv4f32,
-                                  FABD32,FABDv2f32,FABDv4f32)>;
-def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
-                                  FSUBDrr,FSUBv2f64,
-                                  FADDPv2f64,
-                                  FABD64,FABDv2f64)>;
-
-def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
-
-def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
-                                     "FMAXS","FMAXD","FMAXv",
-                                     "FMINS","FMIND","FMINv",
-                                     "FMAXNMS","FMAXNMD","FMAXNMv",
-                                     "FMINNMS","FMINNMD","FMINNMv",
-                                     "FMAXPv2f","FMAXPv4f",
-                                     "FMINPv2f","FMINPv4f",
-                                     "FMAXNMPv2f","FMAXNMPv4f",
-                                     "FMINNMPv2f","FMINNMPv4f")>;
-
-// FCMP,FCMPE,FCCMP,FCCMPE
-def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
-
-// FCSEL is a WriteF.
-
-//---
-// 7.9.4 Shifts and Bitfield Operations
-//---
-
-// SHL is a WriteV
-
-def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
-def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
-
-def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
-def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
-
-// Shift and accumulate uses the vector multiply unit.
-def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
-def CyReadVShiftAcc  : SchedReadAdvance<1,
-                        [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
-def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
-             (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
-
-// SSHL,USHL are WriteV.
-
-def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
-
-// SQSHL,SQSHLU,UQSHL are WriteV.
-
-def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
-
-// WriteV includes:
-// SHLL,SSHLL,USHLL
-// SLI,SRI
-// BIF,BIT,BSL
-// EXT
-// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
-// XTN2
-
-def : InstRW<[CyWriteV4],
-             (instregex "RSHRNv","SHRNv",
-                        "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
-                        "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
-
-//---
-// 7.9.5 Multiplication
-//---
-
-def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
-def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
-                             "SQDMULLv","SQDMULHv","SQRDMULHv")>;
-
-// FMUL,FMULX,FNMUL default to WriteFMul.
-def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
-
-def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
-def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
-                               FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
-
-def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
-def : InstRW<[CyWriteVMul, CyReadVMulAcc],
-             (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
-              "SQDMLAL","SQDMLSL")>;
-
-def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
-def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
-def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
-def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
-
-def : InstRW<[CyWriteSMul, CyReadSMul],
-             (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
-              FMLAv2f32,FMLAv4f32,
-              FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
-def : InstRW<[CyWriteDMul, CyReadDMul],
-             (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
-              FMLAv2f64,FMLAv2i64_indexed,
-              FMLSv2f64,FMLSv2i64_indexed)>;
-
-def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
-def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
-
-//---
-// 7.9.6 Divide and Square Root
-//---
-
-// FDIV,FSQRT
-// TODO: Add 64-bit variant with 19 cycle latency.
-// TODO: Specialize FSQRT for longer latency.
-def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
-  let Latency = 17;
-  let ResourceCycles = [2, 17];
-}
-
-def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
-
-def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
-def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
-
-def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
-def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
-def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
-def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
-
-//---
-// 7.9.7 Integer-FP Conversions
-//---
-
-// FCVT lengthen f16/s32
-def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
-
-// FCVT,FCVTN,FCVTXN
-// SCVTF,UCVTF V,V
-// FRINT(AIMNPXZ) V,V
-def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
-
-// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
-def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
-def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
-
-// FCVT Rd, S/D = V6+LD4: 10 cycles
-def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
-def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
-
-// FCVTL is a WriteV
-
-//---
-// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
-//---
-
-def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
-def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
-                                       AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
-                                       SHA1SU0rrr)>;
-
-def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
-def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
-
-def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
-def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
-                                       SHA256Hrrr,SHA256H2rrr)>;
-
-// TRN,UZP,ZUP are WriteV.
-
-// TBL,TBX are WriteV.
-
-//---
-// 7.9.11-7.9.14 Load/Store, single element and paired
-//---
-
-// Loading into the vector unit takes 5 cycles vs 4 for integer loads.
-def : WriteRes<WriteVLD, [CyUnitLS]> {
-  let Latency = 5;
-}
-
-// Store-load forwarding is 4 cycles.
-def : WriteRes<WriteVST, [CyUnitLS]> {
-  let Latency = 4;
-}
-
-// WriteVLDPair/VSTPair sequences are expanded by the target description.
-
-//---
-// 7.9.15 Load, element operations
-//---
-
-// Only the first WriteVLD and WriteAdr for writeback matches def operands.
-// Subsequent WriteVLDs consume resources. Since all loaded values have the
-// same latency, this is acceptable.
-
-// Vd is read 5 cycles after issuing the vector load.
-def : ReadAdvance<ReadVLD, 5>;
-
-def : InstRW<[WriteVLD],
-             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLD, WriteAdr],
-             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
-
-// Register writes from the load's high half are fused micro-ops.
-def : InstRW<[WriteVLD],
-             (instregex "LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteVLD, WriteAdr],
-             (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVLD, WriteVLD],
-             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
-             (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLD, WriteVLD],
-             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
-             (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
-             (instregex "LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
-             (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLD, WriteVLD],
-             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
-             (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
-             (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
-             (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD],
-             (instregex "LD1i(8|16|32)$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
-             (instregex "LD1i(8|16|32)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
-
-def : InstRW<[WriteVLDShuffle],
-             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr],
-             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[WriteVLDShuffle, WriteV],
-             (instregex "LD2Twov(8b|4h|2s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
-             (instregex "LD2Twov(8b|4h|2s)_POST$")>;
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
-             (instregex "LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
-             (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
-             (instregex "LD2i(8|16|32)$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
-             (instregex "LD2i(8|16|32)_POST")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
-             (instregex "LD2i64$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
-             (instregex "LD2i64_POST")>;
-
-def : InstRW<[WriteVLDShuffle, WriteV],
-             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
-             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
-             (instregex "LD3Threev(8b|4h|2s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
-             (instregex "LD3Threev(8b|4h|2s)_POST")>;
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
-             (instregex "LD3Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
-             (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
-             (instregex "LD3i(8|16|32)$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
-             (instregex "LD3i(8|16|32)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
-             (instregex "LD3i64$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
-             (instregex "LD3i64_POST")>;
-
-def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
-             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
-             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
-             (instrs LD3Rv1d,LD3Rv2d)>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
-             (instrs LD3Rv2d_POST,LD3Rv2d_POST)>;
-
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
-             (instregex "LD4Fourv(8b|4h|2s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
-             (instregex "LD4Fourv(8b|4h|2s)_POST")>;
-def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
-              WriteVLDPairShuffle, WriteVLDPairShuffle],
-             (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
-              WriteVLDPairShuffle, WriteVLDPairShuffle],
-             (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
-             (instregex "LD4i(8|16|32)$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
-             (instregex "LD4i(8|16|32)_POST")>;
-
-
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
-             (instrs LD4i64)>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
-             (instrs LD4i64_POST)>;
-
-def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
-             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
-             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
-             (instrs LD4Rv1d,LD4Rv2d)>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
-             (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
-
-//---
-// 7.9.16 Store, element operations
-//---
-
-// Only the WriteAdr for writeback matches a def operands.
-// Subsequent WriteVLDs only consume resources.
-
-def : InstRW<[WriteVST],
-             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVST],
-             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTShuffle],
-             (instregex "ST1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle],
-             (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVST, WriteVST],
-             (instregex "ST1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVST, WriteVST],
-             (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTShuffle, WriteVST],
-             (instregex "ST1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
-             (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVST, WriteVST, WriteVST],
-             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
-             (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
-             (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
-             (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
-
-def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
-def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
-
-def : InstRW<[WriteVSTShuffle],
-             (instregex "ST2Twov(8b|4h|2s)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle],
-             (instregex "ST2Twov(8b|4h|2s)_POST")>;
-def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
-def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
-def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
-
-def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST3Threev(8b|4h|2s)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST3Threev(8b|4h|2s)_POST")>;
-def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST3Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
-
-def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
-def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
-
-def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
-            (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
-            (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
-              WriteVSTPairShuffle, WriteVSTPairShuffle],
-             (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
-              WriteVSTPairShuffle, WriteVSTPairShuffle],
-             (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
-def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
-
-def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
-def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
-
-//---
-// Unused SchedRead types
-//---
-
-def : ReadAdvance<ReadI, 0>;
-def : ReadAdvance<ReadISReg, 0>;
-def : ReadAdvance<ReadIEReg, 0>;
-def : ReadAdvance<ReadIM, 0>;
-def : ReadAdvance<ReadIMA, 0>;
-def : ReadAdvance<ReadID, 0>;
-
-} // SchedModel = CycloneModel
diff --git a/lib/Target/ARM64/ARM64Schedule.td b/lib/Target/ARM64/ARM64Schedule.td
deleted file mode 100644
index 3a4194173a8..00000000000
--- a/lib/Target/ARM64/ARM64Schedule.td
+++ /dev/null
@@ -1,104 +0,0 @@
-//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-// Define TII for use in SchedVariant Predicates.
-// const MachineInstr *MI and const TargetSchedModel *SchedModel
-// are defined by default.
-def : PredicateProlog<[{
-  const ARM64InstrInfo *TII =
-    static_cast<const ARM64InstrInfo*>(SchedModel->getInstrInfo());
-  (void)TII;
-}]>;
-
-// ARM64 Scheduler Definitions
-
-def WriteImm       : SchedWrite; // MOVN, MOVZ
-// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
-// select the correct sequence of WriteImms.
-
-def WriteI         : SchedWrite; // ALU
-def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
-def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
-def ReadI          : SchedRead;  // ALU
-def ReadISReg      : SchedRead;  // ALU of Shifted-Reg
-def ReadIEReg      : SchedRead;  // ALU of Extended-Reg
-def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
-def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
-def WriteIS        : SchedWrite; // Shift/Scale
-def WriteID32      : SchedWrite; // 32-bit Divide
-def WriteID64      : SchedWrite; // 64-bit Divide
-def ReadID         : SchedRead;  // 32/64-bit Divide
-def WriteIM32      : SchedWrite; // 32-bit Multiply
-def WriteIM64      : SchedWrite; // 64-bit Multiply
-def ReadIM         : SchedRead;  // 32/64-bit Multiply
-def ReadIMA        : SchedRead;  // 32/64-bit Multiply Accumulate
-def WriteBr        : SchedWrite; // Branch
-def WriteBrReg     : SchedWrite; // Indirect Branch
-
-def WriteLD        : SchedWrite; // Load from base addr plus immediate offset
-def WriteST        : SchedWrite; // Store to base addr plus immediate offset
-def WriteSTP       : SchedWrite; // Store a register pair.
-def WriteAdr       : SchedWrite; // Address pre/post increment.
-
-def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
-def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
-def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
-
-// Predicate for determining when a shiftable register is shifted.
-def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>;
-
-// Predicate for determining when a extendedable register is extended.
-def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>;
-
-// ScaledIdxPred is true if a WriteLDIdx operand will be
-// scaled. Subtargets can use this to dynamically select resources and
-// latency for WriteLDIdx and ReadAdrBase.
-def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
-
-// Serialized two-level address load.
-// EXAMPLE: LOADGot
-def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
-
-// Serialized two-level address lookup.
-// EXAMPLE: MOVaddr...
-def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
-
-// The second register of a load-pair.
-// LDP,LDPSW,LDNP,LDXP,LDAXP
-def WriteLDHi : SchedWrite;
-
-// Store-exclusive is a store followed by a dependent load.
-def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
-
-def WriteSys     : SchedWrite; // Long, variable latency system ops.
-def WriteBarrier : SchedWrite; // Memory barrier.
-def WriteHint    : SchedWrite; // Hint instruction.
-
-def WriteF       : SchedWrite; // General floating-point ops.
-def WriteFCmp    : SchedWrite; // Floating-point compare.
-def WriteFCvt    : SchedWrite; // Float conversion.
-def WriteFCopy   : SchedWrite; // Float-int register copy.
-def WriteFImm    : SchedWrite; // Floating-point immediate.
-def WriteFMul    : SchedWrite; // Floating-point multiply.
-def WriteFDiv    : SchedWrite; // Floating-point division.
-
-def WriteV   : SchedWrite; // Vector ops.
-def WriteVLD : SchedWrite; // Vector loads.
-def WriteVST : SchedWrite; // Vector stores.
-
-// Read the unwritten lanes of the VLD's destination registers.
-def ReadVLD : SchedRead;
-
-// Sequential vector load and shuffle.
-def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
-def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
-
-// Store a shuffled vector.
-def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
-def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp b/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
deleted file mode 100644
index f8a2527616c..00000000000
--- a/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===-- ARM64SelectionDAGInfo.cpp - ARM64 SelectionDAG Info ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64SelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64TargetMachine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-selectiondag-info"
-
-ARM64SelectionDAGInfo::ARM64SelectionDAGInfo(const TargetMachine &TM)
-    : TargetSelectionDAGInfo(TM),
-      Subtarget(&TM.getSubtarget<ARM64Subtarget>()) {}
-
-ARM64SelectionDAGInfo::~ARM64SelectionDAGInfo() {}
-
-SDValue ARM64SelectionDAGInfo::EmitTargetCodeForMemset(
-    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool isVolatile,
-    MachinePointerInfo DstPtrInfo) const {
-  // Check to see if there is a specialized entry-point for memory zeroing.
-  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
-  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
-  const char *bzeroEntry =
-      (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : nullptr;
-  // For small size (< 256), it is not beneficial to use bzero
-  // instead of memset.
-  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
-    const ARM64TargetLowering &TLI = *static_cast<const ARM64TargetLowering *>(
-                                          DAG.getTarget().getTargetLowering());
-
-    EVT IntPtr = TLI.getPointerTy();
-    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
-    TargetLowering::ArgListTy Args;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Node = Dst;
-    Entry.Ty = IntPtrTy;
-    Args.push_back(Entry);
-    Entry.Node = Size;
-    Args.push_back(Entry);
-    TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(dl).setChain(Chain)
-      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
-      .setDiscardResult();
-    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-    return CallResult.second;
-  }
-  return SDValue();
-}
diff --git a/lib/Target/ARM64/ARM64SelectionDAGInfo.h b/lib/Target/ARM64/ARM64SelectionDAGInfo.h
deleted file mode 100644
index 770775fc02d..00000000000
--- a/lib/Target/ARM64/ARM64SelectionDAGInfo.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- ARM64SelectionDAGInfo.h - ARM64 SelectionDAG Info -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the ARM64 subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64SELECTIONDAGINFO_H
-#define ARM64SELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class ARM64SelectionDAGInfo : public TargetSelectionDAGInfo {
-  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-public:
-  explicit ARM64SelectionDAGInfo(const TargetMachine &TM);
-  ~ARM64SelectionDAGInfo();
-
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
-                                  SDValue Dst, SDValue Src, SDValue Size,
-                                  unsigned Align, bool isVolatile,
-                                  MachinePointerInfo DstPtrInfo) const override;
-};
-}
-
-#endif
diff --git a/lib/Target/ARM64/ARM64StorePairSuppress.cpp b/lib/Target/ARM64/ARM64StorePairSuppress.cpp
deleted file mode 100644
index a9501ed9217..00000000000
--- a/lib/Target/ARM64/ARM64StorePairSuppress.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-//===---- ARM64StorePairSuppress.cpp --- Suppress store pair formation ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass identifies floating point stores that should not be combined into
-// store pairs. Later we may do the same for floating point loads.
-// ===---------------------------------------------------------------------===//
-
-#include "ARM64InstrInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineTraceMetrics.h"
-#include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-stp-suppress"
-
-namespace {
-class ARM64StorePairSuppress : public MachineFunctionPass {
-  const ARM64InstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  const MachineRegisterInfo *MRI;
-  MachineFunction *MF;
-  TargetSchedModel SchedModel;
-  MachineTraceMetrics *Traces;
-  MachineTraceMetrics::Ensemble *MinInstr;
-
-public:
-  static char ID;
-  ARM64StorePairSuppress() : MachineFunctionPass(ID) {}
-
-  virtual const char *getPassName() const override {
-    return "ARM64 Store Pair Suppression";
-  }
-
-  bool runOnMachineFunction(MachineFunction &F) override;
-
-private:
-  bool shouldAddSTPToBlock(const MachineBasicBlock *BB);
-
-  bool isNarrowFPStore(const MachineInstr &MI);
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<MachineTraceMetrics>();
-    AU.addPreserved<MachineTraceMetrics>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-char ARM64StorePairSuppress::ID = 0;
-} // anonymous
-
-FunctionPass *llvm::createARM64StorePairSuppressPass() {
-  return new ARM64StorePairSuppress();
-}
-
-/// Return true if an STP can be added to this block without increasing the
-/// critical resource height. STP is good to form in Ld/St limited blocks and
-/// bad to form in float-point limited blocks. This is true independent of the
-/// critical path. If the critical path is longer than the resource height, the
-/// extra vector ops can limit physreg renaming. Otherwise, it could simply
-/// oversaturate the vector units.
-bool ARM64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
-  if (!MinInstr)
-    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
-
-  MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB);
-  unsigned ResLength = BBTrace.getResourceLength();
-
-  // Get the machine model's scheduling class for STPQi.
-  // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
-  unsigned SCIdx = TII->get(ARM64::STPDi).getSchedClass();
-  const MCSchedClassDesc *SCDesc =
-      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
-
-  // If a subtarget does not define resources for STPQi, bail here.
-  if (SCDesc->isValid() && !SCDesc->isVariant()) {
-    unsigned ResLenWithSTP = BBTrace.getResourceLength(
-        ArrayRef<const MachineBasicBlock *>(), SCDesc);
-    if (ResLenWithSTP > ResLength) {
-      DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
-                   << " resources " << ResLength << " -> " << ResLenWithSTP
-                   << "\n");
-      return false;
-    }
-  }
-  return true;
-}
-
-/// Return true if this is a floating-point store smaller than the V reg. On
-/// cyclone, these require a vector shuffle before storing a pair.
-/// Ideally we would call getMatchingPairOpcode() and have the machine model
-/// tell us if it's profitable with no cpu knowledge here.
-///
-/// FIXME: We plan to develop a decent Target abstraction for simple loads and
-/// stores. Until then use a nasty switch similar to ARM64LoadStoreOptimizer.
-bool ARM64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STURSi:
-  case ARM64::STURDi:
-    return true;
-  }
-}
-
-bool ARM64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
-  MF = &mf;
-  TII = static_cast<const ARM64InstrInfo *>(MF->getTarget().getInstrInfo());
-  TRI = MF->getTarget().getRegisterInfo();
-  MRI = &MF->getRegInfo();
-  const TargetSubtargetInfo &ST =
-      MF->getTarget().getSubtarget<TargetSubtargetInfo>();
-  SchedModel.init(*ST.getSchedModel(), &ST, TII);
-
-  Traces = &getAnalysis<MachineTraceMetrics>();
-  MinInstr = nullptr;
-
-  DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n');
-
-  if (!SchedModel.hasInstrSchedModel()) {
-    DEBUG(dbgs() << "  Skipping pass: no machine model present.\n");
-    return false;
-  }
-
-  // Check for a sequence of stores to the same base address. We don't need to
-  // precisely determine whether a store pair can be formed. But we do want to
-  // filter out most situations where we can't form store pairs to avoid
-  // computing trace metrics in those cases.
-  for (auto &MBB : *MF) {
-    bool SuppressSTP = false;
-    unsigned PrevBaseReg = 0;
-    for (auto &MI : MBB) {
-      if (!isNarrowFPStore(MI))
-        continue;
-      unsigned BaseReg;
-      unsigned Offset;
-      if (TII->getLdStBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) {
-        if (PrevBaseReg == BaseReg) {
-          // If this block can take STPs, skip ahead to the next block.
-          if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
-            break;
-          // Otherwise, continue unpairing the stores in this block.
-          DEBUG(dbgs() << "Unpairing store " << MI << "\n");
-          SuppressSTP = true;
-          TII->suppressLdStPair(&MI);
-        }
-        PrevBaseReg = BaseReg;
-      } else
-        PrevBaseReg = 0;
-    }
-  }
-  // This pass just sets some internal MachineMemOperand flags. It can't really
-  // invalidate anything.
-  return false;
-}
diff --git a/lib/Target/ARM64/ARM64Subtarget.cpp b/lib/Target/ARM64/ARM64Subtarget.cpp
deleted file mode 100644
index 624e47483ff..00000000000
--- a/lib/Target/ARM64/ARM64Subtarget.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-//===-- ARM64Subtarget.cpp - ARM64 Subtarget Information --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64 specific subclass of TargetSubtarget.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64InstrInfo.h"
-#include "ARM64Subtarget.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-subtarget"
-
-#define GET_SUBTARGETINFO_CTOR
-#define GET_SUBTARGETINFO_TARGET_DESC
-#include "ARM64GenSubtargetInfo.inc"
-
-static cl::opt<bool>
-EnableEarlyIfConvert("arm64-early-ifcvt", cl::desc("Enable the early if "
-                     "converter pass"), cl::init(true), cl::Hidden);
-
-ARM64Subtarget::ARM64Subtarget(const std::string &TT, const std::string &CPU,
-                               const std::string &FS, bool LittleEndian)
-    : ARM64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
-      HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
-      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
-      CPUString(CPU), TargetTriple(TT), IsLittleEndian(LittleEndian) {
-  // Determine default and user-specified characteristics
-
-  if (CPUString.empty())
-    CPUString = "generic";
-
-  ParseSubtargetFeatures(CPUString, FS);
-}
-
-/// ClassifyGlobalReference - Find the target operand flags that describe
-/// how a global value should be referenced for the current subtarget.
-unsigned char
-ARM64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
-                                        const TargetMachine &TM) const {
-
-  // Determine whether this is a reference to a definition or a declaration.
-  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
-  // load from stub.
-  bool isDecl = GV->hasAvailableExternallyLinkage();
-  if (GV->isDeclaration() && !GV->isMaterializable())
-    isDecl = true;
-
-  // MachO large model always goes via a GOT, simply to get a single 8-byte
-  // absolute relocation on all global addresses.
-  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
-    return ARM64II::MO_GOT;
-
-  // The small code mode's direct accesses use ADRP, which cannot necessarily
-  // produce the value 0 (if the code is above 4GB). Therefore they must use the
-  // GOT.
-  if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
-    return ARM64II::MO_GOT;
-
-  // If symbol visibility is hidden, the extra load is not needed if
-  // the symbol is definitely defined in the current translation unit.
-
-  // The handling of non-hidden symbols in PIC mode is rather target-dependent:
-  //   + On MachO, if the symbol is defined in this module the GOT can be
-  //     skipped.
-  //   + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
-  //     defined could end up in unexpected places. Use a GOT.
-  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
-    if (isTargetMachO())
-      return (isDecl || GV->isWeakForLinker()) ? ARM64II::MO_GOT
-                                               : ARM64II::MO_NO_FLAG;
-    else
-      // No need to go through the GOT for local symbols on ELF.
-      return GV->hasLocalLinkage() ? ARM64II::MO_NO_FLAG : ARM64II::MO_GOT;
-  }
-
-  return ARM64II::MO_NO_FLAG;
-}
-
-/// This function returns the name of a function which has an interface
-/// like the non-standard bzero function, if such a function exists on
-/// the current subtarget and it is considered prefereable over
-/// memset with zero passed as the second argument. Otherwise it
-/// returns null.
-const char *ARM64Subtarget::getBZeroEntry() const {
-  // Prefer bzero on Darwin only.
-  if(isTargetDarwin())
-    return "bzero";
-
-  return nullptr;
-}
-
-void ARM64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                         MachineInstr *begin, MachineInstr *end,
-                                         unsigned NumRegionInstrs) const {
-  // LNT run (at least on Cyclone) showed reasonably significant gains for
-  // bi-directional scheduling. 253.perlbmk.
-  Policy.OnlyTopDown = false;
-  Policy.OnlyBottomUp = false;
-}
-
-bool ARM64Subtarget::enableEarlyIfConversion() const {
-  return EnableEarlyIfConvert;
-}
diff --git a/lib/Target/ARM64/ARM64Subtarget.h b/lib/Target/ARM64/ARM64Subtarget.h
deleted file mode 100644
index 9cea3c387d6..00000000000
--- a/lib/Target/ARM64/ARM64Subtarget.h
+++ /dev/null
@@ -1,110 +0,0 @@
-//=====---- ARM64Subtarget.h - Define Subtarget for the ARM64 -*- C++ -*--====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the ARM64 specific subclass of TargetSubtarget.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64SUBTARGET_H
-#define ARM64SUBTARGET_H
-
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include "ARM64RegisterInfo.h"
-#include <string>
-
-#define GET_SUBTARGETINFO_HEADER
-#include "ARM64GenSubtargetInfo.inc"
-
-namespace llvm {
-class GlobalValue;
-class StringRef;
-
-class ARM64Subtarget : public ARM64GenSubtargetInfo {
-protected:
-  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
-
-  /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
-  ARMProcFamilyEnum ARMProcFamily;
-
-  bool HasFPARMv8;
-  bool HasNEON;
-  bool HasCrypto;
-  bool HasCRC;
-
-  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
-  bool HasZeroCycleRegMove;
-
-  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
-  bool HasZeroCycleZeroing;
-
-  /// CPUString - String name of used CPU.
-  std::string CPUString;
-
-  /// TargetTriple - What processor and OS we're targeting.
-  Triple TargetTriple;
-
-  /// IsLittleEndian - Is the target little endian?
-  bool IsLittleEndian;
-
-public:
-  /// This constructor initializes the data members to match that
-  /// of the specified triple.
-  ARM64Subtarget(const std::string &TT, const std::string &CPU,
-                 const std::string &FS, bool LittleEndian);
-
-  bool enableMachineScheduler() const override { return true; }
-
-  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
-
-  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
-
-  bool hasFPARMv8() const { return HasFPARMv8; }
-  bool hasNEON() const { return HasNEON; }
-  bool hasCrypto() const { return HasCrypto; }
-  bool hasCRC() const { return HasCRC; }
-
-  bool isLittleEndian() const { return IsLittleEndian; }
-
-  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
-
-  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-
-  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
-
-  bool isCyclone() const { return CPUString == "cyclone"; }
-
-  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
-  /// that still makes it profitable to inline the call.
-  unsigned getMaxInlineSizeThreshold() const { return 64; }
-
-  /// ParseSubtargetFeatures - Parses features string setting specified
-  /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-
-  /// ClassifyGlobalReference - Find the target operand flags that describe
-  /// how a global value should be referenced for the current subtarget.
-  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
-                                        const TargetMachine &TM) const;
-
-  /// This function returns the name of a function which has an interface
-  /// like the non-standard bzero function, if such a function exists on
-  /// the current subtarget and it is considered prefereable over
-  /// memset with zero passed as the second argument. Otherwise it
-  /// returns null.
-  const char *getBZeroEntry() const;
-
-  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
-                           MachineInstr *end,
-                           unsigned NumRegionInstrs) const override;
-
-  bool enableEarlyIfConversion() const override;
-};
-} // End llvm namespace
-
-#endif // ARM64SUBTARGET_H
diff --git a/lib/Target/ARM64/ARM64TargetMachine.cpp b/lib/Target/ARM64/ARM64TargetMachine.cpp
deleted file mode 100644
index fc73145be3f..00000000000
--- a/lib/Target/ARM64/ARM64TargetMachine.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-//===-- ARM64TargetMachine.cpp - Define TargetMachine for ARM64 -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/PassManager.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Transforms/Scalar.h"
-using namespace llvm;
-
-static cl::opt<bool>
-EnableCCMP("arm64-ccmp", cl::desc("Enable the CCMP formation pass"),
-           cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
-EnableStPairSuppress("arm64-stp-suppress", cl::desc("Suppress STP for ARM64"),
-                     cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
-EnableAdvSIMDScalar("arm64-simd-scalar", cl::desc("Enable use of AdvSIMD scalar"
-                    " integer instructions"), cl::init(false), cl::Hidden);
-
-static cl::opt<bool>
-EnablePromoteConstant("arm64-promote-const", cl::desc("Enable the promote "
-                      "constant pass"), cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
-EnableCollectLOH("arm64-collect-loh", cl::desc("Enable the pass that emits the"
-                 " linker optimization hints (LOH)"), cl::init(true),
-                 cl::Hidden);
-
-static cl::opt<bool>
-EnableDeadRegisterElimination("arm64-dead-def-elimination", cl::Hidden,
-                              cl::desc("Enable the pass that removes dead"
-                                       " definitons and replaces stores to"
-                                       " them with stores to the zero"
-                                       " register"),
-                              cl::init(true));
-
-static cl::opt<bool>
-EnableLoadStoreOpt("arm64-load-store-opt", cl::desc("Enable the load/store pair"
-                   " optimization pass"), cl::init(true), cl::Hidden);
-
-extern "C" void LLVMInitializeARM64Target() {
-  // Register the target.
-  RegisterTargetMachine<ARM64leTargetMachine> X(TheARM64leTarget);
-  RegisterTargetMachine<ARM64beTargetMachine> Y(TheARM64beTarget);
-
-  RegisterTargetMachine<ARM64leTargetMachine> Z(TheAArch64leTarget);
-  RegisterTargetMachine<ARM64beTargetMachine> W(TheAArch64beTarget);
-}
-
-/// TargetMachine ctor - Create an ARM64 architecture model.
-///
-ARM64TargetMachine::ARM64TargetMachine(const Target &T, StringRef TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
-                                       CodeGenOpt::Level OL,
-                                       bool LittleEndian)
-    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS, LittleEndian),
-      // This nested ternary is horrible, but DL needs to be properly initialized
-      // before TLInfo is constructed.
-      DL(Subtarget.isTargetMachO() ?
-         "e-m:o-i64:64-i128:128-n32:64-S128" :
-         (LittleEndian ?
-          "e-m:e-i64:64-i128:128-n32:64-S128" :
-          "E-m:e-i64:64-i128:128-n32:64-S128")),
-      InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget),
-      TSInfo(*this) {
-  initAsmInfo();
-}
-
-void ARM64leTargetMachine::anchor() { }
-
-ARM64leTargetMachine::
-ARM64leTargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL)
-  : ARM64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
-
-void ARM64beTargetMachine::anchor() { }
-
-ARM64beTargetMachine::
-ARM64beTargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL)
-  : ARM64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
-
-namespace {
-/// ARM64 Code Generator Pass Configuration Options.
-class ARM64PassConfig : public TargetPassConfig {
-public:
-  ARM64PassConfig(ARM64TargetMachine *TM, PassManagerBase &PM)
-      : TargetPassConfig(TM, PM) {}
-
-  ARM64TargetMachine &getARM64TargetMachine() const {
-    return getTM<ARM64TargetMachine>();
-  }
-
-  bool addPreISel() override;
-  bool addInstSelector() override;
-  bool addILPOpts() override;
-  bool addPreRegAlloc() override;
-  bool addPostRegAlloc() override;
-  bool addPreSched2() override;
-  bool addPreEmitPass() override;
-};
-} // namespace
-
-void ARM64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our ARM64 pass. This
-  // allows the ARM64 pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createARM64TargetTransformInfoPass(this));
-}
-
-TargetPassConfig *ARM64TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new ARM64PassConfig(this, PM);
-}
-
-// Pass Pipeline Configuration
-bool ARM64PassConfig::addPreISel() {
-  // Run promote constant before global merge, so that the promoted constants
-  // get a chance to be merged
-  if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
-    addPass(createARM64PromoteConstantPass());
-  if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createGlobalMergePass(TM));
-  if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createARM64AddressTypePromotionPass());
-
-  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
-  // ourselves.
-  addPass(createAtomicExpandLoadLinkedPass(TM));
-
-  return false;
-}
-
-bool ARM64PassConfig::addInstSelector() {
-  addPass(createARM64ISelDag(getARM64TargetMachine(), getOptLevel()));
-
-  // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
-  // references to _TLS_MODULE_BASE_ as possible.
-  if (TM->getSubtarget<ARM64Subtarget>().isTargetELF() &&
-      getOptLevel() != CodeGenOpt::None)
-    addPass(createARM64CleanupLocalDynamicTLSPass());
-
-  return false;
-}
-
-bool ARM64PassConfig::addILPOpts() {
-  if (EnableCCMP)
-    addPass(createARM64ConditionalCompares());
-  addPass(&EarlyIfConverterID);
-  if (EnableStPairSuppress)
-    addPass(createARM64StorePairSuppressPass());
-  return true;
-}
-
-bool ARM64PassConfig::addPreRegAlloc() {
-  // Use AdvSIMD scalar instructions whenever profitable.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar)
-    addPass(createARM64AdvSIMDScalar());
-  return true;
-}
-
-bool ARM64PassConfig::addPostRegAlloc() {
-  // Change dead register definitions to refer to the zero register.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
-    addPass(createARM64DeadRegisterDefinitions());
-  return true;
-}
-
-bool ARM64PassConfig::addPreSched2() {
-  // Expand some pseudo instructions to allow proper scheduling.
-  addPass(createARM64ExpandPseudoPass());
-  // Use load/store pair instructions when possible.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
-    addPass(createARM64LoadStoreOptimizationPass());
-  return true;
-}
-
-bool ARM64PassConfig::addPreEmitPass() {
-  // Relax conditional branch instructions if they're otherwise out of
-  // range of their destination.
-  addPass(createARM64BranchRelaxation());
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
-      TM->getSubtarget<ARM64Subtarget>().isTargetMachO())
-    addPass(createARM64CollectLOHPass());
-  return true;
-}
diff --git a/lib/Target/ARM64/ARM64TargetMachine.h b/lib/Target/ARM64/ARM64TargetMachine.h
deleted file mode 100644
index 730ffcaaf6d..00000000000
--- a/lib/Target/ARM64/ARM64TargetMachine.h
+++ /dev/null
@@ -1,92 +0,0 @@
-//===-- ARM64TargetMachine.h - Define TargetMachine for ARM64 ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the ARM64 specific subclass of TargetMachine.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64TARGETMACHINE_H
-#define ARM64TARGETMACHINE_H
-
-#include "ARM64InstrInfo.h"
-#include "ARM64ISelLowering.h"
-#include "ARM64Subtarget.h"
-#include "ARM64FrameLowering.h"
-#include "ARM64SelectionDAGInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/MC/MCStreamer.h"
-
-namespace llvm {
-
-class ARM64TargetMachine : public LLVMTargetMachine {
-protected:
-  ARM64Subtarget Subtarget;
-
-private:
-  const DataLayout DL;
-  ARM64InstrInfo InstrInfo;
-  ARM64TargetLowering TLInfo;
-  ARM64FrameLowering FrameLowering;
-  ARM64SelectionDAGInfo TSInfo;
-
-public:
-  ARM64TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
-                     const TargetOptions &Options, Reloc::Model RM,
-                     CodeModel::Model CM, CodeGenOpt::Level OL,
-                     bool IsLittleEndian);
-
-  const ARM64Subtarget *getSubtargetImpl() const override { return &Subtarget; }
-  const ARM64TargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
-  const DataLayout *getDataLayout() const override { return &DL; }
-  const ARM64FrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-  const ARM64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const ARM64RegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
-  }
-  const ARM64SelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
-  }
-
-  // Pass Pipeline Configuration
-  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-
-  /// \brief Register ARM64 analysis passes with a pass manager.
-  void addAnalysisPasses(PassManagerBase &PM) override;
-};
-
-// ARM64leTargetMachine - ARM64 little endian target machine.
-//
-class ARM64leTargetMachine : public ARM64TargetMachine {
-  virtual void anchor();
-public:
-  ARM64leTargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
-};
-
-// ARM64beTargetMachine - ARM64 big endian target machine.
-//
-class ARM64beTargetMachine : public ARM64TargetMachine {
-  virtual void anchor();
-public:
-  ARM64beTargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64TargetObjectFile.cpp b/lib/Target/ARM64/ARM64TargetObjectFile.cpp
deleted file mode 100644
index cde01e515dc..00000000000
--- a/lib/Target/ARM64/ARM64TargetObjectFile.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===-- ARM64TargetObjectFile.cpp - ARM64 Object Info ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64TargetObjectFile.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Dwarf.h"
-using namespace llvm;
-using namespace dwarf;
-
-void ARM64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
-                                           const TargetMachine &TM) {
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
-}
-
-const MCExpr *ARM64_MachoTargetObjectFile::getTTypeGlobalReference(
-    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
-    const TargetMachine &TM, MachineModuleInfo *MMI,
-    MCStreamer &Streamer) const {
-  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
-  // is an indirect pc-relative reference. The default implementation
-  // won't reference using the GOT, so we need this target-specific
-  // version.
-  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
-    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
-    const MCExpr *Res =
-        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
-    MCSymbol *PCSym = getContext().CreateTempSymbol();
-    Streamer.EmitLabel(PCSym);
-    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
-    return MCBinaryExpr::CreateSub(Res, PC, getContext());
-  }
-
-  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
-      GV, Encoding, Mang, TM, MMI, Streamer);
-}
-
-MCSymbol *ARM64_MachoTargetObjectFile::getCFIPersonalitySymbol(
-    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
-    MachineModuleInfo *MMI) const {
-  return TM.getSymbol(GV, Mang);
-}
diff --git a/lib/Target/ARM64/ARM64TargetObjectFile.h b/lib/Target/ARM64/ARM64TargetObjectFile.h
deleted file mode 100644
index 62446f94f17..00000000000
--- a/lib/Target/ARM64/ARM64TargetObjectFile.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- ARM64TargetObjectFile.h - ARM64 Object Info -*- C++ -------------*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64_TARGETOBJECTFILE_H
-#define LLVM_TARGET_ARM64_TARGETOBJECTFILE_H
-
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-
-namespace llvm {
-class ARM64TargetMachine;
-
-/// This implementation is used for AArch64 ELF targets (Linux in particular).
-class ARM64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
-  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-};
-
-/// ARM64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
-class ARM64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
-public:
-  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
-                                        unsigned Encoding, Mangler &Mang,
-                                        const TargetMachine &TM,
-                                        MachineModuleInfo *MMI,
-                                        MCStreamer &Streamer) const override;
-
-  MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
-                                    const TargetMachine &TM,
-                                    MachineModuleInfo *MMI) const override;
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64TargetTransformInfo.cpp b/lib/Target/ARM64/ARM64TargetTransformInfo.cpp
deleted file mode 100644
index cc4cdff62b5..00000000000
--- a/lib/Target/ARM64/ARM64TargetTransformInfo.cpp
+++ /dev/null
@@ -1,463 +0,0 @@
-//===-- ARM64TargetTransformInfo.cpp - ARM64 specific TTI pass ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file implements a TargetTransformInfo analysis pass specific to the
-/// ARM64 target machine. It uses the target's detailed information to provide
-/// more precise answers to certain TTI queries, while letting the target
-/// independent and default TTI implementations handle the rest.
-///
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64TargetMachine.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/CostTable.h"
-#include "llvm/Target/TargetLowering.h"
-#include <algorithm>
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64tti"
-
-// Declare the pass initialization routine locally as target-specific passes
-// don't have a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializeARM64TTIPass(PassRegistry &);
-}
-
-namespace {
-
-class ARM64TTI final : public ImmutablePass, public TargetTransformInfo {
-  const ARM64TargetMachine *TM;
-  const ARM64Subtarget *ST;
-  const ARM64TargetLowering *TLI;
-
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
-public:
-  ARM64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  ARM64TTI(const ARM64TargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getTargetLowering()) {
-    initializeARM64TTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override { pushTTIStack(this); }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo *)this;
-    return this;
-  }
-
-  /// \name Scalar TTI Implementations
-  /// @{
-  unsigned getIntImmCost(int64_t Val) const;
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
-  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override;
-  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override;
-  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
-
-  /// @}
-
-  /// \name Vector TTI Implementations
-  /// @{
-
-  unsigned getNumberOfRegisters(bool Vector) const override {
-    if (Vector) {
-      if (ST->hasNEON())
-        return 32;
-      return 0;
-    }
-    return 31;
-  }
-
-  unsigned getRegisterBitWidth(bool Vector) const override {
-    if (Vector) {
-      if (ST->hasNEON())
-        return 128;
-      return 0;
-    }
-    return 64;
-  }
-
-  unsigned getMaximumUnrollFactor() const override { return 2; }
-
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
-      override;
-
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
-      override;
-
-  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                  OperandValueKind Opd1Info = OK_AnyValue,
-                                  OperandValueKind Opd2Info = OK_AnyValue) const
-      override;
-
-  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
-
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
-      override;
-
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const override;
-  /// @}
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(ARM64TTI, TargetTransformInfo, "arm64tti",
-                   "ARM64 Target Transform Info", true, true, false)
-char ARM64TTI::ID = 0;
-
-ImmutablePass *
-llvm::createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM) {
-  return new ARM64TTI(TM);
-}
-
-/// \brief Calculate the cost of materializing a 64-bit value. This helper
-/// method might only calculate a fraction of a larger immediate. Therefore it
-/// is valid to return a cost of ZERO.
-unsigned ARM64TTI::getIntImmCost(int64_t Val) const {
-  // Check if the immediate can be encoded within an instruction.
-  if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, 64))
-    return 0;
-
-  if (Val < 0)
-    Val = ~Val;
-
-  // Calculate how many moves we will need to materialize this constant.
-  unsigned LZ = countLeadingZeros((uint64_t)Val);
-  return (64 - LZ + 15) / 16;
-}
-
-/// \brief Calculate the cost of materializing the given constant.
-unsigned ARM64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
-  assert(Ty->isIntegerTy());
-
-  unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  if (BitSize == 0)
-    return ~0U;
-
-  // Sign-extend all constants to a multiple of 64-bit.
-  APInt ImmVal = Imm;
-  if (BitSize & 0x3f)
-    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
-
-  // Split the constant into 64-bit chunks and calculate the cost for each
-  // chunk.
-  unsigned Cost = 0;
-  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
-    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
-    int64_t Val = Tmp.getSExtValue();
-    Cost += getIntImmCost(Val);
-  }
-  // We need at least one instruction to materialze the constant.
-  return std::max(1U, Cost);
-}
-
-unsigned ARM64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
-                                 const APInt &Imm, Type *Ty) const {
-  assert(Ty->isIntegerTy());
-
-  unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  // There is no cost model for constants with a bit size of 0. Return TCC_Free
-  // here, so that constant hoisting will ignore this constant.
-  if (BitSize == 0)
-    return TCC_Free;
-
-  unsigned ImmIdx = ~0U;
-  switch (Opcode) {
-  default:
-    return TCC_Free;
-  case Instruction::GetElementPtr:
-    // Always hoist the base address of a GetElementPtr.
-    if (Idx == 0)
-      return 2 * TCC_Basic;
-    return TCC_Free;
-  case Instruction::Store:
-    ImmIdx = 0;
-    break;
-  case Instruction::Add:
-  case Instruction::Sub:
-  case Instruction::Mul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::ICmp:
-    ImmIdx = 1;
-    break;
-  // Always return TCC_Free for the shift value of a shift instruction.
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-    if (Idx == 1)
-      return TCC_Free;
-    break;
-  case Instruction::Trunc:
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::IntToPtr:
-  case Instruction::PtrToInt:
-  case Instruction::BitCast:
-  case Instruction::PHI:
-  case Instruction::Call:
-  case Instruction::Select:
-  case Instruction::Ret:
-  case Instruction::Load:
-    break;
-  }
-
-  if (Idx == ImmIdx) {
-    unsigned NumConstants = (BitSize + 63) / 64;
-    unsigned Cost = ARM64TTI::getIntImmCost(Imm, Ty);
-    return (Cost <= NumConstants * TCC_Basic)
-      ? static_cast<unsigned>(TCC_Free) : Cost;
-  }
-  return ARM64TTI::getIntImmCost(Imm, Ty);
-}
-
-unsigned ARM64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                                 const APInt &Imm, Type *Ty) const {
-  assert(Ty->isIntegerTy());
-
-  unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  // There is no cost model for constants with a bit size of 0. Return TCC_Free
-  // here, so that constant hoisting will ignore this constant.
-  if (BitSize == 0)
-    return TCC_Free;
-
-  switch (IID) {
-  default:
-    return TCC_Free;
-  case Intrinsic::sadd_with_overflow:
-  case Intrinsic::uadd_with_overflow:
-  case Intrinsic::ssub_with_overflow:
-  case Intrinsic::usub_with_overflow:
-  case Intrinsic::smul_with_overflow:
-  case Intrinsic::umul_with_overflow:
-    if (Idx == 1) {
-      unsigned NumConstants = (BitSize + 63) / 64;
-      unsigned Cost = ARM64TTI::getIntImmCost(Imm, Ty);
-      return (Cost <= NumConstants * TCC_Basic)
-        ? static_cast<unsigned>(TCC_Free) : Cost;
-    }
-    break;
-  case Intrinsic::experimental_stackmap:
-    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
-      return TCC_Free;
-    break;
-  case Intrinsic::experimental_patchpoint_void:
-  case Intrinsic::experimental_patchpoint_i64:
-    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
-      return TCC_Free;
-    break;
-  }
-  return ARM64TTI::getIntImmCost(Imm, Ty);
-}
-
-ARM64TTI::PopcntSupportKind ARM64TTI::getPopcntSupport(unsigned TyWidth) const {
-  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-  if (TyWidth == 32 || TyWidth == 64)
-    return PSK_FastHardware;
-  // TODO: ARM64TargetLowering::LowerCTPOP() supports 128bit popcount.
-  return PSK_Software;
-}
-
-unsigned ARM64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const {
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  assert(ISD && "Invalid opcode");
-
-  EVT SrcTy = TLI->getValueType(Src);
-  EVT DstTy = TLI->getValueType(Dst);
-
-  if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
-
-  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
-    // LowerVectorINT_TO_FP:
-    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
-    // LowerVectorFP_TO_INT
-    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
-    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
-    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 4 },
-    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 4 },
-    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
-    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4 },
-    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
-    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
-  };
-
-  int Idx = ConvertCostTableLookup<MVT>(
-      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
-      SrcTy.getSimpleVT());
-  if (Idx != -1)
-    return ConversionTbl[Idx].Cost;
-
-  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
-}
-
-unsigned ARM64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                      unsigned Index) const {
-  assert(Val->isVectorTy() && "This must be a vector type");
-
-  if (Index != -1U) {
-    // Legalize the type.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
-
-    // This type is legalized to a scalar type.
-    if (!LT.second.isVector())
-      return 0;
-
-    // The type may be split. Normalize the index to the new type.
-    unsigned Width = LT.second.getVectorNumElements();
-    Index = Index % Width;
-
-    // The element at index zero is already inside the vector.
-    if (Index == 0)
-      return 0;
-  }
-
-  // All other insert/extracts cost this much.
-  return 2;
-}
-
-unsigned ARM64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                          OperandValueKind Opd1Info,
-                                          OperandValueKind Opd2Info) const {
-  // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
-
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-
-  switch (ISD) {
-  default:
-    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
-                                                       Opd2Info);
-  case ISD::ADD:
-  case ISD::MUL:
-  case ISD::XOR:
-  case ISD::OR:
-  case ISD::AND:
-    // These nodes are marked as 'custom' for combining purposes only.
-    // We know that they are legal. See LowerAdd in ISelLowering.
-    return 1 * LT.first;
-  }
-}
-
-unsigned ARM64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
-  // Address computations in vectorized code with non-consecutive addresses will
-  // likely result in more instructions compared to scalar code where the
-  // computation can more often be merged into the index mode. The resulting
-  // extra micro-ops can significantly decrease throughput.
-  unsigned NumVectorInstToHideOverhead = 10;
-
-  if (Ty->isVectorTy() && IsComplex)
-    return NumVectorInstToHideOverhead;
-
-  // In many cases the address computation is not merged into the instruction
-  // addressing mode.
-  return 1;
-}
-
-unsigned ARM64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy) const {
-
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  // We don't lower vector selects well that are wider than the register width.
-  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
-    // We would need this many instructions to hide the scalarization happening.
-    unsigned AmortizationCost = 20;
-    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-    VectorSelectTbl[] = {
-      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
-      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
-      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
-      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
-      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
-      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
-    };
-
-    EVT SelCondTy = TLI->getValueType(CondTy);
-    EVT SelValTy = TLI->getValueType(ValTy);
-    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
-      int Idx =
-          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
-                                 SelValTy.getSimpleVT());
-      if (Idx != -1)
-        return VectorSelectTbl[Idx].Cost;
-    }
-  }
-  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
-}
-
-unsigned ARM64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                   unsigned Alignment,
-                                   unsigned AddressSpace) const {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
-
-  if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
-      Src->getVectorElementType()->isIntegerTy(64)) {
-    // Unaligned stores are extremely inefficient. We don't split
-    // unaligned v2i64 stores because the negative impact that has shown in
-    // practice on inlined memcpy code.
-    // We make v2i64 stores expensive so that we will only vectorize if there
-    // are 6 other instructions getting vectorized.
-    unsigned AmortizationCost = 6;
-
-    return LT.first * 2 * AmortizationCost;
-  }
-
-  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
-      Src->getVectorNumElements() < 8) {
-    // We scalarize the loads/stores because there is not v.4b register and we
-    // have to promote the elements to v.4h.
-    unsigned NumVecElts = Src->getVectorNumElements();
-    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
-    // We generate 2 instructions per vector element.
-    return NumVectorizableInstsToAmortize * NumVecElts * 2;
-  }
-
-  return LT.first;
-}
diff --git a/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp b/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
deleted file mode 100644
index 4d710db1d93..00000000000
--- a/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
+++ /dev/null
@@ -1,4030 +0,0 @@
-//===-- ARM64AsmParser.cpp - Parse ARM64 assembly to MCInst instructions --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "Utils/ARM64BaseInfo.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
-#include <cstdio>
-using namespace llvm;
-
-namespace {
-
-class ARM64Operand;
-
-class ARM64AsmParser : public MCTargetAsmParser {
-public:
-  typedef SmallVectorImpl<MCParsedAsmOperand *> OperandVector;
-
-private:
-  StringRef Mnemonic; ///< Instruction mnemonic.
-  MCSubtargetInfo &STI;
-  MCAsmParser &Parser;
-
-  MCAsmParser &getParser() const { return Parser; }
-  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
-  SMLoc getLoc() const { return Parser.getTok().getLoc(); }
-
-  bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
-  ARM64CC::CondCode parseCondCodeString(StringRef Cond);
-  bool parseCondCode(OperandVector &Operands, bool invertCondCode);
-  int tryParseRegister();
-  int tryMatchVectorRegister(StringRef &Kind, bool expected);
-  bool parseRegister(OperandVector &Operands);
-  bool parseSymbolicImmVal(const MCExpr *&ImmVal);
-  bool parseVectorList(OperandVector &Operands);
-  bool parseOperand(OperandVector &Operands, bool isCondCode,
-                    bool invertCondCode);
-
-  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
-  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
-  bool showMatchError(SMLoc Loc, unsigned ErrCode);
-
-  bool parseDirectiveWord(unsigned Size, SMLoc L);
-  bool parseDirectiveTLSDescCall(SMLoc L);
-
-  bool parseDirectiveLOH(StringRef LOH, SMLoc L);
-
-  bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
-  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               OperandVector &Operands, MCStreamer &Out,
-                               unsigned &ErrorInfo,
-                               bool MatchingInlineAsm) override;
-/// @name Auto-generated Match Functions
-/// {
-
-#define GET_ASSEMBLER_HEADER
-#include "ARM64GenAsmMatcher.inc"
-
-  /// }
-
-  OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
-  OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
-  OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
-  OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
-  OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
-  OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
-  OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
-  OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
-  OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
-  OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
-  OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
-  bool tryParseVectorRegister(OperandVector &Operands);
-
-public:
-  enum ARM64MatchResultTy {
-    Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
-#define GET_OPERAND_DIAGNOSTIC_TYPES
-#include "ARM64GenAsmMatcher.inc"
-  };
-  ARM64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-                 const MCInstrInfo &MII,
-                 const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
-    MCAsmParserExtension::Initialize(_Parser);
-
-    // Initialize the set of available features.
-    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
-  }
-
-  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc, OperandVector &Operands) override;
-  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
-  bool ParseDirective(AsmToken DirectiveID) override;
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
-                                      unsigned Kind) override;
-
-  static bool classifySymbolRef(const MCExpr *Expr,
-                                ARM64MCExpr::VariantKind &ELFRefKind,
-                                MCSymbolRefExpr::VariantKind &DarwinRefKind,
-                                int64_t &Addend);
-};
-} // end anonymous namespace
-
-namespace {
-
-/// ARM64Operand - Instances of this class represent a parsed ARM64 machine
-/// instruction.
-class ARM64Operand : public MCParsedAsmOperand {
-private:
-  enum KindTy {
-    k_Immediate,
-    k_ShiftedImm,
-    k_CondCode,
-    k_Register,
-    k_VectorList,
-    k_VectorIndex,
-    k_Token,
-    k_SysReg,
-    k_SysCR,
-    k_Prefetch,
-    k_ShiftExtend,
-    k_FPImm,
-    k_Barrier
-  } Kind;
-
-  SMLoc StartLoc, EndLoc;
-
-  struct TokOp {
-    const char *Data;
-    unsigned Length;
-    bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
-  };
-
-  struct RegOp {
-    unsigned RegNum;
-    bool isVector;
-  };
-
-  struct VectorListOp {
-    unsigned RegNum;
-    unsigned Count;
-    unsigned NumElements;
-    unsigned ElementKind;
-  };
-
-  struct VectorIndexOp {
-    unsigned Val;
-  };
-
-  struct ImmOp {
-    const MCExpr *Val;
-  };
-
-  struct ShiftedImmOp {
-    const MCExpr *Val;
-    unsigned ShiftAmount;
-  };
-
-  struct CondCodeOp {
-    ARM64CC::CondCode Code;
-  };
-
-  struct FPImmOp {
-    unsigned Val; // Encoded 8-bit representation.
-  };
-
-  struct BarrierOp {
-    unsigned Val; // Not the enum since not all values have names.
-  };
-
-  struct SysRegOp {
-    const char *Data;
-    unsigned Length;
-    uint64_t FeatureBits; // We need to pass through information about which
-                          // core we are compiling for so that the SysReg
-                          // Mappers can appropriately conditionalize.
-  };
-
-  struct SysCRImmOp {
-    unsigned Val;
-  };
-
-  struct PrefetchOp {
-    unsigned Val;
-  };
-
-  struct ShiftExtendOp {
-    ARM64_AM::ShiftExtendType Type;
-    unsigned Amount;
-    bool HasExplicitAmount;
-  };
-
-  struct ExtendOp {
-    unsigned Val;
-  };
-
-  union {
-    struct TokOp Tok;
-    struct RegOp Reg;
-    struct VectorListOp VectorList;
-    struct VectorIndexOp VectorIndex;
-    struct ImmOp Imm;
-    struct ShiftedImmOp ShiftedImm;
-    struct CondCodeOp CondCode;
-    struct FPImmOp FPImm;
-    struct BarrierOp Barrier;
-    struct SysRegOp SysReg;
-    struct SysCRImmOp SysCRImm;
-    struct PrefetchOp Prefetch;
-    struct ShiftExtendOp ShiftExtend;
-  };
-
-  // Keep the MCContext around as the MCExprs may need manipulated during
-  // the add<>Operands() calls.
-  MCContext &Ctx;
-
-  ARM64Operand(KindTy K, MCContext &_Ctx)
-      : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {}
-
-public:
-  ARM64Operand(const ARM64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
-    Kind = o.Kind;
-    StartLoc = o.StartLoc;
-    EndLoc = o.EndLoc;
-    switch (Kind) {
-    case k_Token:
-      Tok = o.Tok;
-      break;
-    case k_Immediate:
-      Imm = o.Imm;
-      break;
-    case k_ShiftedImm:
-      ShiftedImm = o.ShiftedImm;
-      break;
-    case k_CondCode:
-      CondCode = o.CondCode;
-      break;
-    case k_FPImm:
-      FPImm = o.FPImm;
-      break;
-    case k_Barrier:
-      Barrier = o.Barrier;
-      break;
-    case k_Register:
-      Reg = o.Reg;
-      break;
-    case k_VectorList:
-      VectorList = o.VectorList;
-      break;
-    case k_VectorIndex:
-      VectorIndex = o.VectorIndex;
-      break;
-    case k_SysReg:
-      SysReg = o.SysReg;
-      break;
-    case k_SysCR:
-      SysCRImm = o.SysCRImm;
-      break;
-    case k_Prefetch:
-      Prefetch = o.Prefetch;
-      break;
-    case k_ShiftExtend:
-      ShiftExtend = o.ShiftExtend;
-      break;
-    }
-  }
-
-  /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const override { return StartLoc; }
-  /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const override { return EndLoc; }
-
-  StringRef getToken() const {
-    assert(Kind == k_Token && "Invalid access!");
-    return StringRef(Tok.Data, Tok.Length);
-  }
-
-  bool isTokenSuffix() const {
-    assert(Kind == k_Token && "Invalid access!");
-    return Tok.IsSuffix;
-  }
-
-  const MCExpr *getImm() const {
-    assert(Kind == k_Immediate && "Invalid access!");
-    return Imm.Val;
-  }
-
-  const MCExpr *getShiftedImmVal() const {
-    assert(Kind == k_ShiftedImm && "Invalid access!");
-    return ShiftedImm.Val;
-  }
-
-  unsigned getShiftedImmShift() const {
-    assert(Kind == k_ShiftedImm && "Invalid access!");
-    return ShiftedImm.ShiftAmount;
-  }
-
-  ARM64CC::CondCode getCondCode() const {
-    assert(Kind == k_CondCode && "Invalid access!");
-    return CondCode.Code;
-  }
-
-  unsigned getFPImm() const {
-    assert(Kind == k_FPImm && "Invalid access!");
-    return FPImm.Val;
-  }
-
-  unsigned getBarrier() const {
-    assert(Kind == k_Barrier && "Invalid access!");
-    return Barrier.Val;
-  }
-
-  unsigned getReg() const override {
-    assert(Kind == k_Register && "Invalid access!");
-    return Reg.RegNum;
-  }
-
-  unsigned getVectorListStart() const {
-    assert(Kind == k_VectorList && "Invalid access!");
-    return VectorList.RegNum;
-  }
-
-  unsigned getVectorListCount() const {
-    assert(Kind == k_VectorList && "Invalid access!");
-    return VectorList.Count;
-  }
-
-  unsigned getVectorIndex() const {
-    assert(Kind == k_VectorIndex && "Invalid access!");
-    return VectorIndex.Val;
-  }
-
-  StringRef getSysReg() const {
-    assert(Kind == k_SysReg && "Invalid access!");
-    return StringRef(SysReg.Data, SysReg.Length);
-  }
-
-  uint64_t getSysRegFeatureBits() const {
-    assert(Kind == k_SysReg && "Invalid access!");
-    return SysReg.FeatureBits;
-  }
-
-  unsigned getSysCR() const {
-    assert(Kind == k_SysCR && "Invalid access!");
-    return SysCRImm.Val;
-  }
-
-  unsigned getPrefetch() const {
-    assert(Kind == k_Prefetch && "Invalid access!");
-    return Prefetch.Val;
-  }
-
-  ARM64_AM::ShiftExtendType getShiftExtendType() const {
-    assert(Kind == k_ShiftExtend && "Invalid access!");
-    return ShiftExtend.Type;
-  }
-
-  unsigned getShiftExtendAmount() const {
-    assert(Kind == k_ShiftExtend && "Invalid access!");
-    return ShiftExtend.Amount;
-  }
-
-  bool hasShiftExtendAmount() const {
-    assert(Kind == k_ShiftExtend && "Invalid access!");
-    return ShiftExtend.HasExplicitAmount;
-  }
-
-  bool isImm() const override { return Kind == k_Immediate; }
-  bool isMem() const override { return false; }
-  bool isSImm9() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -256 && Val < 256);
-  }
-  bool isSImm7s4() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
-  }
-  bool isSImm7s8() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
-  }
-  bool isSImm7s16() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
-  }
-
-  bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const {
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    int64_t Addend;
-    if (!ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
-                                           Addend)) {
-      // If we don't understand the expression, assume the best and
-      // let the fixup and relocation code deal with it.
-      return true;
-    }
-
-    if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
-        ELFRefKind == ARM64MCExpr::VK_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_GOT_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC ||
-        ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC ||
-        ELFRefKind == ARM64MCExpr::VK_GOTTPREL_LO12_NC ||
-        ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12) {
-      // Note that we don't range-check the addend. It's adjusted modulo page
-      // size when converted, so there is no "out of range" condition when using
-      // @pageoff.
-      return Addend >= 0 && (Addend % Scale) == 0;
-    } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
-               DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
-      // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
-      return Addend == 0;
-    }
-
-    return false;
-  }
-
-  template <int Scale> bool isUImm12Offset() const {
-    if (!isImm())
-      return false;
-
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return isSymbolicUImm12Offset(getImm(), Scale);
-
-    int64_t Val = MCE->getValue();
-    return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
-  }
-
-  bool isImm0_7() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 8);
-  }
-  bool isImm1_8() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 9);
-  }
-  bool isImm0_15() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 16);
-  }
-  bool isImm1_16() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 17);
-  }
-  bool isImm0_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 32);
-  }
-  bool isImm1_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 32);
-  }
-  bool isImm1_32() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 33);
-  }
-  bool isImm0_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 64);
-  }
-  bool isImm1_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 64);
-  }
-  bool isImm1_64() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 65);
-  }
-  bool isImm0_127() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 128);
-  }
-  bool isImm0_255() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 256);
-  }
-  bool isImm0_65535() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 65536);
-  }
-  bool isImm32_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 32 && Val < 64);
-  }
-  bool isLogicalImm32() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return ARM64_AM::isLogicalImmediate(MCE->getValue(), 32);
-  }
-  bool isLogicalImm64() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return ARM64_AM::isLogicalImmediate(MCE->getValue(), 64);
-  }
-  bool isShiftedImm() const { return Kind == k_ShiftedImm; }
-  bool isAddSubImm() const {
-    if (!isShiftedImm() && !isImm())
-      return false;
-
-    const MCExpr *Expr;
-
-    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
-    if (isShiftedImm()) {
-      unsigned Shift = ShiftedImm.ShiftAmount;
-      Expr = ShiftedImm.Val;
-      if (Shift != 0 && Shift != 12)
-        return false;
-    } else {
-      Expr = getImm();
-    }
-
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    int64_t Addend;
-    if (ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind,
-                                          DarwinRefKind, Addend)) {
-      return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
-          || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
-          || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
-          || ELFRefKind == ARM64MCExpr::VK_LO12
-          || ELFRefKind == ARM64MCExpr::VK_DTPREL_HI12
-          || ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12
-          || ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC
-          || ELFRefKind == ARM64MCExpr::VK_TPREL_HI12
-          || ELFRefKind == ARM64MCExpr::VK_TPREL_LO12
-          || ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC
-          || ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12;
-    }
-
-    // Otherwise it should be a real immediate in range:
-    const MCConstantExpr *CE = cast<MCConstantExpr>(Expr);
-    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
-  }
-  bool isCondCode() const { return Kind == k_CondCode; }
-  bool isSIMDImmType10() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return ARM64_AM::isAdvSIMDModImmType10(MCE->getValue());
-  }
-  bool isBranchTarget26() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
-  }
-  bool isPCRelLabel19() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
-  }
-  bool isBranchTarget14() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
-  }
-
-  bool isMovWSymbol(ArrayRef<ARM64MCExpr::VariantKind> AllowedModifiers) const {
-    if (!isImm())
-      return false;
-
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    int64_t Addend;
-    if (!ARM64AsmParser::classifySymbolRef(getImm(), ELFRefKind, DarwinRefKind,
-                                           Addend)) {
-      return false;
-    }
-    if (DarwinRefKind != MCSymbolRefExpr::VK_None)
-      return false;
-
-    for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
-      if (ELFRefKind == AllowedModifiers[i])
-        return Addend == 0;
-    }
-
-    return false;
-  }
-
-  bool isMovZSymbolG3() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G3 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovZSymbolG2() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G2,
-                                                   ARM64MCExpr::VK_ABS_G2_S,
-                                                   ARM64MCExpr::VK_TPREL_G2,
-                                                   ARM64MCExpr::VK_DTPREL_G2 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovZSymbolG1() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G1,
-                                                   ARM64MCExpr::VK_ABS_G1_S,
-                                                   ARM64MCExpr::VK_GOTTPREL_G1,
-                                                   ARM64MCExpr::VK_TPREL_G1,
-                                                   ARM64MCExpr::VK_DTPREL_G1, };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovZSymbolG0() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G0,
-                                                   ARM64MCExpr::VK_ABS_G0_S,
-                                                   ARM64MCExpr::VK_TPREL_G0,
-                                                   ARM64MCExpr::VK_DTPREL_G0 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG3() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G3 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG2() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G2_NC };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG1() const {
-    static ARM64MCExpr::VariantKind Variants[] = {
-      ARM64MCExpr::VK_ABS_G1_NC, ARM64MCExpr::VK_TPREL_G1_NC,
-      ARM64MCExpr::VK_DTPREL_G1_NC
-    };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG0() const {
-    static ARM64MCExpr::VariantKind Variants[] = {
-      ARM64MCExpr::VK_ABS_G0_NC,   ARM64MCExpr::VK_GOTTPREL_G0_NC,
-      ARM64MCExpr::VK_TPREL_G0_NC, ARM64MCExpr::VK_DTPREL_G0_NC
-    };
-    return isMovWSymbol(Variants);
-  }
-
-  template<int RegWidth, int Shift>
-  bool isMOVZMovAlias() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    uint64_t Value = CE->getValue();
-
-    if (RegWidth == 32)
-      Value &= 0xffffffffULL;
-
-    // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
-    if (Value == 0 && Shift != 0)
-      return false;
-
-    return (Value & ~(0xffffULL << Shift)) == 0;
-  }
-
-  template<int RegWidth, int Shift>
-  bool isMOVNMovAlias() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    uint64_t Value = CE->getValue();
-
-    // MOVZ takes precedence over MOVN.
-    for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16)
-      if ((Value & ~(0xffffULL << MOVZShift)) == 0)
-        return false;
-
-    Value = ~Value;
-    if (RegWidth == 32)
-      Value &= 0xffffffffULL;
-
-    return (Value & ~(0xffffULL << Shift)) == 0;
-  }
-
-  bool isFPImm() const { return Kind == k_FPImm; }
-  bool isBarrier() const { return Kind == k_Barrier; }
-  bool isSysReg() const { return Kind == k_SysReg; }
-  bool isMRSSystemRegister() const {
-    if (!isSysReg()) return false;
-
-    bool IsKnownRegister;
-    auto Mapper = ARM64SysReg::MRSMapper(getSysRegFeatureBits());
-    Mapper.fromString(getSysReg(), IsKnownRegister);
-
-    return IsKnownRegister;
-  }
-  bool isMSRSystemRegister() const {
-    if (!isSysReg()) return false;
-
-    bool IsKnownRegister;
-    auto Mapper = ARM64SysReg::MSRMapper(getSysRegFeatureBits());
-    Mapper.fromString(getSysReg(), IsKnownRegister);
-
-    return IsKnownRegister;
-  }
-  bool isSystemPStateField() const {
-    if (!isSysReg()) return false;
-
-    bool IsKnownRegister;
-    ARM64PState::PStateMapper().fromString(getSysReg(), IsKnownRegister);
-
-    return IsKnownRegister;
-  }
-  bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
-  bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
-  bool isVectorRegLo() const {
-    return Kind == k_Register && Reg.isVector &&
-      ARM64MCRegisterClasses[ARM64::FPR128_loRegClassID].contains(Reg.RegNum);
-  }
-  bool isGPR32as64() const {
-    return Kind == k_Register && !Reg.isVector &&
-      ARM64MCRegisterClasses[ARM64::GPR64RegClassID].contains(Reg.RegNum);
-  }
-
-  bool isGPR64sp0() const {
-    return Kind == k_Register && !Reg.isVector &&
-      ARM64MCRegisterClasses[ARM64::GPR64spRegClassID].contains(Reg.RegNum);
-  }
-
-  /// Is this a vector list with the type implicit (presumably attached to the
-  /// instruction itself)?
-  template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
-    return Kind == k_VectorList && VectorList.Count == NumRegs &&
-           !VectorList.ElementKind;
-  }
-
-  template <unsigned NumRegs, unsigned NumElements, char ElementKind>
-  bool isTypedVectorList() const {
-    if (Kind != k_VectorList)
-      return false;
-    if (VectorList.Count != NumRegs)
-      return false;
-    if (VectorList.ElementKind != ElementKind)
-      return false;
-    return VectorList.NumElements == NumElements;
-  }
-
-  bool isVectorIndex1() const {
-    return Kind == k_VectorIndex && VectorIndex.Val == 1;
-  }
-  bool isVectorIndexB() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 16;
-  }
-  bool isVectorIndexH() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 8;
-  }
-  bool isVectorIndexS() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 4;
-  }
-  bool isVectorIndexD() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 2;
-  }
-  bool isToken() const override { return Kind == k_Token; }
-  bool isTokenEqual(StringRef Str) const {
-    return Kind == k_Token && getToken() == Str;
-  }
-  bool isSysCR() const { return Kind == k_SysCR; }
-  bool isPrefetch() const { return Kind == k_Prefetch; }
-  bool isShiftExtend() const { return Kind == k_ShiftExtend; }
-  bool isShifter() const {
-    if (!isShiftExtend())
-      return false;
-
-    ARM64_AM::ShiftExtendType ST = getShiftExtendType();
-    return (ST == ARM64_AM::LSL || ST == ARM64_AM::LSR || ST == ARM64_AM::ASR ||
-            ST == ARM64_AM::ROR || ST == ARM64_AM::MSL);
-  }
-  bool isExtend() const {
-    if (!isShiftExtend())
-      return false;
-
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    return (ET == ARM64_AM::UXTB || ET == ARM64_AM::SXTB ||
-            ET == ARM64_AM::UXTH || ET == ARM64_AM::SXTH ||
-            ET == ARM64_AM::UXTW || ET == ARM64_AM::SXTW ||
-            ET == ARM64_AM::UXTX || ET == ARM64_AM::SXTX ||
-            ET == ARM64_AM::LSL) &&
-           getShiftExtendAmount() <= 4;
-  }
-
-  bool isExtend64() const {
-    if (!isExtend())
-      return false;
-    // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    return ET != ARM64_AM::UXTX && ET != ARM64_AM::SXTX;
-  }
-  bool isExtendLSL64() const {
-    if (!isExtend())
-      return false;
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    return (ET == ARM64_AM::UXTX || ET == ARM64_AM::SXTX || ET == ARM64_AM::LSL) &&
-      getShiftExtendAmount() <= 4;
-  }
-
-  template<int Width> bool isMemXExtend() const {
-    if (!isExtend())
-      return false;
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    return (ET == ARM64_AM::LSL || ET == ARM64_AM::SXTX) &&
-           (getShiftExtendAmount() == Log2_32(Width / 8) ||
-            getShiftExtendAmount() == 0);
-  }
-
-  template<int Width> bool isMemWExtend() const {
-    if (!isExtend())
-      return false;
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    return (ET == ARM64_AM::UXTW || ET == ARM64_AM::SXTW) &&
-           (getShiftExtendAmount() == Log2_32(Width / 8) ||
-            getShiftExtendAmount() == 0);
-  }
-
-  template <unsigned width>
-  bool isArithmeticShifter() const {
-    if (!isShifter())
-      return false;
-
-    // An arithmetic shifter is LSL, LSR, or ASR.
-    ARM64_AM::ShiftExtendType ST = getShiftExtendType();
-    return (ST == ARM64_AM::LSL || ST == ARM64_AM::LSR ||
-            ST == ARM64_AM::ASR) && getShiftExtendAmount() < width;
-  }
-
-  template <unsigned width>
-  bool isLogicalShifter() const {
-    if (!isShifter())
-      return false;
-
-    // A logical shifter is LSL, LSR, ASR or ROR.
-    ARM64_AM::ShiftExtendType ST = getShiftExtendType();
-    return (ST == ARM64_AM::LSL || ST == ARM64_AM::LSR || ST == ARM64_AM::ASR ||
-            ST == ARM64_AM::ROR) &&
-           getShiftExtendAmount() < width;
-  }
-
-  bool isMovImm32Shifter() const {
-    if (!isShifter())
-      return false;
-
-    // A MOVi shifter is LSL of 0, 16, 32, or 48.
-    ARM64_AM::ShiftExtendType ST = getShiftExtendType();
-    if (ST != ARM64_AM::LSL)
-      return false;
-    uint64_t Val = getShiftExtendAmount();
-    return (Val == 0 || Val == 16);
-  }
-
-  bool isMovImm64Shifter() const {
-    if (!isShifter())
-      return false;
-
-    // A MOVi shifter is LSL of 0 or 16.
-    ARM64_AM::ShiftExtendType ST = getShiftExtendType();
-    if (ST != ARM64_AM::LSL)
-      return false;
-    uint64_t Val = getShiftExtendAmount();
-    return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
-  }
-
-  bool isLogicalVecShifter() const {
-    if (!isShifter())
-      return false;
-
-    // A logical vector shifter is a left shift by 0, 8, 16, or 24.
-    unsigned Shift = getShiftExtendAmount();
-    return getShiftExtendType() == ARM64_AM::LSL &&
-           (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
-  }
-
-  bool isLogicalVecHalfWordShifter() const {
-    if (!isLogicalVecShifter())
-      return false;
-
-    // A logical vector shifter is a left shift by 0 or 8.
-    unsigned Shift = getShiftExtendAmount();
-    return getShiftExtendType() == ARM64_AM::LSL && (Shift == 0 || Shift == 8);
-  }
-
-  bool isMoveVecShifter() const {
-    if (!isShiftExtend())
-      return false;
-
-    // A logical vector shifter is a left shift by 8 or 16.
-    unsigned Shift = getShiftExtendAmount();
-    return getShiftExtendType() == ARM64_AM::MSL && (Shift == 8 || Shift == 16);
-  }
-
-  // Fallback unscaled operands are for aliases of LDR/STR that fall back
-  // to LDUR/STUR when the offset is not legal for the former but is for
-  // the latter. As such, in addition to checking for being a legal unscaled
-  // address, also check that it is not a legal scaled address. This avoids
-  // ambiguity in the matcher.
-  template<int Width>
-  bool isSImm9OffsetFB() const {
-    return isSImm9() && !isUImm12Offset<Width / 8>();
-  }
-
-  bool isAdrpLabel() const {
-    // Validation was handled during parsing, so we just sanity check that
-    // something didn't go haywire.
-    if (!isImm())
-        return false;
-
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
-      int64_t Val = CE->getValue();
-      int64_t Min = - (4096 * (1LL << (21 - 1)));
-      int64_t Max = 4096 * ((1LL << (21 - 1)) - 1);
-      return (Val % 4096) == 0 && Val >= Min && Val <= Max;
-    }
-
-    return true;
-  }
-
-  bool isAdrLabel() const {
-    // Validation was handled during parsing, so we just sanity check that
-    // something didn't go haywire.
-    if (!isImm())
-        return false;
-
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
-      int64_t Val = CE->getValue();
-      int64_t Min = - (1LL << (21 - 1));
-      int64_t Max = ((1LL << (21 - 1)) - 1);
-      return Val >= Min && Val <= Max;
-    }
-
-    return true;
-  }
-
-  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
-    // Add as immediates when possible.  Null MCExpr = 0.
-    if (!Expr)
-      Inst.addOperand(MCOperand::CreateImm(0));
-    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    else
-      Inst.addOperand(MCOperand::CreateExpr(Expr));
-  }
-
-  void addRegOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
-  }
-
-  void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    assert(ARM64MCRegisterClasses[ARM64::GPR64RegClassID].contains(getReg()));
-
-    const MCRegisterInfo *RI = Ctx.getRegisterInfo();
-    uint32_t Reg = RI->getRegClass(ARM64::GPR32RegClassID).getRegister(
-        RI->getEncodingValue(getReg()));
-
-    Inst.addOperand(MCOperand::CreateReg(Reg));
-  }
-
-  void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    assert(ARM64MCRegisterClasses[ARM64::FPR128RegClassID].contains(getReg()));
-    Inst.addOperand(MCOperand::CreateReg(ARM64::D0 + getReg() - ARM64::Q0));
-  }
-
-  void addVectorReg128Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    assert(ARM64MCRegisterClasses[ARM64::FPR128RegClassID].contains(getReg()));
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
-  }
-
-  void addVectorRegLoOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
-  }
-
-  template <unsigned NumRegs>
-  void addVectorList64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    static unsigned FirstRegs[] = { ARM64::D0,       ARM64::D0_D1,
-                                    ARM64::D0_D1_D2, ARM64::D0_D1_D2_D3 };
-    unsigned FirstReg = FirstRegs[NumRegs - 1];
-
-    Inst.addOperand(
-        MCOperand::CreateReg(FirstReg + getVectorListStart() - ARM64::Q0));
-  }
-
-  template <unsigned NumRegs>
-  void addVectorList128Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    static unsigned FirstRegs[] = { ARM64::Q0,       ARM64::Q0_Q1,
-                                    ARM64::Q0_Q1_Q2, ARM64::Q0_Q1_Q2_Q3 };
-    unsigned FirstReg = FirstRegs[NumRegs - 1];
-
-    Inst.addOperand(
-        MCOperand::CreateReg(FirstReg + getVectorListStart() - ARM64::Q0));
-  }
-
-  void addVectorIndex1Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // If this is a pageoff symrefexpr with an addend, adjust the addend
-    // to be only the page-offset portion. Otherwise, just add the expr
-    // as-is.
-    addExpr(Inst, getImm());
-  }
-
-  void addAddSubImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && "Invalid number of operands!");
-    if (isShiftedImm()) {
-      addExpr(Inst, getShiftedImmVal());
-      Inst.addOperand(MCOperand::CreateImm(getShiftedImmShift()));
-    } else {
-      addExpr(Inst, getImm());
-      Inst.addOperand(MCOperand::CreateImm(0));
-    }
-  }
-
-  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getCondCode()));
-  }
-
-  void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      addExpr(Inst, getImm());
-    else
-      Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 12));
-  }
-
-  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  template<int Scale>
-  void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-
-    if (!MCE) {
-      Inst.addOperand(MCOperand::CreateExpr(getImm()));
-      return;
-    }
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / Scale));
-  }
-
-  void addSImm9Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4));
-  }
-
-  void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8));
-  }
-
-  void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16));
-  }
-
-  void addImm0_7Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_15Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_31Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_63Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_63Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_127Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_255Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm32_63Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid logical immediate operand!");
-    uint64_t encoding = ARM64_AM::encodeLogicalImmediate(MCE->getValue(), 32);
-    Inst.addOperand(MCOperand::CreateImm(encoding));
-  }
-
-  void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid logical immediate operand!");
-    uint64_t encoding = ARM64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
-    Inst.addOperand(MCOperand::CreateImm(encoding));
-  }
-
-  void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid immediate operand!");
-    uint64_t encoding = ARM64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
-    Inst.addOperand(MCOperand::CreateImm(encoding));
-  }
-
-  void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
-    // Branch operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
-  }
-
-  void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
-    // Branch operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
-  }
-
-  void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
-    // Branch operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
-  }
-
-  void addFPImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getFPImm()));
-  }
-
-  void addBarrierOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getBarrier()));
-  }
-
-  void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-
-    bool Valid;
-    auto Mapper = ARM64SysReg::MRSMapper(getSysRegFeatureBits());
-    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
-
-    Inst.addOperand(MCOperand::CreateImm(Bits));
-  }
-
-  void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-
-    bool Valid;
-    auto Mapper = ARM64SysReg::MSRMapper(getSysRegFeatureBits());
-    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
-
-    Inst.addOperand(MCOperand::CreateImm(Bits));
-  }
-
-  void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-
-    bool Valid;
-    uint32_t Bits = ARM64PState::PStateMapper().fromString(getSysReg(), Valid);
-
-    Inst.addOperand(MCOperand::CreateImm(Bits));
-  }
-
-  void addSysCROperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getSysCR()));
-  }
-
-  void addPrefetchOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getPrefetch()));
-  }
-
-  void addShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    unsigned Imm =
-        ARM64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount());
-    Inst.addOperand(MCOperand::CreateImm(Imm));
-  }
-
-  void addExtendOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    if (ET == ARM64_AM::LSL) ET = ARM64_AM::UXTW;
-    unsigned Imm = ARM64_AM::getArithExtendImm(ET, getShiftExtendAmount());
-    Inst.addOperand(MCOperand::CreateImm(Imm));
-  }
-
-  void addExtend64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    if (ET == ARM64_AM::LSL) ET = ARM64_AM::UXTX;
-    unsigned Imm = ARM64_AM::getArithExtendImm(ET, getShiftExtendAmount());
-    Inst.addOperand(MCOperand::CreateImm(Imm));
-  }
-
-  void addMemExtendOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && "Invalid number of operands!");
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    bool IsSigned = ET == ARM64_AM::SXTW || ET == ARM64_AM::SXTX;
-    Inst.addOperand(MCOperand::CreateImm(IsSigned));
-    Inst.addOperand(MCOperand::CreateImm(getShiftExtendAmount() != 0));
-  }
-
-  // For 8-bit load/store instructions with a register offset, both the
-  // "DoShift" and "NoShift" variants have a shift of 0. Because of this,
-  // they're disambiguated by whether the shift was explicit or implicit rather
-  // than its size.
-  void addMemExtend8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && "Invalid number of operands!");
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    bool IsSigned = ET == ARM64_AM::SXTW || ET == ARM64_AM::SXTX;
-    Inst.addOperand(MCOperand::CreateImm(IsSigned));
-    Inst.addOperand(MCOperand::CreateImm(hasShiftExtendAmount()));
-  }
-
-  template<int Shift>
-  void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Value = CE->getValue();
-    Inst.addOperand(MCOperand::CreateImm((Value >> Shift) & 0xffff));
-  }
-
-  template<int Shift>
-  void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Value = CE->getValue();
-    Inst.addOperand(MCOperand::CreateImm((~Value >> Shift) & 0xffff));
-  }
-
-  void print(raw_ostream &OS) const override;
-
-  static ARM64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S,
-                                   MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Token, Ctx);
-    Op->Tok.Data = Str.data();
-    Op->Tok.Length = Str.size();
-    Op->Tok.IsSuffix = IsSuffix;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S,
-                                 SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Register, Ctx);
-    Op->Reg.RegNum = RegNum;
-    Op->Reg.isVector = isVector;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
-                                        unsigned NumElements, char ElementKind,
-                                        SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_VectorList, Ctx);
-    Op->VectorList.RegNum = RegNum;
-    Op->VectorList.Count = Count;
-    Op->VectorList.NumElements = NumElements;
-    Op->VectorList.ElementKind = ElementKind;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
-                                         MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_VectorIndex, Ctx);
-    Op->VectorIndex.Val = Idx;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
-                                 MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Immediate, Ctx);
-    Op->Imm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateShiftedImm(const MCExpr *Val, unsigned ShiftAmount,
-                                        SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_ShiftedImm, Ctx);
-    Op->ShiftedImm .Val = Val;
-    Op->ShiftedImm.ShiftAmount = ShiftAmount;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateCondCode(ARM64CC::CondCode Code, SMLoc S, SMLoc E,
-                                      MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_CondCode, Ctx);
-    Op->CondCode.Code = Code;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_FPImm, Ctx);
-    Op->FPImm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Barrier, Ctx);
-    Op->Barrier.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateSysReg(StringRef Str, SMLoc S,
-                                    uint64_t FeatureBits, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_SysReg, Ctx);
-    Op->SysReg.Data = Str.data();
-    Op->SysReg.Length = Str.size();
-    Op->SysReg.FeatureBits = FeatureBits;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E,
-                                   MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_SysCR, Ctx);
-    Op->SysCRImm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Prefetch, Ctx);
-    Op->Prefetch.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateShiftExtend(ARM64_AM::ShiftExtendType ShOp,
-                                         unsigned Val, bool HasExplicitAmount,
-                                         SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_ShiftExtend, Ctx);
-    Op->ShiftExtend.Type = ShOp;
-    Op->ShiftExtend.Amount = Val;
-    Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-};
-
-} // end anonymous namespace.
-
-void ARM64Operand::print(raw_ostream &OS) const {
-  switch (Kind) {
-  case k_FPImm:
-    OS << "<fpimm " << getFPImm() << "(" << ARM64_AM::getFPImmFloat(getFPImm())
-       << ") >";
-    break;
-  case k_Barrier: {
-    bool Valid;
-    StringRef Name = ARM64DB::DBarrierMapper().toString(getBarrier(), Valid);
-    if (Valid)
-      OS << "<barrier " << Name << ">";
-    else
-      OS << "<barrier invalid #" << getBarrier() << ">";
-    break;
-  }
-  case k_Immediate:
-    getImm()->print(OS);
-    break;
-  case k_ShiftedImm: {
-    unsigned Shift = getShiftedImmShift();
-    OS << "<shiftedimm ";
-    getShiftedImmVal()->print(OS);
-    OS << ", lsl #" << ARM64_AM::getShiftValue(Shift) << ">";
-    break;
-  }
-  case k_CondCode:
-    OS << "<condcode " << getCondCode() << ">";
-    break;
-  case k_Register:
-    OS << "<register " << getReg() << ">";
-    break;
-  case k_VectorList: {
-    OS << "<vectorlist ";
-    unsigned Reg = getVectorListStart();
-    for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
-      OS << Reg + i << " ";
-    OS << ">";
-    break;
-  }
-  case k_VectorIndex:
-    OS << "<vectorindex " << getVectorIndex() << ">";
-    break;
-  case k_SysReg:
-    OS << "<sysreg: " << getSysReg() << '>';
-    break;
-  case k_Token:
-    OS << "'" << getToken() << "'";
-    break;
-  case k_SysCR:
-    OS << "c" << getSysCR();
-    break;
-  case k_Prefetch: {
-    bool Valid;
-    StringRef Name = ARM64PRFM::PRFMMapper().toString(getPrefetch(), Valid);
-    if (Valid)
-      OS << "<prfop " << Name << ">";
-    else
-      OS << "<prfop invalid #" << getPrefetch() << ">";
-    break;
-  }
-  case k_ShiftExtend: {
-    OS << "<" << ARM64_AM::getShiftExtendName(getShiftExtendType()) << " #"
-       << getShiftExtendAmount();
-    if (!hasShiftExtendAmount())
-      OS << "<imp>";
-    OS << '>';
-    break;
-  }
-  }
-}
-
-/// @name Auto-generated Match Functions
-/// {
-
-static unsigned MatchRegisterName(StringRef Name);
-
-/// }
-
-static unsigned matchVectorRegName(StringRef Name) {
-  return StringSwitch<unsigned>(Name)
-      .Case("v0", ARM64::Q0)
-      .Case("v1", ARM64::Q1)
-      .Case("v2", ARM64::Q2)
-      .Case("v3", ARM64::Q3)
-      .Case("v4", ARM64::Q4)
-      .Case("v5", ARM64::Q5)
-      .Case("v6", ARM64::Q6)
-      .Case("v7", ARM64::Q7)
-      .Case("v8", ARM64::Q8)
-      .Case("v9", ARM64::Q9)
-      .Case("v10", ARM64::Q10)
-      .Case("v11", ARM64::Q11)
-      .Case("v12", ARM64::Q12)
-      .Case("v13", ARM64::Q13)
-      .Case("v14", ARM64::Q14)
-      .Case("v15", ARM64::Q15)
-      .Case("v16", ARM64::Q16)
-      .Case("v17", ARM64::Q17)
-      .Case("v18", ARM64::Q18)
-      .Case("v19", ARM64::Q19)
-      .Case("v20", ARM64::Q20)
-      .Case("v21", ARM64::Q21)
-      .Case("v22", ARM64::Q22)
-      .Case("v23", ARM64::Q23)
-      .Case("v24", ARM64::Q24)
-      .Case("v25", ARM64::Q25)
-      .Case("v26", ARM64::Q26)
-      .Case("v27", ARM64::Q27)
-      .Case("v28", ARM64::Q28)
-      .Case("v29", ARM64::Q29)
-      .Case("v30", ARM64::Q30)
-      .Case("v31", ARM64::Q31)
-      .Default(0);
-}
-
-static bool isValidVectorKind(StringRef Name) {
-  return StringSwitch<bool>(Name.lower())
-      .Case(".8b", true)
-      .Case(".16b", true)
-      .Case(".4h", true)
-      .Case(".8h", true)
-      .Case(".2s", true)
-      .Case(".4s", true)
-      .Case(".1d", true)
-      .Case(".2d", true)
-      .Case(".1q", true)
-      // Accept the width neutral ones, too, for verbose syntax. If those
-      // aren't used in the right places, the token operand won't match so
-      // all will work out.
-      .Case(".b", true)
-      .Case(".h", true)
-      .Case(".s", true)
-      .Case(".d", true)
-      .Default(false);
-}
-
-static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
-                                 char &ElementKind) {
-  assert(isValidVectorKind(Name));
-
-  ElementKind = Name.lower()[Name.size() - 1];
-  NumElements = 0;
-
-  if (Name.size() == 2)
-    return;
-
-  // Parse the lane count
-  Name = Name.drop_front();
-  while (isdigit(Name.front())) {
-    NumElements = 10 * NumElements + (Name.front() - '0');
-    Name = Name.drop_front();
-  }
-}
-
-bool ARM64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
-                                   SMLoc &EndLoc) {
-  StartLoc = getLoc();
-  RegNo = tryParseRegister();
-  EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  return (RegNo == (unsigned)-1);
-}
-
-/// tryParseRegister - Try to parse a register name. The token must be an
-/// Identifier when called, and if it is a register name the token is eaten and
-/// the register is added to the operand list.
-int ARM64AsmParser::tryParseRegister() {
-  const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
-
-  std::string lowerCase = Tok.getString().lower();
-  unsigned RegNum = MatchRegisterName(lowerCase);
-  // Also handle a few aliases of registers.
-  if (RegNum == 0)
-    RegNum = StringSwitch<unsigned>(lowerCase)
-                 .Case("fp",  ARM64::FP)
-                 .Case("lr",  ARM64::LR)
-                 .Case("x31", ARM64::XZR)
-                 .Case("w31", ARM64::WZR)
-                 .Default(0);
-
-  if (RegNum == 0)
-    return -1;
-
-  Parser.Lex(); // Eat identifier token.
-  return RegNum;
-}
-
-/// tryMatchVectorRegister - Try to parse a vector register name with optional
-/// kind specifier. If it is a register specifier, eat the token and return it.
-int ARM64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
-  if (Parser.getTok().isNot(AsmToken::Identifier)) {
-    TokError("vector register expected");
-    return -1;
-  }
-
-  StringRef Name = Parser.getTok().getString();
-  // If there is a kind specifier, it's separated from the register name by
-  // a '.'.
-  size_t Start = 0, Next = Name.find('.');
-  StringRef Head = Name.slice(Start, Next);
-  unsigned RegNum = matchVectorRegName(Head);
-  if (RegNum) {
-    if (Next != StringRef::npos) {
-      Kind = Name.slice(Next, StringRef::npos);
-      if (!isValidVectorKind(Kind)) {
-        TokError("invalid vector kind qualifier");
-        return -1;
-      }
-    }
-    Parser.Lex(); // Eat the register token.
-    return RegNum;
-  }
-
-  if (expected)
-    TokError("vector register expected");
-  return -1;
-}
-
-/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
-  SMLoc S = getLoc();
-
-  if (Parser.getTok().isNot(AsmToken::Identifier)) {
-    Error(S, "Expected cN operand where 0 <= N <= 15");
-    return MatchOperand_ParseFail;
-  }
-
-  StringRef Tok = Parser.getTok().getIdentifier();
-  if (Tok[0] != 'c' && Tok[0] != 'C') {
-    Error(S, "Expected cN operand where 0 <= N <= 15");
-    return MatchOperand_ParseFail;
-  }
-
-  uint32_t CRNum;
-  bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
-  if (BadNum || CRNum > 15) {
-    Error(S, "Expected cN operand where 0 <= N <= 15");
-    return MatchOperand_ParseFail;
-  }
-
-  Parser.Lex(); // Eat identifier token.
-  Operands.push_back(ARM64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
-  return MatchOperand_Success;
-}
-
-/// tryParsePrefetch - Try to parse a prefetch operand.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParsePrefetch(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  // Either an identifier for named values or a 5-bit immediate.
-  bool Hash = Tok.is(AsmToken::Hash);
-  if (Hash || Tok.is(AsmToken::Integer)) {
-    if (Hash)
-      Parser.Lex(); // Eat hash token.
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return MatchOperand_ParseFail;
-
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for prefetch operand");
-      return MatchOperand_ParseFail;
-    }
-    unsigned prfop = MCE->getValue();
-    if (prfop > 31) {
-      TokError("prefetch operand out of range, [0,31] expected");
-      return MatchOperand_ParseFail;
-    }
-
-    Operands.push_back(ARM64Operand::CreatePrefetch(prfop, S, getContext()));
-    return MatchOperand_Success;
-  }
-
-  if (Tok.isNot(AsmToken::Identifier)) {
-    TokError("pre-fetch hint expected");
-    return MatchOperand_ParseFail;
-  }
-
-  bool Valid;
-  unsigned prfop = ARM64PRFM::PRFMMapper().fromString(Tok.getString(), Valid);
-  if (!Valid) {
-    TokError("pre-fetch hint expected");
-    return MatchOperand_ParseFail;
-  }
-
-  Parser.Lex(); // Eat identifier token.
-  Operands.push_back(ARM64Operand::CreatePrefetch(prfop, S, getContext()));
-  return MatchOperand_Success;
-}
-
-/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
-/// instruction.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const MCExpr *Expr;
-
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat hash token.
-  }
-
-  if (parseSymbolicImmVal(Expr))
-    return MatchOperand_ParseFail;
-
-  ARM64MCExpr::VariantKind ELFRefKind;
-  MCSymbolRefExpr::VariantKind DarwinRefKind;
-  int64_t Addend;
-  if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
-    if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
-        ELFRefKind == ARM64MCExpr::VK_INVALID) {
-      // No modifier was specified at all; this is the syntax for an ELF basic
-      // ADRP relocation (unfortunately).
-      Expr = ARM64MCExpr::Create(Expr, ARM64MCExpr::VK_ABS_PAGE, getContext());
-    } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
-                DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
-               Addend != 0) {
-      Error(S, "gotpage label reference not allowed an addend");
-      return MatchOperand_ParseFail;
-    } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
-               DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
-               DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
-               ELFRefKind != ARM64MCExpr::VK_GOT_PAGE &&
-               ELFRefKind != ARM64MCExpr::VK_GOTTPREL_PAGE &&
-               ELFRefKind != ARM64MCExpr::VK_TLSDESC_PAGE) {
-      // The operand must be an @page or @gotpage qualified symbolref.
-      Error(S, "page or gotpage label reference expected");
-      return MatchOperand_ParseFail;
-    }
-  }
-
-  // We have either a label reference possibly with addend or an immediate. The
-  // addend is a raw value here. The linker will adjust it to only reference the
-  // page.
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
-
-  return MatchOperand_Success;
-}
-
-/// tryParseAdrLabel - Parse and validate a source label for the ADR
-/// instruction.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const MCExpr *Expr;
-
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat hash token.
-  }
-
-  if (getParser().parseExpression(Expr))
-    return MatchOperand_ParseFail;
-
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
-
-  return MatchOperand_Success;
-}
-
-/// tryParseFPImm - A floating point immediate expression operand.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseFPImm(OperandVector &Operands) {
-  SMLoc S = getLoc();
-
-  bool Hash = false;
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat '#'
-    Hash = true;
-  }
-
-  // Handle negation, as that still comes through as a separate token.
-  bool isNegative = false;
-  if (Parser.getTok().is(AsmToken::Minus)) {
-    isNegative = true;
-    Parser.Lex();
-  }
-  const AsmToken &Tok = Parser.getTok();
-  if (Tok.is(AsmToken::Real)) {
-    APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
-    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-    // If we had a '-' in front, toggle the sign bit.
-    IntVal ^= (uint64_t)isNegative << 63;
-    int Val = ARM64_AM::getFP64Imm(APInt(64, IntVal));
-    Parser.Lex(); // Eat the token.
-    // Check for out of range values. As an exception, we let Zero through,
-    // as we handle that special case in post-processing before matching in
-    // order to use the zero register for it.
-    if (Val == -1 && !RealVal.isZero()) {
-      TokError("expected compatible register or floating-point constant");
-      return MatchOperand_ParseFail;
-    }
-    Operands.push_back(ARM64Operand::CreateFPImm(Val, S, getContext()));
-    return MatchOperand_Success;
-  }
-  if (Tok.is(AsmToken::Integer)) {
-    int64_t Val;
-    if (!isNegative && Tok.getString().startswith("0x")) {
-      Val = Tok.getIntVal();
-      if (Val > 255 || Val < 0) {
-        TokError("encoded floating point value out of range");
-        return MatchOperand_ParseFail;
-      }
-    } else {
-      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
-      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-      // If we had a '-' in front, toggle the sign bit.
-      IntVal ^= (uint64_t)isNegative << 63;
-      Val = ARM64_AM::getFP64Imm(APInt(64, IntVal));
-    }
-    Parser.Lex(); // Eat the token.
-    Operands.push_back(ARM64Operand::CreateFPImm(Val, S, getContext()));
-    return MatchOperand_Success;
-  }
-
-  if (!Hash)
-    return MatchOperand_NoMatch;
-
-  TokError("invalid floating point immediate");
-  return MatchOperand_ParseFail;
-}
-
-/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
-  SMLoc S = getLoc();
-
-  if (Parser.getTok().is(AsmToken::Hash))
-    Parser.Lex(); // Eat '#'
-  else if (Parser.getTok().isNot(AsmToken::Integer))
-    // Operand should start from # or should be integer, emit error otherwise.
-    return MatchOperand_NoMatch;
-
-  const MCExpr *Imm;
-  if (parseSymbolicImmVal(Imm))
-    return MatchOperand_ParseFail;
-  else if (Parser.getTok().isNot(AsmToken::Comma)) {
-    uint64_t ShiftAmount = 0;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm);
-    if (MCE) {
-      int64_t Val = MCE->getValue();
-      if (Val > 0xfff && (Val & 0xfff) == 0) {
-        Imm = MCConstantExpr::Create(Val >> 12, getContext());
-        ShiftAmount = 12;
-      }
-    }
-    SMLoc E = Parser.getTok().getLoc();
-    Operands.push_back(ARM64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E,
-                                                      getContext()));
-    return MatchOperand_Success;
-  }
-
-  // Eat ','
-  Parser.Lex();
-
-  // The optional operand must be "lsl #N" where N is non-negative.
-  if (!Parser.getTok().is(AsmToken::Identifier) ||
-      !Parser.getTok().getIdentifier().equals_lower("lsl")) {
-    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
-    return MatchOperand_ParseFail;
-  }
-
-  // Eat 'lsl'
-  Parser.Lex();
-
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex();
-  }
-
-  if (Parser.getTok().isNot(AsmToken::Integer)) {
-    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
-    return MatchOperand_ParseFail;
-  }
-
-  int64_t ShiftAmount = Parser.getTok().getIntVal();
-
-  if (ShiftAmount < 0) {
-    Error(Parser.getTok().getLoc(), "positive shift amount required");
-    return MatchOperand_ParseFail;
-  }
-  Parser.Lex(); // Eat the number
-
-  SMLoc E = Parser.getTok().getLoc();
-  Operands.push_back(ARM64Operand::CreateShiftedImm(Imm, ShiftAmount,
-                                                    S, E, getContext()));
-  return MatchOperand_Success;
-}
-
-/// parseCondCodeString - Parse a Condition Code string.
-ARM64CC::CondCode ARM64AsmParser::parseCondCodeString(StringRef Cond) {
-  ARM64CC::CondCode CC = StringSwitch<ARM64CC::CondCode>(Cond.lower())
-                    .Case("eq", ARM64CC::EQ)
-                    .Case("ne", ARM64CC::NE)
-                    .Case("cs", ARM64CC::HS)
-                    .Case("hs", ARM64CC::HS)
-                    .Case("cc", ARM64CC::LO)
-                    .Case("lo", ARM64CC::LO)
-                    .Case("mi", ARM64CC::MI)
-                    .Case("pl", ARM64CC::PL)
-                    .Case("vs", ARM64CC::VS)
-                    .Case("vc", ARM64CC::VC)
-                    .Case("hi", ARM64CC::HI)
-                    .Case("ls", ARM64CC::LS)
-                    .Case("ge", ARM64CC::GE)
-                    .Case("lt", ARM64CC::LT)
-                    .Case("gt", ARM64CC::GT)
-                    .Case("le", ARM64CC::LE)
-                    .Case("al", ARM64CC::AL)
-                    .Case("nv", ARM64CC::NV)
-                    .Default(ARM64CC::Invalid);
-  return CC;
-}
-
-/// parseCondCode - Parse a Condition Code operand.
-bool ARM64AsmParser::parseCondCode(OperandVector &Operands,
-                                   bool invertCondCode) {
-  SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
-
-  StringRef Cond = Tok.getString();
-  ARM64CC::CondCode CC = parseCondCodeString(Cond);
-  if (CC == ARM64CC::Invalid)
-    return TokError("invalid condition code");
-  Parser.Lex(); // Eat identifier token.
-
-  if (invertCondCode)
-    CC = ARM64CC::getInvertedCondCode(ARM64CC::CondCode(CC));
-
-  Operands.push_back(
-      ARM64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
-  return false;
-}
-
-/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
-/// them if present.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-  std::string LowerID = Tok.getString().lower();
-  ARM64_AM::ShiftExtendType ShOp =
-      StringSwitch<ARM64_AM::ShiftExtendType>(LowerID)
-          .Case("lsl", ARM64_AM::LSL)
-          .Case("lsr", ARM64_AM::LSR)
-          .Case("asr", ARM64_AM::ASR)
-          .Case("ror", ARM64_AM::ROR)
-          .Case("msl", ARM64_AM::MSL)
-          .Case("uxtb", ARM64_AM::UXTB)
-          .Case("uxth", ARM64_AM::UXTH)
-          .Case("uxtw", ARM64_AM::UXTW)
-          .Case("uxtx", ARM64_AM::UXTX)
-          .Case("sxtb", ARM64_AM::SXTB)
-          .Case("sxth", ARM64_AM::SXTH)
-          .Case("sxtw", ARM64_AM::SXTW)
-          .Case("sxtx", ARM64_AM::SXTX)
-          .Default(ARM64_AM::InvalidShiftExtend);
-
-  if (ShOp == ARM64_AM::InvalidShiftExtend)
-    return MatchOperand_NoMatch;
-
-  SMLoc S = Tok.getLoc();
-  Parser.Lex();
-
-  bool Hash = getLexer().is(AsmToken::Hash);
-  if (!Hash && getLexer().isNot(AsmToken::Integer)) {
-    if (ShOp == ARM64_AM::LSL || ShOp == ARM64_AM::LSR ||
-        ShOp == ARM64_AM::ASR || ShOp == ARM64_AM::ROR ||
-        ShOp == ARM64_AM::MSL) {
-      // We expect a number here.
-      TokError("expected #imm after shift specifier");
-      return MatchOperand_ParseFail;
-    }
-
-    // "extend" type operatoins don't need an immediate, #0 is implicit.
-    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(
-        ARM64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext()));
-    return MatchOperand_Success;
-  }
-
-  if (Hash)
-    Parser.Lex(); // Eat the '#'.
-
-  // Make sure we do actually have a number
-  if (!Parser.getTok().is(AsmToken::Integer)) {
-    Error(Parser.getTok().getLoc(),
-          "expected integer shift amount");
-    return MatchOperand_ParseFail;
-  }
-
-  const MCExpr *ImmVal;
-  if (getParser().parseExpression(ImmVal))
-    return MatchOperand_ParseFail;
-
-  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-  if (!MCE) {
-    TokError("expected #imm after shift specifier");
-    return MatchOperand_ParseFail;
-  }
-
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(ARM64Operand::CreateShiftExtend(ShOp, MCE->getValue(),
-                                                     true, S, E, getContext()));
-  return MatchOperand_Success;
-}
-
-/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
-/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
-bool ARM64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
-                                   OperandVector &Operands) {
-  if (Name.find('.') != StringRef::npos)
-    return TokError("invalid operand");
-
-  Mnemonic = Name;
-  Operands.push_back(
-      ARM64Operand::CreateToken("sys", false, NameLoc, getContext()));
-
-  const AsmToken &Tok = Parser.getTok();
-  StringRef Op = Tok.getString();
-  SMLoc S = Tok.getLoc();
-
-  const MCExpr *Expr = nullptr;
-
-#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
-  do {                                                                         \
-    Expr = MCConstantExpr::Create(op1, getContext());                          \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateImm(Expr, S, getLoc(), getContext()));             \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));             \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));             \
-    Expr = MCConstantExpr::Create(op2, getContext());                          \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateImm(Expr, S, getLoc(), getContext()));             \
-  } while (0)
-
-  if (Mnemonic == "ic") {
-    if (!Op.compare_lower("ialluis")) {
-      // SYS #0, C7, C1, #0
-      SYS_ALIAS(0, 7, 1, 0);
-    } else if (!Op.compare_lower("iallu")) {
-      // SYS #0, C7, C5, #0
-      SYS_ALIAS(0, 7, 5, 0);
-    } else if (!Op.compare_lower("ivau")) {
-      // SYS #3, C7, C5, #1
-      SYS_ALIAS(3, 7, 5, 1);
-    } else {
-      return TokError("invalid operand for IC instruction");
-    }
-  } else if (Mnemonic == "dc") {
-    if (!Op.compare_lower("zva")) {
-      // SYS #3, C7, C4, #1
-      SYS_ALIAS(3, 7, 4, 1);
-    } else if (!Op.compare_lower("ivac")) {
-      // SYS #3, C7, C6, #1
-      SYS_ALIAS(0, 7, 6, 1);
-    } else if (!Op.compare_lower("isw")) {
-      // SYS #0, C7, C6, #2
-      SYS_ALIAS(0, 7, 6, 2);
-    } else if (!Op.compare_lower("cvac")) {
-      // SYS #3, C7, C10, #1
-      SYS_ALIAS(3, 7, 10, 1);
-    } else if (!Op.compare_lower("csw")) {
-      // SYS #0, C7, C10, #2
-      SYS_ALIAS(0, 7, 10, 2);
-    } else if (!Op.compare_lower("cvau")) {
-      // SYS #3, C7, C11, #1
-      SYS_ALIAS(3, 7, 11, 1);
-    } else if (!Op.compare_lower("civac")) {
-      // SYS #3, C7, C14, #1
-      SYS_ALIAS(3, 7, 14, 1);
-    } else if (!Op.compare_lower("cisw")) {
-      // SYS #0, C7, C14, #2
-      SYS_ALIAS(0, 7, 14, 2);
-    } else {
-      return TokError("invalid operand for DC instruction");
-    }
-  } else if (Mnemonic == "at") {
-    if (!Op.compare_lower("s1e1r")) {
-      // SYS #0, C7, C8, #0
-      SYS_ALIAS(0, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e2r")) {
-      // SYS #4, C7, C8, #0
-      SYS_ALIAS(4, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e3r")) {
-      // SYS #6, C7, C8, #0
-      SYS_ALIAS(6, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e1w")) {
-      // SYS #0, C7, C8, #1
-      SYS_ALIAS(0, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e2w")) {
-      // SYS #4, C7, C8, #1
-      SYS_ALIAS(4, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e3w")) {
-      // SYS #6, C7, C8, #1
-      SYS_ALIAS(6, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e0r")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 2);
-    } else if (!Op.compare_lower("s1e0w")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 3);
-    } else if (!Op.compare_lower("s12e1r")) {
-      // SYS #4, C7, C8, #4
-      SYS_ALIAS(4, 7, 8, 4);
-    } else if (!Op.compare_lower("s12e1w")) {
-      // SYS #4, C7, C8, #5
-      SYS_ALIAS(4, 7, 8, 5);
-    } else if (!Op.compare_lower("s12e0r")) {
-      // SYS #4, C7, C8, #6
-      SYS_ALIAS(4, 7, 8, 6);
-    } else if (!Op.compare_lower("s12e0w")) {
-      // SYS #4, C7, C8, #7
-      SYS_ALIAS(4, 7, 8, 7);
-    } else {
-      return TokError("invalid operand for AT instruction");
-    }
-  } else if (Mnemonic == "tlbi") {
-    if (!Op.compare_lower("vmalle1is")) {
-      // SYS #0, C8, C3, #0
-      SYS_ALIAS(0, 8, 3, 0);
-    } else if (!Op.compare_lower("alle2is")) {
-      // SYS #4, C8, C3, #0
-      SYS_ALIAS(4, 8, 3, 0);
-    } else if (!Op.compare_lower("alle3is")) {
-      // SYS #6, C8, C3, #0
-      SYS_ALIAS(6, 8, 3, 0);
-    } else if (!Op.compare_lower("vae1is")) {
-      // SYS #0, C8, C3, #1
-      SYS_ALIAS(0, 8, 3, 1);
-    } else if (!Op.compare_lower("vae2is")) {
-      // SYS #4, C8, C3, #1
-      SYS_ALIAS(4, 8, 3, 1);
-    } else if (!Op.compare_lower("vae3is")) {
-      // SYS #6, C8, C3, #1
-      SYS_ALIAS(6, 8, 3, 1);
-    } else if (!Op.compare_lower("aside1is")) {
-      // SYS #0, C8, C3, #2
-      SYS_ALIAS(0, 8, 3, 2);
-    } else if (!Op.compare_lower("vaae1is")) {
-      // SYS #0, C8, C3, #3
-      SYS_ALIAS(0, 8, 3, 3);
-    } else if (!Op.compare_lower("alle1is")) {
-      // SYS #4, C8, C3, #4
-      SYS_ALIAS(4, 8, 3, 4);
-    } else if (!Op.compare_lower("vale1is")) {
-      // SYS #0, C8, C3, #5
-      SYS_ALIAS(0, 8, 3, 5);
-    } else if (!Op.compare_lower("vaale1is")) {
-      // SYS #0, C8, C3, #7
-      SYS_ALIAS(0, 8, 3, 7);
-    } else if (!Op.compare_lower("vmalle1")) {
-      // SYS #0, C8, C7, #0
-      SYS_ALIAS(0, 8, 7, 0);
-    } else if (!Op.compare_lower("alle2")) {
-      // SYS #4, C8, C7, #0
-      SYS_ALIAS(4, 8, 7, 0);
-    } else if (!Op.compare_lower("vale2is")) {
-      // SYS #4, C8, C3, #5
-      SYS_ALIAS(4, 8, 3, 5);
-    } else if (!Op.compare_lower("vale3is")) {
-      // SYS #6, C8, C3, #5
-      SYS_ALIAS(6, 8, 3, 5);
-    } else if (!Op.compare_lower("alle3")) {
-      // SYS #6, C8, C7, #0
-      SYS_ALIAS(6, 8, 7, 0);
-    } else if (!Op.compare_lower("vae1")) {
-      // SYS #0, C8, C7, #1
-      SYS_ALIAS(0, 8, 7, 1);
-    } else if (!Op.compare_lower("vae2")) {
-      // SYS #4, C8, C7, #1
-      SYS_ALIAS(4, 8, 7, 1);
-    } else if (!Op.compare_lower("vae3")) {
-      // SYS #6, C8, C7, #1
-      SYS_ALIAS(6, 8, 7, 1);
-    } else if (!Op.compare_lower("aside1")) {
-      // SYS #0, C8, C7, #2
-      SYS_ALIAS(0, 8, 7, 2);
-    } else if (!Op.compare_lower("vaae1")) {
-      // SYS #0, C8, C7, #3
-      SYS_ALIAS(0, 8, 7, 3);
-    } else if (!Op.compare_lower("alle1")) {
-      // SYS #4, C8, C7, #4
-      SYS_ALIAS(4, 8, 7, 4);
-    } else if (!Op.compare_lower("vale1")) {
-      // SYS #0, C8, C7, #5
-      SYS_ALIAS(0, 8, 7, 5);
-    } else if (!Op.compare_lower("vale2")) {
-      // SYS #4, C8, C7, #5
-      SYS_ALIAS(4, 8, 7, 5);
-    } else if (!Op.compare_lower("vale3")) {
-      // SYS #6, C8, C7, #5
-      SYS_ALIAS(6, 8, 7, 5);
-    } else if (!Op.compare_lower("vaale1")) {
-      // SYS #0, C8, C7, #7
-      SYS_ALIAS(0, 8, 7, 7);
-    } else if (!Op.compare_lower("ipas2e1")) {
-      // SYS #4, C8, C4, #1
-      SYS_ALIAS(4, 8, 4, 1);
-    } else if (!Op.compare_lower("ipas2le1")) {
-      // SYS #4, C8, C4, #5
-      SYS_ALIAS(4, 8, 4, 5);
-    } else if (!Op.compare_lower("ipas2e1is")) {
-      // SYS #4, C8, C4, #1
-      SYS_ALIAS(4, 8, 0, 1);
-    } else if (!Op.compare_lower("ipas2le1is")) {
-      // SYS #4, C8, C4, #5
-      SYS_ALIAS(4, 8, 0, 5);
-    } else if (!Op.compare_lower("vmalls12e1")) {
-      // SYS #4, C8, C7, #6
-      SYS_ALIAS(4, 8, 7, 6);
-    } else if (!Op.compare_lower("vmalls12e1is")) {
-      // SYS #4, C8, C3, #6
-      SYS_ALIAS(4, 8, 3, 6);
-    } else {
-      return TokError("invalid operand for TLBI instruction");
-    }
-  }
-
-#undef SYS_ALIAS
-
-  Parser.Lex(); // Eat operand.
-
-  bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
-  bool HasRegister = false;
-
-  // Check for the optional register operand.
-  if (getLexer().is(AsmToken::Comma)) {
-    Parser.Lex(); // Eat comma.
-
-    if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
-      return TokError("expected register operand");
-
-    HasRegister = true;
-  }
-
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    Parser.eatToEndOfStatement();
-    return TokError("unexpected token in argument list");
-  }
-
-  if (ExpectRegister && !HasRegister) {
-    return TokError("specified " + Mnemonic + " op requires a register");
-  }
-  else if (!ExpectRegister && HasRegister) {
-    return TokError("specified " + Mnemonic + " op does not use a register");
-  }
-
-  Parser.Lex(); // Consume the EndOfStatement
-  return false;
-}
-
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-
-  // Can be either a #imm style literal or an option name
-  bool Hash = Tok.is(AsmToken::Hash);
-  if (Hash || Tok.is(AsmToken::Integer)) {
-    // Immediate operand.
-    if (Hash)
-      Parser.Lex(); // Eat the '#'
-    const MCExpr *ImmVal;
-    SMLoc ExprLoc = getLoc();
-    if (getParser().parseExpression(ImmVal))
-      return MatchOperand_ParseFail;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      Error(ExprLoc, "immediate value expected for barrier operand");
-      return MatchOperand_ParseFail;
-    }
-    if (MCE->getValue() < 0 || MCE->getValue() > 15) {
-      Error(ExprLoc, "barrier operand out of range");
-      return MatchOperand_ParseFail;
-    }
-    Operands.push_back(
-        ARM64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext()));
-    return MatchOperand_Success;
-  }
-
-  if (Tok.isNot(AsmToken::Identifier)) {
-    TokError("invalid operand for instruction");
-    return MatchOperand_ParseFail;
-  }
-
-  bool Valid;
-  unsigned Opt = ARM64DB::DBarrierMapper().fromString(Tok.getString(), Valid);
-  if (!Valid) {
-    TokError("invalid barrier option name");
-    return MatchOperand_ParseFail;
-  }
-
-  // The only valid named option for ISB is 'sy'
-  if (Mnemonic == "isb" && Opt != ARM64DB::SY) {
-    TokError("'sy' or #imm operand expected");
-    return MatchOperand_ParseFail;
-  }
-
-  Operands.push_back(ARM64Operand::CreateBarrier(Opt, getLoc(), getContext()));
-  Parser.Lex(); // Consume the option
-
-  return MatchOperand_Success;
-}
-
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseSysReg(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-
-  if (Tok.isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
-
-  Operands.push_back(ARM64Operand::CreateSysReg(Tok.getString(), getLoc(),
-                     STI.getFeatureBits(), getContext()));
-  Parser.Lex(); // Eat identifier
-
-  return MatchOperand_Success;
-}
-
-/// tryParseVectorRegister - Parse a vector register operand.
-bool ARM64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
-  if (Parser.getTok().isNot(AsmToken::Identifier))
-    return true;
-
-  SMLoc S = getLoc();
-  // Check for a vector register specifier first.
-  StringRef Kind;
-  int64_t Reg = tryMatchVectorRegister(Kind, false);
-  if (Reg == -1)
-    return true;
-  Operands.push_back(
-      ARM64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
-  // If there was an explicit qualifier, that goes on as a literal text
-  // operand.
-  if (!Kind.empty())
-    Operands.push_back(ARM64Operand::CreateToken(Kind, false, S, getContext()));
-
-  // If there is an index specifier following the register, parse that too.
-  if (Parser.getTok().is(AsmToken::LBrac)) {
-    SMLoc SIdx = getLoc();
-    Parser.Lex(); // Eat left bracket token.
-
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for vector index");
-      return false;
-    }
-
-    SMLoc E = getLoc();
-    if (Parser.getTok().isNot(AsmToken::RBrac)) {
-      Error(E, "']' expected");
-      return false;
-    }
-
-    Parser.Lex(); // Eat right bracket token.
-
-    Operands.push_back(ARM64Operand::CreateVectorIndex(MCE->getValue(), SIdx, E,
-                                                       getContext()));
-  }
-
-  return false;
-}
-
-/// parseRegister - Parse a non-vector register operand.
-bool ARM64AsmParser::parseRegister(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  // Try for a vector register.
-  if (!tryParseVectorRegister(Operands))
-    return false;
-
-  // Try for a scalar register.
-  int64_t Reg = tryParseRegister();
-  if (Reg == -1)
-    return true;
-  Operands.push_back(
-      ARM64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
-
-  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
-  // as a string token in the instruction itself.
-  if (getLexer().getKind() == AsmToken::LBrac) {
-    SMLoc LBracS = getLoc();
-    Parser.Lex();
-    const AsmToken &Tok = Parser.getTok();
-    if (Tok.is(AsmToken::Integer)) {
-      SMLoc IntS = getLoc();
-      int64_t Val = Tok.getIntVal();
-      if (Val == 1) {
-        Parser.Lex();
-        if (getLexer().getKind() == AsmToken::RBrac) {
-          SMLoc RBracS = getLoc();
-          Parser.Lex();
-          Operands.push_back(
-              ARM64Operand::CreateToken("[", false, LBracS, getContext()));
-          Operands.push_back(
-              ARM64Operand::CreateToken("1", false, IntS, getContext()));
-          Operands.push_back(
-              ARM64Operand::CreateToken("]", false, RBracS, getContext()));
-          return false;
-        }
-      }
-    }
-  }
-
-  return false;
-}
-
-bool ARM64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
-  bool HasELFModifier = false;
-  ARM64MCExpr::VariantKind RefKind;
-
-  if (Parser.getTok().is(AsmToken::Colon)) {
-    Parser.Lex(); // Eat ':"
-    HasELFModifier = true;
-
-    if (Parser.getTok().isNot(AsmToken::Identifier)) {
-      Error(Parser.getTok().getLoc(),
-            "expect relocation specifier in operand after ':'");
-      return true;
-    }
-
-    std::string LowerCase = Parser.getTok().getIdentifier().lower();
-    RefKind = StringSwitch<ARM64MCExpr::VariantKind>(LowerCase)
-                  .Case("lo12", ARM64MCExpr::VK_LO12)
-                  .Case("abs_g3", ARM64MCExpr::VK_ABS_G3)
-                  .Case("abs_g2", ARM64MCExpr::VK_ABS_G2)
-                  .Case("abs_g2_s", ARM64MCExpr::VK_ABS_G2_S)
-                  .Case("abs_g2_nc", ARM64MCExpr::VK_ABS_G2_NC)
-                  .Case("abs_g1", ARM64MCExpr::VK_ABS_G1)
-                  .Case("abs_g1_s", ARM64MCExpr::VK_ABS_G1_S)
-                  .Case("abs_g1_nc", ARM64MCExpr::VK_ABS_G1_NC)
-                  .Case("abs_g0", ARM64MCExpr::VK_ABS_G0)
-                  .Case("abs_g0_s", ARM64MCExpr::VK_ABS_G0_S)
-                  .Case("abs_g0_nc", ARM64MCExpr::VK_ABS_G0_NC)
-                  .Case("dtprel_g2", ARM64MCExpr::VK_DTPREL_G2)
-                  .Case("dtprel_g1", ARM64MCExpr::VK_DTPREL_G1)
-                  .Case("dtprel_g1_nc", ARM64MCExpr::VK_DTPREL_G1_NC)
-                  .Case("dtprel_g0", ARM64MCExpr::VK_DTPREL_G0)
-                  .Case("dtprel_g0_nc", ARM64MCExpr::VK_DTPREL_G0_NC)
-                  .Case("dtprel_hi12", ARM64MCExpr::VK_DTPREL_HI12)
-                  .Case("dtprel_lo12", ARM64MCExpr::VK_DTPREL_LO12)
-                  .Case("dtprel_lo12_nc", ARM64MCExpr::VK_DTPREL_LO12_NC)
-                  .Case("tprel_g2", ARM64MCExpr::VK_TPREL_G2)
-                  .Case("tprel_g1", ARM64MCExpr::VK_TPREL_G1)
-                  .Case("tprel_g1_nc", ARM64MCExpr::VK_TPREL_G1_NC)
-                  .Case("tprel_g0", ARM64MCExpr::VK_TPREL_G0)
-                  .Case("tprel_g0_nc", ARM64MCExpr::VK_TPREL_G0_NC)
-                  .Case("tprel_hi12", ARM64MCExpr::VK_TPREL_HI12)
-                  .Case("tprel_lo12", ARM64MCExpr::VK_TPREL_LO12)
-                  .Case("tprel_lo12_nc", ARM64MCExpr::VK_TPREL_LO12_NC)
-                  .Case("tlsdesc_lo12", ARM64MCExpr::VK_TLSDESC_LO12)
-                  .Case("got", ARM64MCExpr::VK_GOT_PAGE)
-                  .Case("got_lo12", ARM64MCExpr::VK_GOT_LO12)
-                  .Case("gottprel", ARM64MCExpr::VK_GOTTPREL_PAGE)
-                  .Case("gottprel_lo12", ARM64MCExpr::VK_GOTTPREL_LO12_NC)
-                  .Case("gottprel_g1", ARM64MCExpr::VK_GOTTPREL_G1)
-                  .Case("gottprel_g0_nc", ARM64MCExpr::VK_GOTTPREL_G0_NC)
-                  .Case("tlsdesc", ARM64MCExpr::VK_TLSDESC_PAGE)
-                  .Default(ARM64MCExpr::VK_INVALID);
-
-    if (RefKind == ARM64MCExpr::VK_INVALID) {
-      Error(Parser.getTok().getLoc(),
-            "expect relocation specifier in operand after ':'");
-      return true;
-    }
-
-    Parser.Lex(); // Eat identifier
-
-    if (Parser.getTok().isNot(AsmToken::Colon)) {
-      Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier");
-      return true;
-    }
-    Parser.Lex(); // Eat ':'
-  }
-
-  if (getParser().parseExpression(ImmVal))
-    return true;
-
-  if (HasELFModifier)
-    ImmVal = ARM64MCExpr::Create(ImmVal, RefKind, getContext());
-
-  return false;
-}
-
-/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
-bool ARM64AsmParser::parseVectorList(OperandVector &Operands) {
-  assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
-  SMLoc S = getLoc();
-  Parser.Lex(); // Eat left bracket token.
-  StringRef Kind;
-  int64_t FirstReg = tryMatchVectorRegister(Kind, true);
-  if (FirstReg == -1)
-    return true;
-  int64_t PrevReg = FirstReg;
-  unsigned Count = 1;
-
-  if (Parser.getTok().is(AsmToken::Minus)) {
-    Parser.Lex(); // Eat the minus.
-
-    SMLoc Loc = getLoc();
-    StringRef NextKind;
-    int64_t Reg = tryMatchVectorRegister(NextKind, true);
-    if (Reg == -1)
-      return true;
-    // Any Kind suffices must match on all regs in the list.
-    if (Kind != NextKind)
-      return Error(Loc, "mismatched register size suffix");
-
-    unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
-
-    if (Space == 0 || Space > 3) {
-      return Error(Loc, "invalid number of vectors");
-    }
-
-    Count += Space;
-  }
-  else {
-    while (Parser.getTok().is(AsmToken::Comma)) {
-      Parser.Lex(); // Eat the comma token.
-
-      SMLoc Loc = getLoc();
-      StringRef NextKind;
-      int64_t Reg = tryMatchVectorRegister(NextKind, true);
-      if (Reg == -1)
-        return true;
-      // Any Kind suffices must match on all regs in the list.
-      if (Kind != NextKind)
-        return Error(Loc, "mismatched register size suffix");
-
-      // Registers must be incremental (with wraparound at 31)
-      if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
-          (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
-       return Error(Loc, "registers must be sequential");
-
-      PrevReg = Reg;
-      ++Count;
-    }
-  }
-
-  if (Parser.getTok().isNot(AsmToken::RCurly))
-    return Error(getLoc(), "'}' expected");
-  Parser.Lex(); // Eat the '}' token.
-
-  if (Count > 4)
-    return Error(S, "invalid number of vectors");
-
-  unsigned NumElements = 0;
-  char ElementKind = 0;
-  if (!Kind.empty())
-    parseValidVectorKind(Kind, NumElements, ElementKind);
-
-  Operands.push_back(ARM64Operand::CreateVectorList(
-      FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
-
-  // If there is an index specifier following the list, parse that too.
-  if (Parser.getTok().is(AsmToken::LBrac)) {
-    SMLoc SIdx = getLoc();
-    Parser.Lex(); // Eat left bracket token.
-
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for vector index");
-      return false;
-    }
-
-    SMLoc E = getLoc();
-    if (Parser.getTok().isNot(AsmToken::RBrac)) {
-      Error(E, "']' expected");
-      return false;
-    }
-
-    Parser.Lex(); // Eat right bracket token.
-
-    Operands.push_back(ARM64Operand::CreateVectorIndex(MCE->getValue(), SIdx, E,
-                                                       getContext()));
-  }
-  return false;
-}
-
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-  if (!Tok.is(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
-
-  unsigned RegNum = MatchRegisterName(Tok.getString().lower());
-
-  MCContext &Ctx = getContext();
-  const MCRegisterInfo *RI = Ctx.getRegisterInfo();
-  if (!RI->getRegClass(ARM64::GPR64spRegClassID).contains(RegNum))
-    return MatchOperand_NoMatch;
-
-  SMLoc S = getLoc();
-  Parser.Lex(); // Eat register
-
-  if (Parser.getTok().isNot(AsmToken::Comma)) {
-    Operands.push_back(ARM64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
-    return MatchOperand_Success;
-  }
-  Parser.Lex(); // Eat comma.
-
-  if (Parser.getTok().is(AsmToken::Hash))
-    Parser.Lex(); // Eat hash
-
-  if (Parser.getTok().isNot(AsmToken::Integer)) {
-    Error(getLoc(), "index must be absent or #0");
-    return MatchOperand_ParseFail;
-  }
-
-  const MCExpr *ImmVal;
-  if (Parser.parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
-      cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
-    Error(getLoc(), "index must be absent or #0");
-    return MatchOperand_ParseFail;
-  }
-
-  Operands.push_back(ARM64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
-  return MatchOperand_Success;
-}
-
-/// parseOperand - Parse a arm instruction operand.  For now this parses the
-/// operand regardless of the mnemonic.
-bool ARM64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
-                                  bool invertCondCode) {
-  // Check if the current operand has a custom associated parser, if so, try to
-  // custom parse the operand, or fallback to the general approach.
-  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
-  if (ResTy == MatchOperand_Success)
-    return false;
-  // If there wasn't a custom match, try the generic matcher below. Otherwise,
-  // there was a match, but an error occurred, in which case, just return that
-  // the operand parsing failed.
-  if (ResTy == MatchOperand_ParseFail)
-    return true;
-
-  // Nothing custom, so do general case parsing.
-  SMLoc S, E;
-  switch (getLexer().getKind()) {
-  default: {
-    SMLoc S = getLoc();
-    const MCExpr *Expr;
-    if (parseSymbolicImmVal(Expr))
-      return Error(S, "invalid operand");
-
-    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
-    return false;
-  }
-  case AsmToken::LBrac: {
-    SMLoc Loc = Parser.getTok().getLoc();
-    Operands.push_back(ARM64Operand::CreateToken("[", false, Loc,
-                                                 getContext()));
-    Parser.Lex(); // Eat '['
-
-    // There's no comma after a '[', so we can parse the next operand
-    // immediately.
-    return parseOperand(Operands, false, false);
-  }
-  case AsmToken::LCurly:
-    return parseVectorList(Operands);
-  case AsmToken::Identifier: {
-    // If we're expecting a Condition Code operand, then just parse that.
-    if (isCondCode)
-      return parseCondCode(Operands, invertCondCode);
-
-    // If it's a register name, parse it.
-    if (!parseRegister(Operands))
-      return false;
-
-    // This could be an optional "shift" or "extend" operand.
-    OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
-    // We can only continue if no tokens were eaten.
-    if (GotShift != MatchOperand_NoMatch)
-      return GotShift;
-
-    // This was not a register so parse other operands that start with an
-    // identifier (like labels) as expressions and create them as immediates.
-    const MCExpr *IdVal;
-    S = getLoc();
-    if (getParser().parseExpression(IdVal))
-      return true;
-
-    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(ARM64Operand::CreateImm(IdVal, S, E, getContext()));
-    return false;
-  }
-  case AsmToken::Integer:
-  case AsmToken::Real:
-  case AsmToken::Hash: {
-    // #42 -> immediate.
-    S = getLoc();
-    if (getLexer().is(AsmToken::Hash))
-      Parser.Lex();
-
-    // Parse a negative sign
-    bool isNegative = false;
-    if (Parser.getTok().is(AsmToken::Minus)) {
-      isNegative = true;
-      // We need to consume this token only when we have a Real, otherwise
-      // we let parseSymbolicImmVal take care of it
-      if (Parser.getLexer().peekTok().is(AsmToken::Real))
-        Parser.Lex();
-    }
-
-    // The only Real that should come through here is a literal #0.0 for
-    // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
-    // so convert the value.
-    const AsmToken &Tok = Parser.getTok();
-    if (Tok.is(AsmToken::Real)) {
-      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
-      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-      if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
-          Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
-          Mnemonic != "fcmlt")
-        return TokError("unexpected floating point literal");
-      else if (IntVal != 0 || isNegative)
-        return TokError("expected floating-point constant #0.0");
-      Parser.Lex(); // Eat the token.
-
-      Operands.push_back(
-          ARM64Operand::CreateToken("#0", false, S, getContext()));
-      Operands.push_back(
-          ARM64Operand::CreateToken(".0", false, S, getContext()));
-      return false;
-    }
-
-    const MCExpr *ImmVal;
-    if (parseSymbolicImmVal(ImmVal))
-      return true;
-
-    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(ARM64Operand::CreateImm(ImmVal, S, E, getContext()));
-    return false;
-  }
-  }
-}
-
-/// ParseInstruction - Parse an ARM64 instruction mnemonic followed by its
-/// operands.
-bool ARM64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
-                                      StringRef Name, SMLoc NameLoc,
-                                      OperandVector &Operands) {
-  Name = StringSwitch<StringRef>(Name.lower())
-             .Case("beq", "b.eq")
-             .Case("bne", "b.ne")
-             .Case("bhs", "b.hs")
-             .Case("bcs", "b.cs")
-             .Case("blo", "b.lo")
-             .Case("bcc", "b.cc")
-             .Case("bmi", "b.mi")
-             .Case("bpl", "b.pl")
-             .Case("bvs", "b.vs")
-             .Case("bvc", "b.vc")
-             .Case("bhi", "b.hi")
-             .Case("bls", "b.ls")
-             .Case("bge", "b.ge")
-             .Case("blt", "b.lt")
-             .Case("bgt", "b.gt")
-             .Case("ble", "b.le")
-             .Case("bal", "b.al")
-             .Case("bnv", "b.nv")
-             .Default(Name);
-
-  // Create the leading tokens for the mnemonic, split by '.' characters.
-  size_t Start = 0, Next = Name.find('.');
-  StringRef Head = Name.slice(Start, Next);
-
-  // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
-  if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi") {
-    bool IsError = parseSysAlias(Head, NameLoc, Operands);
-    if (IsError && getLexer().isNot(AsmToken::EndOfStatement))
-      Parser.eatToEndOfStatement();
-    return IsError;
-  }
-
-  Operands.push_back(
-      ARM64Operand::CreateToken(Head, false, NameLoc, getContext()));
-  Mnemonic = Head;
-
-  // Handle condition codes for a branch mnemonic
-  if (Head == "b" && Next != StringRef::npos) {
-    Start = Next;
-    Next = Name.find('.', Start + 1);
-    Head = Name.slice(Start + 1, Next);
-
-    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
-                                            (Head.data() - Name.data()));
-    ARM64CC::CondCode CC = parseCondCodeString(Head);
-    if (CC == ARM64CC::Invalid)
-      return Error(SuffixLoc, "invalid condition code");
-    Operands.push_back(
-        ARM64Operand::CreateToken(".", true, SuffixLoc, getContext()));
-    Operands.push_back(
-        ARM64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
-  }
-
-  // Add the remaining tokens in the mnemonic.
-  while (Next != StringRef::npos) {
-    Start = Next;
-    Next = Name.find('.', Start + 1);
-    Head = Name.slice(Start, Next);
-    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
-                                            (Head.data() - Name.data()) + 1);
-    Operands.push_back(
-        ARM64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
-  }
-
-  // Conditional compare instructions have a Condition Code operand, which needs
-  // to be parsed and an immediate operand created.
-  bool condCodeFourthOperand =
-      (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
-       Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
-       Head == "csinc" || Head == "csinv" || Head == "csneg");
-
-  // These instructions are aliases to some of the conditional select
-  // instructions. However, the condition code is inverted in the aliased
-  // instruction.
-  //
-  // FIXME: Is this the correct way to handle these? Or should the parser
-  //        generate the aliased instructions directly?
-  bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
-  bool condCodeThirdOperand =
-      (Head == "cinc" || Head == "cinv" || Head == "cneg");
-
-  // Read the remaining operands.
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    // Read the first operand.
-    if (parseOperand(Operands, false, false)) {
-      Parser.eatToEndOfStatement();
-      return true;
-    }
-
-    unsigned N = 2;
-    while (getLexer().is(AsmToken::Comma)) {
-      Parser.Lex(); // Eat the comma.
-
-      // Parse and remember the operand.
-      if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
-                                     (N == 3 && condCodeThirdOperand) ||
-                                     (N == 2 && condCodeSecondOperand),
-                       condCodeSecondOperand || condCodeThirdOperand)) {
-        Parser.eatToEndOfStatement();
-        return true;
-      }
-
-      // After successfully parsing some operands there are two special cases to
-      // consider (i.e. notional operands not separated by commas). Both are due
-      // to memory specifiers:
-      //  + An RBrac will end an address for load/store/prefetch
-      //  + An '!' will indicate a pre-indexed operation.
-      //
-      // It's someone else's responsibility to make sure these tokens are sane
-      // in the given context!
-      if (Parser.getTok().is(AsmToken::RBrac)) {
-        SMLoc Loc = Parser.getTok().getLoc();
-        Operands.push_back(ARM64Operand::CreateToken("]", false, Loc,
-                                                     getContext()));
-        Parser.Lex();
-      }
-
-      if (Parser.getTok().is(AsmToken::Exclaim)) {
-        SMLoc Loc = Parser.getTok().getLoc();
-        Operands.push_back(ARM64Operand::CreateToken("!", false, Loc,
-                                                     getContext()));
-        Parser.Lex();
-      }
-
-      ++N;
-    }
-  }
-
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    SMLoc Loc = Parser.getTok().getLoc();
-    Parser.eatToEndOfStatement();
-    return Error(Loc, "unexpected token in argument list");
-  }
-
-  Parser.Lex(); // Consume the EndOfStatement
-  return false;
-}
-
-// FIXME: This entire function is a giant hack to provide us with decent
-// operand range validation/diagnostics until TableGen/MC can be extended
-// to support autogeneration of this kind of validation.
-bool ARM64AsmParser::validateInstruction(MCInst &Inst,
-                                         SmallVectorImpl<SMLoc> &Loc) {
-  const MCRegisterInfo *RI = getContext().getRegisterInfo();
-  // Check for indexed addressing modes w/ the base register being the
-  // same as a destination/source register or pair load where
-  // the Rt == Rt2. All of those are undefined behaviour.
-  switch (Inst.getOpcode()) {
-  case ARM64::LDPSWpre:
-  case ARM64::LDPWpost:
-  case ARM64::LDPWpre:
-  case ARM64::LDPXpost:
-  case ARM64::LDPXpre: {
-    unsigned Rt = Inst.getOperand(1).getReg();
-    unsigned Rt2 = Inst.getOperand(2).getReg();
-    unsigned Rn = Inst.getOperand(3).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable LDP instruction, writeback base "
-                           "is also a destination");
-    if (RI->isSubRegisterEq(Rn, Rt2))
-      return Error(Loc[1], "unpredictable LDP instruction, writeback base "
-                           "is also a destination");
-    // FALLTHROUGH
-  }
-  case ARM64::LDPDi:
-  case ARM64::LDPQi:
-  case ARM64::LDPSi:
-  case ARM64::LDPSWi:
-  case ARM64::LDPWi:
-  case ARM64::LDPXi: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rt2 = Inst.getOperand(1).getReg();
-    if (Rt == Rt2)
-      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
-    break;
-  }
-  case ARM64::LDPDpost:
-  case ARM64::LDPDpre:
-  case ARM64::LDPQpost:
-  case ARM64::LDPQpre:
-  case ARM64::LDPSpost:
-  case ARM64::LDPSpre:
-  case ARM64::LDPSWpost: {
-    unsigned Rt = Inst.getOperand(1).getReg();
-    unsigned Rt2 = Inst.getOperand(2).getReg();
-    if (Rt == Rt2)
-      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
-    break;
-  }
-  case ARM64::STPDpost:
-  case ARM64::STPDpre:
-  case ARM64::STPQpost:
-  case ARM64::STPQpre:
-  case ARM64::STPSpost:
-  case ARM64::STPSpre:
-  case ARM64::STPWpost:
-  case ARM64::STPWpre:
-  case ARM64::STPXpost:
-  case ARM64::STPXpre: {
-    unsigned Rt = Inst.getOperand(1).getReg();
-    unsigned Rt2 = Inst.getOperand(2).getReg();
-    unsigned Rn = Inst.getOperand(3).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable STP instruction, writeback base "
-                           "is also a source");
-    if (RI->isSubRegisterEq(Rn, Rt2))
-      return Error(Loc[1], "unpredictable STP instruction, writeback base "
-                           "is also a source");
-    break;
-  }
-  case ARM64::LDRBBpre:
-  case ARM64::LDRBpre:
-  case ARM64::LDRHHpre:
-  case ARM64::LDRHpre:
-  case ARM64::LDRSBWpre:
-  case ARM64::LDRSBXpre:
-  case ARM64::LDRSHWpre:
-  case ARM64::LDRSHXpre:
-  case ARM64::LDRSWpre:
-  case ARM64::LDRWpre:
-  case ARM64::LDRXpre:
-  case ARM64::LDRBBpost:
-  case ARM64::LDRBpost:
-  case ARM64::LDRHHpost:
-  case ARM64::LDRHpost:
-  case ARM64::LDRSBWpost:
-  case ARM64::LDRSBXpost:
-  case ARM64::LDRSHWpost:
-  case ARM64::LDRSHXpost:
-  case ARM64::LDRSWpost:
-  case ARM64::LDRWpost:
-  case ARM64::LDRXpost: {
-    unsigned Rt = Inst.getOperand(1).getReg();
-    unsigned Rn = Inst.getOperand(2).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable LDR instruction, writeback base "
-                           "is also a source");
-    break;
-  }
-  case ARM64::STRBBpost:
-  case ARM64::STRBpost:
-  case ARM64::STRHHpost:
-  case ARM64::STRHpost:
-  case ARM64::STRWpost:
-  case ARM64::STRXpost:
-  case ARM64::STRBBpre:
-  case ARM64::STRBpre:
-  case ARM64::STRHHpre:
-  case ARM64::STRHpre:
-  case ARM64::STRWpre:
-  case ARM64::STRXpre: {
-    unsigned Rt = Inst.getOperand(1).getReg();
-    unsigned Rn = Inst.getOperand(2).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable STR instruction, writeback base "
-                           "is also a source");
-    break;
-  }
-  }
-
-  // Now check immediate ranges. Separate from the above as there is overlap
-  // in the instructions being checked and this keeps the nested conditionals
-  // to a minimum.
-  switch (Inst.getOpcode()) {
-  case ARM64::ADDSWri:
-  case ARM64::ADDSXri:
-  case ARM64::ADDWri:
-  case ARM64::ADDXri:
-  case ARM64::SUBSWri:
-  case ARM64::SUBSXri:
-  case ARM64::SUBWri:
-  case ARM64::SUBXri: {
-    // Annoyingly we can't do this in the isAddSubImm predicate, so there is
-    // some slight duplication here.
-    if (Inst.getOperand(2).isExpr()) {
-      const MCExpr *Expr = Inst.getOperand(2).getExpr();
-      ARM64MCExpr::VariantKind ELFRefKind;
-      MCSymbolRefExpr::VariantKind DarwinRefKind;
-      int64_t Addend;
-      if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
-        return Error(Loc[2], "invalid immediate expression");
-      }
-
-      // Only allow these with ADDXri.
-      if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
-          DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) &&
-          Inst.getOpcode() == ARM64::ADDXri)
-        return false;
-
-      // Only allow these with ADDXri/ADDWri
-      if ((ELFRefKind == ARM64MCExpr::VK_LO12 ||
-          ELFRefKind == ARM64MCExpr::VK_DTPREL_HI12 ||
-          ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 ||
-          ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC ||
-          ELFRefKind == ARM64MCExpr::VK_TPREL_HI12 ||
-          ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 ||
-          ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC ||
-          ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12) &&
-          (Inst.getOpcode() == ARM64::ADDXri ||
-          Inst.getOpcode() == ARM64::ADDWri))
-        return false;
-
-      // Don't allow expressions in the immediate field otherwise
-      return Error(Loc[2], "invalid immediate expression");
-    }
-    return false;
-  }
-  default:
-    return false;
-  }
-}
-
-bool ARM64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
-  switch (ErrCode) {
-  case Match_MissingFeature:
-    return Error(Loc,
-                 "instruction requires a CPU feature not currently enabled");
-  case Match_InvalidOperand:
-    return Error(Loc, "invalid operand for instruction");
-  case Match_InvalidSuffix:
-    return Error(Loc, "invalid type suffix for instruction");
-  case Match_InvalidCondCode:
-    return Error(Loc, "expected AArch64 condition code");
-  case Match_AddSubRegExtendSmall:
-    return Error(Loc,
-      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
-  case Match_AddSubRegExtendLarge:
-    return Error(Loc,
-      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
-  case Match_AddSubSecondSource:
-    return Error(Loc,
-      "expected compatible register, symbol or integer in range [0, 4095]");
-  case Match_LogicalSecondSource:
-    return Error(Loc, "expected compatible register or logical immediate");
-  case Match_InvalidMovImm32Shift:
-    return Error(Loc, "expected 'lsl' with optional integer 0 or 16");
-  case Match_InvalidMovImm64Shift:
-    return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48");
-  case Match_AddSubRegShift32:
-    return Error(Loc,
-       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
-  case Match_AddSubRegShift64:
-    return Error(Loc,
-       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
-  case Match_InvalidFPImm:
-    return Error(Loc,
-                 "expected compatible register or floating-point constant");
-  case Match_InvalidMemoryIndexedSImm9:
-    return Error(Loc, "index must be an integer in range [-256, 255].");
-  case Match_InvalidMemoryIndexed4SImm7:
-    return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
-  case Match_InvalidMemoryIndexed8SImm7:
-    return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
-  case Match_InvalidMemoryIndexed16SImm7:
-    return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
-  case Match_InvalidMemoryWExtend8:
-    return Error(Loc,
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0");
-  case Match_InvalidMemoryWExtend16:
-    return Error(Loc,
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
-  case Match_InvalidMemoryWExtend32:
-    return Error(Loc,
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
-  case Match_InvalidMemoryWExtend64:
-    return Error(Loc,
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
-  case Match_InvalidMemoryWExtend128:
-    return Error(Loc,
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #4");
-  case Match_InvalidMemoryXExtend8:
-    return Error(Loc,
-                 "expected 'lsl' or 'sxtx' with optional shift of #0");
-  case Match_InvalidMemoryXExtend16:
-    return Error(Loc,
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
-  case Match_InvalidMemoryXExtend32:
-    return Error(Loc,
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
-  case Match_InvalidMemoryXExtend64:
-    return Error(Loc,
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
-  case Match_InvalidMemoryXExtend128:
-    return Error(Loc,
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
-  case Match_InvalidMemoryIndexed1:
-    return Error(Loc, "index must be an integer in range [0, 4095].");
-  case Match_InvalidMemoryIndexed2:
-    return Error(Loc, "index must be a multiple of 2 in range [0, 8190].");
-  case Match_InvalidMemoryIndexed4:
-    return Error(Loc, "index must be a multiple of 4 in range [0, 16380].");
-  case Match_InvalidMemoryIndexed8:
-    return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
-  case Match_InvalidMemoryIndexed16:
-    return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
-  case Match_InvalidImm0_7:
-    return Error(Loc, "immediate must be an integer in range [0, 7].");
-  case Match_InvalidImm0_15:
-    return Error(Loc, "immediate must be an integer in range [0, 15].");
-  case Match_InvalidImm0_31:
-    return Error(Loc, "immediate must be an integer in range [0, 31].");
-  case Match_InvalidImm0_63:
-    return Error(Loc, "immediate must be an integer in range [0, 63].");
-  case Match_InvalidImm0_127:
-    return Error(Loc, "immediate must be an integer in range [0, 127].");
-  case Match_InvalidImm0_65535:
-    return Error(Loc, "immediate must be an integer in range [0, 65535].");
-  case Match_InvalidImm1_8:
-    return Error(Loc, "immediate must be an integer in range [1, 8].");
-  case Match_InvalidImm1_16:
-    return Error(Loc, "immediate must be an integer in range [1, 16].");
-  case Match_InvalidImm1_32:
-    return Error(Loc, "immediate must be an integer in range [1, 32].");
-  case Match_InvalidImm1_64:
-    return Error(Loc, "immediate must be an integer in range [1, 64].");
-  case Match_InvalidIndex1:
-    return Error(Loc, "expected lane specifier '[1]'");
-  case Match_InvalidIndexB:
-    return Error(Loc, "vector lane must be an integer in range [0, 15].");
-  case Match_InvalidIndexH:
-    return Error(Loc, "vector lane must be an integer in range [0, 7].");
-  case Match_InvalidIndexS:
-    return Error(Loc, "vector lane must be an integer in range [0, 3].");
-  case Match_InvalidIndexD:
-    return Error(Loc, "vector lane must be an integer in range [0, 1].");
-  case Match_InvalidLabel:
-    return Error(Loc, "expected label or encodable integer pc offset");
-  case Match_MRS:
-    return Error(Loc, "expected readable system register");
-  case Match_MSR:
-    return Error(Loc, "expected writable system register or pstate");
-  case Match_MnemonicFail:
-    return Error(Loc, "unrecognized instruction mnemonic");
-  default:
-    assert(0 && "unexpected error code!");
-    return Error(Loc, "invalid instruction format");
-  }
-}
-
-static const char *getSubtargetFeatureName(unsigned Val);
-
-bool ARM64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                                             OperandVector &Operands,
-                                             MCStreamer &Out,
-                                             unsigned &ErrorInfo,
-                                             bool MatchingInlineAsm) {
-  assert(!Operands.empty() && "Unexpect empty operand list!");
-  ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[0]);
-  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
-
-  StringRef Tok = Op->getToken();
-  unsigned NumOperands = Operands.size();
-
-  if (NumOperands == 4 && Tok == "lsl") {
-    ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
-    ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-    if (Op2->isReg() && Op3->isImm()) {
-      const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-      if (Op3CE) {
-        uint64_t Op3Val = Op3CE->getValue();
-        uint64_t NewOp3Val = 0;
-        uint64_t NewOp4Val = 0;
-        if (ARM64MCRegisterClasses[ARM64::GPR32allRegClassID].contains(
-                Op2->getReg())) {
-          NewOp3Val = (32 - Op3Val) & 0x1f;
-          NewOp4Val = 31 - Op3Val;
-        } else {
-          NewOp3Val = (64 - Op3Val) & 0x3f;
-          NewOp4Val = 63 - Op3Val;
-        }
-
-        const MCExpr *NewOp3 = MCConstantExpr::Create(NewOp3Val, getContext());
-        const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext());
-
-        Operands[0] = ARM64Operand::CreateToken(
-            "ubfm", false, Op->getStartLoc(), getContext());
-        Operands[3] = ARM64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
-                                              Op3->getEndLoc(), getContext());
-        Operands.push_back(ARM64Operand::CreateImm(
-            NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext()));
-        delete Op3;
-        delete Op;
-      }
-    }
-  } else if (NumOperands == 5) {
-    // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
-    // UBFIZ -> UBFM aliases.
-    if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
-      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      ARM64Operand *Op4 = static_cast<ARM64Operand *>(Operands[4]);
-
-      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
-        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
-
-        if (Op3CE && Op4CE) {
-          uint64_t Op3Val = Op3CE->getValue();
-          uint64_t Op4Val = Op4CE->getValue();
-
-          uint64_t RegWidth = 0;
-          if (ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains(
-              Op1->getReg()))
-            RegWidth = 64;
-          else
-            RegWidth = 32;
-
-          if (Op3Val >= RegWidth)
-            return Error(Op3->getStartLoc(),
-                         "expected integer in range [0, 31]");
-          if (Op4Val < 1 || Op4Val > RegWidth)
-            return Error(Op4->getStartLoc(),
-                         "expected integer in range [1, 32]");
-
-          uint64_t NewOp3Val = 0;
-          if (ARM64MCRegisterClasses[ARM64::GPR32allRegClassID].contains(
-                  Op1->getReg()))
-            NewOp3Val = (32 - Op3Val) & 0x1f;
-          else
-            NewOp3Val = (64 - Op3Val) & 0x3f;
-
-          uint64_t NewOp4Val = Op4Val - 1;
-
-          if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
-            return Error(Op4->getStartLoc(),
-                         "requested insert overflows register");
-
-          const MCExpr *NewOp3 =
-              MCConstantExpr::Create(NewOp3Val, getContext());
-          const MCExpr *NewOp4 =
-              MCConstantExpr::Create(NewOp4Val, getContext());
-          Operands[3] = ARM64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
-                                                Op3->getEndLoc(), getContext());
-          Operands[4] = ARM64Operand::CreateImm(NewOp4, Op4->getStartLoc(),
-                                                Op4->getEndLoc(), getContext());
-          if (Tok == "bfi")
-            Operands[0] = ARM64Operand::CreateToken(
-                "bfm", false, Op->getStartLoc(), getContext());
-          else if (Tok == "sbfiz")
-            Operands[0] = ARM64Operand::CreateToken(
-                "sbfm", false, Op->getStartLoc(), getContext());
-          else if (Tok == "ubfiz")
-            Operands[0] = ARM64Operand::CreateToken(
-                "ubfm", false, Op->getStartLoc(), getContext());
-          else
-            llvm_unreachable("No valid mnemonic for alias?");
-
-          delete Op;
-          delete Op3;
-          delete Op4;
-        }
-      }
-
-      // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
-      // UBFX -> UBFM aliases.
-    } else if (NumOperands == 5 &&
-               (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
-      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      ARM64Operand *Op4 = static_cast<ARM64Operand *>(Operands[4]);
-
-      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
-        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
-
-        if (Op3CE && Op4CE) {
-          uint64_t Op3Val = Op3CE->getValue();
-          uint64_t Op4Val = Op4CE->getValue();
-
-          uint64_t RegWidth = 0;
-          if (ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains(
-              Op1->getReg()))
-            RegWidth = 64;
-          else
-            RegWidth = 32;
-
-          if (Op3Val >= RegWidth)
-            return Error(Op3->getStartLoc(),
-                         "expected integer in range [0, 31]");
-          if (Op4Val < 1 || Op4Val > RegWidth)
-            return Error(Op4->getStartLoc(),
-                         "expected integer in range [1, 32]");
-
-          uint64_t NewOp4Val = Op3Val + Op4Val - 1;
-
-          if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val)
-            return Error(Op4->getStartLoc(),
-                         "requested extract overflows register");
-
-          const MCExpr *NewOp4 =
-              MCConstantExpr::Create(NewOp4Val, getContext());
-          Operands[4] = ARM64Operand::CreateImm(
-              NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
-          if (Tok == "bfxil")
-            Operands[0] = ARM64Operand::CreateToken(
-                "bfm", false, Op->getStartLoc(), getContext());
-          else if (Tok == "sbfx")
-            Operands[0] = ARM64Operand::CreateToken(
-                "sbfm", false, Op->getStartLoc(), getContext());
-          else if (Tok == "ubfx")
-            Operands[0] = ARM64Operand::CreateToken(
-                "ubfm", false, Op->getStartLoc(), getContext());
-          else
-            llvm_unreachable("No valid mnemonic for alias?");
-
-          delete Op;
-          delete Op4;
-        }
-      }
-    }
-  }
-  // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
-  //        InstAlias can't quite handle this since the reg classes aren't
-  //        subclasses.
-  if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
-    // The source register can be Wn here, but the matcher expects a
-    // GPR64. Twiddle it here if necessary.
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
-    if (Op->isReg()) {
-      unsigned Reg = getXRegFromWReg(Op->getReg());
-      Operands[2] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                            Op->getEndLoc(), getContext());
-      delete Op;
-    }
-  }
-  // FIXME: Likewise for sxt[bh] with a Xd dst operand
-  else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[1]);
-    if (Op->isReg() &&
-        ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains(
-            Op->getReg())) {
-      // The source register can be Wn here, but the matcher expects a
-      // GPR64. Twiddle it here if necessary.
-      ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
-      if (Op->isReg()) {
-        unsigned Reg = getXRegFromWReg(Op->getReg());
-        Operands[2] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                              Op->getEndLoc(), getContext());
-        delete Op;
-      }
-    }
-  }
-  // FIXME: Likewise for uxt[bh] with a Xd dst operand
-  else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[1]);
-    if (Op->isReg() &&
-        ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains(
-            Op->getReg())) {
-      // The source register can be Wn here, but the matcher expects a
-      // GPR32. Twiddle it here if necessary.
-      ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[1]);
-      if (Op->isReg()) {
-        unsigned Reg = getWRegFromXReg(Op->getReg());
-        Operands[1] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                              Op->getEndLoc(), getContext());
-        delete Op;
-      }
-    }
-  }
-
-  // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
-  if (NumOperands == 3 && Tok == "fmov") {
-    ARM64Operand *RegOp = static_cast<ARM64Operand *>(Operands[1]);
-    ARM64Operand *ImmOp = static_cast<ARM64Operand *>(Operands[2]);
-    if (RegOp->isReg() && ImmOp->isFPImm() &&
-        ImmOp->getFPImm() == (unsigned)-1) {
-      unsigned zreg = ARM64MCRegisterClasses[ARM64::FPR32RegClassID].contains(
-                          RegOp->getReg())
-                          ? ARM64::WZR
-                          : ARM64::XZR;
-      Operands[2] = ARM64Operand::CreateReg(zreg, false, Op->getStartLoc(),
-                                            Op->getEndLoc(), getContext());
-      delete ImmOp;
-    }
-  }
-
-  MCInst Inst;
-  // First try to match against the secondary set of tables containing the
-  // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
-  unsigned MatchResult =
-      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
-
-  // If that fails, try against the alternate table containing long-form NEON:
-  // "fadd v0.2s, v1.2s, v2.2s"
-  if (MatchResult != Match_Success)
-    MatchResult =
-        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
-
-  switch (MatchResult) {
-  case Match_Success: {
-    // Perform range checking and other semantic validations
-    SmallVector<SMLoc, 8> OperandLocs;
-    NumOperands = Operands.size();
-    for (unsigned i = 1; i < NumOperands; ++i)
-      OperandLocs.push_back(Operands[i]->getStartLoc());
-    if (validateInstruction(Inst, OperandLocs))
-      return true;
-
-    Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, STI);
-    return false;
-  }
-  case Match_MissingFeature: {
-    assert(ErrorInfo && "Unknown missing feature!");
-    // Special case the error message for the very common case where only
-    // a single subtarget feature is missing (neon, e.g.).
-    std::string Msg = "instruction requires:";
-    unsigned Mask = 1;
-    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
-      if (ErrorInfo & Mask) {
-        Msg += " ";
-        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
-      }
-      Mask <<= 1;
-    }
-    return Error(IDLoc, Msg);
-  }
-  case Match_MnemonicFail:
-    return showMatchError(IDLoc, MatchResult);
-  case Match_InvalidOperand: {
-    SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
-      if (ErrorInfo >= Operands.size())
-        return Error(IDLoc, "too few operands for instruction");
-
-      ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
-      if (ErrorLoc == SMLoc())
-        ErrorLoc = IDLoc;
-    }
-    // If the match failed on a suffix token operand, tweak the diagnostic
-    // accordingly.
-    if (((ARM64Operand *)Operands[ErrorInfo])->isToken() &&
-        ((ARM64Operand *)Operands[ErrorInfo])->isTokenSuffix())
-      MatchResult = Match_InvalidSuffix;
-
-    return showMatchError(ErrorLoc, MatchResult);
-  }
-  case Match_InvalidMemoryIndexed1:
-  case Match_InvalidMemoryIndexed2:
-  case Match_InvalidMemoryIndexed4:
-  case Match_InvalidMemoryIndexed8:
-  case Match_InvalidMemoryIndexed16:
-  case Match_InvalidCondCode:
-  case Match_AddSubRegExtendSmall:
-  case Match_AddSubRegExtendLarge:
-  case Match_AddSubSecondSource:
-  case Match_LogicalSecondSource:
-  case Match_AddSubRegShift32:
-  case Match_AddSubRegShift64:
-  case Match_InvalidMovImm32Shift:
-  case Match_InvalidMovImm64Shift:
-  case Match_InvalidFPImm:
-  case Match_InvalidMemoryWExtend8:
-  case Match_InvalidMemoryWExtend16:
-  case Match_InvalidMemoryWExtend32:
-  case Match_InvalidMemoryWExtend64:
-  case Match_InvalidMemoryWExtend128:
-  case Match_InvalidMemoryXExtend8:
-  case Match_InvalidMemoryXExtend16:
-  case Match_InvalidMemoryXExtend32:
-  case Match_InvalidMemoryXExtend64:
-  case Match_InvalidMemoryXExtend128:
-  case Match_InvalidMemoryIndexed4SImm7:
-  case Match_InvalidMemoryIndexed8SImm7:
-  case Match_InvalidMemoryIndexed16SImm7:
-  case Match_InvalidMemoryIndexedSImm9:
-  case Match_InvalidImm0_7:
-  case Match_InvalidImm0_15:
-  case Match_InvalidImm0_31:
-  case Match_InvalidImm0_63:
-  case Match_InvalidImm0_127:
-  case Match_InvalidImm0_65535:
-  case Match_InvalidImm1_8:
-  case Match_InvalidImm1_16:
-  case Match_InvalidImm1_32:
-  case Match_InvalidImm1_64:
-  case Match_InvalidIndex1:
-  case Match_InvalidIndexB:
-  case Match_InvalidIndexH:
-  case Match_InvalidIndexS:
-  case Match_InvalidIndexD:
-  case Match_InvalidLabel:
-  case Match_MSR:
-  case Match_MRS: {
-    // Any time we get here, there's nothing fancy to do. Just get the
-    // operand SMLoc and display the diagnostic.
-    SMLoc ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
-    if (ErrorLoc == SMLoc())
-      ErrorLoc = IDLoc;
-    return showMatchError(ErrorLoc, MatchResult);
-  }
-  }
-
-  llvm_unreachable("Implement any new match types added!");
-  return true;
-}
-
-/// ParseDirective parses the arm specific directives
-bool ARM64AsmParser::ParseDirective(AsmToken DirectiveID) {
-  StringRef IDVal = DirectiveID.getIdentifier();
-  SMLoc Loc = DirectiveID.getLoc();
-  if (IDVal == ".hword")
-    return parseDirectiveWord(2, Loc);
-  if (IDVal == ".word")
-    return parseDirectiveWord(4, Loc);
-  if (IDVal == ".xword")
-    return parseDirectiveWord(8, Loc);
-  if (IDVal == ".tlsdesccall")
-    return parseDirectiveTLSDescCall(Loc);
-
-  return parseDirectiveLOH(IDVal, Loc);
-}
-
-/// parseDirectiveWord
-///  ::= .word [ expression (, expression)* ]
-bool ARM64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
-      const MCExpr *Value;
-      if (getParser().parseExpression(Value))
-        return true;
-
-      getParser().getStreamer().EmitValue(Value, Size);
-
-      if (getLexer().is(AsmToken::EndOfStatement))
-        break;
-
-      // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma))
-        return Error(L, "unexpected token in directive");
-      Parser.Lex();
-    }
-  }
-
-  Parser.Lex();
-  return false;
-}
-
-// parseDirectiveTLSDescCall:
-//   ::= .tlsdesccall symbol
-bool ARM64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
-  StringRef Name;
-  if (getParser().parseIdentifier(Name))
-    return Error(L, "expected symbol after directive");
-
-  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
-  Expr = ARM64MCExpr::Create(Expr, ARM64MCExpr::VK_TLSDESC, getContext());
-
-  MCInst Inst;
-  Inst.setOpcode(ARM64::TLSDESCCALL);
-  Inst.addOperand(MCOperand::CreateExpr(Expr));
-
-  getParser().getStreamer().EmitInstruction(Inst, STI);
-  return false;
-}
-
-/// ::= .loh <lohName | lohId> label1, ..., labelN
-/// The number of arguments depends on the loh identifier.
-bool ARM64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
-  if (IDVal != MCLOHDirectiveName())
-    return true;
-  MCLOHType Kind;
-  if (getParser().getTok().isNot(AsmToken::Identifier)) {
-    if (getParser().getTok().isNot(AsmToken::Integer))
-      return TokError("expected an identifier or a number in directive");
-    // We successfully get a numeric value for the identifier.
-    // Check if it is valid.
-    int64_t Id = getParser().getTok().getIntVal();
-    Kind = (MCLOHType)Id;
-    // Check that Id does not overflow MCLOHType.
-    if (!isValidMCLOHType(Kind) || Id != Kind)
-      return TokError("invalid numeric identifier in directive");
-  } else {
-    StringRef Name = getTok().getIdentifier();
-    // We successfully parse an identifier.
-    // Check if it is a recognized one.
-    int Id = MCLOHNameToId(Name);
-
-    if (Id == -1)
-      return TokError("invalid identifier in directive");
-    Kind = (MCLOHType)Id;
-  }
-  // Consume the identifier.
-  Lex();
-  // Get the number of arguments of this LOH.
-  int NbArgs = MCLOHIdToNbArgs(Kind);
-
-  assert(NbArgs != -1 && "Invalid number of arguments");
-
-  SmallVector<MCSymbol *, 3> Args;
-  for (int Idx = 0; Idx < NbArgs; ++Idx) {
-    StringRef Name;
-    if (getParser().parseIdentifier(Name))
-      return TokError("expected identifier in directive");
-    Args.push_back(getContext().GetOrCreateSymbol(Name));
-
-    if (Idx + 1 == NbArgs)
-      break;
-    if (getLexer().isNot(AsmToken::Comma))
-      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
-    Lex();
-  }
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
-
-  getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
-  return false;
-}
-
-bool
-ARM64AsmParser::classifySymbolRef(const MCExpr *Expr,
-                                  ARM64MCExpr::VariantKind &ELFRefKind,
-                                  MCSymbolRefExpr::VariantKind &DarwinRefKind,
-                                  int64_t &Addend) {
-  ELFRefKind = ARM64MCExpr::VK_INVALID;
-  DarwinRefKind = MCSymbolRefExpr::VK_None;
-  Addend = 0;
-
-  if (const ARM64MCExpr *AE = dyn_cast<ARM64MCExpr>(Expr)) {
-    ELFRefKind = AE->getKind();
-    Expr = AE->getSubExpr();
-  }
-
-  const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
-  if (SE) {
-    // It's a simple symbol reference with no addend.
-    DarwinRefKind = SE->getKind();
-    return true;
-  }
-
-  const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
-  if (!BE)
-    return false;
-
-  SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
-  if (!SE)
-    return false;
-  DarwinRefKind = SE->getKind();
-
-  if (BE->getOpcode() != MCBinaryExpr::Add &&
-      BE->getOpcode() != MCBinaryExpr::Sub)
-    return false;
-
-  // See if the addend is is a constant, otherwise there's more going
-  // on here than we can deal with.
-  auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
-  if (!AddendExpr)
-    return false;
-
-  Addend = AddendExpr->getValue();
-  if (BE->getOpcode() == MCBinaryExpr::Sub)
-    Addend = -Addend;
-
-  // It's some symbol reference + a constant addend, but really
-  // shouldn't use both Darwin and ELF syntax.
-  return ELFRefKind == ARM64MCExpr::VK_INVALID ||
-         DarwinRefKind == MCSymbolRefExpr::VK_None;
-}
-
-/// Force static initialization.
-extern "C" void LLVMInitializeARM64AsmParser() {
-  RegisterMCAsmParser<ARM64AsmParser> X(TheARM64leTarget);
-  RegisterMCAsmParser<ARM64AsmParser> Y(TheARM64beTarget);
-
-  RegisterMCAsmParser<ARM64AsmParser> Z(TheAArch64leTarget);
-  RegisterMCAsmParser<ARM64AsmParser> W(TheAArch64beTarget);
-}
-
-#define GET_REGISTER_MATCHER
-#define GET_SUBTARGET_FEATURE_NAME
-#define GET_MATCHER_IMPLEMENTATION
-#include "ARM64GenAsmMatcher.inc"
-
-// Define this matcher function after the auto-generated include so we
-// have the match class enum definitions.
-unsigned ARM64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
-                                                    unsigned Kind) {
-  ARM64Operand *Op = static_cast<ARM64Operand *>(AsmOp);
-  // If the kind is a token for a literal immediate, check if our asm
-  // operand matches. This is for InstAliases which have a fixed-value
-  // immediate in the syntax.
-  int64_t ExpectedVal;
-  switch (Kind) {
-  default:
-    return Match_InvalidOperand;
-  case MCK__35_0:
-    ExpectedVal = 0;
-    break;
-  case MCK__35_1:
-    ExpectedVal = 1;
-    break;
-  case MCK__35_12:
-    ExpectedVal = 12;
-    break;
-  case MCK__35_16:
-    ExpectedVal = 16;
-    break;
-  case MCK__35_2:
-    ExpectedVal = 2;
-    break;
-  case MCK__35_24:
-    ExpectedVal = 24;
-    break;
-  case MCK__35_3:
-    ExpectedVal = 3;
-    break;
-  case MCK__35_32:
-    ExpectedVal = 32;
-    break;
-  case MCK__35_4:
-    ExpectedVal = 4;
-    break;
-  case MCK__35_48:
-    ExpectedVal = 48;
-    break;
-  case MCK__35_6:
-    ExpectedVal = 6;
-    break;
-  case MCK__35_64:
-    ExpectedVal = 64;
-    break;
-  case MCK__35_8:
-    ExpectedVal = 8;
-    break;
-  }
-  if (!Op->isImm())
-    return Match_InvalidOperand;
-  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
-  if (!CE)
-    return Match_InvalidOperand;
-  if (CE->getValue() == ExpectedVal)
-    return Match_Success;
-  return Match_InvalidOperand;
-}
diff --git a/lib/Target/ARM64/AsmParser/CMakeLists.txt b/lib/Target/ARM64/AsmParser/CMakeLists.txt
deleted file mode 100644
index 826158b1ed1..00000000000
--- a/lib/Target/ARM64/AsmParser/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64AsmParser
-  ARM64AsmParser.cpp
-  )
-
diff --git a/lib/Target/ARM64/AsmParser/LLVMBuild.txt b/lib/Target/ARM64/AsmParser/LLVMBuild.txt
deleted file mode 100644
index 9045283e919..00000000000
--- a/lib/Target/ARM64/AsmParser/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/ARM64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64AsmParser
-parent = ARM64
-required_libraries = ARM64Desc ARM64Info ARM64Utils MC MCParser Support
-add_to_library_groups = ARM64
diff --git a/lib/Target/ARM64/AsmParser/Makefile b/lib/Target/ARM64/AsmParser/Makefile
deleted file mode 100644
index d25c47f9af9..00000000000
--- a/lib/Target/ARM64/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64AsmParser
-
-# Hack: we need to include 'main' ARM target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/CMakeLists.txt b/lib/Target/ARM64/CMakeLists.txt
deleted file mode 100644
index 56ba3b73294..00000000000
--- a/lib/Target/ARM64/CMakeLists.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS ARM64.td)
-
-tablegen(LLVM ARM64GenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM ARM64GenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM ARM64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
-tablegen(LLVM ARM64GenMCPseudoLowering.inc -gen-pseudo-lowering)
-tablegen(LLVM ARM64GenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM ARM64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
-tablegen(LLVM ARM64GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM ARM64GenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM ARM64GenFastISel.inc -gen-fast-isel)
-tablegen(LLVM ARM64GenCallingConv.inc -gen-callingconv)
-tablegen(LLVM ARM64GenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM ARM64GenDisassemblerTables.inc -gen-disassembler)
-add_public_tablegen_target(ARM64CommonTableGen)
-
-add_llvm_target(ARM64CodeGen
-  ARM64AddressTypePromotion.cpp
-  ARM64AdvSIMDScalarPass.cpp
-  ARM64AsmPrinter.cpp
-  ARM64BranchRelaxation.cpp
-  ARM64CleanupLocalDynamicTLSPass.cpp
-  ARM64CollectLOH.cpp
-  ARM64ConditionalCompares.cpp
-  ARM64DeadRegisterDefinitionsPass.cpp
-  ARM64ExpandPseudoInsts.cpp
-  ARM64FastISel.cpp
-  ARM64FrameLowering.cpp
-  ARM64ISelDAGToDAG.cpp
-  ARM64ISelLowering.cpp
-  ARM64InstrInfo.cpp
-  ARM64LoadStoreOptimizer.cpp
-  ARM64MCInstLower.cpp
-  ARM64PromoteConstant.cpp
-  ARM64RegisterInfo.cpp
-  ARM64SelectionDAGInfo.cpp
-  ARM64StorePairSuppress.cpp
-  ARM64Subtarget.cpp
-  ARM64TargetMachine.cpp
-  ARM64TargetObjectFile.cpp
-  ARM64TargetTransformInfo.cpp
-)
-
-add_dependencies(LLVMARM64CodeGen intrinsics_gen)
-
-add_subdirectory(TargetInfo)
-add_subdirectory(AsmParser)
-add_subdirectory(Disassembler)
-add_subdirectory(InstPrinter)
-add_subdirectory(MCTargetDesc)
-add_subdirectory(Utils)
diff --git a/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp b/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
deleted file mode 100644
index bb47b3a0982..00000000000
--- a/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
+++ /dev/null
@@ -1,1548 +0,0 @@
-//===- ARM64Disassembler.cpp - Disassembler for ARM64 -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64Disassembler.h"
-#include "ARM64ExternalSymbolizer.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "Utils/ARM64BaseInfo.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryObject.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-disassembler"
-
-// Pull DecodeStatus and its enum values into the global namespace.
-typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
-
-// Forward declare these because the autogenerated code will reference them.
-// Definitions are further down.
-static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
-                                              unsigned RegNo, uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
-static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const void *Decoder);
-static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const void *Decoder);
-static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder);
-
-static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
-                                       uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
-                                    uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
-                                            uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
-                                            uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
-                                                  uint32_t insn,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
-static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                            uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
-                                                  uint32_t insn,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
-static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder);
-static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder);
-static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder);
-static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder);
-
-static bool Check(DecodeStatus &Out, DecodeStatus In) {
-  switch (In) {
-    case MCDisassembler::Success:
-      // Out stays the same.
-      return true;
-    case MCDisassembler::SoftFail:
-      Out = In;
-      return true;
-    case MCDisassembler::Fail:
-      Out = In;
-      return false;
-  }
-  llvm_unreachable("Invalid DecodeStatus!");
-}
-
-#include "ARM64GenDisassemblerTables.inc"
-#include "ARM64GenInstrInfo.inc"
-
-#define Success llvm::MCDisassembler::Success
-#define Fail llvm::MCDisassembler::Fail
-#define SoftFail llvm::MCDisassembler::SoftFail
-
-static MCDisassembler *createARM64Disassembler(const Target &T,
-                                               const MCSubtargetInfo &STI,
-                                               MCContext &Ctx) {
-  return new ARM64Disassembler(STI, Ctx);
-}
-
-DecodeStatus ARM64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                               const MemoryObject &Region,
-                                               uint64_t Address,
-                                               raw_ostream &os,
-                                               raw_ostream &cs) const {
-  CommentStream = &cs;
-
-  uint8_t bytes[4];
-
-  Size = 0;
-  // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
-    return Fail;
-  Size = 4;
-
-  // Encoded as a small-endian 32-bit word in the stream.
-  uint32_t insn =
-      (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
-
-  // Calling the auto-generated decoder function.
-  return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
-}
-
-static MCSymbolizer *
-createARM64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
-                              LLVMSymbolLookupCallback SymbolLookUp,
-                              void *DisInfo, MCContext *Ctx,
-                              MCRelocationInfo *RelInfo) {
-  return new llvm::ARM64ExternalSymbolizer(
-                                     *Ctx,
-                                     std::unique_ptr<MCRelocationInfo>(RelInfo),
-                                     GetOpInfo, SymbolLookUp, DisInfo);
-}
-
-extern "C" void LLVMInitializeARM64Disassembler() {
-  TargetRegistry::RegisterMCDisassembler(TheARM64leTarget,
-                                         createARM64Disassembler);
-  TargetRegistry::RegisterMCDisassembler(TheARM64beTarget,
-                                         createARM64Disassembler);
-  TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget,
-                                       createARM64ExternalSymbolizer);
-  TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget,
-                                       createARM64ExternalSymbolizer);
-
-  TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget,
-                                         createARM64Disassembler);
-  TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget,
-                                         createARM64Disassembler);
-  TargetRegistry::RegisterMCSymbolizer(TheAArch64leTarget,
-                                       createARM64ExternalSymbolizer);
-  TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget,
-                                       createARM64ExternalSymbolizer);
-}
-
-static const unsigned FPR128DecoderTable[] = {
-  ARM64::Q0,  ARM64::Q1,  ARM64::Q2,  ARM64::Q3,  ARM64::Q4,  ARM64::Q5,
-  ARM64::Q6,  ARM64::Q7,  ARM64::Q8,  ARM64::Q9,  ARM64::Q10, ARM64::Q11,
-  ARM64::Q12, ARM64::Q13, ARM64::Q14, ARM64::Q15, ARM64::Q16, ARM64::Q17,
-  ARM64::Q18, ARM64::Q19, ARM64::Q20, ARM64::Q21, ARM64::Q22, ARM64::Q23,
-  ARM64::Q24, ARM64::Q25, ARM64::Q26, ARM64::Q27, ARM64::Q28, ARM64::Q29,
-  ARM64::Q30, ARM64::Q31
-};
-
-static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR128DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                 uint64_t Addr,
-                                                 const void *Decoder) {
-  if (RegNo > 15)
-    return Fail;
-  return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
-}
-
-static const unsigned FPR64DecoderTable[] = {
-  ARM64::D0,  ARM64::D1,  ARM64::D2,  ARM64::D3,  ARM64::D4,  ARM64::D5,
-  ARM64::D6,  ARM64::D7,  ARM64::D8,  ARM64::D9,  ARM64::D10, ARM64::D11,
-  ARM64::D12, ARM64::D13, ARM64::D14, ARM64::D15, ARM64::D16, ARM64::D17,
-  ARM64::D18, ARM64::D19, ARM64::D20, ARM64::D21, ARM64::D22, ARM64::D23,
-  ARM64::D24, ARM64::D25, ARM64::D26, ARM64::D27, ARM64::D28, ARM64::D29,
-  ARM64::D30, ARM64::D31
-};
-
-static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR64DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned FPR32DecoderTable[] = {
-  ARM64::S0,  ARM64::S1,  ARM64::S2,  ARM64::S3,  ARM64::S4,  ARM64::S5,
-  ARM64::S6,  ARM64::S7,  ARM64::S8,  ARM64::S9,  ARM64::S10, ARM64::S11,
-  ARM64::S12, ARM64::S13, ARM64::S14, ARM64::S15, ARM64::S16, ARM64::S17,
-  ARM64::S18, ARM64::S19, ARM64::S20, ARM64::S21, ARM64::S22, ARM64::S23,
-  ARM64::S24, ARM64::S25, ARM64::S26, ARM64::S27, ARM64::S28, ARM64::S29,
-  ARM64::S30, ARM64::S31
-};
-
-static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR32DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned FPR16DecoderTable[] = {
-  ARM64::H0,  ARM64::H1,  ARM64::H2,  ARM64::H3,  ARM64::H4,  ARM64::H5,
-  ARM64::H6,  ARM64::H7,  ARM64::H8,  ARM64::H9,  ARM64::H10, ARM64::H11,
-  ARM64::H12, ARM64::H13, ARM64::H14, ARM64::H15, ARM64::H16, ARM64::H17,
-  ARM64::H18, ARM64::H19, ARM64::H20, ARM64::H21, ARM64::H22, ARM64::H23,
-  ARM64::H24, ARM64::H25, ARM64::H26, ARM64::H27, ARM64::H28, ARM64::H29,
-  ARM64::H30, ARM64::H31
-};
-
-static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR16DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned FPR8DecoderTable[] = {
-  ARM64::B0,  ARM64::B1,  ARM64::B2,  ARM64::B3,  ARM64::B4,  ARM64::B5,
-  ARM64::B6,  ARM64::B7,  ARM64::B8,  ARM64::B9,  ARM64::B10, ARM64::B11,
-  ARM64::B12, ARM64::B13, ARM64::B14, ARM64::B15, ARM64::B16, ARM64::B17,
-  ARM64::B18, ARM64::B19, ARM64::B20, ARM64::B21, ARM64::B22, ARM64::B23,
-  ARM64::B24, ARM64::B25, ARM64::B26, ARM64::B27, ARM64::B28, ARM64::B29,
-  ARM64::B30, ARM64::B31
-};
-
-static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR8DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned GPR64DecoderTable[] = {
-  ARM64::X0,  ARM64::X1,  ARM64::X2,  ARM64::X3,  ARM64::X4,  ARM64::X5,
-  ARM64::X6,  ARM64::X7,  ARM64::X8,  ARM64::X9,  ARM64::X10, ARM64::X11,
-  ARM64::X12, ARM64::X13, ARM64::X14, ARM64::X15, ARM64::X16, ARM64::X17,
-  ARM64::X18, ARM64::X19, ARM64::X20, ARM64::X21, ARM64::X22, ARM64::X23,
-  ARM64::X24, ARM64::X25, ARM64::X26, ARM64::X27, ARM64::X28, ARM64::FP,
-  ARM64::LR,  ARM64::XZR
-};
-
-static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = GPR64DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = GPR64DecoderTable[RegNo];
-  if (Register == ARM64::XZR)
-    Register = ARM64::SP;
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned GPR32DecoderTable[] = {
-  ARM64::W0,  ARM64::W1,  ARM64::W2,  ARM64::W3,  ARM64::W4,  ARM64::W5,
-  ARM64::W6,  ARM64::W7,  ARM64::W8,  ARM64::W9,  ARM64::W10, ARM64::W11,
-  ARM64::W12, ARM64::W13, ARM64::W14, ARM64::W15, ARM64::W16, ARM64::W17,
-  ARM64::W18, ARM64::W19, ARM64::W20, ARM64::W21, ARM64::W22, ARM64::W23,
-  ARM64::W24, ARM64::W25, ARM64::W26, ARM64::W27, ARM64::W28, ARM64::W29,
-  ARM64::W30, ARM64::WZR
-};
-
-static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = GPR32DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = GPR32DecoderTable[RegNo];
-  if (Register == ARM64::WZR)
-    Register = ARM64::WSP;
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned VectorDecoderTable[] = {
-  ARM64::Q0,  ARM64::Q1,  ARM64::Q2,  ARM64::Q3,  ARM64::Q4,  ARM64::Q5,
-  ARM64::Q6,  ARM64::Q7,  ARM64::Q8,  ARM64::Q9,  ARM64::Q10, ARM64::Q11,
-  ARM64::Q12, ARM64::Q13, ARM64::Q14, ARM64::Q15, ARM64::Q16, ARM64::Q17,
-  ARM64::Q18, ARM64::Q19, ARM64::Q20, ARM64::Q21, ARM64::Q22, ARM64::Q23,
-  ARM64::Q24, ARM64::Q25, ARM64::Q26, ARM64::Q27, ARM64::Q28, ARM64::Q29,
-  ARM64::Q30, ARM64::Q31
-};
-
-static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = VectorDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned QQDecoderTable[] = {
-  ARM64::Q0_Q1,   ARM64::Q1_Q2,   ARM64::Q2_Q3,   ARM64::Q3_Q4,
-  ARM64::Q4_Q5,   ARM64::Q5_Q6,   ARM64::Q6_Q7,   ARM64::Q7_Q8,
-  ARM64::Q8_Q9,   ARM64::Q9_Q10,  ARM64::Q10_Q11, ARM64::Q11_Q12,
-  ARM64::Q12_Q13, ARM64::Q13_Q14, ARM64::Q14_Q15, ARM64::Q15_Q16,
-  ARM64::Q16_Q17, ARM64::Q17_Q18, ARM64::Q18_Q19, ARM64::Q19_Q20,
-  ARM64::Q20_Q21, ARM64::Q21_Q22, ARM64::Q22_Q23, ARM64::Q23_Q24,
-  ARM64::Q24_Q25, ARM64::Q25_Q26, ARM64::Q26_Q27, ARM64::Q27_Q28,
-  ARM64::Q28_Q29, ARM64::Q29_Q30, ARM64::Q30_Q31, ARM64::Q31_Q0
-};
-
-static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                          uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = QQDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned QQQDecoderTable[] = {
-  ARM64::Q0_Q1_Q2,    ARM64::Q1_Q2_Q3,    ARM64::Q2_Q3_Q4,
-  ARM64::Q3_Q4_Q5,    ARM64::Q4_Q5_Q6,    ARM64::Q5_Q6_Q7,
-  ARM64::Q6_Q7_Q8,    ARM64::Q7_Q8_Q9,    ARM64::Q8_Q9_Q10,
-  ARM64::Q9_Q10_Q11,  ARM64::Q10_Q11_Q12, ARM64::Q11_Q12_Q13,
-  ARM64::Q12_Q13_Q14, ARM64::Q13_Q14_Q15, ARM64::Q14_Q15_Q16,
-  ARM64::Q15_Q16_Q17, ARM64::Q16_Q17_Q18, ARM64::Q17_Q18_Q19,
-  ARM64::Q18_Q19_Q20, ARM64::Q19_Q20_Q21, ARM64::Q20_Q21_Q22,
-  ARM64::Q21_Q22_Q23, ARM64::Q22_Q23_Q24, ARM64::Q23_Q24_Q25,
-  ARM64::Q24_Q25_Q26, ARM64::Q25_Q26_Q27, ARM64::Q26_Q27_Q28,
-  ARM64::Q27_Q28_Q29, ARM64::Q28_Q29_Q30, ARM64::Q29_Q30_Q31,
-  ARM64::Q30_Q31_Q0,  ARM64::Q31_Q0_Q1
-};
-
-static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = QQQDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned QQQQDecoderTable[] = {
-  ARM64::Q0_Q1_Q2_Q3,     ARM64::Q1_Q2_Q3_Q4,     ARM64::Q2_Q3_Q4_Q5,
-  ARM64::Q3_Q4_Q5_Q6,     ARM64::Q4_Q5_Q6_Q7,     ARM64::Q5_Q6_Q7_Q8,
-  ARM64::Q6_Q7_Q8_Q9,     ARM64::Q7_Q8_Q9_Q10,    ARM64::Q8_Q9_Q10_Q11,
-  ARM64::Q9_Q10_Q11_Q12,  ARM64::Q10_Q11_Q12_Q13, ARM64::Q11_Q12_Q13_Q14,
-  ARM64::Q12_Q13_Q14_Q15, ARM64::Q13_Q14_Q15_Q16, ARM64::Q14_Q15_Q16_Q17,
-  ARM64::Q15_Q16_Q17_Q18, ARM64::Q16_Q17_Q18_Q19, ARM64::Q17_Q18_Q19_Q20,
-  ARM64::Q18_Q19_Q20_Q21, ARM64::Q19_Q20_Q21_Q22, ARM64::Q20_Q21_Q22_Q23,
-  ARM64::Q21_Q22_Q23_Q24, ARM64::Q22_Q23_Q24_Q25, ARM64::Q23_Q24_Q25_Q26,
-  ARM64::Q24_Q25_Q26_Q27, ARM64::Q25_Q26_Q27_Q28, ARM64::Q26_Q27_Q28_Q29,
-  ARM64::Q27_Q28_Q29_Q30, ARM64::Q28_Q29_Q30_Q31, ARM64::Q29_Q30_Q31_Q0,
-  ARM64::Q30_Q31_Q0_Q1,   ARM64::Q31_Q0_Q1_Q2
-};
-
-static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = QQQQDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned DDDecoderTable[] = {
-  ARM64::D0_D1,   ARM64::D1_D2,   ARM64::D2_D3,   ARM64::D3_D4,
-  ARM64::D4_D5,   ARM64::D5_D6,   ARM64::D6_D7,   ARM64::D7_D8,
-  ARM64::D8_D9,   ARM64::D9_D10,  ARM64::D10_D11, ARM64::D11_D12,
-  ARM64::D12_D13, ARM64::D13_D14, ARM64::D14_D15, ARM64::D15_D16,
-  ARM64::D16_D17, ARM64::D17_D18, ARM64::D18_D19, ARM64::D19_D20,
-  ARM64::D20_D21, ARM64::D21_D22, ARM64::D22_D23, ARM64::D23_D24,
-  ARM64::D24_D25, ARM64::D25_D26, ARM64::D26_D27, ARM64::D27_D28,
-  ARM64::D28_D29, ARM64::D29_D30, ARM64::D30_D31, ARM64::D31_D0
-};
-
-static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                          uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = DDDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned DDDDecoderTable[] = {
-  ARM64::D0_D1_D2,    ARM64::D1_D2_D3,    ARM64::D2_D3_D4,
-  ARM64::D3_D4_D5,    ARM64::D4_D5_D6,    ARM64::D5_D6_D7,
-  ARM64::D6_D7_D8,    ARM64::D7_D8_D9,    ARM64::D8_D9_D10,
-  ARM64::D9_D10_D11,  ARM64::D10_D11_D12, ARM64::D11_D12_D13,
-  ARM64::D12_D13_D14, ARM64::D13_D14_D15, ARM64::D14_D15_D16,
-  ARM64::D15_D16_D17, ARM64::D16_D17_D18, ARM64::D17_D18_D19,
-  ARM64::D18_D19_D20, ARM64::D19_D20_D21, ARM64::D20_D21_D22,
-  ARM64::D21_D22_D23, ARM64::D22_D23_D24, ARM64::D23_D24_D25,
-  ARM64::D24_D25_D26, ARM64::D25_D26_D27, ARM64::D26_D27_D28,
-  ARM64::D27_D28_D29, ARM64::D28_D29_D30, ARM64::D29_D30_D31,
-  ARM64::D30_D31_D0,  ARM64::D31_D0_D1
-};
-
-static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = DDDDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned DDDDDecoderTable[] = {
-  ARM64::D0_D1_D2_D3,     ARM64::D1_D2_D3_D4,     ARM64::D2_D3_D4_D5,
-  ARM64::D3_D4_D5_D6,     ARM64::D4_D5_D6_D7,     ARM64::D5_D6_D7_D8,
-  ARM64::D6_D7_D8_D9,     ARM64::D7_D8_D9_D10,    ARM64::D8_D9_D10_D11,
-  ARM64::D9_D10_D11_D12,  ARM64::D10_D11_D12_D13, ARM64::D11_D12_D13_D14,
-  ARM64::D12_D13_D14_D15, ARM64::D13_D14_D15_D16, ARM64::D14_D15_D16_D17,
-  ARM64::D15_D16_D17_D18, ARM64::D16_D17_D18_D19, ARM64::D17_D18_D19_D20,
-  ARM64::D18_D19_D20_D21, ARM64::D19_D20_D21_D22, ARM64::D20_D21_D22_D23,
-  ARM64::D21_D22_D23_D24, ARM64::D22_D23_D24_D25, ARM64::D23_D24_D25_D26,
-  ARM64::D24_D25_D26_D27, ARM64::D25_D26_D27_D28, ARM64::D26_D27_D28_D29,
-  ARM64::D27_D28_D29_D30, ARM64::D28_D29_D30_D31, ARM64::D29_D30_D31_D0,
-  ARM64::D30_D31_D0_D1,   ARM64::D31_D0_D1_D2
-};
-
-static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = DDDDDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  // scale{5} is asserted as 1 in tblgen.
-  Imm |= 0x20;  
-  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
-  return Success;
-}
-
-static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
-  return Success;
-}
-
-static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
-                                       uint64_t Addr, const void *Decoder) {
-  int64_t ImmVal = Imm;
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend 19-bit immediate.
-  if (ImmVal & (1 << (19 - 1)))
-    ImmVal |= ~((1LL << 19) - 1);
-
-  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr,
-                                     Inst.getOpcode() != ARM64::LDRXl, 0, 4))
-    Inst.addOperand(MCOperand::CreateImm(ImmVal));
-  return Success;
-}
-
-static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
-                                    uint64_t Address, const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm((Imm  >> 1) & 1));
-  Inst.addOperand(MCOperand::CreateImm(Imm & 1));
-  return Success;
-}
-
-static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
-                                            uint64_t Address,
-                                            const void *Decoder) {
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
-
-  Imm |= 0x8000;
-  Inst.addOperand(MCOperand::CreateImm(Imm));
-
-  bool ValidNamed;
-  (void)ARM64SysReg::MRSMapper(STI.getFeatureBits()).toString(Imm, ValidNamed);
-
-  return ValidNamed ? Success : Fail;
-}
-
-static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
-                                            uint64_t Address,
-                                            const void *Decoder) {
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
-
-  Imm |= 0x8000;
-  Inst.addOperand(MCOperand::CreateImm(Imm));
-
-  bool ValidNamed;
-  (void)ARM64SysReg::MSRMapper(STI.getFeatureBits()).toString(Imm, ValidNamed);
-
-  return ValidNamed ? Success : Fail;
-}
-
-static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const void *Decoder) {
-  // This decoder exists to add the dummy Lane operand to the MCInst, which must
-  // be 1 in assembly but has no other real manifestation.
-  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned IsToVec = fieldFromInstruction(Insn, 16, 1);
-
-  if (IsToVec) {
-    DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
-  } else {
-    DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
-  }
-
-  // Add the lane
-  Inst.addOperand(MCOperand::CreateImm(1));
-
-  return Success;
-}
-
-static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
-                                       unsigned Add) {
-  Inst.addOperand(MCOperand::CreateImm(Add - Imm));
-  return Success;
-}
-
-static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
-                                       unsigned Add) {
-  Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1)));
-  return Success;
-}
-
-static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 64);
-}
-
-static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
-}
-
-static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 32);
-}
-
-static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
-}
-
-static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 16);
-}
-
-static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
-}
-
-static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 8);
-}
-
-static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 64);
-}
-
-static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 32);
-}
-
-static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 16);
-}
-
-static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 8);
-}
-
-static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn, uint64_t Addr,
-                                                   const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(insn, 16, 5);
-  unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
-  unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
-  unsigned shift = (shiftHi << 6) | shiftLo;
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::ADDWrs:
-  case ARM64::ADDSWrs:
-  case ARM64::SUBWrs:
-  case ARM64::SUBSWrs:
-    // if shift == '11' then ReservedValue()
-    if (shiftHi == 0x3)
-      return Fail;
-    // Deliberate fallthrough
-  case ARM64::ANDWrs:
-  case ARM64::ANDSWrs:
-  case ARM64::BICWrs:
-  case ARM64::BICSWrs:
-  case ARM64::ORRWrs:
-  case ARM64::ORNWrs:
-  case ARM64::EORWrs:
-  case ARM64::EONWrs: {
-    // if sf == '0' and imm6<5> == '1' then ReservedValue()
-    if (shiftLo >> 5 == 1)
-      return Fail;
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-  case ARM64::ADDXrs:
-  case ARM64::ADDSXrs:
-  case ARM64::SUBXrs:
-  case ARM64::SUBSXrs:
-    // if shift == '11' then ReservedValue()
-    if (shiftHi == 0x3)
-      return Fail;
-    // Deliberate fallthrough
-  case ARM64::ANDXrs:
-  case ARM64::ANDSXrs:
-  case ARM64::BICXrs:
-  case ARM64::BICSXrs:
-  case ARM64::ORRXrs:
-  case ARM64::ORNXrs:
-  case ARM64::EORXrs:
-  case ARM64::EONXrs:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-
-  Inst.addOperand(MCOperand::CreateImm(shift));
-  return Success;
-}
-
-static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned imm = fieldFromInstruction(insn, 5, 16);
-  unsigned shift = fieldFromInstruction(insn, 21, 2);
-  shift <<= 4;
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::MOVZWi:
-  case ARM64::MOVNWi:
-  case ARM64::MOVKWi:
-    if (shift & (1U << 5))
-      return Fail;
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::MOVZXi:
-  case ARM64::MOVNXi:
-  case ARM64::MOVKXi:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  }
-
-  if (Inst.getOpcode() == ARM64::MOVKWi || Inst.getOpcode() == ARM64::MOVKXi)
-    Inst.addOperand(Inst.getOperand(0));
-
-  Inst.addOperand(MCOperand::CreateImm(imm));
-  Inst.addOperand(MCOperand::CreateImm(shift));
-  return Success;
-}
-
-static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
-                                                  uint32_t insn, uint64_t Addr,
-                                                  const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned offset = fieldFromInstruction(insn, 10, 12);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::PRFMui:
-    // Rt is an immediate in prefetch.
-    Inst.addOperand(MCOperand::CreateImm(Rt));
-    break;
-  case ARM64::STRBBui:
-  case ARM64::LDRBBui:
-  case ARM64::LDRSBWui:
-  case ARM64::STRHHui:
-  case ARM64::LDRHHui:
-  case ARM64::LDRSHWui:
-  case ARM64::STRWui:
-  case ARM64::LDRWui:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSBXui:
-  case ARM64::LDRSHXui:
-  case ARM64::LDRSWui:
-  case ARM64::STRXui:
-  case ARM64::LDRXui:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRQui:
-  case ARM64::STRQui:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRDui:
-  case ARM64::STRDui:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSui:
-  case ARM64::STRSui:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRHui:
-  case ARM64::STRHui:
-    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRBui:
-  case ARM64::STRBui:
-    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4))
-    Inst.addOperand(MCOperand::CreateImm(offset));
-  return Success;
-}
-
-static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  int64_t offset = fieldFromInstruction(insn, 12, 9);
-
-  // offset is a 9-bit signed immediate, so sign extend it to
-  // fill the unsigned.
-  if (offset & (1 << (9 - 1)))
-    offset |= ~((1LL << 9) - 1);
-
-  // First operand is always the writeback to the address register, if needed.
-  switch (Inst.getOpcode()) {
-  default:
-    break;
-  case ARM64::LDRSBWpre:
-  case ARM64::LDRSHWpre:
-  case ARM64::STRBBpre:
-  case ARM64::LDRBBpre:
-  case ARM64::STRHHpre:
-  case ARM64::LDRHHpre:
-  case ARM64::STRWpre:
-  case ARM64::LDRWpre:
-  case ARM64::LDRSBWpost:
-  case ARM64::LDRSHWpost:
-  case ARM64::STRBBpost:
-  case ARM64::LDRBBpost:
-  case ARM64::STRHHpost:
-  case ARM64::LDRHHpost:
-  case ARM64::STRWpost:
-  case ARM64::LDRWpost:
-  case ARM64::LDRSBXpre:
-  case ARM64::LDRSHXpre:
-  case ARM64::STRXpre:
-  case ARM64::LDRSWpre:
-  case ARM64::LDRXpre:
-  case ARM64::LDRSBXpost:
-  case ARM64::LDRSHXpost:
-  case ARM64::STRXpost:
-  case ARM64::LDRSWpost:
-  case ARM64::LDRXpost:
-  case ARM64::LDRQpre:
-  case ARM64::STRQpre:
-  case ARM64::LDRQpost:
-  case ARM64::STRQpost:
-  case ARM64::LDRDpre:
-  case ARM64::STRDpre:
-  case ARM64::LDRDpost:
-  case ARM64::STRDpost:
-  case ARM64::LDRSpre:
-  case ARM64::STRSpre:
-  case ARM64::LDRSpost:
-  case ARM64::STRSpost:
-  case ARM64::LDRHpre:
-  case ARM64::STRHpre:
-  case ARM64::LDRHpost:
-  case ARM64::STRHpost:
-  case ARM64::LDRBpre:
-  case ARM64::STRBpre:
-  case ARM64::LDRBpost:
-  case ARM64::STRBpost:
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    break;
-  }
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::PRFUMi:
-    // Rt is an immediate in prefetch.
-    Inst.addOperand(MCOperand::CreateImm(Rt));
-    break;
-  case ARM64::STURBBi:
-  case ARM64::LDURBBi:
-  case ARM64::LDURSBWi:
-  case ARM64::STURHHi:
-  case ARM64::LDURHHi:
-  case ARM64::LDURSHWi:
-  case ARM64::STURWi:
-  case ARM64::LDURWi:
-  case ARM64::LDTRSBWi:
-  case ARM64::LDTRSHWi:
-  case ARM64::STTRWi:
-  case ARM64::LDTRWi:
-  case ARM64::STTRHi:
-  case ARM64::LDTRHi:
-  case ARM64::LDTRBi:
-  case ARM64::STTRBi:
-  case ARM64::LDRSBWpre:
-  case ARM64::LDRSHWpre:
-  case ARM64::STRBBpre:
-  case ARM64::LDRBBpre:
-  case ARM64::STRHHpre:
-  case ARM64::LDRHHpre:
-  case ARM64::STRWpre:
-  case ARM64::LDRWpre:
-  case ARM64::LDRSBWpost:
-  case ARM64::LDRSHWpost:
-  case ARM64::STRBBpost:
-  case ARM64::LDRBBpost:
-  case ARM64::STRHHpost:
-  case ARM64::LDRHHpost:
-  case ARM64::STRWpost:
-  case ARM64::LDRWpost:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURSBXi:
-  case ARM64::LDURSHXi:
-  case ARM64::LDURSWi:
-  case ARM64::STURXi:
-  case ARM64::LDURXi:
-  case ARM64::LDTRSBXi:
-  case ARM64::LDTRSHXi:
-  case ARM64::LDTRSWi:
-  case ARM64::STTRXi:
-  case ARM64::LDTRXi:
-  case ARM64::LDRSBXpre:
-  case ARM64::LDRSHXpre:
-  case ARM64::STRXpre:
-  case ARM64::LDRSWpre:
-  case ARM64::LDRXpre:
-  case ARM64::LDRSBXpost:
-  case ARM64::LDRSHXpost:
-  case ARM64::STRXpost:
-  case ARM64::LDRSWpost:
-  case ARM64::LDRXpost:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURQi:
-  case ARM64::STURQi:
-  case ARM64::LDRQpre:
-  case ARM64::STRQpre:
-  case ARM64::LDRQpost:
-  case ARM64::STRQpost:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURDi:
-  case ARM64::STURDi:
-  case ARM64::LDRDpre:
-  case ARM64::STRDpre:
-  case ARM64::LDRDpost:
-  case ARM64::STRDpost:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURSi:
-  case ARM64::STURSi:
-  case ARM64::LDRSpre:
-  case ARM64::STRSpre:
-  case ARM64::LDRSpost:
-  case ARM64::STRSpost:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURHi:
-  case ARM64::STURHi:
-  case ARM64::LDRHpre:
-  case ARM64::STRHpre:
-  case ARM64::LDRHpost:
-  case ARM64::STRHpost:
-    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURBi:
-  case ARM64::STURBi:
-  case ARM64::LDRBpre:
-  case ARM64::STRBpre:
-  case ARM64::LDRBpost:
-  case ARM64::STRBpost:
-    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(offset));
-
-  bool IsLoad = fieldFromInstruction(insn, 22, 1);
-  bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0;
-  bool IsFP = fieldFromInstruction(insn, 26, 1);
-
-  // Cannot write back to a transfer register (but xzr != sp).
-  if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn)
-    return SoftFail;
-
-  return Success;
-}
-
-static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn, uint64_t Addr,
-                                                   const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
-  unsigned Rs = fieldFromInstruction(insn, 16, 5);
-
-  unsigned Opcode = Inst.getOpcode();
-  switch (Opcode) {
-  default:
-    return Fail;
-  case ARM64::STLXRW:
-  case ARM64::STLXRB:
-  case ARM64::STLXRH:
-  case ARM64::STXRW:
-  case ARM64::STXRB:
-  case ARM64::STXRH:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDARW:
-  case ARM64::LDARB:
-  case ARM64::LDARH:
-  case ARM64::LDAXRW:
-  case ARM64::LDAXRB:
-  case ARM64::LDAXRH:
-  case ARM64::LDXRW:
-  case ARM64::LDXRB:
-  case ARM64::LDXRH:
-  case ARM64::STLRW:
-  case ARM64::STLRB:
-  case ARM64::STLRH:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::STLXRX:
-  case ARM64::STXRX:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDARX:
-  case ARM64::LDAXRX:
-  case ARM64::LDXRX:
-  case ARM64::STLRX:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::STLXPW:
-  case ARM64::STXPW:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDAXPW:
-  case ARM64::LDXPW:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::STLXPX:
-  case ARM64::STXPX:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDAXPX:
-  case ARM64::LDXPX:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-
-  // You shouldn't load to the same register twice in an instruction...
-  if ((Opcode == ARM64::LDAXPW || Opcode == ARM64::LDXPW ||
-       Opcode == ARM64::LDAXPX || Opcode == ARM64::LDXPX) &&
-      Rt == Rt2)
-    return SoftFail;
-
-  return Success;
-}
-
-static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
-  int64_t offset = fieldFromInstruction(insn, 15, 7);
-  bool IsLoad = fieldFromInstruction(insn, 22, 1);
-
-  // offset is a 7-bit signed immediate, so sign extend it to
-  // fill the unsigned.
-  if (offset & (1 << (7 - 1)))
-    offset |= ~((1LL << 7) - 1);
-
-  unsigned Opcode = Inst.getOpcode();
-  bool NeedsDisjointWritebackTransfer = false;
-
-  // First operand is always writeback of base register.
-  switch (Opcode) {
-  default:
-    break;
-  case ARM64::LDPXpost:
-  case ARM64::STPXpost:
-  case ARM64::LDPSWpost:
-  case ARM64::LDPXpre:
-  case ARM64::STPXpre:
-  case ARM64::LDPSWpre:
-  case ARM64::LDPWpost:
-  case ARM64::STPWpost:
-  case ARM64::LDPWpre:
-  case ARM64::STPWpre:
-  case ARM64::LDPQpost:
-  case ARM64::STPQpost:
-  case ARM64::LDPQpre:
-  case ARM64::STPQpre:
-  case ARM64::LDPDpost:
-  case ARM64::STPDpost:
-  case ARM64::LDPDpre:
-  case ARM64::STPDpre:
-  case ARM64::LDPSpost:
-  case ARM64::STPSpost:
-  case ARM64::LDPSpre:
-  case ARM64::STPSpre:
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    break;
-  }
-
-  switch (Opcode) {
-  default:
-    return Fail;
-  case ARM64::LDPXpost:
-  case ARM64::STPXpost:
-  case ARM64::LDPSWpost:
-  case ARM64::LDPXpre:
-  case ARM64::STPXpre:
-  case ARM64::LDPSWpre:
-    NeedsDisjointWritebackTransfer = true;
-    // Fallthrough
-  case ARM64::LDNPXi:
-  case ARM64::STNPXi:
-  case ARM64::LDPXi:
-  case ARM64::STPXi:
-  case ARM64::LDPSWi:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDPWpost:
-  case ARM64::STPWpost:
-  case ARM64::LDPWpre:
-  case ARM64::STPWpre:
-    NeedsDisjointWritebackTransfer = true;
-    // Fallthrough
-  case ARM64::LDNPWi:
-  case ARM64::STNPWi:
-  case ARM64::LDPWi:
-  case ARM64::STPWi:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPQi:
-  case ARM64::STNPQi:
-  case ARM64::LDPQpost:
-  case ARM64::STPQpost:
-  case ARM64::LDPQi:
-  case ARM64::STPQi:
-  case ARM64::LDPQpre:
-  case ARM64::STPQpre:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPDi:
-  case ARM64::STNPDi:
-  case ARM64::LDPDpost:
-  case ARM64::STPDpost:
-  case ARM64::LDPDi:
-  case ARM64::STPDi:
-  case ARM64::LDPDpre:
-  case ARM64::STPDpre:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPSi:
-  case ARM64::STNPSi:
-  case ARM64::LDPSpost:
-  case ARM64::STPSpost:
-  case ARM64::LDPSi:
-  case ARM64::STPSi:
-  case ARM64::LDPSpre:
-  case ARM64::STPSpre:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(offset));
-
-  // You shouldn't load to the same register twice in an instruction...
-  if (IsLoad && Rt == Rt2)
-    return SoftFail;
-
-  // ... or do any operation that writes-back to a transfer register. But note
-  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
-  if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn))
-    return SoftFail;
-
-  return Success;
-}
-
-static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(insn, 16, 5);
-  unsigned extend = fieldFromInstruction(insn, 10, 6);
-
-  unsigned shift = extend & 0x7;
-  if (shift > 4)
-    return Fail;
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::ADDWrx:
-  case ARM64::SUBWrx:
-    DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDSWrx:
-  case ARM64::SUBSWrx:
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDXrx:
-  case ARM64::SUBXrx:
-    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDSXrx:
-  case ARM64::SUBSXrx:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDXrx64:
-  case ARM64::SUBXrx64:
-    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::SUBSXrx64:
-  case ARM64::ADDSXrx64:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-
-  Inst.addOperand(MCOperand::CreateImm(extend));
-  return Success;
-}
-
-static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
-  unsigned imm;
-
-  if (Datasize) {
-    if (Inst.getOpcode() == ARM64::ANDSXri)
-      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    else
-      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
-    imm = fieldFromInstruction(insn, 10, 13);
-    if (!ARM64_AM::isValidDecodeLogicalImmediate(imm, 64))
-      return Fail;
-  } else {
-    if (Inst.getOpcode() == ARM64::ANDSWri)
-      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    else
-      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
-    imm = fieldFromInstruction(insn, 10, 12);
-    if (!ARM64_AM::isValidDecodeLogicalImmediate(imm, 32))
-      return Fail;
-  }
-  Inst.addOperand(MCOperand::CreateImm(imm));
-  return Success;
-}
-
-static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned cmode = fieldFromInstruction(insn, 12, 4);
-  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
-  imm |= fieldFromInstruction(insn, 5, 5);
-
-  if (Inst.getOpcode() == ARM64::MOVID)
-    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
-  else
-    DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-
-  Inst.addOperand(MCOperand::CreateImm(imm));
-
-  switch (Inst.getOpcode()) {
-  default:
-    break;
-  case ARM64::MOVIv4i16:
-  case ARM64::MOVIv8i16:
-  case ARM64::MVNIv4i16:
-  case ARM64::MVNIv8i16:
-  case ARM64::MOVIv2i32:
-  case ARM64::MOVIv4i32:
-  case ARM64::MVNIv2i32:
-  case ARM64::MVNIv4i32:
-    Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
-    break;
-  case ARM64::MOVIv2s_msl:
-  case ARM64::MOVIv4s_msl:
-  case ARM64::MVNIv2s_msl:
-  case ARM64::MVNIv4s_msl:
-    Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108));
-    break;
-  }
-
-  return Success;
-}
-
-static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned cmode = fieldFromInstruction(insn, 12, 4);
-  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
-  imm |= fieldFromInstruction(insn, 5, 5);
-
-  // Tied operands added twice.
-  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-
-  Inst.addOperand(MCOperand::CreateImm(imm));
-  Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
-
-  return Success;
-}
-
-static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                         uint64_t Addr, const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
-  imm |= fieldFromInstruction(insn, 29, 2);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend the 21-bit immediate.
-  if (imm & (1 << (21 - 1)))
-    imm |= ~((1LL << 21) - 1);
-
-  DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-  if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4))
-    Inst.addOperand(MCOperand::CreateImm(imm));
-
-  return Success;
-}
-
-static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Addr, const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Imm = fieldFromInstruction(insn, 10, 14);
-  unsigned S = fieldFromInstruction(insn, 29, 1);
-  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
-
-  unsigned ShifterVal = (Imm >> 12) & 3;
-  unsigned ImmVal = Imm & 0xFFF;
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  if (ShifterVal != 0 && ShifterVal != 1)
-    return Fail;
-
-  if (Datasize) {
-    if (Rd == 31 && !S)
-      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    else
-      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  } else {
-    if (Rd == 31 && !S)
-      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
-    else
-      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
-  }
-
-  if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4))
-    Inst.addOperand(MCOperand::CreateImm(ImmVal));
-  Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal));
-  return Success;
-}
-
-static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  int64_t imm = fieldFromInstruction(insn, 0, 26);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend the 26-bit immediate.
-  if (imm & (1 << (26 - 1)))
-    imm |= ~((1LL << 26) - 1);
-
-  if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4))
-    Inst.addOperand(MCOperand::CreateImm(imm));
-
-  return Success;
-}
-
-static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
-                                                  uint32_t insn, uint64_t Addr,
-                                                  const void *Decoder) {
-  uint64_t op1 = fieldFromInstruction(insn, 16, 3);
-  uint64_t op2 = fieldFromInstruction(insn, 5, 3);
-  uint64_t crm = fieldFromInstruction(insn, 8, 4);
-
-  uint64_t pstate_field = (op1 << 3) | op2;
-
-  Inst.addOperand(MCOperand::CreateImm(pstate_field));
-  Inst.addOperand(MCOperand::CreateImm(crm));
-
-  bool ValidNamed;
-  (void)ARM64PState::PStateMapper().toString(pstate_field, ValidNamed);
-  
-  return ValidNamed ? Success : Fail;
-}
-
-static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Addr, const void *Decoder) {
-  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
-  uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
-  bit |= fieldFromInstruction(insn, 19, 5);
-  int64_t dst = fieldFromInstruction(insn, 5, 14);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend 14-bit immediate.
-  if (dst & (1 << (14 - 1)))
-    dst |= ~((1LL << 14) - 1);
-
-  if (fieldFromInstruction(insn, 31, 1) == 0)
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-  else
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(bit));
-  if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4))
-    Inst.addOperand(MCOperand::CreateImm(dst));
-
-  return Success;
-}
diff --git a/lib/Target/ARM64/Disassembler/ARM64Disassembler.h b/lib/Target/ARM64/Disassembler/ARM64Disassembler.h
deleted file mode 100644
index 8989925f36b..00000000000
--- a/lib/Target/ARM64/Disassembler/ARM64Disassembler.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===- ARM64Disassembler.h - Disassembler for ARM64 -------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64DISASSEMBLER_H
-#define ARM64DISASSEMBLER_H
-
-#include "llvm/MC/MCDisassembler.h"
-
-namespace llvm {
-
-class MCInst;
-class MemoryObject;
-class raw_ostream;
-
-class ARM64Disassembler : public MCDisassembler {
-public:
-  ARM64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
-    : MCDisassembler(STI, Ctx) {}
-
-  ~ARM64Disassembler() {}
-
-  /// getInstruction - See MCDisassembler.
-  MCDisassembler::DecodeStatus
-  getInstruction(MCInst &instr, uint64_t &size, const MemoryObject &region,
-                 uint64_t address, raw_ostream &vStream,
-                 raw_ostream &cStream) const override;
-};
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.cpp b/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.cpp
deleted file mode 100644
index 2f8e516d185..00000000000
--- a/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-//===- ARM64ExternalSymbolizer.cpp - Symbolizer for ARM64 -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64ExternalSymbolizer.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "Utils/ARM64BaseInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-disassembler"
-
-static MCSymbolRefExpr::VariantKind
-getVariant(uint64_t LLVMDisassembler_VariantKind) {
-  switch (LLVMDisassembler_VariantKind) {
-  case LLVMDisassembler_VariantKind_None:
-    return MCSymbolRefExpr::VK_None;
-  case LLVMDisassembler_VariantKind_ARM64_PAGE:
-    return MCSymbolRefExpr::VK_PAGE;
-  case LLVMDisassembler_VariantKind_ARM64_PAGEOFF:
-    return MCSymbolRefExpr::VK_PAGEOFF;
-  case LLVMDisassembler_VariantKind_ARM64_GOTPAGE:
-    return MCSymbolRefExpr::VK_GOTPAGE;
-  case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF:
-    return MCSymbolRefExpr::VK_GOTPAGEOFF;
-  case LLVMDisassembler_VariantKind_ARM64_TLVP:
-  case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
-  default:
-    assert(0 && "bad LLVMDisassembler_VariantKind");
-    return MCSymbolRefExpr::VK_None;
-  }
-}
-
-/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
-/// operand in place of the immediate Value in the MCInst.  The immediate
-/// Value has not had any PC adjustment made by the caller. If the instruction
-/// is a branch that adds the PC to the immediate Value then isBranch is
-/// Success, else Fail. If GetOpInfo is non-null, then it is called to get any
-/// symbolic information at the Address for this instrution.  If that returns
-/// non-zero then the symbolic information it returns is used to create an
-/// MCExpr and that is added as an operand to the MCInst.  If GetOpInfo()
-/// returns zero and isBranch is Success then a symbol look up for
-/// Address + Value is done and if a symbol is found an MCExpr is created with
-/// that, else an MCExpr with Address + Value is created.  If GetOpInfo()
-/// returns zero and isBranch is Fail then the the Opcode of the MCInst is
-/// tested and for ADRP an other instructions that help to load of pointers
-/// a symbol look up is done to see it is returns a specific reference type
-/// to add to the comment stream.  This function returns Success if it adds
-/// an operand to the MCInst and Fail otherwise.
-bool ARM64ExternalSymbolizer::tryAddingSymbolicOperand(
-                                                     MCInst &MI,
-                                                     raw_ostream &CommentStream,
-                                                     int64_t Value,
-                                                     uint64_t Address,
-                                                     bool IsBranch,
-                                                     uint64_t Offset,
-                                                     uint64_t InstSize) {
-  // FIXME: This method shares a lot of code with
-  //        MCExternalSymbolizer::tryAddingSymbolicOperand. It may be possible
-  //        refactor the MCExternalSymbolizer interface to allow more of this
-  //        implementation to be shared.
-  //
-  struct LLVMOpInfo1 SymbolicOp;
-  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-  SymbolicOp.Value = Value;
-  uint64_t ReferenceType;
-  const char *ReferenceName;
-  if (!GetOpInfo ||
-      !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
-    if (IsBranch) {
-      ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
-      const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType,
-                                      Address, &ReferenceName);
-      if (Name) {
-        SymbolicOp.AddSymbol.Name = Name;
-        SymbolicOp.AddSymbol.Present = true;
-        SymbolicOp.Value = 0;
-      } else {
-        SymbolicOp.Value = Address + Value;
-      }
-      if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
-        CommentStream << "symbol stub for: " << ReferenceName;
-      else if (ReferenceType ==
-               LLVMDisassembler_ReferenceType_Out_Objc_Message)
-        CommentStream << "Objc message: " << ReferenceName;
-    } else if (MI.getOpcode() == ARM64::ADRP) {
-        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP;
-        // otool expects the fully encoded ADRP instruction to be passed in as
-        // the value here, so reconstruct it:
-        const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
-        uint32_t EncodedInst = 0x90000000;
-        EncodedInst |= (Value & 0x3) << 29; // immlo
-        EncodedInst |= ((Value >> 2) & 0x7FFFF) << 5; // immhi
-        EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg
-        SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
-                     &ReferenceName);
-        CommentStream << format("0x%llx",
-                                0xfffffffffffff000LL & (Address + Value));
-    } else if (MI.getOpcode() == ARM64::ADDXri ||
-               MI.getOpcode() == ARM64::LDRXui ||
-               MI.getOpcode() == ARM64::LDRXl ||
-               MI.getOpcode() == ARM64::ADR) {
-      if (MI.getOpcode() == ARM64::ADDXri)
-        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri;
-      else if (MI.getOpcode() == ARM64::LDRXui)
-        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui;
-      if (MI.getOpcode() == ARM64::LDRXl) {
-        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl;
-        SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
-                     &ReferenceName);
-      } else if (MI.getOpcode() == ARM64::ADR) {
-        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR;
-        SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
-                            &ReferenceName);
-      } else {
-        const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
-        // otool expects the fully encoded ADD/LDR instruction to be passed in
-        // as the value here, so reconstruct it:
-        unsigned EncodedInst =
-          MI.getOpcode() == ARM64::ADDXri ? 0x91000000: 0xF9400000;
-        EncodedInst |= Value << 10; // imm12 [+ shift:2 for ADD]
-        EncodedInst |=
-          MCRI.getEncodingValue(MI.getOperand(1).getReg()) << 5; // Rn
-        EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // Rd
-
-        SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
-                     &ReferenceName);
-      }
-      if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
-        CommentStream << "literal pool symbol address: " << ReferenceName;
-      else if (ReferenceType ==
-               LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
-        CommentStream << "literal pool for: \"" << ReferenceName << "\"";
-      else if (ReferenceType ==
-               LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
-        CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
-      else if (ReferenceType ==
-               LLVMDisassembler_ReferenceType_Out_Objc_Message)
-        CommentStream << "Objc message: " << ReferenceName;
-      else if (ReferenceType ==
-               LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
-        CommentStream << "Objc message ref: " << ReferenceName;
-      else if (ReferenceType ==
-               LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
-        CommentStream << "Objc selector ref: " << ReferenceName;
-      else if (ReferenceType ==
-               LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
-        CommentStream << "Objc class ref: " << ReferenceName;
-      // For these instructions, the SymbolLookUp() above is just to get the
-      // ReferenceType and ReferenceName.  We want to make sure not to
-      // fall through so we don't build an MCExpr to leave the disassembly
-      // of the immediate values of these instructions to the InstPrinter.
-      return false;
-    } else {
-      return false;
-    }
-  }
-
-  const MCExpr *Add = nullptr;
-  if (SymbolicOp.AddSymbol.Present) {
-    if (SymbolicOp.AddSymbol.Name) {
-      StringRef Name(SymbolicOp.AddSymbol.Name);
-      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
-      MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind);
-      if (Variant != MCSymbolRefExpr::VK_None)
-        Add = MCSymbolRefExpr::Create(Sym, Variant, Ctx);
-      else
-        Add = MCSymbolRefExpr::Create(Sym, Ctx);
-    } else {
-      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, Ctx);
-    }
-  }
-
-  const MCExpr *Sub = nullptr;
-  if (SymbolicOp.SubtractSymbol.Present) {
-    if (SymbolicOp.SubtractSymbol.Name) {
-      StringRef Name(SymbolicOp.SubtractSymbol.Name);
-      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
-      Sub = MCSymbolRefExpr::Create(Sym, Ctx);
-    } else {
-      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, Ctx);
-    }
-  }
-
-  const MCExpr *Off = nullptr;
-  if (SymbolicOp.Value != 0)
-    Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx);
-
-  const MCExpr *Expr;
-  if (Sub) {
-    const MCExpr *LHS;
-    if (Add)
-      LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx);
-    else
-      LHS = MCUnaryExpr::CreateMinus(Sub, Ctx);
-    if (Off)
-      Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx);
-    else
-      Expr = LHS;
-  } else if (Add) {
-    if (Off)
-      Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx);
-    else
-      Expr = Add;
-  } else {
-    if (Off)
-      Expr = Off;
-    else
-      Expr = MCConstantExpr::Create(0, Ctx);
-  }
-
-  MI.addOperand(MCOperand::CreateExpr(Expr));
-
-  return true;
-}
diff --git a/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.h b/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.h
deleted file mode 100644
index 45f07a5e258..00000000000
--- a/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===- ARM64ExternalSymbolizer.h - Symbolizer for ARM64 ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Symbolize ARM64 assembly code during disassembly using callbacks.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64EXTERNALSYMBOLIZER_H
-#define ARM64EXTERNALSYMBOLIZER_H
-
-#include "llvm/MC/MCExternalSymbolizer.h"
-
-namespace llvm {
-
-class ARM64ExternalSymbolizer : public MCExternalSymbolizer {
-public:
-  ARM64ExternalSymbolizer(MCContext &Ctx,
-                          std::unique_ptr<MCRelocationInfo> RelInfo,
-                          LLVMOpInfoCallback GetOpInfo,
-                          LLVMSymbolLookupCallback SymbolLookUp, void *DisInfo)
-    : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp,
-                           DisInfo) {}
-
-  bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream,
-                                int64_t Value, uint64_t Address, bool IsBranch,
-                                uint64_t Offset, uint64_t InstSize) override;
-};
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/Disassembler/CMakeLists.txt b/lib/Target/ARM64/Disassembler/CMakeLists.txt
deleted file mode 100644
index 43ade66be14..00000000000
--- a/lib/Target/ARM64/Disassembler/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64Disassembler
-  ARM64Disassembler.cpp
-  ARM64ExternalSymbolizer.cpp
-  )
-# workaround for hanging compilation on MSVC8, 9 and 10
-#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
-#set_property(
-#  SOURCE ARMDisassembler.cpp
-#  PROPERTY COMPILE_FLAGS "/Od"
-#  )
-#endif()
-add_dependencies(LLVMARM64Disassembler ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/Disassembler/LLVMBuild.txt b/lib/Target/ARM64/Disassembler/LLVMBuild.txt
deleted file mode 100644
index 5bbe88ddb49..00000000000
--- a/lib/Target/ARM64/Disassembler/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/ARM64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Disassembler
-parent = ARM64
-required_libraries = ARM64Info ARM64Utils MC Support
-add_to_library_groups = ARM64
diff --git a/lib/Target/ARM64/Disassembler/Makefile b/lib/Target/ARM64/Disassembler/Makefile
deleted file mode 100644
index 479d00c2494..00000000000
--- a/lib/Target/ARM64/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM64/Disassembler/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Disassembler
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
deleted file mode 100644
index 529b450352e..00000000000
--- a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
+++ /dev/null
@@ -1,1312 +0,0 @@
-//===-- ARM64InstPrinter.cpp - Convert ARM64 MCInst to assembly syntax ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints an ARM64 MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64InstPrinter.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "Utils/ARM64BaseInfo.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "asm-printer"
-
-#define GET_INSTRUCTION_NAME
-#define PRINT_ALIAS_INSTR
-#include "ARM64GenAsmWriter.inc"
-#define GET_INSTRUCTION_NAME
-#define PRINT_ALIAS_INSTR
-#include "ARM64GenAsmWriter1.inc"
-
-ARM64InstPrinter::ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                                   const MCRegisterInfo &MRI,
-                                   const MCSubtargetInfo &STI)
-    : MCInstPrinter(MAI, MII, MRI) {
-  // Initialize the set of available features.
-  setAvailableFeatures(STI.getFeatureBits());
-}
-
-ARM64AppleInstPrinter::ARM64AppleInstPrinter(const MCAsmInfo &MAI,
-                                             const MCInstrInfo &MII,
-                                             const MCRegisterInfo &MRI,
-                                             const MCSubtargetInfo &STI)
-    : ARM64InstPrinter(MAI, MII, MRI, STI) {}
-
-void ARM64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
-  // This is for .cfi directives.
-  OS << getRegisterName(RegNo);
-}
-
-void ARM64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                 StringRef Annot) {
-  // Check for special encodings and print the canonical alias instead.
-
-  unsigned Opcode = MI->getOpcode();
-
-  if (Opcode == ARM64::SYSxt)
-    if (printSysAlias(MI, O)) {
-      printAnnotation(O, Annot);
-      return;
-    }
-
-  // SBFM/UBFM should print to a nicer aliased form if possible.
-  if (Opcode == ARM64::SBFMXri || Opcode == ARM64::SBFMWri ||
-      Opcode == ARM64::UBFMXri || Opcode == ARM64::UBFMWri) {
-    const MCOperand &Op0 = MI->getOperand(0);
-    const MCOperand &Op1 = MI->getOperand(1);
-    const MCOperand &Op2 = MI->getOperand(2);
-    const MCOperand &Op3 = MI->getOperand(3);
-
-    bool IsSigned = (Opcode == ARM64::SBFMXri || Opcode == ARM64::SBFMWri);
-    bool Is64Bit = (Opcode == ARM64::SBFMXri || Opcode == ARM64::UBFMXri);
-    if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
-      const char *AsmMnemonic = nullptr;
-
-      switch (Op3.getImm()) {
-      default:
-        break;
-      case 7:
-        if (IsSigned)
-          AsmMnemonic = "sxtb";
-        else if (!Is64Bit)
-          AsmMnemonic = "uxtb";
-        break;
-      case 15:
-        if (IsSigned)
-          AsmMnemonic = "sxth";
-        else if (!Is64Bit)
-          AsmMnemonic = "uxth";
-        break;
-      case 31:
-        // *xtw is only valid for signed 64-bit operations.
-        if (Is64Bit && IsSigned)
-          AsmMnemonic = "sxtw";
-        break;
-      }
-
-      if (AsmMnemonic) {
-        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
-          << ", " << getRegisterName(getWRegFromXReg(Op1.getReg()));
-        printAnnotation(O, Annot);
-        return;
-      }
-    }
-
-    // All immediate shifts are aliases, implemented using the Bitfield
-    // instruction. In all cases the immediate shift amount shift must be in
-    // the range 0 to (reg.size -1).
-    if (Op2.isImm() && Op3.isImm()) {
-      const char *AsmMnemonic = nullptr;
-      int shift = 0;
-      int64_t immr = Op2.getImm();
-      int64_t imms = Op3.getImm();
-      if (Opcode == ARM64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
-        AsmMnemonic = "lsl";
-        shift = 31 - imms;
-      } else if (Opcode == ARM64::UBFMXri && imms != 0x3f &&
-                 ((imms + 1 == immr))) {
-        AsmMnemonic = "lsl";
-        shift = 63 - imms;
-      } else if (Opcode == ARM64::UBFMWri && imms == 0x1f) {
-        AsmMnemonic = "lsr";
-        shift = immr;
-      } else if (Opcode == ARM64::UBFMXri && imms == 0x3f) {
-        AsmMnemonic = "lsr";
-        shift = immr;
-      } else if (Opcode == ARM64::SBFMWri && imms == 0x1f) {
-        AsmMnemonic = "asr";
-        shift = immr;
-      } else if (Opcode == ARM64::SBFMXri && imms == 0x3f) {
-        AsmMnemonic = "asr";
-        shift = immr;
-      }
-      if (AsmMnemonic) {
-        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
-          << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
-        printAnnotation(O, Annot);
-        return;
-      }
-    }
-
-    // SBFIZ/UBFIZ aliases
-    if (Op2.getImm() > Op3.getImm()) {
-      O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t'
-        << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
-        << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1;
-      printAnnotation(O, Annot);
-      return;
-    }
-
-    // Otherwise SBFX/UBFX is the preferred form
-    O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t'
-      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
-      << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1;
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  if (Opcode == ARM64::BFMXri || Opcode == ARM64::BFMWri) {
-    const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0
-    const MCOperand &Op2 = MI->getOperand(2);
-    int ImmR = MI->getOperand(3).getImm();
-    int ImmS = MI->getOperand(4).getImm();
-
-    // BFI alias
-    if (ImmS < ImmR) {
-      int BitWidth = Opcode == ARM64::BFMXri ? 64 : 32;
-      int LSB = (BitWidth - ImmR) % BitWidth;
-      int Width = ImmS + 1;
-      O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", "
-        << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width;
-      printAnnotation(O, Annot);
-      return;
-    }
-
-    int LSB = ImmR;
-    int Width = ImmS - ImmR + 1;
-    // Otherwise BFXIL the preferred form
-    O << "\tbfxil\t"
-      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg())
-      << ", #" << LSB << ", #" << Width;
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
-  // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
-  // printed.
-  if ((Opcode == ARM64::MOVZXi || Opcode == ARM64::MOVZWi ||
-       Opcode == ARM64::MOVNXi || Opcode == ARM64::MOVNWi) &&
-      MI->getOperand(1).isExpr()) {
-    if (Opcode == ARM64::MOVZXi || Opcode == ARM64::MOVZWi)
-      O << "\tmovz\t";
-    else
-      O << "\tmovn\t";
-
-    O << getRegisterName(MI->getOperand(0).getReg()) << ", #"
-      << *MI->getOperand(1).getExpr();
-    return;
-  }
-
-  if ((Opcode == ARM64::MOVKXi || Opcode == ARM64::MOVKWi) &&
-      MI->getOperand(2).isExpr()) {
-    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
-      << *MI->getOperand(2).getExpr();
-    return;
-  }
-
-  if (!printAliasInstr(MI, O))
-    printInstruction(MI, O);
-
-  printAnnotation(O, Annot);
-}
-
-static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
-                                bool &IsTbx) {
-  switch (Opcode) {
-  case ARM64::TBXv8i8One:
-  case ARM64::TBXv8i8Two:
-  case ARM64::TBXv8i8Three:
-  case ARM64::TBXv8i8Four:
-    IsTbx = true;
-    Layout = ".8b";
-    return true;
-  case ARM64::TBLv8i8One:
-  case ARM64::TBLv8i8Two:
-  case ARM64::TBLv8i8Three:
-  case ARM64::TBLv8i8Four:
-    IsTbx = false;
-    Layout = ".8b";
-    return true;
-  case ARM64::TBXv16i8One:
-  case ARM64::TBXv16i8Two:
-  case ARM64::TBXv16i8Three:
-  case ARM64::TBXv16i8Four:
-    IsTbx = true;
-    Layout = ".16b";
-    return true;
-  case ARM64::TBLv16i8One:
-  case ARM64::TBLv16i8Two:
-  case ARM64::TBLv16i8Three:
-  case ARM64::TBLv16i8Four:
-    IsTbx = false;
-    Layout = ".16b";
-    return true;
-  default:
-    return false;
-  }
-}
-
-struct LdStNInstrDesc {
-  unsigned Opcode;
-  const char *Mnemonic;
-  const char *Layout;
-  int ListOperand;
-  bool HasLane;
-  int NaturalOffset;
-};
-
-static LdStNInstrDesc LdStNInstInfo[] = {
-  { ARM64::LD1i8,             "ld1",  ".b",     1, true,  0  },
-  { ARM64::LD1i16,            "ld1",  ".h",     1, true,  0  },
-  { ARM64::LD1i32,            "ld1",  ".s",     1, true,  0  },
-  { ARM64::LD1i64,            "ld1",  ".d",     1, true,  0  },
-  { ARM64::LD1i8_POST,        "ld1",  ".b",     2, true,  1  },
-  { ARM64::LD1i16_POST,       "ld1",  ".h",     2, true,  2  },
-  { ARM64::LD1i32_POST,       "ld1",  ".s",     2, true,  4  },
-  { ARM64::LD1i64_POST,       "ld1",  ".d",     2, true,  8  },
-  { ARM64::LD1Rv16b,          "ld1r", ".16b",   0, false, 0  },
-  { ARM64::LD1Rv8h,           "ld1r", ".8h",    0, false, 0  },
-  { ARM64::LD1Rv4s,           "ld1r", ".4s",    0, false, 0  },
-  { ARM64::LD1Rv2d,           "ld1r", ".2d",    0, false, 0  },
-  { ARM64::LD1Rv8b,           "ld1r", ".8b",    0, false, 0  },
-  { ARM64::LD1Rv4h,           "ld1r", ".4h",    0, false, 0  },
-  { ARM64::LD1Rv2s,           "ld1r", ".2s",    0, false, 0  },
-  { ARM64::LD1Rv1d,           "ld1r", ".1d",    0, false, 0  },
-  { ARM64::LD1Rv16b_POST,     "ld1r", ".16b",   1, false, 1  },
-  { ARM64::LD1Rv8h_POST,      "ld1r", ".8h",    1, false, 2  },
-  { ARM64::LD1Rv4s_POST,      "ld1r", ".4s",    1, false, 4  },
-  { ARM64::LD1Rv2d_POST,      "ld1r", ".2d",    1, false, 8  },
-  { ARM64::LD1Rv8b_POST,      "ld1r", ".8b",    1, false, 1  },
-  { ARM64::LD1Rv4h_POST,      "ld1r", ".4h",    1, false, 2  },
-  { ARM64::LD1Rv2s_POST,      "ld1r", ".2s",    1, false, 4  },
-  { ARM64::LD1Rv1d_POST,      "ld1r", ".1d",    1, false, 8  },
-  { ARM64::LD1Onev16b,        "ld1",  ".16b",   0, false, 0  },
-  { ARM64::LD1Onev8h,         "ld1",  ".8h",    0, false, 0  },
-  { ARM64::LD1Onev4s,         "ld1",  ".4s",    0, false, 0  },
-  { ARM64::LD1Onev2d,         "ld1",  ".2d",    0, false, 0  },
-  { ARM64::LD1Onev8b,         "ld1",  ".8b",    0, false, 0  },
-  { ARM64::LD1Onev4h,         "ld1",  ".4h",    0, false, 0  },
-  { ARM64::LD1Onev2s,         "ld1",  ".2s",    0, false, 0  },
-  { ARM64::LD1Onev1d,         "ld1",  ".1d",    0, false, 0  },
-  { ARM64::LD1Onev16b_POST,   "ld1",  ".16b",   1, false, 16 },
-  { ARM64::LD1Onev8h_POST,    "ld1",  ".8h",    1, false, 16 },
-  { ARM64::LD1Onev4s_POST,    "ld1",  ".4s",    1, false, 16 },
-  { ARM64::LD1Onev2d_POST,    "ld1",  ".2d",    1, false, 16 },
-  { ARM64::LD1Onev8b_POST,    "ld1",  ".8b",    1, false, 8  },
-  { ARM64::LD1Onev4h_POST,    "ld1",  ".4h",    1, false, 8  },
-  { ARM64::LD1Onev2s_POST,    "ld1",  ".2s",    1, false, 8  },
-  { ARM64::LD1Onev1d_POST,    "ld1",  ".1d",    1, false, 8  },
-  { ARM64::LD1Twov16b,        "ld1",  ".16b",   0, false, 0  },
-  { ARM64::LD1Twov8h,         "ld1",  ".8h",    0, false, 0  },
-  { ARM64::LD1Twov4s,         "ld1",  ".4s",    0, false, 0  },
-  { ARM64::LD1Twov2d,         "ld1",  ".2d",    0, false, 0  },
-  { ARM64::LD1Twov8b,         "ld1",  ".8b",    0, false, 0  },
-  { ARM64::LD1Twov4h,         "ld1",  ".4h",    0, false, 0  },
-  { ARM64::LD1Twov2s,         "ld1",  ".2s",    0, false, 0  },
-  { ARM64::LD1Twov1d,         "ld1",  ".1d",    0, false, 0  },
-  { ARM64::LD1Twov16b_POST,   "ld1",  ".16b",   1, false, 32 },
-  { ARM64::LD1Twov8h_POST,    "ld1",  ".8h",    1, false, 32 },
-  { ARM64::LD1Twov4s_POST,    "ld1",  ".4s",    1, false, 32 },
-  { ARM64::LD1Twov2d_POST,    "ld1",  ".2d",    1, false, 32 },
-  { ARM64::LD1Twov8b_POST,    "ld1",  ".8b",    1, false, 16 },
-  { ARM64::LD1Twov4h_POST,    "ld1",  ".4h",    1, false, 16 },
-  { ARM64::LD1Twov2s_POST,    "ld1",  ".2s",    1, false, 16 },
-  { ARM64::LD1Twov1d_POST,    "ld1",  ".1d",    1, false, 16 },
-  { ARM64::LD1Threev16b,      "ld1",  ".16b",   0, false, 0  },
-  { ARM64::LD1Threev8h,       "ld1",  ".8h",    0, false, 0  },
-  { ARM64::LD1Threev4s,       "ld1",  ".4s",    0, false, 0  },
-  { ARM64::LD1Threev2d,       "ld1",  ".2d",    0, false, 0  },
-  { ARM64::LD1Threev8b,       "ld1",  ".8b",    0, false, 0  },
-  { ARM64::LD1Threev4h,       "ld1",  ".4h",    0, false, 0  },
-  { ARM64::LD1Threev2s,       "ld1",  ".2s",    0, false, 0  },
-  { ARM64::LD1Threev1d,       "ld1",  ".1d",    0, false, 0  },
-  { ARM64::LD1Threev16b_POST, "ld1",  ".16b",   1, false, 48 },
-  { ARM64::LD1Threev8h_POST,  "ld1",  ".8h",    1, false, 48 },
-  { ARM64::LD1Threev4s_POST,  "ld1",  ".4s",    1, false, 48 },
-  { ARM64::LD1Threev2d_POST,  "ld1",  ".2d",    1, false, 48 },
-  { ARM64::LD1Threev8b_POST,  "ld1",  ".8b",    1, false, 24 },
-  { ARM64::LD1Threev4h_POST,  "ld1",  ".4h",    1, false, 24 },
-  { ARM64::LD1Threev2s_POST,  "ld1",  ".2s",    1, false, 24 },
-  { ARM64::LD1Threev1d_POST,  "ld1",  ".1d",    1, false, 24 },
-  { ARM64::LD1Fourv16b,       "ld1",  ".16b",   0, false, 0  },
-  { ARM64::LD1Fourv8h,        "ld1",  ".8h",    0, false, 0  },
-  { ARM64::LD1Fourv4s,        "ld1",  ".4s",    0, false, 0  },
-  { ARM64::LD1Fourv2d,        "ld1",  ".2d",    0, false, 0  },
-  { ARM64::LD1Fourv8b,        "ld1",  ".8b",    0, false, 0  },
-  { ARM64::LD1Fourv4h,        "ld1",  ".4h",    0, false, 0  },
-  { ARM64::LD1Fourv2s,        "ld1",  ".2s",    0, false, 0  },
-  { ARM64::LD1Fourv1d,        "ld1",  ".1d",    0, false, 0  },
-  { ARM64::LD1Fourv16b_POST,  "ld1",  ".16b",   1, false, 64 },
-  { ARM64::LD1Fourv8h_POST,   "ld1",  ".8h",    1, false, 64 },
-  { ARM64::LD1Fourv4s_POST,   "ld1",  ".4s",    1, false, 64 },
-  { ARM64::LD1Fourv2d_POST,   "ld1",  ".2d",    1, false, 64 },
-  { ARM64::LD1Fourv8b_POST,   "ld1",  ".8b",    1, false, 32 },
-  { ARM64::LD1Fourv4h_POST,   "ld1",  ".4h",    1, false, 32 },
-  { ARM64::LD1Fourv2s_POST,   "ld1",  ".2s",    1, false, 32 },
-  { ARM64::LD1Fourv1d_POST,   "ld1",  ".1d",    1, false, 32 },
-  { ARM64::LD2i8,             "ld2",  ".b",     1, true,  0  },
-  { ARM64::LD2i16,            "ld2",  ".h",     1, true,  0  },
-  { ARM64::LD2i32,            "ld2",  ".s",     1, true,  0  },
-  { ARM64::LD2i64,            "ld2",  ".d",     1, true,  0  },
-  { ARM64::LD2i8_POST,        "ld2",  ".b",     2, true,  2  },
-  { ARM64::LD2i16_POST,       "ld2",  ".h",     2, true,  4  },
-  { ARM64::LD2i32_POST,       "ld2",  ".s",     2, true,  8  },
-  { ARM64::LD2i64_POST,       "ld2",  ".d",     2, true,  16  },
-  { ARM64::LD2Rv16b,          "ld2r", ".16b",   0, false, 0  },
-  { ARM64::LD2Rv8h,           "ld2r", ".8h",    0, false, 0  },
-  { ARM64::LD2Rv4s,           "ld2r", ".4s",    0, false, 0  },
-  { ARM64::LD2Rv2d,           "ld2r", ".2d",    0, false, 0  },
-  { ARM64::LD2Rv8b,           "ld2r", ".8b",    0, false, 0  },
-  { ARM64::LD2Rv4h,           "ld2r", ".4h",    0, false, 0  },
-  { ARM64::LD2Rv2s,           "ld2r", ".2s",    0, false, 0  },
-  { ARM64::LD2Rv1d,           "ld2r", ".1d",    0, false, 0  },
-  { ARM64::LD2Rv16b_POST,     "ld2r", ".16b",   1, false, 2  },
-  { ARM64::LD2Rv8h_POST,      "ld2r", ".8h",    1, false, 4  },
-  { ARM64::LD2Rv4s_POST,      "ld2r", ".4s",    1, false, 8  },
-  { ARM64::LD2Rv2d_POST,      "ld2r", ".2d",    1, false, 16 },
-  { ARM64::LD2Rv8b_POST,      "ld2r", ".8b",    1, false, 2  },
-  { ARM64::LD2Rv4h_POST,      "ld2r", ".4h",    1, false, 4  },
-  { ARM64::LD2Rv2s_POST,      "ld2r", ".2s",    1, false, 8  },
-  { ARM64::LD2Rv1d_POST,      "ld2r", ".1d",    1, false, 16 },
-  { ARM64::LD2Twov16b,        "ld2",  ".16b",   0, false, 0  },
-  { ARM64::LD2Twov8h,         "ld2",  ".8h",    0, false, 0  },
-  { ARM64::LD2Twov4s,         "ld2",  ".4s",    0, false, 0  },
-  { ARM64::LD2Twov2d,         "ld2",  ".2d",    0, false, 0  },
-  { ARM64::LD2Twov8b,         "ld2",  ".8b",    0, false, 0  },
-  { ARM64::LD2Twov4h,         "ld2",  ".4h",    0, false, 0  },
-  { ARM64::LD2Twov2s,         "ld2",  ".2s",    0, false, 0  },
-  { ARM64::LD2Twov16b_POST,   "ld2",  ".16b",   1, false, 32 },
-  { ARM64::LD2Twov8h_POST,    "ld2",  ".8h",    1, false, 32 },
-  { ARM64::LD2Twov4s_POST,    "ld2",  ".4s",    1, false, 32 },
-  { ARM64::LD2Twov2d_POST,    "ld2",  ".2d",    1, false, 32 },
-  { ARM64::LD2Twov8b_POST,    "ld2",  ".8b",    1, false, 16 },
-  { ARM64::LD2Twov4h_POST,    "ld2",  ".4h",    1, false, 16 },
-  { ARM64::LD2Twov2s_POST,    "ld2",  ".2s",    1, false, 16 },
-  { ARM64::LD3i8,             "ld3",  ".b",     1, true,  0  },
-  { ARM64::LD3i16,            "ld3",  ".h",     1, true,  0  },
-  { ARM64::LD3i32,            "ld3",  ".s",     1, true,  0  },
-  { ARM64::LD3i64,            "ld3",  ".d",     1, true,  0  },
-  { ARM64::LD3i8_POST,        "ld3",  ".b",     2, true,  3  },
-  { ARM64::LD3i16_POST,       "ld3",  ".h",     2, true,  6  },
-  { ARM64::LD3i32_POST,       "ld3",  ".s",     2, true,  12  },
-  { ARM64::LD3i64_POST,       "ld3",  ".d",     2, true,  24  },
-  { ARM64::LD3Rv16b,          "ld3r", ".16b",   0, false, 0  },
-  { ARM64::LD3Rv8h,           "ld3r", ".8h",    0, false, 0  },
-  { ARM64::LD3Rv4s,           "ld3r", ".4s",    0, false, 0  },
-  { ARM64::LD3Rv2d,           "ld3r", ".2d",    0, false, 0  },
-  { ARM64::LD3Rv8b,           "ld3r", ".8b",    0, false, 0  },
-  { ARM64::LD3Rv4h,           "ld3r", ".4h",    0, false, 0  },
-  { ARM64::LD3Rv2s,           "ld3r", ".2s",    0, false, 0  },
-  { ARM64::LD3Rv1d,           "ld3r", ".1d",    0, false, 0  },
-  { ARM64::LD3Rv16b_POST,     "ld3r", ".16b",   1, false, 3  },
-  { ARM64::LD3Rv8h_POST,      "ld3r", ".8h",    1, false, 6  },
-  { ARM64::LD3Rv4s_POST,      "ld3r", ".4s",    1, false, 12 },
-  { ARM64::LD3Rv2d_POST,      "ld3r", ".2d",    1, false, 24 },
-  { ARM64::LD3Rv8b_POST,      "ld3r", ".8b",    1, false, 3  },
-  { ARM64::LD3Rv4h_POST,      "ld3r", ".4h",    1, false, 6  },
-  { ARM64::LD3Rv2s_POST,      "ld3r", ".2s",    1, false, 12 },
-  { ARM64::LD3Rv1d_POST,      "ld3r", ".1d",    1, false, 24 },
-  { ARM64::LD3Threev16b,      "ld3",  ".16b",   0, false, 0  },
-  { ARM64::LD3Threev8h,       "ld3",  ".8h",    0, false, 0  },
-  { ARM64::LD3Threev4s,       "ld3",  ".4s",    0, false, 0  },
-  { ARM64::LD3Threev2d,       "ld3",  ".2d",    0, false, 0  },
-  { ARM64::LD3Threev8b,       "ld3",  ".8b",    0, false, 0  },
-  { ARM64::LD3Threev4h,       "ld3",  ".4h",    0, false, 0  },
-  { ARM64::LD3Threev2s,       "ld3",  ".2s",    0, false, 0  },
-  { ARM64::LD3Threev16b_POST, "ld3",  ".16b",   1, false, 48 },
-  { ARM64::LD3Threev8h_POST,  "ld3",  ".8h",    1, false, 48 },
-  { ARM64::LD3Threev4s_POST,  "ld3",  ".4s",    1, false, 48 },
-  { ARM64::LD3Threev2d_POST,  "ld3",  ".2d",    1, false, 48 },
-  { ARM64::LD3Threev8b_POST,  "ld3",  ".8b",    1, false, 24 },
-  { ARM64::LD3Threev4h_POST,  "ld3",  ".4h",    1, false, 24 },
-  { ARM64::LD3Threev2s_POST,  "ld3",  ".2s",    1, false, 24 },
-  { ARM64::LD4i8,             "ld4",  ".b",     1, true,  0  },
-  { ARM64::LD4i16,            "ld4",  ".h",     1, true,  0  },
-  { ARM64::LD4i32,            "ld4",  ".s",     1, true,  0  },
-  { ARM64::LD4i64,            "ld4",  ".d",     1, true,  0  },
-  { ARM64::LD4i8_POST,        "ld4",  ".b",     2, true,  4  },
-  { ARM64::LD4i16_POST,       "ld4",  ".h",     2, true,  8  },
-  { ARM64::LD4i32_POST,       "ld4",  ".s",     2, true,  16 },
-  { ARM64::LD4i64_POST,       "ld4",  ".d",     2, true,  32 },
-  { ARM64::LD4Rv16b,          "ld4r", ".16b",   0, false, 0  },
-  { ARM64::LD4Rv8h,           "ld4r", ".8h",    0, false, 0  },
-  { ARM64::LD4Rv4s,           "ld4r", ".4s",    0, false, 0  },
-  { ARM64::LD4Rv2d,           "ld4r", ".2d",    0, false, 0  },
-  { ARM64::LD4Rv8b,           "ld4r", ".8b",    0, false, 0  },
-  { ARM64::LD4Rv4h,           "ld4r", ".4h",    0, false, 0  },
-  { ARM64::LD4Rv2s,           "ld4r", ".2s",    0, false, 0  },
-  { ARM64::LD4Rv1d,           "ld4r", ".1d",    0, false, 0  },
-  { ARM64::LD4Rv16b_POST,     "ld4r", ".16b",   1, false, 4  },
-  { ARM64::LD4Rv8h_POST,      "ld4r", ".8h",    1, false, 8  },
-  { ARM64::LD4Rv4s_POST,      "ld4r", ".4s",    1, false, 16 },
-  { ARM64::LD4Rv2d_POST,      "ld4r", ".2d",    1, false, 32 },
-  { ARM64::LD4Rv8b_POST,      "ld4r", ".8b",    1, false, 4  },
-  { ARM64::LD4Rv4h_POST,      "ld4r", ".4h",    1, false, 8  },
-  { ARM64::LD4Rv2s_POST,      "ld4r", ".2s",    1, false, 16 },
-  { ARM64::LD4Rv1d_POST,      "ld4r", ".1d",    1, false, 32 },
-  { ARM64::LD4Fourv16b,       "ld4",  ".16b",   0, false, 0  },
-  { ARM64::LD4Fourv8h,        "ld4",  ".8h",    0, false, 0  },
-  { ARM64::LD4Fourv4s,        "ld4",  ".4s",    0, false, 0  },
-  { ARM64::LD4Fourv2d,        "ld4",  ".2d",    0, false, 0  },
-  { ARM64::LD4Fourv8b,        "ld4",  ".8b",    0, false, 0  },
-  { ARM64::LD4Fourv4h,        "ld4",  ".4h",    0, false, 0  },
-  { ARM64::LD4Fourv2s,        "ld4",  ".2s",    0, false, 0  },
-  { ARM64::LD4Fourv16b_POST,  "ld4",  ".16b",   1, false, 64 },
-  { ARM64::LD4Fourv8h_POST,   "ld4",  ".8h",    1, false, 64 },
-  { ARM64::LD4Fourv4s_POST,   "ld4",  ".4s",    1, false, 64 },
-  { ARM64::LD4Fourv2d_POST,   "ld4",  ".2d",    1, false, 64 },
-  { ARM64::LD4Fourv8b_POST,   "ld4",  ".8b",    1, false, 32 },
-  { ARM64::LD4Fourv4h_POST,   "ld4",  ".4h",    1, false, 32 },
-  { ARM64::LD4Fourv2s_POST,   "ld4",  ".2s",    1, false, 32 },
-  { ARM64::ST1i8,             "st1",  ".b",     0, true,  0  },
-  { ARM64::ST1i16,            "st1",  ".h",     0, true,  0  },
-  { ARM64::ST1i32,            "st1",  ".s",     0, true,  0  },
-  { ARM64::ST1i64,            "st1",  ".d",     0, true,  0  },
-  { ARM64::ST1i8_POST,        "st1",  ".b",     1, true,  1  },
-  { ARM64::ST1i16_POST,       "st1",  ".h",     1, true,  2  },
-  { ARM64::ST1i32_POST,       "st1",  ".s",     1, true,  4  },
-  { ARM64::ST1i64_POST,       "st1",  ".d",     1, true,  8  },
-  { ARM64::ST1Onev16b,        "st1",  ".16b",   0, false, 0  },
-  { ARM64::ST1Onev8h,         "st1",  ".8h",    0, false, 0  },
-  { ARM64::ST1Onev4s,         "st1",  ".4s",    0, false, 0  },
-  { ARM64::ST1Onev2d,         "st1",  ".2d",    0, false, 0  },
-  { ARM64::ST1Onev8b,         "st1",  ".8b",    0, false, 0  },
-  { ARM64::ST1Onev4h,         "st1",  ".4h",    0, false, 0  },
-  { ARM64::ST1Onev2s,         "st1",  ".2s",    0, false, 0  },
-  { ARM64::ST1Onev1d,         "st1",  ".1d",    0, false, 0  },
-  { ARM64::ST1Onev16b_POST,   "st1",  ".16b",   1, false, 16 },
-  { ARM64::ST1Onev8h_POST,    "st1",  ".8h",    1, false, 16 },
-  { ARM64::ST1Onev4s_POST,    "st1",  ".4s",    1, false, 16 },
-  { ARM64::ST1Onev2d_POST,    "st1",  ".2d",    1, false, 16 },
-  { ARM64::ST1Onev8b_POST,    "st1",  ".8b",    1, false, 8  },
-  { ARM64::ST1Onev4h_POST,    "st1",  ".4h",    1, false, 8  },
-  { ARM64::ST1Onev2s_POST,    "st1",  ".2s",    1, false, 8  },
-  { ARM64::ST1Onev1d_POST,    "st1",  ".1d",    1, false, 8  },
-  { ARM64::ST1Twov16b,        "st1",  ".16b",   0, false, 0  },
-  { ARM64::ST1Twov8h,         "st1",  ".8h",    0, false, 0  },
-  { ARM64::ST1Twov4s,         "st1",  ".4s",    0, false, 0  },
-  { ARM64::ST1Twov2d,         "st1",  ".2d",    0, false, 0  },
-  { ARM64::ST1Twov8b,         "st1",  ".8b",    0, false, 0  },
-  { ARM64::ST1Twov4h,         "st1",  ".4h",    0, false, 0  },
-  { ARM64::ST1Twov2s,         "st1",  ".2s",    0, false, 0  },
-  { ARM64::ST1Twov1d,         "st1",  ".1d",    0, false, 0  },
-  { ARM64::ST1Twov16b_POST,   "st1",  ".16b",   1, false, 32 },
-  { ARM64::ST1Twov8h_POST,    "st1",  ".8h",    1, false, 32 },
-  { ARM64::ST1Twov4s_POST,    "st1",  ".4s",    1, false, 32 },
-  { ARM64::ST1Twov2d_POST,    "st1",  ".2d",    1, false, 32 },
-  { ARM64::ST1Twov8b_POST,    "st1",  ".8b",    1, false, 16 },
-  { ARM64::ST1Twov4h_POST,    "st1",  ".4h",    1, false, 16 },
-  { ARM64::ST1Twov2s_POST,    "st1",  ".2s",    1, false, 16 },
-  { ARM64::ST1Twov1d_POST,    "st1",  ".1d",    1, false, 16 },
-  { ARM64::ST1Threev16b,      "st1",  ".16b",   0, false, 0  },
-  { ARM64::ST1Threev8h,       "st1",  ".8h",    0, false, 0  },
-  { ARM64::ST1Threev4s,       "st1",  ".4s",    0, false, 0  },
-  { ARM64::ST1Threev2d,       "st1",  ".2d",    0, false, 0  },
-  { ARM64::ST1Threev8b,       "st1",  ".8b",    0, false, 0  },
-  { ARM64::ST1Threev4h,       "st1",  ".4h",    0, false, 0  },
-  { ARM64::ST1Threev2s,       "st1",  ".2s",    0, false, 0  },
-  { ARM64::ST1Threev1d,       "st1",  ".1d",    0, false, 0  },
-  { ARM64::ST1Threev16b_POST, "st1",  ".16b",   1, false, 48 },
-  { ARM64::ST1Threev8h_POST,  "st1",  ".8h",    1, false, 48 },
-  { ARM64::ST1Threev4s_POST,  "st1",  ".4s",    1, false, 48 },
-  { ARM64::ST1Threev2d_POST,  "st1",  ".2d",    1, false, 48 },
-  { ARM64::ST1Threev8b_POST,  "st1",  ".8b",    1, false, 24 },
-  { ARM64::ST1Threev4h_POST,  "st1",  ".4h",    1, false, 24 },
-  { ARM64::ST1Threev2s_POST,  "st1",  ".2s",    1, false, 24 },
-  { ARM64::ST1Threev1d_POST,  "st1",  ".1d",    1, false, 24 },
-  { ARM64::ST1Fourv16b,       "st1",  ".16b",   0, false, 0  },
-  { ARM64::ST1Fourv8h,        "st1",  ".8h",    0, false, 0  },
-  { ARM64::ST1Fourv4s,        "st1",  ".4s",    0, false, 0  },
-  { ARM64::ST1Fourv2d,        "st1",  ".2d",    0, false, 0  },
-  { ARM64::ST1Fourv8b,        "st1",  ".8b",    0, false, 0  },
-  { ARM64::ST1Fourv4h,        "st1",  ".4h",    0, false, 0  },
-  { ARM64::ST1Fourv2s,        "st1",  ".2s",    0, false, 0  },
-  { ARM64::ST1Fourv1d,        "st1",  ".1d",    0, false, 0  },
-  { ARM64::ST1Fourv16b_POST,  "st1",  ".16b",   1, false, 64 },
-  { ARM64::ST1Fourv8h_POST,   "st1",  ".8h",    1, false, 64 },
-  { ARM64::ST1Fourv4s_POST,   "st1",  ".4s",    1, false, 64 },
-  { ARM64::ST1Fourv2d_POST,   "st1",  ".2d",    1, false, 64 },
-  { ARM64::ST1Fourv8b_POST,   "st1",  ".8b",    1, false, 32 },
-  { ARM64::ST1Fourv4h_POST,   "st1",  ".4h",    1, false, 32 },
-  { ARM64::ST1Fourv2s_POST,   "st1",  ".2s",    1, false, 32 },
-  { ARM64::ST1Fourv1d_POST,   "st1",  ".1d",    1, false, 32 },
-  { ARM64::ST2i8,             "st2",  ".b",     0, true,  0  },
-  { ARM64::ST2i16,            "st2",  ".h",     0, true,  0  },
-  { ARM64::ST2i32,            "st2",  ".s",     0, true,  0  },
-  { ARM64::ST2i64,            "st2",  ".d",     0, true,  0  },
-  { ARM64::ST2i8_POST,        "st2",  ".b",     1, true,  2  },
-  { ARM64::ST2i16_POST,       "st2",  ".h",     1, true,  4  },
-  { ARM64::ST2i32_POST,       "st2",  ".s",     1, true,  8  },
-  { ARM64::ST2i64_POST,       "st2",  ".d",     1, true,  16 },
-  { ARM64::ST2Twov16b,        "st2",  ".16b",   0, false, 0  },
-  { ARM64::ST2Twov8h,         "st2",  ".8h",    0, false, 0  },
-  { ARM64::ST2Twov4s,         "st2",  ".4s",    0, false, 0  },
-  { ARM64::ST2Twov2d,         "st2",  ".2d",    0, false, 0  },
-  { ARM64::ST2Twov8b,         "st2",  ".8b",    0, false, 0  },
-  { ARM64::ST2Twov4h,         "st2",  ".4h",    0, false, 0  },
-  { ARM64::ST2Twov2s,         "st2",  ".2s",    0, false, 0  },
-  { ARM64::ST2Twov16b_POST,   "st2",  ".16b",   1, false, 32 },
-  { ARM64::ST2Twov8h_POST,    "st2",  ".8h",    1, false, 32 },
-  { ARM64::ST2Twov4s_POST,    "st2",  ".4s",    1, false, 32 },
-  { ARM64::ST2Twov2d_POST,    "st2",  ".2d",    1, false, 32 },
-  { ARM64::ST2Twov8b_POST,    "st2",  ".8b",    1, false, 16 },
-  { ARM64::ST2Twov4h_POST,    "st2",  ".4h",    1, false, 16 },
-  { ARM64::ST2Twov2s_POST,    "st2",  ".2s",    1, false, 16 },
-  { ARM64::ST3i8,             "st3",  ".b",     0, true,  0  },
-  { ARM64::ST3i16,            "st3",  ".h",     0, true,  0  },
-  { ARM64::ST3i32,            "st3",  ".s",     0, true,  0  },
-  { ARM64::ST3i64,            "st3",  ".d",     0, true,  0  },
-  { ARM64::ST3i8_POST,        "st3",  ".b",     1, true,  3  },
-  { ARM64::ST3i16_POST,       "st3",  ".h",     1, true,  6  },
-  { ARM64::ST3i32_POST,       "st3",  ".s",     1, true,  12 },
-  { ARM64::ST3i64_POST,       "st3",  ".d",     1, true,  24 },
-  { ARM64::ST3Threev16b,      "st3",  ".16b",   0, false, 0  },
-  { ARM64::ST3Threev8h,       "st3",  ".8h",    0, false, 0  },
-  { ARM64::ST3Threev4s,       "st3",  ".4s",    0, false, 0  },
-  { ARM64::ST3Threev2d,       "st3",  ".2d",    0, false, 0  },
-  { ARM64::ST3Threev8b,       "st3",  ".8b",    0, false, 0  },
-  { ARM64::ST3Threev4h,       "st3",  ".4h",    0, false, 0  },
-  { ARM64::ST3Threev2s,       "st3",  ".2s",    0, false, 0  },
-  { ARM64::ST3Threev16b_POST, "st3",  ".16b",   1, false, 48 },
-  { ARM64::ST3Threev8h_POST,  "st3",  ".8h",    1, false, 48 },
-  { ARM64::ST3Threev4s_POST,  "st3",  ".4s",    1, false, 48 },
-  { ARM64::ST3Threev2d_POST,  "st3",  ".2d",    1, false, 48 },
-  { ARM64::ST3Threev8b_POST,  "st3",  ".8b",    1, false, 24 },
-  { ARM64::ST3Threev4h_POST,  "st3",  ".4h",    1, false, 24 },
-  { ARM64::ST3Threev2s_POST,  "st3",  ".2s",    1, false, 24 },
-  { ARM64::ST4i8,             "st4",  ".b",     0, true,  0  },
-  { ARM64::ST4i16,            "st4",  ".h",     0, true,  0  },
-  { ARM64::ST4i32,            "st4",  ".s",     0, true,  0  },
-  { ARM64::ST4i64,            "st4",  ".d",     0, true,  0  },
-  { ARM64::ST4i8_POST,        "st4",  ".b",     1, true,  4  },
-  { ARM64::ST4i16_POST,       "st4",  ".h",     1, true,  8  },
-  { ARM64::ST4i32_POST,       "st4",  ".s",     1, true,  16 },
-  { ARM64::ST4i64_POST,       "st4",  ".d",     1, true,  32 },
-  { ARM64::ST4Fourv16b,       "st4",  ".16b",   0, false, 0  },
-  { ARM64::ST4Fourv8h,        "st4",  ".8h",    0, false, 0  },
-  { ARM64::ST4Fourv4s,        "st4",  ".4s",    0, false, 0  },
-  { ARM64::ST4Fourv2d,        "st4",  ".2d",    0, false, 0  },
-  { ARM64::ST4Fourv8b,        "st4",  ".8b",    0, false, 0  },
-  { ARM64::ST4Fourv4h,        "st4",  ".4h",    0, false, 0  },
-  { ARM64::ST4Fourv2s,        "st4",  ".2s",    0, false, 0  },
-  { ARM64::ST4Fourv16b_POST,  "st4",  ".16b",   1, false, 64 },
-  { ARM64::ST4Fourv8h_POST,   "st4",  ".8h",    1, false, 64 },
-  { ARM64::ST4Fourv4s_POST,   "st4",  ".4s",    1, false, 64 },
-  { ARM64::ST4Fourv2d_POST,   "st4",  ".2d",    1, false, 64 },
-  { ARM64::ST4Fourv8b_POST,   "st4",  ".8b",    1, false, 32 },
-  { ARM64::ST4Fourv4h_POST,   "st4",  ".4h",    1, false, 32 },
-  { ARM64::ST4Fourv2s_POST,   "st4",  ".2s",    1, false, 32 },
-};
-
-static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
-  unsigned Idx;
-  for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
-    if (LdStNInstInfo[Idx].Opcode == Opcode)
-      return &LdStNInstInfo[Idx];
-
-  return nullptr;
-}
-
-void ARM64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                      StringRef Annot) {
-  unsigned Opcode = MI->getOpcode();
-  StringRef Layout, Mnemonic;
-
-  bool IsTbx;
-  if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
-    O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
-      << getRegisterName(MI->getOperand(0).getReg(), ARM64::vreg) << ", ";
-
-    unsigned ListOpNum = IsTbx ? 2 : 1;
-    printVectorList(MI, ListOpNum, O, "");
-
-    O << ", "
-      << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), ARM64::vreg);
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
-    O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
-
-    // Now onto the operands: first a vector list with possible lane
-    // specifier. E.g. { v0 }[2]
-    int OpNum = LdStDesc->ListOperand;
-    printVectorList(MI, OpNum++, O, "");
-
-    if (LdStDesc->HasLane)
-      O << '[' << MI->getOperand(OpNum++).getImm() << ']';
-
-    // Next the address: [xN]
-    unsigned AddrReg = MI->getOperand(OpNum++).getReg();
-    O << ", [" << getRegisterName(AddrReg) << ']';
-
-    // Finally, there might be a post-indexed offset.
-    if (LdStDesc->NaturalOffset != 0) {
-      unsigned Reg = MI->getOperand(OpNum++).getReg();
-      if (Reg != ARM64::XZR)
-        O << ", " << getRegisterName(Reg);
-      else {
-        assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
-        O << ", #" << LdStDesc->NaturalOffset;
-      }
-    }
-
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  ARM64InstPrinter::printInst(MI, O, Annot);
-}
-
-bool ARM64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
-#ifndef NDEBUG
-  unsigned Opcode = MI->getOpcode();
-  assert(Opcode == ARM64::SYSxt && "Invalid opcode for SYS alias!");
-#endif
-
-  const char *Asm = nullptr;
-  const MCOperand &Op1 = MI->getOperand(0);
-  const MCOperand &Cn = MI->getOperand(1);
-  const MCOperand &Cm = MI->getOperand(2);
-  const MCOperand &Op2 = MI->getOperand(3);
-
-  unsigned Op1Val = Op1.getImm();
-  unsigned CnVal = Cn.getImm();
-  unsigned CmVal = Cm.getImm();
-  unsigned Op2Val = Op2.getImm();
-
-  if (CnVal == 7) {
-    switch (CmVal) {
-    default:
-      break;
-
-    // IC aliases
-    case 1:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tialluis";
-      break;
-    case 5:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tiallu";
-      else if (Op1Val == 3 && Op2Val == 1)
-        Asm = "ic\tivau";
-      break;
-
-    // DC aliases
-    case 4:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tzva";
-      break;
-    case 6:
-      if (Op1Val == 0 && Op2Val == 1)
-        Asm = "dc\tivac";
-      if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tisw";
-      break;
-    case 10:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcsw";
-      break;
-    case 11:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvau";
-      break;
-    case 14:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcivac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcisw";
-      break;
-
-    // AT aliases
-    case 8:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e1r"; break;
-        case 1: Asm = "at\ts1e1w"; break;
-        case 2: Asm = "at\ts1e0r"; break;
-        case 3: Asm = "at\ts1e0w"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e2r"; break;
-        case 1: Asm = "at\ts1e2w"; break;
-        case 4: Asm = "at\ts12e1r"; break;
-        case 5: Asm = "at\ts12e1w"; break;
-        case 6: Asm = "at\ts12e0r"; break;
-        case 7: Asm = "at\ts12e0w"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e3r"; break;
-        case 1: Asm = "at\ts1e3w"; break;
-        }
-        break;
-      }
-      break;
-    }
-  } else if (CnVal == 8) {
-    // TLBI aliases
-    switch (CmVal) {
-    default:
-      break;
-    case 3:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1is"; break;
-        case 1: Asm = "tlbi\tvae1is"; break;
-        case 2: Asm = "tlbi\taside1is"; break;
-        case 3: Asm = "tlbi\tvaae1is"; break;
-        case 5: Asm = "tlbi\tvale1is"; break;
-        case 7: Asm = "tlbi\tvaale1is"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2is"; break;
-        case 1: Asm = "tlbi\tvae2is"; break;
-        case 4: Asm = "tlbi\talle1is"; break;
-        case 5: Asm = "tlbi\tvale2is"; break;
-        case 6: Asm = "tlbi\tvmalls12e1is"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3is"; break;
-        case 1: Asm = "tlbi\tvae3is"; break;
-        case 5: Asm = "tlbi\tvale3is"; break;
-        }
-        break;
-      }
-      break;
-    case 0:
-      switch (Op1Val) {
-      default:
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 1: Asm = "tlbi\tipas2e1is"; break;
-        case 5: Asm = "tlbi\tipas2le1is"; break;
-        }
-        break;
-      }
-      break;
-    case 4:
-      switch (Op1Val) {
-      default:
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 1: Asm = "tlbi\tipas2e1"; break;
-        case 5: Asm = "tlbi\tipas2le1"; break;
-        }
-        break;
-      }
-      break;
-    case 7:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1"; break;
-        case 1: Asm = "tlbi\tvae1"; break;
-        case 2: Asm = "tlbi\taside1"; break;
-        case 3: Asm = "tlbi\tvaae1"; break;
-        case 5: Asm = "tlbi\tvale1"; break;
-        case 7: Asm = "tlbi\tvaale1"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2"; break;
-        case 1: Asm = "tlbi\tvae2"; break;
-        case 4: Asm = "tlbi\talle1"; break;
-        case 5: Asm = "tlbi\tvale2"; break;
-        case 6: Asm = "tlbi\tvmalls12e1"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3"; break;
-        case 1: Asm = "tlbi\tvae3";  break;
-        case 5: Asm = "tlbi\tvale3"; break;
-        }
-        break;
-      }
-      break;
-    }
-  }
-
-  if (Asm) {
-    unsigned Reg = MI->getOperand(4).getReg();
-
-    O << '\t' << Asm;
-    if (StringRef(Asm).lower().find("all") == StringRef::npos)
-      O << ", " << getRegisterName(Reg);
-  }
-
-  return Asm != nullptr;
-}
-
-void ARM64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                    raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
-    O << getRegisterName(Reg);
-  } else if (Op.isImm()) {
-    O << '#' << Op.getImm();
-  } else {
-    assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << *Op.getExpr();
-  }
-}
-
-void ARM64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  O << format("#%#llx", Op.getImm());
-}
-
-void ARM64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
-                                           unsigned Imm, raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
-    if (Reg == ARM64::XZR)
-      O << "#" << Imm;
-    else
-      O << getRegisterName(Reg);
-  } else
-    assert(0 && "unknown operand kind in printPostIncOperand64");
-}
-
-void ARM64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  assert(Op.isReg() && "Non-register vreg operand!");
-  unsigned Reg = Op.getReg();
-  O << getRegisterName(Reg, ARM64::vreg);
-}
-
-void ARM64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
-  O << "c" << Op.getImm();
-}
-
-void ARM64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  if (MO.isImm()) {
-    unsigned Val = (MO.getImm() & 0xfff);
-    assert(Val == MO.getImm() && "Add/sub immediate out of range!");
-    unsigned Shift =
-        ARM64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
-    O << '#' << Val;
-    if (Shift != 0)
-      printShifter(MI, OpNum + 1, O);
-
-    if (CommentStream)
-      *CommentStream << '=' << (Val << Shift) << '\n';
-  } else {
-    assert(MO.isExpr() && "Unexpected operand type!");
-    O << *MO.getExpr();
-    printShifter(MI, OpNum + 1, O);
-  }
-}
-
-void ARM64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  uint64_t Val = MI->getOperand(OpNum).getImm();
-  O << "#0x";
-  O.write_hex(ARM64_AM::decodeLogicalImmediate(Val, 32));
-}
-
-void ARM64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  uint64_t Val = MI->getOperand(OpNum).getImm();
-  O << "#0x";
-  O.write_hex(ARM64_AM::decodeLogicalImmediate(Val, 64));
-}
-
-void ARM64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
-                                    raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNum).getImm();
-  // LSL #0 should not be printed.
-  if (ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
-      ARM64_AM::getShiftValue(Val) == 0)
-    return;
-  O << ", " << ARM64_AM::getShiftExtendName(ARM64_AM::getShiftType(Val)) << " #"
-    << ARM64_AM::getShiftValue(Val);
-}
-
-void ARM64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-  O << getRegisterName(MI->getOperand(OpNum).getReg());
-  printShifter(MI, OpNum + 1, O);
-}
-
-void ARM64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
-                                             raw_ostream &O) {
-  O << getRegisterName(MI->getOperand(OpNum).getReg());
-  printArithExtend(MI, OpNum + 1, O);
-}
-
-void ARM64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNum).getImm();
-  ARM64_AM::ShiftExtendType ExtType = ARM64_AM::getArithExtendType(Val);
-  unsigned ShiftVal = ARM64_AM::getArithShiftValue(Val);
-
-  // If the destination or first source register operand is [W]SP, print
-  // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
-  // all.
-  if (ExtType == ARM64_AM::UXTW || ExtType == ARM64_AM::UXTX) {
-    unsigned Dest = MI->getOperand(0).getReg();
-    unsigned Src1 = MI->getOperand(1).getReg();
-    if ( ((Dest == ARM64::SP || Src1 == ARM64::SP) &&
-          ExtType == ARM64_AM::UXTX) ||
-         ((Dest == ARM64::WSP || Src1 == ARM64::WSP) &&
-          ExtType == ARM64_AM::UXTW) ) {
-      if (ShiftVal != 0)
-        O << ", lsl #" << ShiftVal;
-      return;
-    }
-  }
-  O << ", " << ARM64_AM::getShiftExtendName(ExtType);
-  if (ShiftVal != 0)
-    O << " #" << ShiftVal;
-}
-
-void ARM64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O, char SrcRegKind,
-                                      unsigned Width) {
-  unsigned SignExtend = MI->getOperand(OpNum).getImm();
-  unsigned DoShift = MI->getOperand(OpNum + 1).getImm();
-
-  // sxtw, sxtx, uxtw or lsl (== uxtx)
-  bool IsLSL = !SignExtend && SrcRegKind == 'x';
-  if (IsLSL)
-    O << "lsl";
-  else
-    O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind;
-
-  if (DoShift || IsLSL)
-    O << " #" << Log2_32(Width / 8);
-}
-
-void ARM64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
-                                     raw_ostream &O) {
-  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(OpNum).getImm();
-  O << ARM64CC::getCondCodeName(CC);
-}
-
-void ARM64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(OpNum).getImm();
-  O << ARM64CC::getCondCodeName(ARM64CC::getInvertedCondCode(CC));
-}
-
-void ARM64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
-}
-
-template<int Scale>
-void ARM64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
-                                     raw_ostream &O) {
-  O << '#' << Scale * MI->getOperand(OpNum).getImm();
-}
-
-void ARM64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
-                                         unsigned Scale, raw_ostream &O) {
-  const MCOperand MO = MI->getOperand(OpNum);
-  if (MO.isImm()) {
-    O << "#" << (MO.getImm() * Scale);
-  } else {
-    assert(MO.isExpr() && "Unexpected operand type!");
-    O << *MO.getExpr();
-  }
-}
-
-void ARM64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
-                                        unsigned Scale, raw_ostream &O) {
-  const MCOperand MO1 = MI->getOperand(OpNum + 1);
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
-  if (MO1.isImm()) {
-      O << ", #" << (MO1.getImm() * Scale);
-  } else {
-    assert(MO1.isExpr() && "Unexpected operand type!");
-    O << ", " << *MO1.getExpr();
-  }
-  O << ']';
-}
-
-void ARM64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
-  unsigned prfop = MI->getOperand(OpNum).getImm();
-  bool Valid;
-  StringRef Name = ARM64PRFM::PRFMMapper().toString(prfop, Valid);
-  if (Valid)
-    O << Name;
-  else
-    O << '#' << prfop;
-}
-
-void ARM64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  float FPImm = MO.isFPImm() ? MO.getFPImm() : ARM64_AM::getFPImmFloat(MO.getImm());
-
-  // 8 decimal places are enough to perfectly represent permitted floats.
-  O << format("#%.8f", FPImm);
-}
-
-static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
-  while (Stride--) {
-    switch (Reg) {
-    default:
-      assert(0 && "Vector register expected!");
-    case ARM64::Q0:  Reg = ARM64::Q1;  break;
-    case ARM64::Q1:  Reg = ARM64::Q2;  break;
-    case ARM64::Q2:  Reg = ARM64::Q3;  break;
-    case ARM64::Q3:  Reg = ARM64::Q4;  break;
-    case ARM64::Q4:  Reg = ARM64::Q5;  break;
-    case ARM64::Q5:  Reg = ARM64::Q6;  break;
-    case ARM64::Q6:  Reg = ARM64::Q7;  break;
-    case ARM64::Q7:  Reg = ARM64::Q8;  break;
-    case ARM64::Q8:  Reg = ARM64::Q9;  break;
-    case ARM64::Q9:  Reg = ARM64::Q10; break;
-    case ARM64::Q10: Reg = ARM64::Q11; break;
-    case ARM64::Q11: Reg = ARM64::Q12; break;
-    case ARM64::Q12: Reg = ARM64::Q13; break;
-    case ARM64::Q13: Reg = ARM64::Q14; break;
-    case ARM64::Q14: Reg = ARM64::Q15; break;
-    case ARM64::Q15: Reg = ARM64::Q16; break;
-    case ARM64::Q16: Reg = ARM64::Q17; break;
-    case ARM64::Q17: Reg = ARM64::Q18; break;
-    case ARM64::Q18: Reg = ARM64::Q19; break;
-    case ARM64::Q19: Reg = ARM64::Q20; break;
-    case ARM64::Q20: Reg = ARM64::Q21; break;
-    case ARM64::Q21: Reg = ARM64::Q22; break;
-    case ARM64::Q22: Reg = ARM64::Q23; break;
-    case ARM64::Q23: Reg = ARM64::Q24; break;
-    case ARM64::Q24: Reg = ARM64::Q25; break;
-    case ARM64::Q25: Reg = ARM64::Q26; break;
-    case ARM64::Q26: Reg = ARM64::Q27; break;
-    case ARM64::Q27: Reg = ARM64::Q28; break;
-    case ARM64::Q28: Reg = ARM64::Q29; break;
-    case ARM64::Q29: Reg = ARM64::Q30; break;
-    case ARM64::Q30: Reg = ARM64::Q31; break;
-    // Vector lists can wrap around.
-    case ARM64::Q31:
-      Reg = ARM64::Q0;
-      break;
-    }
-  }
-  return Reg;
-}
-
-void ARM64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O, StringRef LayoutSuffix) {
-  unsigned Reg = MI->getOperand(OpNum).getReg();
-
-  O << "{ ";
-
-  // Work out how many registers there are in the list (if there is an actual
-  // list).
-  unsigned NumRegs = 1;
-  if (MRI.getRegClass(ARM64::DDRegClassID).contains(Reg) ||
-      MRI.getRegClass(ARM64::QQRegClassID).contains(Reg))
-    NumRegs = 2;
-  else if (MRI.getRegClass(ARM64::DDDRegClassID).contains(Reg) ||
-           MRI.getRegClass(ARM64::QQQRegClassID).contains(Reg))
-    NumRegs = 3;
-  else if (MRI.getRegClass(ARM64::DDDDRegClassID).contains(Reg) ||
-           MRI.getRegClass(ARM64::QQQQRegClassID).contains(Reg))
-    NumRegs = 4;
-
-  // Now forget about the list and find out what the first register is.
-  if (unsigned FirstReg = MRI.getSubReg(Reg, ARM64::dsub0))
-    Reg = FirstReg;
-  else if (unsigned FirstReg = MRI.getSubReg(Reg, ARM64::qsub0))
-    Reg = FirstReg;
-
-  // If it's a D-reg, we need to promote it to the equivalent Q-reg before
-  // printing (otherwise getRegisterName fails).
-  if (MRI.getRegClass(ARM64::FPR64RegClassID).contains(Reg)) {
-    const MCRegisterClass &FPR128RC = MRI.getRegClass(ARM64::FPR128RegClassID);
-    Reg = MRI.getMatchingSuperReg(Reg, ARM64::dsub, &FPR128RC);
-  }
-
-  for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
-    O << getRegisterName(Reg, ARM64::vreg) << LayoutSuffix;
-    if (i + 1 != NumRegs)
-      O << ", ";
-  }
-
-  O << " }";
-}
-
-void ARM64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
-                                                      unsigned OpNum,
-                                                      raw_ostream &O) {
-  printVectorList(MI, OpNum, O, "");
-}
-
-template <unsigned NumLanes, char LaneKind>
-void ARM64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-  std::string Suffix(".");
-  if (NumLanes)
-    Suffix += itostr(NumLanes) + LaneKind;
-  else
-    Suffix += LaneKind;
-
-  printVectorList(MI, OpNum, O, Suffix);
-}
-
-void ARM64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O) {
-  O << "[" << MI->getOperand(OpNum).getImm() << "]";
-}
-
-void ARM64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNum);
-
-  // If the label has already been resolved to an immediate offset (say, when
-  // we're running the disassembler), just print the immediate.
-  if (Op.isImm()) {
-    O << "#" << (Op.getImm() << 2);
-    return;
-  }
-
-  // If the branch target is simply an address then print it in hex.
-  const MCConstantExpr *BranchTarget =
-      dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
-  int64_t Address;
-  if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
-    O << "0x";
-    O.write_hex(Address);
-  } else {
-    // Otherwise, just print the expression.
-    O << *MI->getOperand(OpNum).getExpr();
-  }
-}
-
-void ARM64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNum);
-
-  // If the label has already been resolved to an immediate offset (say, when
-  // we're running the disassembler), just print the immediate.
-  if (Op.isImm()) {
-    O << "#" << (Op.getImm() << 12);
-    return;
-  }
-
-  // Otherwise, just print the expression.
-  O << *MI->getOperand(OpNum).getExpr();
-}
-
-void ARM64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-  unsigned Opcode = MI->getOpcode();
-
-  bool Valid;
-  StringRef Name;
-  if (Opcode == ARM64::ISB)
-    Name = ARM64ISB::ISBMapper().toString(Val, Valid);
-  else
-    Name = ARM64DB::DBarrierMapper().toString(Val, Valid);
-  if (Valid)
-    O << Name;
-  else
-    O << "#" << Val;
-}
-
-void ARM64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-
-  bool Valid;
-  auto Mapper = ARM64SysReg::MRSMapper(getAvailableFeatures());
-  std::string Name = Mapper.toString(Val, Valid);
-
-  if (Valid)
-    O << StringRef(Name).upper();
-}
-
-void ARM64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-
-  bool Valid;
-  auto Mapper = ARM64SysReg::MSRMapper(getAvailableFeatures());
-  std::string Name = Mapper.toString(Val, Valid);
-
-  if (Valid)
-    O << StringRef(Name).upper();
-}
-
-void ARM64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
-                                              raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-
-  bool Valid;
-  StringRef Name = ARM64PState::PStateMapper().toString(Val, Valid);
-  if (Valid)
-    O << StringRef(Name.str()).upper();
-  else
-    O << "#" << Val;
-}
-
-void ARM64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
-                                              raw_ostream &O) {
-  unsigned RawVal = MI->getOperand(OpNo).getImm();
-  uint64_t Val = ARM64_AM::decodeAdvSIMDModImmType10(RawVal);
-  O << format("#%#016llx", Val);
-}
diff --git a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
deleted file mode 100644
index 0fd6f100712..00000000000
--- a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
+++ /dev/null
@@ -1,140 +0,0 @@
-//===-- ARM64InstPrinter.h - Convert ARM64 MCInst to assembly syntax ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints an ARM64 MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64INSTPRINTER_H
-#define ARM64INSTPRINTER_H
-
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-
-namespace llvm {
-
-class MCOperand;
-
-class ARM64InstPrinter : public MCInstPrinter {
-public:
-  ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                   const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
-
-  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
-  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
-
-  // Autogenerated by tblgen.
-  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
-  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
-  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                                       unsigned PrintMethodIdx, raw_ostream &O);
-  virtual StringRef getRegName(unsigned RegNo) const {
-    return getRegisterName(RegNo);
-  }
-  static const char *getRegisterName(unsigned RegNo,
-                                     unsigned AltIdx = ARM64::NoRegAltName);
-
-protected:
-  bool printSysAlias(const MCInst *MI, raw_ostream &O);
-  // Operand printers
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printHexImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
-                           raw_ostream &O);
-  template<int Amount>
-  void printPostIncOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printPostIncOperand(MI, OpNo, Amount, O);
-  }
-
-  void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printArithExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                      char SrcRegKind, unsigned Width);
-  template <char SrcRegKind, unsigned Width>
-  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printMemExtend(MI, OpNum, O, SrcRegKind, Width);
-  }
-
-  void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printInverseCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAlignedLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale,
-                         raw_ostream &O);
-  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale,
-                        raw_ostream &O);
-
-  template<int Scale>
-  void printUImm12Offset(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printUImm12Offset(MI, OpNum, Scale, O);
-  }
-
-  template<int BitWidth>
-  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexedWB(MI, OpNum, BitWidth / 8, O);
-  }
-
-  void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  template<int Scale>
-  void printImmScale(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                       StringRef LayoutSuffix);
-
-  /// Print a list of vector registers where the type suffix is implicit
-  /// (i.e. attached to the instruction rather than the registers).
-  void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O);
-
-  template <unsigned NumLanes, char LaneKind>
-  void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printSystemPStateField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-};
-
-class ARM64AppleInstPrinter : public ARM64InstPrinter {
-public:
-  ARM64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                        const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
-
-  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
-
-  void printInstruction(const MCInst *MI, raw_ostream &O) override;
-  bool printAliasInstr(const MCInst *MI, raw_ostream &O) override;
-  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                                       unsigned PrintMethodIdx, raw_ostream &O);
-  StringRef getRegName(unsigned RegNo) const override {
-    return getRegisterName(RegNo);
-  }
-  static const char *getRegisterName(unsigned RegNo,
-                                     unsigned AltIdx = ARM64::NoRegAltName);
-};
-}
-
-#endif
diff --git a/lib/Target/ARM64/InstPrinter/CMakeLists.txt b/lib/Target/ARM64/InstPrinter/CMakeLists.txt
deleted file mode 100644
index b8ee12c5541..00000000000
--- a/lib/Target/ARM64/InstPrinter/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64AsmPrinter
-  ARM64InstPrinter.cpp
-  )
-
-add_dependencies(LLVMARM64AsmPrinter ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/InstPrinter/LLVMBuild.txt b/lib/Target/ARM64/InstPrinter/LLVMBuild.txt
deleted file mode 100644
index 7ab43924921..00000000000
--- a/lib/Target/ARM64/InstPrinter/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64AsmPrinter
-parent = ARM64
-required_libraries = ARM64Utils MC Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/InstPrinter/Makefile b/lib/Target/ARM64/InstPrinter/Makefile
deleted file mode 100644
index a59efb08465..00000000000
--- a/lib/Target/ARM64/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM64/AsmPrinter/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64AsmPrinter
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/LLVMBuild.txt b/lib/Target/ARM64/LLVMBuild.txt
deleted file mode 100644
index 3d1e56e7ca6..00000000000
--- a/lib/Target/ARM64/LLVMBuild.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-;===- ./lib/Target/ARM64/LLVMBuild.txt -------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[common]
-subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils
-
-[component_0]
-type = TargetGroup
-name = ARM64
-parent = Target
-has_asmparser = 1
-has_asmprinter = 1
-has_disassembler = 1
-has_jit = 1
-
-[component_1]
-type = Library
-name = ARM64CodeGen
-parent = ARM64
-required_libraries = ARM64AsmPrinter ARM64Desc ARM64Info ARM64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
-add_to_library_groups = ARM64
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h b/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h
deleted file mode 100644
index 53bd3545a59..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h
+++ /dev/null
@@ -1,738 +0,0 @@
-//===- ARM64AddressingModes.h - ARM64 Addressing Modes ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 addressing mode implementation stuff.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H
-#define LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H
-
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-
-namespace llvm {
-
-/// ARM64_AM - ARM64 Addressing Mode Stuff
-namespace ARM64_AM {
-
-//===----------------------------------------------------------------------===//
-// Shifts
-//
-
-enum ShiftExtendType {
-  InvalidShiftExtend = -1,
-  LSL = 0,
-  LSR,
-  ASR,
-  ROR,
-  MSL,
-
-  UXTB,
-  UXTH,
-  UXTW,
-  UXTX,
-
-  SXTB,
-  SXTH,
-  SXTW,
-  SXTX,
-};
-
-/// getShiftName - Get the string encoding for the shift type.
-static inline const char *getShiftExtendName(ARM64_AM::ShiftExtendType ST) {
-  switch (ST) {
-  default: assert(false && "unhandled shift type!");
-  case ARM64_AM::LSL: return "lsl";
-  case ARM64_AM::LSR: return "lsr";
-  case ARM64_AM::ASR: return "asr";
-  case ARM64_AM::ROR: return "ror";
-  case ARM64_AM::MSL: return "msl";
-  case ARM64_AM::UXTB: return "uxtb";
-  case ARM64_AM::UXTH: return "uxth";
-  case ARM64_AM::UXTW: return "uxtw";
-  case ARM64_AM::UXTX: return "uxtx";
-  case ARM64_AM::SXTB: return "sxtb";
-  case ARM64_AM::SXTH: return "sxth";
-  case ARM64_AM::SXTW: return "sxtw";
-  case ARM64_AM::SXTX: return "sxtx";
-  }
-  return nullptr;
-}
-
-/// getShiftType - Extract the shift type.
-static inline ARM64_AM::ShiftExtendType getShiftType(unsigned Imm) {
-  switch ((Imm >> 6) & 0x7) {
-  default: return ARM64_AM::InvalidShiftExtend;
-  case 0: return ARM64_AM::LSL;
-  case 1: return ARM64_AM::LSR;
-  case 2: return ARM64_AM::ASR;
-  case 3: return ARM64_AM::ROR;
-  case 4: return ARM64_AM::MSL;
-  }
-}
-
-/// getShiftValue - Extract the shift value.
-static inline unsigned getShiftValue(unsigned Imm) {
-  return Imm & 0x3f;
-}
-
-/// getShifterImm - Encode the shift type and amount:
-///   imm:     6-bit shift amount
-///   shifter: 000 ==> lsl
-///            001 ==> lsr
-///            010 ==> asr
-///            011 ==> ror
-///            100 ==> msl
-///   {8-6}  = shifter
-///   {5-0}  = imm
-static inline unsigned getShifterImm(ARM64_AM::ShiftExtendType ST,
-                                     unsigned Imm) {
-  assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
-  unsigned STEnc = 0;
-  switch (ST) {
-  default:  llvm_unreachable("Invalid shift requested");
-  case ARM64_AM::LSL: STEnc = 0; break;
-  case ARM64_AM::LSR: STEnc = 1; break;
-  case ARM64_AM::ASR: STEnc = 2; break;
-  case ARM64_AM::ROR: STEnc = 3; break;
-  case ARM64_AM::MSL: STEnc = 4; break;
-  }
-  return (STEnc << 6) | (Imm & 0x3f);
-}
-
-//===----------------------------------------------------------------------===//
-// Extends
-//
-
-/// getArithShiftValue - get the arithmetic shift value.
-static inline unsigned getArithShiftValue(unsigned Imm) {
-  return Imm & 0x7;
-}
-
-/// getExtendType - Extract the extend type for operands of arithmetic ops.
-static inline ARM64_AM::ShiftExtendType getExtendType(unsigned Imm) {
-  assert((Imm & 0x7) == Imm && "invalid immediate!");
-  switch (Imm) {
-  default: llvm_unreachable("Compiler bug!");
-  case 0: return ARM64_AM::UXTB;
-  case 1: return ARM64_AM::UXTH;
-  case 2: return ARM64_AM::UXTW;
-  case 3: return ARM64_AM::UXTX;
-  case 4: return ARM64_AM::SXTB;
-  case 5: return ARM64_AM::SXTH;
-  case 6: return ARM64_AM::SXTW;
-  case 7: return ARM64_AM::SXTX;
-  }
-}
-
-static inline ARM64_AM::ShiftExtendType getArithExtendType(unsigned Imm) {
-  return getExtendType((Imm >> 3) & 0x7);
-}
-
-/// Mapping from extend bits to required operation:
-///   shifter: 000 ==> uxtb
-///            001 ==> uxth
-///            010 ==> uxtw
-///            011 ==> uxtx
-///            100 ==> sxtb
-///            101 ==> sxth
-///            110 ==> sxtw
-///            111 ==> sxtx
-inline unsigned getExtendEncoding(ARM64_AM::ShiftExtendType ET) {
-  switch (ET) {
-  default: llvm_unreachable("Invalid extend type requested");
-  case ARM64_AM::UXTB: return 0; break;
-  case ARM64_AM::UXTH: return 1; break;
-  case ARM64_AM::UXTW: return 2; break;
-  case ARM64_AM::UXTX: return 3; break;
-  case ARM64_AM::SXTB: return 4; break;
-  case ARM64_AM::SXTH: return 5; break;
-  case ARM64_AM::SXTW: return 6; break;
-  case ARM64_AM::SXTX: return 7; break;
-  }
-}
-
-/// getArithExtendImm - Encode the extend type and shift amount for an
-///                     arithmetic instruction:
-///   imm:     3-bit extend amount
-///   {5-3}  = shifter
-///   {2-0}  = imm3
-static inline unsigned getArithExtendImm(ARM64_AM::ShiftExtendType ET,
-                                         unsigned Imm) {
-  assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
-  return (getExtendEncoding(ET) << 3) | (Imm & 0x7);
-}
-
-/// getMemDoShift - Extract the "do shift" flag value for load/store
-/// instructions.
-static inline bool getMemDoShift(unsigned Imm) {
-  return (Imm & 0x1) != 0;
-}
-
-/// getExtendType - Extract the extend type for the offset operand of
-/// loads/stores.
-static inline ARM64_AM::ShiftExtendType getMemExtendType(unsigned Imm) {
-  return getExtendType((Imm >> 1) & 0x7);
-}
-
-/// getExtendImm - Encode the extend type and amount for a load/store inst:
-///   doshift:     should the offset be scaled by the access size
-///   shifter: 000 ==> uxtb
-///            001 ==> uxth
-///            010 ==> uxtw
-///            011 ==> uxtx
-///            100 ==> sxtb
-///            101 ==> sxth
-///            110 ==> sxtw
-///            111 ==> sxtx
-///   {3-1}  = shifter
-///   {0}  = doshift
-static inline unsigned getMemExtendImm(ARM64_AM::ShiftExtendType ET,
-                                       bool DoShift) {
-  return (getExtendEncoding(ET) << 1) | unsigned(DoShift);
-}
-
-static inline uint64_t ror(uint64_t elt, unsigned size) {
-  return ((elt & 1) << (size-1)) | (elt >> 1);
-}
-
-/// processLogicalImmediate - Determine if an immediate value can be encoded
-/// as the immediate operand of a logical instruction for the given register
-/// size.  If so, return true with "encoding" set to the encoded value in
-/// the form N:immr:imms.
-static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize,
-                                           uint64_t &encoding) {
-  if (imm == 0ULL || imm == ~0ULL ||
-      (regSize != 64 && (imm >> regSize != 0 || imm == ~0U)))
-    return false;
-
-  unsigned size = 2;
-  uint64_t eltVal = imm;
-
-  // First, determine the element size.
-  while (size < regSize) {
-    unsigned numElts = regSize / size;
-    unsigned mask = (1ULL << size) - 1;
-    uint64_t lowestEltVal = imm & mask;
-
-    bool allMatched = true;
-    for (unsigned i = 1; i < numElts; ++i) {
-     uint64_t currEltVal = (imm >> (i*size)) & mask;
-      if (currEltVal != lowestEltVal) {
-        allMatched = false;
-        break;
-      }
-    }
-
-    if (allMatched) {
-      eltVal = lowestEltVal;
-      break;
-    }
-
-    size *= 2;
-  }
-
-  // Second, determine the rotation to make the element be: 0^m 1^n.
-  for (unsigned i = 0; i < size; ++i) {
-    eltVal = ror(eltVal, size);
-    uint32_t clz = countLeadingZeros(eltVal) - (64 - size);
-    uint32_t cto = CountTrailingOnes_64(eltVal);
-
-    if (clz + cto == size) {
-      // Encode in immr the number of RORs it would take to get *from* this
-      // element value to our target value, where i+1 is the number of RORs
-      // to go the opposite direction.
-      unsigned immr = size - (i + 1);
-
-      // If size has a 1 in the n'th bit, create a value that has zeroes in
-      // bits [0, n] and ones above that.
-      uint64_t nimms = ~(size-1) << 1;
-
-      // Or the CTO value into the low bits, which must be below the Nth bit
-      // bit mentioned above.
-      nimms |= (cto-1);
-
-      // Extract the seventh bit and toggle it to create the N field.
-      unsigned N = ((nimms >> 6) & 1) ^ 1;
-
-      encoding = (N << 12) | (immr << 6) | (nimms & 0x3f);
-      return true;
-    }
-  }
-
-  return false;
-}
-
-/// isLogicalImmediate - Return true if the immediate is valid for a logical
-/// immediate instruction of the given register size. Return false otherwise.
-static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) {
-  uint64_t encoding;
-  return processLogicalImmediate(imm, regSize, encoding);
-}
-
-/// encodeLogicalImmediate - Return the encoded immediate value for a logical
-/// immediate instruction of the given register size.
-static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) {
-  uint64_t encoding = 0;
-  bool res = processLogicalImmediate(imm, regSize, encoding);
-  assert(res && "invalid logical immediate");
-  (void)res;
-  return encoding;
-}
-
-/// decodeLogicalImmediate - Decode a logical immediate value in the form
-/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the
-/// integer value it represents with regSize bits.
-static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) {
-  // Extract the N, imms, and immr fields.
-  unsigned N = (val >> 12) & 1;
-  unsigned immr = (val >> 6) & 0x3f;
-  unsigned imms = val & 0x3f;
-
-  assert((regSize == 64 || N == 0) && "undefined logical immediate encoding");
-  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
-  assert(len >= 0 && "undefined logical immediate encoding");
-  unsigned size = (1 << len);
-  unsigned R = immr & (size - 1);
-  unsigned S = imms & (size - 1);
-  assert(S != size - 1 && "undefined logical immediate encoding");
-  uint64_t pattern = (1ULL << (S + 1)) - 1;
-  for (unsigned i = 0; i < R; ++i)
-    pattern = ror(pattern, size);
-
-  // Replicate the pattern to fill the regSize.
-  while (size != regSize) {
-    pattern |= (pattern << size);
-    size *= 2;
-  }
-  return pattern;
-}
-
-/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value
-/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits)
-/// is a valid encoding for an integer value with regSize bits.
-static inline bool isValidDecodeLogicalImmediate(uint64_t val,
-                                                 unsigned regSize) {
-  // Extract the N and imms fields needed for checking.
-  unsigned N = (val >> 12) & 1;
-  unsigned imms = val & 0x3f;
-
-  if (regSize == 32 && N != 0) // undefined logical immediate encoding
-    return false;
-  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
-  if (len < 0) // undefined logical immediate encoding
-    return false;
-  unsigned size = (1 << len);
-  unsigned S = imms & (size - 1);
-  if (S == size - 1) // undefined logical immediate encoding
-    return false;
-
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating-point Immediates
-//
-static inline float getFPImmFloat(unsigned Imm) {
-  // We expect an 8-bit binary encoding of a floating-point number here.
-  union {
-    uint32_t I;
-    float F;
-  } FPUnion;
-
-  uint8_t Sign = (Imm >> 7) & 0x1;
-  uint8_t Exp = (Imm >> 4) & 0x7;
-  uint8_t Mantissa = Imm & 0xf;
-
-  //   8-bit FP    iEEEE Float Encoding
-  //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
-  //
-  // where B = NOT(b);
-
-  FPUnion.I = 0;
-  FPUnion.I |= Sign << 31;
-  FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
-  FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
-  FPUnion.I |= (Exp & 0x3) << 23;
-  FPUnion.I |= Mantissa << 19;
-  return FPUnion.F;
-}
-
-/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
-/// floating-point value. If the value cannot be represented as an 8-bit
-/// floating-point value, then return -1.
-static inline int getFP32Imm(const APInt &Imm) {
-  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
-  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
-  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
-
-  // We can handle 4 bits of mantissa.
-  // mantissa = (16+UInt(e:f:g:h))/16.
-  if (Mantissa & 0x7ffff)
-    return -1;
-  Mantissa >>= 19;
-  if ((Mantissa & 0xf) != Mantissa)
-    return -1;
-
-  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
-  if (Exp < -3 || Exp > 4)
-    return -1;
-  Exp = ((Exp+3) & 0x7) ^ 4;
-
-  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
-}
-
-static inline int getFP32Imm(const APFloat &FPImm) {
-  return getFP32Imm(FPImm.bitcastToAPInt());
-}
-
-/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
-/// floating-point value. If the value cannot be represented as an 8-bit
-/// floating-point value, then return -1.
-static inline int getFP64Imm(const APInt &Imm) {
-  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
-  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023;   // -1022 to 1023
-  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
-
-  // We can handle 4 bits of mantissa.
-  // mantissa = (16+UInt(e:f:g:h))/16.
-  if (Mantissa & 0xffffffffffffULL)
-    return -1;
-  Mantissa >>= 48;
-  if ((Mantissa & 0xf) != Mantissa)
-    return -1;
-
-  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
-  if (Exp < -3 || Exp > 4)
-    return -1;
-  Exp = ((Exp+3) & 0x7) ^ 4;
-
-  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
-}
-
-static inline int getFP64Imm(const APFloat &FPImm) {
-  return getFP64Imm(FPImm.bitcastToAPInt());
-}
-
-//===--------------------------------------------------------------------===//
-// AdvSIMD Modified Immediates
-//===--------------------------------------------------------------------===//
-
-// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh
-static inline bool isAdvSIMDModImmType1(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm & 0xffffff00ffffff00ULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) {
-  return (Imm & 0xffULL);
-}
-
-static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 32) | EncVal;
-}
-
-// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00
-static inline bool isAdvSIMDModImmType2(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm & 0xffff00ffffff00ffULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) {
-  return (Imm & 0xff00ULL) >> 8;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 40) | (EncVal << 8);
-}
-
-// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00
-static inline bool isAdvSIMDModImmType3(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm & 0xff00ffffff00ffffULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) {
-  return (Imm & 0xff0000ULL) >> 16;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 48) | (EncVal << 16);
-}
-
-// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00
-static inline bool isAdvSIMDModImmType4(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm & 0x00ffffff00ffffffULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) {
-  return (Imm & 0xff000000ULL) >> 24;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 56) | (EncVal << 24);
-}
-
-// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh
-static inline bool isAdvSIMDModImmType5(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) &&
-         ((Imm & 0xff00ff00ff00ff00ULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) {
-  return (Imm & 0xffULL);
-}
-
-static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal;
-}
-
-// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00
-static inline bool isAdvSIMDModImmType6(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) &&
-         ((Imm & 0x00ff00ff00ff00ffULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) {
-  return (Imm & 0xff00ULL) >> 8;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8);
-}
-
-// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF
-static inline bool isAdvSIMDModImmType7(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) {
-  return (Imm & 0xff00ULL) >> 8;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL;
-}
-
-// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF
-static inline bool isAdvSIMDModImmType8(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL);
-}
-
-static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL;
-}
-
-static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) {
-  return (Imm & 0x00ff0000ULL) >> 16;
-}
-
-// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh
-static inline bool isAdvSIMDModImmType9(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm >> 48) == (Imm & 0x0000ffffULL)) &&
-         ((Imm >> 56) == (Imm & 0x000000ffULL));
-}
-
-static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) {
-  return (Imm & 0xffULL);
-}
-
-static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  EncVal |= (EncVal << 8);
-  EncVal |= (EncVal << 16);
-  EncVal |= (EncVal << 32);
-  return EncVal;
-}
-
-// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
-// cmode: 1110, op: 1
-static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
-  uint64_t ByteA = Imm & 0xff00000000000000ULL;
-  uint64_t ByteB = Imm & 0x00ff000000000000ULL;
-  uint64_t ByteC = Imm & 0x0000ff0000000000ULL;
-  uint64_t ByteD = Imm & 0x000000ff00000000ULL;
-  uint64_t ByteE = Imm & 0x00000000ff000000ULL;
-  uint64_t ByteF = Imm & 0x0000000000ff0000ULL;
-  uint64_t ByteG = Imm & 0x000000000000ff00ULL;
-  uint64_t ByteH = Imm & 0x00000000000000ffULL;
-
-  return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) &&
-         (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) &&
-         (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) &&
-         (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) &&
-         (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) &&
-         (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) &&
-         (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) &&
-         (ByteH == 0ULL || ByteH == 0x00000000000000ffULL);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) {
-  uint8_t BitA = (Imm & 0xff00000000000000ULL) != 0;
-  uint8_t BitB = (Imm & 0x00ff000000000000ULL) != 0;
-  uint8_t BitC = (Imm & 0x0000ff0000000000ULL) != 0;
-  uint8_t BitD = (Imm & 0x000000ff00000000ULL) != 0;
-  uint8_t BitE = (Imm & 0x00000000ff000000ULL) != 0;
-  uint8_t BitF = (Imm & 0x0000000000ff0000ULL) != 0;
-  uint8_t BitG = (Imm & 0x000000000000ff00ULL) != 0;
-  uint8_t BitH = (Imm & 0x00000000000000ffULL) != 0;
-
-  uint8_t EncVal = BitA;
-  EncVal <<= 1;
-  EncVal |= BitB;
-  EncVal <<= 1;
-  EncVal |= BitC;
-  EncVal <<= 1;
-  EncVal |= BitD;
-  EncVal <<= 1;
-  EncVal |= BitE;
-  EncVal <<= 1;
-  EncVal |= BitF;
-  EncVal <<= 1;
-  EncVal |= BitG;
-  EncVal <<= 1;
-  EncVal |= BitH;
-  return EncVal;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) {
-  uint64_t EncVal = 0;
-  if (Imm & 0x80) EncVal |= 0xff00000000000000ULL;
-  if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL;
-  if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL;
-  if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL;
-  if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL;
-  if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL;
-  if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL;
-  if (Imm & 0x01) EncVal |= 0x00000000000000ffULL;
-  return EncVal;
-}
-
-// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00
-static inline bool isAdvSIMDModImmType11(uint64_t Imm) {
-  uint64_t BString = (Imm & 0x7E000000ULL) >> 25;
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         (BString == 0x1f || BString == 0x20) &&
-         ((Imm & 0x0007ffff0007ffffULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) {
-  uint8_t BitA = (Imm & 0x80000000ULL) != 0;
-  uint8_t BitB = (Imm & 0x20000000ULL) != 0;
-  uint8_t BitC = (Imm & 0x01000000ULL) != 0;
-  uint8_t BitD = (Imm & 0x00800000ULL) != 0;
-  uint8_t BitE = (Imm & 0x00400000ULL) != 0;
-  uint8_t BitF = (Imm & 0x00200000ULL) != 0;
-  uint8_t BitG = (Imm & 0x00100000ULL) != 0;
-  uint8_t BitH = (Imm & 0x00080000ULL) != 0;
-
-  uint8_t EncVal = BitA;
-  EncVal <<= 1;
-  EncVal |= BitB;
-  EncVal <<= 1;
-  EncVal |= BitC;
-  EncVal <<= 1;
-  EncVal |= BitD;
-  EncVal <<= 1;
-  EncVal |= BitE;
-  EncVal <<= 1;
-  EncVal |= BitF;
-  EncVal <<= 1;
-  EncVal |= BitG;
-  EncVal <<= 1;
-  EncVal |= BitH;
-  return EncVal;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) {
-  uint64_t EncVal = 0;
-  if (Imm & 0x80) EncVal |= 0x80000000ULL;
-  if (Imm & 0x40) EncVal |= 0x3e000000ULL;
-  else            EncVal |= 0x40000000ULL;
-  if (Imm & 0x20) EncVal |= 0x01000000ULL;
-  if (Imm & 0x10) EncVal |= 0x00800000ULL;
-  if (Imm & 0x08) EncVal |= 0x00400000ULL;
-  if (Imm & 0x04) EncVal |= 0x00200000ULL;
-  if (Imm & 0x02) EncVal |= 0x00100000ULL;
-  if (Imm & 0x01) EncVal |= 0x00080000ULL;
-  return (EncVal << 32) | EncVal;
-}
-
-// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00
-static inline bool isAdvSIMDModImmType12(uint64_t Imm) {
-  uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54;
-  return ((BString == 0xff || BString == 0x100) &&
-         ((Imm & 0x0000ffffffffffffULL) == 0));
-}
-
-static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) {
-  uint8_t BitA = (Imm & 0x8000000000000000ULL) != 0;
-  uint8_t BitB = (Imm & 0x0040000000000000ULL) != 0;
-  uint8_t BitC = (Imm & 0x0020000000000000ULL) != 0;
-  uint8_t BitD = (Imm & 0x0010000000000000ULL) != 0;
-  uint8_t BitE = (Imm & 0x0008000000000000ULL) != 0;
-  uint8_t BitF = (Imm & 0x0004000000000000ULL) != 0;
-  uint8_t BitG = (Imm & 0x0002000000000000ULL) != 0;
-  uint8_t BitH = (Imm & 0x0001000000000000ULL) != 0;
-
-  uint8_t EncVal = BitA;
-  EncVal <<= 1;
-  EncVal |= BitB;
-  EncVal <<= 1;
-  EncVal |= BitC;
-  EncVal <<= 1;
-  EncVal |= BitD;
-  EncVal <<= 1;
-  EncVal |= BitE;
-  EncVal <<= 1;
-  EncVal |= BitF;
-  EncVal <<= 1;
-  EncVal |= BitG;
-  EncVal <<= 1;
-  EncVal |= BitH;
-  return EncVal;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
-  uint64_t EncVal = 0;
-  if (Imm & 0x80) EncVal |= 0x8000000000000000ULL;
-  if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL;
-  else            EncVal |= 0x4000000000000000ULL;
-  if (Imm & 0x20) EncVal |= 0x0020000000000000ULL;
-  if (Imm & 0x10) EncVal |= 0x0010000000000000ULL;
-  if (Imm & 0x08) EncVal |= 0x0008000000000000ULL;
-  if (Imm & 0x04) EncVal |= 0x0004000000000000ULL;
-  if (Imm & 0x02) EncVal |= 0x0002000000000000ULL;
-  if (Imm & 0x01) EncVal |= 0x0001000000000000ULL;
-  return (EncVal << 32) | EncVal;
-}
-
-} // end namespace ARM64_AM
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
deleted file mode 100644
index ba5025ab620..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
+++ /dev/null
@@ -1,564 +0,0 @@
-//===-- ARM64AsmBackend.cpp - ARM64 Assembler Backend ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64RegisterInfo.h"
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCFixupKindInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
-using namespace llvm;
-
-namespace {
-
-class ARM64AsmBackend : public MCAsmBackend {
-  static const unsigned PCRelFlagVal =
-      MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
-
-public:
-  ARM64AsmBackend(const Target &T) : MCAsmBackend() {}
-
-  unsigned getNumFixupKinds() const override {
-    return ARM64::NumTargetFixupKinds;
-  }
-
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
-    const static MCFixupKindInfo Infos[ARM64::NumTargetFixupKinds] = {
-      // This table *must* be in the order that the fixup_* kinds are defined in
-      // ARM64FixupKinds.h.
-      //
-      // Name                           Offset (bits) Size (bits)     Flags
-      { "fixup_arm64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_arm64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_arm64_add_imm12", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale1", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale2", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale4", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale8", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale16", 10, 12, 0 },
-      { "fixup_arm64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
-      { "fixup_arm64_movw", 5, 16, 0 },
-      { "fixup_arm64_pcrel_branch14", 5, 14, PCRelFlagVal },
-      { "fixup_arm64_pcrel_branch19", 5, 19, PCRelFlagVal },
-      { "fixup_arm64_pcrel_branch26", 0, 26, PCRelFlagVal },
-      { "fixup_arm64_pcrel_call26", 0, 26, PCRelFlagVal },
-      { "fixup_arm64_tlsdesc_call", 0, 0, 0 }
-    };
-
-    if (Kind < FirstTargetFixupKind)
-      return MCAsmBackend::getFixupKindInfo(Kind);
-
-    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-           "Invalid kind!");
-    return Infos[Kind - FirstTargetFixupKind];
-  }
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
-
-  bool mayNeedRelaxation(const MCInst &Inst) const override;
-  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const override;
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
-
-  void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
-
-  unsigned getPointerSize() const { return 8; }
-};
-
-} // end anonymous namespace
-
-/// \brief The number of bytes the fixup may change.
-static unsigned getFixupKindNumBytes(unsigned Kind) {
-  switch (Kind) {
-  default:
-    assert(0 && "Unknown fixup kind!");
-
-  case ARM64::fixup_arm64_tlsdesc_call:
-    return 0;
-
-  case FK_Data_1:
-    return 1;
-
-  case FK_Data_2:
-  case ARM64::fixup_arm64_movw:
-    return 2;
-
-  case ARM64::fixup_arm64_pcrel_branch14:
-  case ARM64::fixup_arm64_add_imm12:
-  case ARM64::fixup_arm64_ldst_imm12_scale1:
-  case ARM64::fixup_arm64_ldst_imm12_scale2:
-  case ARM64::fixup_arm64_ldst_imm12_scale4:
-  case ARM64::fixup_arm64_ldst_imm12_scale8:
-  case ARM64::fixup_arm64_ldst_imm12_scale16:
-  case ARM64::fixup_arm64_ldr_pcrel_imm19:
-  case ARM64::fixup_arm64_pcrel_branch19:
-    return 3;
-
-  case ARM64::fixup_arm64_pcrel_adr_imm21:
-  case ARM64::fixup_arm64_pcrel_adrp_imm21:
-  case ARM64::fixup_arm64_pcrel_branch26:
-  case ARM64::fixup_arm64_pcrel_call26:
-  case FK_Data_4:
-    return 4;
-
-  case FK_Data_8:
-    return 8;
-  }
-}
-
-static unsigned AdrImmBits(unsigned Value) {
-  unsigned lo2 = Value & 0x3;
-  unsigned hi19 = (Value & 0x1ffffc) >> 2;
-  return (hi19 << 5) | (lo2 << 29);
-}
-
-static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
-  int64_t SignedValue = static_cast<int64_t>(Value);
-  switch (Kind) {
-  default:
-    assert(false && "Unknown fixup kind!");
-  case ARM64::fixup_arm64_pcrel_adr_imm21:
-    if (SignedValue > 2097151 || SignedValue < -2097152)
-      report_fatal_error("fixup value out of range");
-    return AdrImmBits(Value & 0x1fffffULL);
-  case ARM64::fixup_arm64_pcrel_adrp_imm21:
-    return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
-  case ARM64::fixup_arm64_ldr_pcrel_imm19:
-  case ARM64::fixup_arm64_pcrel_branch19:
-    // Signed 21-bit immediate
-    if (SignedValue > 2097151 || SignedValue < -2097152)
-      report_fatal_error("fixup value out of range");
-    // Low two bits are not encoded.
-    return (Value >> 2) & 0x7ffff;
-  case ARM64::fixup_arm64_add_imm12:
-  case ARM64::fixup_arm64_ldst_imm12_scale1:
-    // Unsigned 12-bit immediate
-    if (Value >= 0x1000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value;
-  case ARM64::fixup_arm64_ldst_imm12_scale2:
-    // Unsigned 12-bit immediate which gets multiplied by 2
-    if (Value & 1 || Value >= 0x2000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 1;
-  case ARM64::fixup_arm64_ldst_imm12_scale4:
-    // Unsigned 12-bit immediate which gets multiplied by 4
-    if (Value & 3 || Value >= 0x4000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 2;
-  case ARM64::fixup_arm64_ldst_imm12_scale8:
-    // Unsigned 12-bit immediate which gets multiplied by 8
-    if (Value & 7 || Value >= 0x8000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 3;
-  case ARM64::fixup_arm64_ldst_imm12_scale16:
-    // Unsigned 12-bit immediate which gets multiplied by 16
-    if (Value & 15 || Value >= 0x10000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 4;
-  case ARM64::fixup_arm64_movw:
-    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
-    return Value;
-  case ARM64::fixup_arm64_pcrel_branch14:
-    // Signed 16-bit immediate
-    if (SignedValue > 32767 || SignedValue < -32768)
-      report_fatal_error("fixup value out of range");
-    // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0x3)
-      report_fatal_error("fixup not sufficiently aligned");
-    return (Value >> 2) & 0x3fff;
-  case ARM64::fixup_arm64_pcrel_branch26:
-  case ARM64::fixup_arm64_pcrel_call26:
-    // Signed 28-bit immediate
-    if (SignedValue > 134217727 || SignedValue < -134217728)
-      report_fatal_error("fixup value out of range");
-    // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0x3)
-      report_fatal_error("fixup not sufficiently aligned");
-    return (Value >> 2) & 0x3ffffff;
-  case FK_Data_1:
-  case FK_Data_2:
-  case FK_Data_4:
-  case FK_Data_8:
-    return Value;
-  }
-}
-
-void ARM64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                 unsigned DataSize, uint64_t Value,
-                                 bool IsPCRel) const {
-  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  if (!Value)
-    return; // Doesn't change encoding.
-  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
-  // Apply any target-specific value adjustments.
-  Value = adjustFixupValue(Fixup.getKind(), Value);
-
-  // Shift the value into position.
-  Value <<= Info.TargetOffset;
-
-  unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-
-  // For each byte of the fragment that the fixup touches, mask in the
-  // bits from the fixup value.
-  for (unsigned i = 0; i != NumBytes; ++i)
-    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
-}
-
-bool ARM64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
-  return false;
-}
-
-bool ARM64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                                           const MCRelaxableFragment *DF,
-                                           const MCAsmLayout &Layout) const {
-  // FIXME:  This isn't correct for ARM64. Just moving the "generic" logic
-  // into the targets for now.
-  //
-  // Relax if the value is too big for a (signed) i8.
-  return int64_t(Value) != int64_t(int8_t(Value));
-}
-
-void ARM64AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
-  assert(false && "ARM64AsmBackend::relaxInstruction() unimplemented");
-}
-
-bool ARM64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  // If the count is not 4-byte aligned, we must be writing data into the text
-  // section (otherwise we have unaligned instructions, and thus have far
-  // bigger problems), so just write zeros instead.
-  if ((Count & 3) != 0) {
-    for (uint64_t i = 0, e = (Count & 3); i != e; ++i)
-      OW->Write8(0);
-  }
-
-  // We are properly aligned, so write NOPs as requested.
-  Count /= 4;
-  for (uint64_t i = 0; i != Count; ++i)
-    OW->Write32(0xd503201f);
-  return true;
-}
-
-namespace {
-
-namespace CU {
-
-/// \brief Compact unwind encoding values.
-enum CompactUnwindEncodings {
-  /// \brief A "frameless" leaf function, where no non-volatile registers are
-  /// saved. The return remains in LR throughout the function.
-  UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
-
-  /// \brief No compact unwind encoding available. Instead the low 23-bits of
-  /// the compact unwind encoding is the offset of the DWARF FDE in the
-  /// __eh_frame section. This mode is never used in object files. It is only
-  /// generated by the linker in final linked images, which have only DWARF info
-  /// for a function.
-  UNWIND_ARM64_MODE_DWARF = 0x03000000,
-
-  /// \brief This is a standard arm64 prologue where FP/LR are immediately
-  /// pushed on the stack, then SP is copied to FP. If there are any
-  /// non-volatile register saved, they are copied into the stack fame in pairs
-  /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
-  /// five X pairs and four D pairs can be saved, but the memory layout must be
-  /// in register number order.
-  UNWIND_ARM64_MODE_FRAME = 0x04000000,
-
-  /// \brief Frame register pair encodings.
-  UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
-  UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
-  UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
-  UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008,
-  UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010,
-  UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100,
-  UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200,
-  UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400,
-  UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800
-};
-
-} // end CU namespace
-
-// FIXME: This should be in a separate file.
-class DarwinARM64AsmBackend : public ARM64AsmBackend {
-  const MCRegisterInfo &MRI;
-
-  /// \brief Encode compact unwind stack adjustment for frameless functions.
-  /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
-  /// The stack size always needs to be 16 byte aligned.
-  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
-    return (StackSize / 16) << 12;
-  }
-
-public:
-  DarwinARM64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
-      : ARM64AsmBackend(T), MRI(MRI) {}
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
-    return createARM64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
-                                       MachO::CPU_SUBTYPE_ARM64_ALL);
-  }
-
-  bool doesSectionRequireSymbols(const MCSection &Section) const override {
-    // Any section for which the linker breaks things into atoms needs to
-    // preserve symbols, including assembler local symbols, to identify
-    // those atoms. These sections are:
-    // Sections of type:
-    //
-    //    S_CSTRING_LITERALS  (e.g. __cstring)
-    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
-    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
-    //
-    // Sections named:
-    //
-    //    __TEXT,__eh_frame
-    //    __TEXT,__ustring
-    //    __DATA,__cfstring
-    //    __DATA,__objc_classrefs
-    //    __DATA,__objc_catlist
-    //
-    // FIXME: It would be better if the compiler used actual linker local
-    // symbols for each of these sections rather than preserving what
-    // are ostensibly assembler local symbols.
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
-    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
-            SMO.getType() == MachO::S_4BYTE_LITERALS ||
-            SMO.getType() == MachO::S_8BYTE_LITERALS ||
-            SMO.getType() == MachO::S_16BYTE_LITERALS ||
-            SMO.getType() == MachO::S_LITERAL_POINTERS ||
-            (SMO.getSegmentName() == "__TEXT" &&
-             (SMO.getSectionName() == "__eh_frame" ||
-              SMO.getSectionName() == "__ustring")) ||
-            (SMO.getSegmentName() == "__DATA" &&
-             (SMO.getSectionName() == "__cfstring" ||
-              SMO.getSectionName() == "__objc_classrefs" ||
-              SMO.getSectionName() == "__objc_catlist")));
-  }
-
-  /// \brief Generate the compact unwind encoding from the CFI directives.
-  uint32_t generateCompactUnwindEncoding(
-                             ArrayRef<MCCFIInstruction> Instrs) const override {
-    if (Instrs.empty())
-      return CU::UNWIND_ARM64_MODE_FRAMELESS;
-
-    bool HasFP = false;
-    unsigned StackSize = 0;
-
-    uint32_t CompactUnwindEncoding = 0;
-    for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
-      const MCCFIInstruction &Inst = Instrs[i];
-
-      switch (Inst.getOperation()) {
-      default:
-        // Cannot handle this directive:  bail out.
-        return CU::UNWIND_ARM64_MODE_DWARF;
-      case MCCFIInstruction::OpDefCfa: {
-        // Defines a frame pointer.
-        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
-                   ARM64::FP &&
-               "Invalid frame pointer!");
-        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
-
-        const MCCFIInstruction &LRPush = Instrs[++i];
-        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
-               "Link register not pushed!");
-        const MCCFIInstruction &FPPush = Instrs[++i];
-        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
-               "Frame pointer not pushed!");
-
-        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
-        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
-
-        LRReg = getXRegFromWReg(LRReg);
-        FPReg = getXRegFromWReg(FPReg);
-
-        assert(LRReg == ARM64::LR && FPReg == ARM64::FP &&
-               "Pushing invalid registers for frame!");
-
-        // Indicate that the function has a frame.
-        CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
-        HasFP = true;
-        break;
-      }
-      case MCCFIInstruction::OpDefCfaOffset: {
-        assert(StackSize == 0 && "We already have the CFA offset!");
-        StackSize = std::abs(Inst.getOffset());
-        break;
-      }
-      case MCCFIInstruction::OpOffset: {
-        // Registers are saved in pairs. We expect there to be two consecutive
-        // `.cfi_offset' instructions with the appropriate registers specified.
-        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
-        if (i + 1 == e)
-          return CU::UNWIND_ARM64_MODE_DWARF;
-
-        const MCCFIInstruction &Inst2 = Instrs[++i];
-        if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
-          return CU::UNWIND_ARM64_MODE_DWARF;
-        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
-
-        // N.B. The encodings must be in register number order, and the X
-        // registers before the D registers.
-
-        // X19/X20 pair = 0x00000001,
-        // X21/X22 pair = 0x00000002,
-        // X23/X24 pair = 0x00000004,
-        // X25/X26 pair = 0x00000008,
-        // X27/X28 pair = 0x00000010
-        Reg1 = getXRegFromWReg(Reg1);
-        Reg2 = getXRegFromWReg(Reg2);
-
-        if (Reg1 == ARM64::X19 && Reg2 == ARM64::X20 &&
-            (CompactUnwindEncoding & 0xF1E) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR;
-        else if (Reg1 == ARM64::X21 && Reg2 == ARM64::X22 &&
-                 (CompactUnwindEncoding & 0xF1C) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR;
-        else if (Reg1 == ARM64::X23 && Reg2 == ARM64::X24 &&
-                 (CompactUnwindEncoding & 0xF18) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR;
-        else if (Reg1 == ARM64::X25 && Reg2 == ARM64::X26 &&
-                 (CompactUnwindEncoding & 0xF10) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR;
-        else if (Reg1 == ARM64::X27 && Reg2 == ARM64::X28 &&
-                 (CompactUnwindEncoding & 0xF00) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR;
-        else {
-          Reg1 = getDRegFromBReg(Reg1);
-          Reg2 = getDRegFromBReg(Reg2);
-
-          // D8/D9 pair   = 0x00000100,
-          // D10/D11 pair = 0x00000200,
-          // D12/D13 pair = 0x00000400,
-          // D14/D15 pair = 0x00000800
-          if (Reg1 == ARM64::D8 && Reg2 == ARM64::D9 &&
-              (CompactUnwindEncoding & 0xE00) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR;
-          else if (Reg1 == ARM64::D10 && Reg2 == ARM64::D11 &&
-                   (CompactUnwindEncoding & 0xC00) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR;
-          else if (Reg1 == ARM64::D12 && Reg2 == ARM64::D13 &&
-                   (CompactUnwindEncoding & 0x800) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR;
-          else if (Reg1 == ARM64::D14 && Reg2 == ARM64::D15)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR;
-          else
-            // A pair was pushed which we cannot handle.
-            return CU::UNWIND_ARM64_MODE_DWARF;
-        }
-
-        break;
-      }
-      }
-    }
-
-    if (!HasFP) {
-      // With compact unwind info we can only represent stack adjustments of up
-      // to 65520 bytes.
-      if (StackSize > 65520)
-        return CU::UNWIND_ARM64_MODE_DWARF;
-
-      CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS;
-      CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
-    }
-
-    return CompactUnwindEncoding;
-  }
-};
-
-} // end anonymous namespace
-
-namespace {
-
-class ELFARM64AsmBackend : public ARM64AsmBackend {
-public:
-  uint8_t OSABI;
-  bool IsLittleEndian;
-
-  ELFARM64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian)
-    : ARM64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {}
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
-    return createARM64ELFObjectWriter(OS, OSABI, IsLittleEndian);
-  }
-
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
-};
-
-void ELFARM64AsmBackend::processFixupValue(const MCAssembler &Asm,
-                                           const MCAsmLayout &Layout,
-                                           const MCFixup &Fixup,
-                                           const MCFragment *DF,
-                                           const MCValue &Target,
-                                           uint64_t &Value, bool &IsResolved) {
-  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
-  // ~0xfff. This means that the required offset to reach a symbol can vary by
-  // up to one step depending on where the ADRP is in memory. For example:
-  //
-  //     ADRP x0, there
-  //  there:
-  //
-  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
-  // we'll need that as an offset. At any other address "there" will be in the
-  // same page as the ADRP and the instruction should encode 0x0. Assuming the
-  // section isn't 0x1000-aligned, we therefore need to delegate this decision
-  // to the linker -- a relocation!
-  if ((uint32_t)Fixup.getKind() == ARM64::fixup_arm64_pcrel_adrp_imm21)
-    IsResolved = false;
-}
-
-void ELFARM64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                    unsigned DataSize, uint64_t Value,
-                                    bool IsPCRel) const {
-  // store fixups in .eh_frame section in big endian order
-  if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
-    const MCSection *Sec = Fixup.getValue()->FindAssociatedSection();
-    const MCSectionELF *SecELF = static_cast<const MCSectionELF *>(Sec);
-    if (SecELF->getSectionName() == ".eh_frame")
-      Value = ByteSwap_32(unsigned(Value));
-  }
-  ARM64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
-}
-}
-
-MCAsmBackend *llvm::createARM64leAsmBackend(const Target &T,
-                                            const MCRegisterInfo &MRI,
-                                            StringRef TT, StringRef CPU) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin())
-    return new DarwinARM64AsmBackend(T, MRI);
-
-  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
-  return new ELFARM64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true);
-}
-
-MCAsmBackend *llvm::createARM64beAsmBackend(const Target &T,
-                                            const MCRegisterInfo &MRI,
-                                            StringRef TT, StringRef CPU) {
-  Triple TheTriple(TT);
-
-  assert(TheTriple.isOSBinFormatELF() && "Big endian is only supported for ELF targets!");
-  return new ELFARM64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/false);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
deleted file mode 100644
index 0990a701bc8..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-//===-- ARM64ELFObjectWriter.cpp - ARM64 ELF Writer -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file handles ELF-specific object emission, converting LLVM's internal
-// fixups into the appropriate relocations.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-namespace {
-class ARM64ELFObjectWriter : public MCELFObjectTargetWriter {
-public:
-  ARM64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian);
-
-  virtual ~ARM64ELFObjectWriter();
-
-protected:
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override;
-
-private:
-};
-}
-
-ARM64ELFObjectWriter::ARM64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian)
-    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
-                              /*HasRelocationAddend*/ true) {}
-
-ARM64ELFObjectWriter::~ARM64ELFObjectWriter() {}
-
-unsigned ARM64ELFObjectWriter::GetRelocType(const MCValue &Target,
-                                            const MCFixup &Fixup,
-                                            bool IsPCRel) const {
-  ARM64MCExpr::VariantKind RefKind =
-      static_cast<ARM64MCExpr::VariantKind>(Target.getRefKind());
-  ARM64MCExpr::VariantKind SymLoc = ARM64MCExpr::getSymbolLoc(RefKind);
-  bool IsNC = ARM64MCExpr::isNotChecked(RefKind);
-
-  assert((!Target.getSymA() ||
-          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
-         "Should only be expression-level modifiers here");
-
-  assert((!Target.getSymB() ||
-          Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
-         "Should only be expression-level modifiers here");
-
-  if (IsPCRel) {
-    switch ((unsigned)Fixup.getKind()) {
-    case FK_Data_2:
-      return ELF::R_AARCH64_PREL16;
-    case FK_Data_4:
-      return ELF::R_AARCH64_PREL32;
-    case FK_Data_8:
-      return ELF::R_AARCH64_PREL64;
-    case ARM64::fixup_arm64_pcrel_adr_imm21:
-      assert(SymLoc == ARM64MCExpr::VK_NONE && "unexpected ADR relocation");
-      return ELF::R_AARCH64_ADR_PREL_LO21;
-    case ARM64::fixup_arm64_pcrel_adrp_imm21:
-      if (SymLoc == ARM64MCExpr::VK_ABS && !IsNC)
-        return ELF::R_AARCH64_ADR_PREL_PG_HI21;
-      if (SymLoc == ARM64MCExpr::VK_GOT && !IsNC)
-        return ELF::R_AARCH64_ADR_GOT_PAGE;
-      if (SymLoc == ARM64MCExpr::VK_GOTTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
-      if (SymLoc == ARM64MCExpr::VK_TLSDESC && !IsNC)
-        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
-      llvm_unreachable("invalid symbol kind for ADRP relocation");
-    case ARM64::fixup_arm64_pcrel_branch26:
-      return ELF::R_AARCH64_JUMP26;
-    case ARM64::fixup_arm64_pcrel_call26:
-      return ELF::R_AARCH64_CALL26;
-    case ARM64::fixup_arm64_ldr_pcrel_imm19:
-      if (SymLoc == ARM64MCExpr::VK_GOTTPREL)
-        return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
-      return ELF::R_AARCH64_LD_PREL_LO19;
-    case ARM64::fixup_arm64_pcrel_branch14:
-      return ELF::R_AARCH64_TSTBR14;
-    case ARM64::fixup_arm64_pcrel_branch19:
-      return ELF::R_AARCH64_CONDBR19;
-    default:
-      llvm_unreachable("Unsupported pc-relative fixup kind");
-    }
-  } else {
-    switch ((unsigned)Fixup.getKind()) {
-    case FK_Data_2:
-      return ELF::R_AARCH64_ABS16;
-    case FK_Data_4:
-      return ELF::R_AARCH64_ABS32;
-    case FK_Data_8:
-      return ELF::R_AARCH64_ABS64;
-    case ARM64::fixup_arm64_add_imm12:
-      if (RefKind == ARM64MCExpr::VK_DTPREL_HI12)
-        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
-      if (RefKind == ARM64MCExpr::VK_TPREL_HI12)
-        return ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_LO12_NC)
-        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_LO12)
-        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
-      if (RefKind == ARM64MCExpr::VK_TPREL_LO12_NC)
-        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
-      if (RefKind == ARM64MCExpr::VK_TPREL_LO12)
-        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
-      if (RefKind == ARM64MCExpr::VK_TLSDESC_LO12)
-        return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_ADD_ABS_LO12_NC;
-
-      report_fatal_error("invalid fixup for add (uimm12) instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale1:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
-
-      report_fatal_error("invalid fixup for 8-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale2:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
-
-      report_fatal_error("invalid fixup for 16-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale4:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
-
-      report_fatal_error("invalid fixup for 32-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale8:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_GOT && IsNC)
-        return ELF::R_AARCH64_LD64_GOT_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_GOTTPREL && IsNC)
-        return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TLSDESC && IsNC)
-        return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
-
-      report_fatal_error("invalid fixup for 64-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale16:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
-
-      report_fatal_error("invalid fixup for 128-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_movw:
-      if (RefKind == ARM64MCExpr::VK_ABS_G3)
-        return ELF::R_AARCH64_MOVW_UABS_G3;
-      if (RefKind == ARM64MCExpr::VK_ABS_G2)
-        return ELF::R_AARCH64_MOVW_UABS_G2;
-      if (RefKind == ARM64MCExpr::VK_ABS_G2_S)
-        return ELF::R_AARCH64_MOVW_SABS_G2;
-      if (RefKind == ARM64MCExpr::VK_ABS_G2_NC)
-        return ELF::R_AARCH64_MOVW_UABS_G2_NC;
-      if (RefKind == ARM64MCExpr::VK_ABS_G1)
-        return ELF::R_AARCH64_MOVW_UABS_G1;
-      if (RefKind == ARM64MCExpr::VK_ABS_G1_S)
-        return ELF::R_AARCH64_MOVW_SABS_G1;
-      if (RefKind == ARM64MCExpr::VK_ABS_G1_NC)
-        return ELF::R_AARCH64_MOVW_UABS_G1_NC;
-      if (RefKind == ARM64MCExpr::VK_ABS_G0)
-        return ELF::R_AARCH64_MOVW_UABS_G0;
-      if (RefKind == ARM64MCExpr::VK_ABS_G0_S)
-        return ELF::R_AARCH64_MOVW_SABS_G0;
-      if (RefKind == ARM64MCExpr::VK_ABS_G0_NC)
-        return ELF::R_AARCH64_MOVW_UABS_G0_NC;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G2)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G1)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G1_NC)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G0)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G0_NC)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G2)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G1)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G1_NC)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G0)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G0_NC)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
-      if (RefKind == ARM64MCExpr::VK_GOTTPREL_G1)
-        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
-      if (RefKind == ARM64MCExpr::VK_GOTTPREL_G0_NC)
-        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
-      report_fatal_error("invalid fixup for movz/movk instruction");
-      return 0;
-    case ARM64::fixup_arm64_tlsdesc_call:
-      return ELF::R_AARCH64_TLSDESC_CALL;
-    default:
-      llvm_unreachable("Unknown ELF relocation type");
-    }
-  }
-
-  llvm_unreachable("Unimplemented fixup -> relocation");
-}
-
-MCObjectWriter *llvm::createARM64ELFObjectWriter(raw_ostream &OS,
-                                                 uint8_t OSABI,
-                                                 bool IsLittleEndian) {
-  MCELFObjectTargetWriter *MOTW = new ARM64ELFObjectWriter(OSABI, IsLittleEndian);
-  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
deleted file mode 100644
index adbf8307972..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-//===- lib/MC/ARM64ELFStreamer.cpp - ELF Object Output for ARM64 ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file assembles .s files and emits AArch64 ELF .o object files. Different
-// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit
-// regions of data and code.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-
-/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
-/// the appropriate points in the object files. These symbols are defined in the
-/// AArch64 ELF ABI:
-///    infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf
-///
-/// In brief: $x or $d should be emitted at the start of each contiguous region
-/// of A64 code or data in a section. In practice, this emission does not rely
-/// on explicit assembler directives but on inherent properties of the
-/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an
-/// instruction).
-///
-/// As a result this system is orthogonal to the DataRegion infrastructure used
-/// by MachO. Beware!
-class ARM64ELFStreamer : public MCELFStreamer {
-public:
-  ARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
-                   MCCodeEmitter *Emitter)
-      : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
-        LastEMS(EMS_None) {}
-
-  ~ARM64ELFStreamer() {}
-
-  void ChangeSection(const MCSection *Section,
-                     const MCExpr *Subsection) override {
-    // We have to keep track of the mapping symbol state of any sections we
-    // use. Each one should start off as EMS_None, which is provided as the
-    // default constructor by DenseMap::lookup.
-    LastMappingSymbols[getPreviousSection().first] = LastEMS;
-    LastEMS = LastMappingSymbols.lookup(Section);
-
-    MCELFStreamer::ChangeSection(Section, Subsection);
-  }
-
-  /// This function is the one used to emit instruction data into the ELF
-  /// streamer. We override it to add the appropriate mapping symbol if
-  /// necessary.
-  void EmitInstruction(const MCInst &Inst,
-                       const MCSubtargetInfo &STI) override {
-    EmitA64MappingSymbol();
-    MCELFStreamer::EmitInstruction(Inst, STI);
-  }
-
-  /// This is one of the functions used to emit data into an ELF section, so the
-  /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d)
-  /// if necessary.
-  void EmitBytes(StringRef Data) override {
-    EmitDataMappingSymbol();
-    MCELFStreamer::EmitBytes(Data);
-  }
-
-  /// This is one of the functions used to emit data into an ELF section, so the
-  /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d)
-  /// if necessary.
-  void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                     const SMLoc &Loc) override {
-    EmitDataMappingSymbol();
-    MCELFStreamer::EmitValueImpl(Value, Size);
-  }
-
-private:
-  enum ElfMappingSymbol {
-    EMS_None,
-    EMS_A64,
-    EMS_Data
-  };
-
-  void EmitDataMappingSymbol() {
-    if (LastEMS == EMS_Data)
-      return;
-    EmitMappingSymbol("$d");
-    LastEMS = EMS_Data;
-  }
-
-  void EmitA64MappingSymbol() {
-    if (LastEMS == EMS_A64)
-      return;
-    EmitMappingSymbol("$x");
-    LastEMS = EMS_A64;
-  }
-
-  void EmitMappingSymbol(StringRef Name) {
-    MCSymbol *Start = getContext().CreateTempSymbol();
-    EmitLabel(Start);
-
-    MCSymbol *Symbol = getContext().GetOrCreateSymbol(
-        Name + "." + Twine(MappingSymbolCounter++));
-
-    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-    MCELF::SetType(SD, ELF::STT_NOTYPE);
-    MCELF::SetBinding(SD, ELF::STB_LOCAL);
-    SD.setExternal(false);
-    Symbol->setSection(*getCurrentSection().first);
-
-    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
-    Symbol->setVariableValue(Value);
-  }
-
-  int64_t MappingSymbolCounter;
-
-  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
-  ElfMappingSymbol LastEMS;
-
-  /// @}
-};
-}
-
-namespace llvm {
-MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack) {
-  ARM64ELFStreamer *S = new ARM64ELFStreamer(Context, TAB, OS, Emitter);
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
-  if (NoExecStack)
-    S->getAssembler().setNoExecStack(true);
-  return S;
-}
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
deleted file mode 100644
index 72dadbc50aa..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-- ARM64ELFStreamer.h - ELF Streamer for ARM64 -------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF streamer information for the ARM64 backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_AARCH64_ELF_STREAMER_H
-#define LLVM_AARCH64_ELF_STREAMER_H
-
-#include "llvm/MC/MCELFStreamer.h"
-
-namespace llvm {
-
-MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack);
-}
-
-#endif // ARM64_ELF_STREAMER_H
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h b/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
deleted file mode 100644
index 7106b314ea2..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
+++ /dev/null
@@ -1,76 +0,0 @@
-//===-- ARM64FixupKinds.h - ARM64 Specific Fixup Entries --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ARM64FIXUPKINDS_H
-#define LLVM_ARM64FIXUPKINDS_H
-
-#include "llvm/MC/MCFixup.h"
-
-namespace llvm {
-namespace ARM64 {
-
-enum Fixups {
-  // fixup_arm64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADR instruction.
-  fixup_arm64_pcrel_adr_imm21 = FirstTargetFixupKind,
-
-  // fixup_arm64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADRP instruction.
-  fixup_arm64_pcrel_adrp_imm21,
-
-  // fixup_arm64_imm12 - 12-bit fixup for add/sub instructions.
-  //     No alignment adjustment. All value bits are encoded.
-  fixup_arm64_add_imm12,
-
-  // fixup_arm64_ldst_imm12_* - unsigned 12-bit fixups for load and
-  // store instructions.
-  fixup_arm64_ldst_imm12_scale1,
-  fixup_arm64_ldst_imm12_scale2,
-  fixup_arm64_ldst_imm12_scale4,
-  fixup_arm64_ldst_imm12_scale8,
-  fixup_arm64_ldst_imm12_scale16,
-
-  // fixup_arm64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
-  // immediate. Same encoding as fixup_arm64_pcrel_adrhi, except this is used by
-  // pc-relative loads and generates relocations directly when necessary.
-  fixup_arm64_ldr_pcrel_imm19,
-
-  // FIXME: comment
-  fixup_arm64_movw,
-
-  // fixup_arm64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
-  // immediate.
-  fixup_arm64_pcrel_branch14,
-
-  // fixup_arm64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
-  // immediate. Same encoding as fixup_arm64_pcrel_adrhi, except this is use by
-  // b.cc and generates relocations directly when necessary.
-  fixup_arm64_pcrel_branch19,
-
-  // fixup_arm64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
-  // immediate.
-  fixup_arm64_pcrel_branch26,
-
-  // fixup_arm64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
-  // immediate. Distinguished from branch26 only on ELF.
-  fixup_arm64_pcrel_call26,
-
-  // fixup_arm64_tlsdesc_call - zero-space placeholder for the ELF
-  // R_AARCH64_TLSDESC_CALL relocation.
-  fixup_arm64_tlsdesc_call,
-
-  // Marker
-  LastTargetFixupKind,
-  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-};
-
-} // end namespace ARM64
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
deleted file mode 100644
index e211d3428bf..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-//===-- ARM64MCAsmInfo.cpp - ARM64 asm properties -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of the ARM64MCAsmInfo properties.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCAsmInfo.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/CommandLine.h"
-using namespace llvm;
-
-enum AsmWriterVariantTy {
-  Default = -1,
-  Generic = 0,
-  Apple = 1
-};
-
-static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
-    "arm64-neon-syntax", cl::init(Default),
-    cl::desc("Choose style of NEON code to emit from ARM64 backend:"),
-    cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
-               clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"),
-               clEnumValEnd));
-
-ARM64MCAsmInfoDarwin::ARM64MCAsmInfoDarwin() {
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
-
-  PrivateGlobalPrefix = "L";
-  SeparatorString = "%%";
-  CommentString = ";";
-  PointerSize = CalleeSaveStackSlotSize = 8;
-
-  AlignmentIsInBytes = false;
-  UsesELFSectionDirectiveForBSS = true;
-  SupportsDebugInformation = true;
-  UseDataRegionDirectives = true;
-
-  ExceptionsType = ExceptionHandling::DwarfCFI;
-}
-
-const MCExpr *ARM64MCAsmInfoDarwin::getExprForPersonalitySymbol(
-    const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
-  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
-  // is an indirect pc-relative reference. The default implementation
-  // won't reference using the GOT, so we need this target-specific
-  // version.
-  MCContext &Context = Streamer.getContext();
-  const MCExpr *Res =
-      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
-  MCSymbol *PCSym = Context.CreateTempSymbol();
-  Streamer.EmitLabel(PCSym);
-  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
-  return MCBinaryExpr::CreateSub(Res, PC, Context);
-}
-
-ARM64MCAsmInfoELF::ARM64MCAsmInfoELF(StringRef TT) {
-  Triple T(TT);
-  if (T.getArch() == Triple::arm64_be)
-    IsLittleEndian = false;
-
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
-
-  PointerSize = 8;
-
-  // ".comm align is in bytes but .align is pow-2."
-  AlignmentIsInBytes = false;
-
-  CommentString = "//";
-  PrivateGlobalPrefix = ".L";
-  Code32Directive = ".code\t32";
-
-  Data16bitsDirective = "\t.hword\t";
-  Data32bitsDirective = "\t.word\t";
-  Data64bitsDirective = "\t.xword\t";
-
-  UseDataRegionDirectives = false;
-
-  WeakRefDirective = "\t.weak\t";
-
-  HasLEB128 = true;
-  SupportsDebugInformation = true;
-
-  // Exceptions handling
-  ExceptionsType = ExceptionHandling::DwarfCFI;
-
-  UseIntegratedAssembler = true;
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
deleted file mode 100644
index 324bc39560f..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//=====-- ARM64MCAsmInfo.h - ARM64 asm properties -----------*- C++ -*--====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the ARM64MCAsmInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64TARGETASMINFO_H
-#define ARM64TARGETASMINFO_H
-
-#include "llvm/MC/MCAsmInfoDarwin.h"
-
-namespace llvm {
-class Target;
-class StringRef;
-class MCStreamer;
-struct ARM64MCAsmInfoDarwin : public MCAsmInfoDarwin {
-  explicit ARM64MCAsmInfoDarwin();
-  const MCExpr *
-  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
-                              MCStreamer &Streamer) const override;
-};
-
-struct ARM64MCAsmInfoELF : public MCAsmInfo {
-  explicit ARM64MCAsmInfoELF(StringRef TT);
-};
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
deleted file mode 100644
index 0db08f422e4..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
+++ /dev/null
@@ -1,658 +0,0 @@
-//===-- ARM64/ARM64MCCodeEmitter.cpp - Convert ARM64 code to machine code -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64MCCodeEmitter class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "Utils/ARM64BaseInfo.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "mccodeemitter"
-
-STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
-STATISTIC(MCNumFixups, "Number of MC fixups created.");
-
-namespace {
-
-class ARM64MCCodeEmitter : public MCCodeEmitter {
-  MCContext &Ctx;
-
-  ARM64MCCodeEmitter(const ARM64MCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const ARM64MCCodeEmitter &);     // DO NOT IMPLEMENT
-public:
-  ARM64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                     MCContext &ctx)
-      : Ctx(ctx) {}
-
-  ~ARM64MCCodeEmitter() {}
-
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-
-  /// getLdStUImm12OpValue - Return encoding info for 12-bit unsigned immediate
-  /// attached to a load, store or prfm instruction. If operand requires a
-  /// relocation, record it and return zero in that part of the encoding.
-  template <uint32_t FixupKind>
-  uint32_t getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
-  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
-  /// target.
-  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-
-  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
-  /// the 2-bit shift field.
-  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-
-  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
-  /// branch target.
-  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const;
-
-  /// getLoadLiteralOpValue - Return the encoded value for a load-literal
-  /// pc-relative address.
-  uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  /// getMemExtendOpValue - Return the encoded value for a reg-extend load/store
-  /// instruction: bit 0 is whether a shift is present, bit 1 is whether the
-  /// operation is a sign extend (as opposed to a zero extend).
-  uint32_t getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-
-  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
-  /// branch target.
-  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const;
-
-  /// getBranchTargetOpValue - Return the encoded value for an unconditional
-  /// branch target.
-  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                  SmallVectorImpl<MCFixup> &Fixups,
-                                  const MCSubtargetInfo &STI) const;
-
-  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
-  /// of a MOVZ or MOVK instruction.
-  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
-  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
-  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
-  /// shifter (MSL).
-  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const;
-
-  /// getFixedPointScaleOpValue - Return the encoded value for the
-  // FP-to-fixed-point scale factor.
-  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const;
-
-  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
-  /// getSIMDShift64OpValue - Return the encoded value for the
-  // shift-by-immediate AdvSIMD instructions.
-  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                   const MCSubtargetInfo &STI) const;
-
-  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
-
-  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
-    // Output the constant in little endian byte order.
-    for (unsigned i = 0; i != Size; ++i) {
-      EmitByte(Val & 255, OS);
-      Val >>= 8;
-    }
-  }
-
-  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const override;
-
-  unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue,
-                      const MCSubtargetInfo &STI) const;
-
-  template<int hasRs, int hasRt2> unsigned
-  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
-                        const MCSubtargetInfo &STI) const;
-
-  unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
-                                     const MCSubtargetInfo &STI) const;
-};
-
-} // end anonymous namespace
-
-MCCodeEmitter *llvm::createARM64MCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
-                                              const MCSubtargetInfo &STI,
-                                              MCContext &Ctx) {
-  return new ARM64MCCodeEmitter(MCII, STI, Ctx);
-}
-
-/// getMachineOpValue - Return binary encoding of operand. If the machine
-/// operand requires relocation, record the relocation and return zero.
-unsigned
-ARM64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const {
-  if (MO.isReg())
-    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
-  else {
-    assert(MO.isImm() && "did not expect relocated expression");
-    return static_cast<unsigned>(MO.getImm());
-  }
-
-  assert(0 && "Unable to encode MCOperand!");
-  return 0;
-}
-
-template<unsigned FixupKind> uint32_t
-ARM64MCCodeEmitter::getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  uint32_t ImmVal = 0;
-
-  if (MO.isImm())
-    ImmVal = static_cast<uint32_t>(MO.getImm());
-  else {
-    assert(MO.isExpr() && "unable to encode load/store imm operand");
-    MCFixupKind Kind = MCFixupKind(FixupKind);
-    Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-    ++MCNumFixups;
-  }
-
-  return ImmVal;
-}
-
-/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
-/// target.
-uint32_t
-ARM64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected target type!");
-  const MCExpr *Expr = MO.getExpr();
-
-  MCFixupKind Kind = MI.getOpcode() == ARM64::ADR
-                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_adr_imm21)
-                         : MCFixupKind(ARM64::fixup_arm64_pcrel_adrp_imm21);
-  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
-
-  MCNumFixups += 1;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
-/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
-/// return value.
-uint32_t
-ARM64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                        SmallVectorImpl<MCFixup> &Fixups,
-                                        const MCSubtargetInfo &STI) const {
-  // Suboperands are [imm, shifter].
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  assert(ARM64_AM::getShiftType(MO1.getImm()) == ARM64_AM::LSL &&
-         "unexpected shift type for add/sub immediate");
-  unsigned ShiftVal = ARM64_AM::getShiftValue(MO1.getImm());
-  assert((ShiftVal == 0 || ShiftVal == 12) &&
-         "unexpected shift value for add/sub immediate");
-  if (MO.isImm())
-    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
-  assert(MO.isExpr() && "Unable to encode MCOperand!");
-  const MCExpr *Expr = MO.getExpr();
-
-  // Encode the 12 bits of the fixup.
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_add_imm12);
-  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  return 0;
-}
-
-/// getCondBranchTargetOpValue - Return the encoded value for a conditional
-/// branch target.
-uint32_t ARM64MCCodeEmitter::getCondBranchTargetOpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected target type!");
-
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_branch19);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getLoadLiteralOpValue - Return the encoded value for a load-literal
-/// pc-relative address.
-uint32_t
-ARM64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected target type!");
-
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_ldr_pcrel_imm19);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
-                                        SmallVectorImpl<MCFixup> &Fixups,
-                                        const MCSubtargetInfo &STI) const {
-  unsigned SignExtend = MI.getOperand(OpIdx).getImm();
-  unsigned DoShift = MI.getOperand(OpIdx + 1).getImm();
-  return (SignExtend << 1) | DoShift;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected movz/movk immediate");
-
-  Fixups.push_back(MCFixup::Create(
-      0, MO.getExpr(), MCFixupKind(ARM64::fixup_arm64_movw), MI.getLoc()));
-
-  ++MCNumFixups;
-
-  return 0;
-}
-
-/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
-/// branch target.
-uint32_t ARM64MCCodeEmitter::getTestBranchTargetOpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected ADR target type!");
-
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_branch14);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getBranchTargetOpValue - Return the encoded value for an unconditional
-/// branch target.
-uint32_t
-ARM64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                           SmallVectorImpl<MCFixup> &Fixups,
-                                           const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected ADR target type!");
-
-  MCFixupKind Kind = MI.getOpcode() == ARM64::BL
-                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_call26)
-                         : MCFixupKind(ARM64::fixup_arm64_pcrel_branch26);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getVecShifterOpValue - Return the encoded value for the vector shifter:
-///
-///   00 -> 0
-///   01 -> 8
-///   10 -> 16
-///   11 -> 24
-uint32_t
-ARM64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-
-  switch (MO.getImm()) {
-  default:
-    break;
-  case 0:
-    return 0;
-  case 8:
-    return 1;
-  case 16:
-    return 2;
-  case 24:
-    return 3;
-  }
-
-  assert(false && "Invalid value for vector shift amount!");
-  return 0;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 64 - (MO.getImm());
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
-                                             SmallVectorImpl<MCFixup> &Fixups,
-                                             const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 64 - (MO.getImm() | 32);
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 32 - (MO.getImm() | 16);
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 16 - (MO.getImm() | 8);
-}
-
-/// getFixedPointScaleOpValue - Return the encoded value for the
-// FP-to-fixed-point scale factor.
-uint32_t ARM64MCCodeEmitter::getFixedPointScaleOpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 64 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 64 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 32 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 16 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 8 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 64;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 32;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 16;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 8;
-}
-
-/// getMoveVecShifterOpValue - Return the encoded value for the vector move
-/// shifter (MSL).
-uint32_t
-ARM64MCCodeEmitter::getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                             SmallVectorImpl<MCFixup> &Fixups,
-                                             const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() &&
-         "Expected an immediate value for the move shift amount!");
-  unsigned ShiftVal = ARM64_AM::getShiftValue(MO.getImm());
-  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
-  return ShiftVal == 8 ? 0 : 1;
-}
-
-unsigned ARM64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                                     const MCSubtargetInfo &STI) const {
-  // If one of the signed fixup kinds is applied to a MOVZ instruction, the
-  // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
-  // job to ensure that any bits possibly affected by this are 0. This means we
-  // must zero out bit 30 (essentially emitting a MOVN).
-  MCOperand UImm16MO = MI.getOperand(1);
-
-  // Nothing to do if there's no fixup.
-  if (UImm16MO.isImm())
-    return EncodedValue;
-
-  const ARM64MCExpr *A64E = cast<ARM64MCExpr>(UImm16MO.getExpr());
-  switch (A64E->getKind()) {
-  case ARM64MCExpr::VK_DTPREL_G2:
-  case ARM64MCExpr::VK_DTPREL_G1:
-  case ARM64MCExpr::VK_DTPREL_G0:
-  case ARM64MCExpr::VK_GOTTPREL_G1:
-  case ARM64MCExpr::VK_TPREL_G2:
-  case ARM64MCExpr::VK_TPREL_G1:
-  case ARM64MCExpr::VK_TPREL_G0:
-    return EncodedValue & ~(1u << 30);
-  default:
-    // Nothing to do for an unsigned fixup.
-    return EncodedValue;
-  }
-
-
-  return EncodedValue & ~(1u << 30);
-}
-
-void ARM64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                                           SmallVectorImpl<MCFixup> &Fixups,
-                                           const MCSubtargetInfo &STI) const {
-  if (MI.getOpcode() == ARM64::TLSDESCCALL) {
-    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
-    // following (BLR) instruction. It doesn't emit any code itself so it
-    // doesn't go through the normal TableGenerated channels.
-    MCFixupKind Fixup = MCFixupKind(ARM64::fixup_arm64_tlsdesc_call);
-    Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
-    return;
-  }
-
-  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
-  EmitConstant(Binary, 4, OS);
-  ++MCNumEmitted; // Keep track of the # of mi's emitted.
-}
-
-unsigned
-ARM64MCCodeEmitter::fixMulHigh(const MCInst &MI,
-                               unsigned EncodedValue,
-                               const MCSubtargetInfo &STI) const {
-  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
-  // (i.e. all bits 1) but is ignored by the processor.
-  EncodedValue |= 0x1f << 10;
-  return EncodedValue;
-}
-
-template<int hasRs, int hasRt2> unsigned
-ARM64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
-                                          unsigned EncodedValue,
-                                          const MCSubtargetInfo &STI) const {
-  if (!hasRs) EncodedValue |= 0x001F0000;
-  if (!hasRt2) EncodedValue |= 0x00007C00;
-
-  return EncodedValue;
-}
-
-unsigned
-ARM64MCCodeEmitter::fixOneOperandFPComparison(const MCInst &MI,
-                                              unsigned EncodedValue,
-                                              const MCSubtargetInfo &STI) const {
-  // The Rm field of FCMP and friends is unused - it should be assembled
-  // as 0, but is ignored by the processor.
-  EncodedValue &= ~(0x1f << 16);
-  return EncodedValue;
-}
-
-#include "ARM64GenMCCodeEmitter.inc"
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
deleted file mode 100644
index efa820b097f..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-//===-- ARM64MCExpr.cpp - ARM64 specific MC expression classes --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the assembly expression modifiers
-// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCExpr.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "aarch64symbolrefexpr"
-
-const ARM64MCExpr *ARM64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
-                                       MCContext &Ctx) {
-  return new (Ctx) ARM64MCExpr(Expr, Kind);
-}
-
-StringRef ARM64MCExpr::getVariantKindName() const {
-  switch (static_cast<uint32_t>(getKind())) {
-  case VK_CALL:                return "";
-  case VK_LO12:                return ":lo12:";
-  case VK_ABS_G3:              return ":abs_g3:";
-  case VK_ABS_G2:              return ":abs_g2:";
-  case VK_ABS_G2_S:            return ":abs_g2_s:";
-  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
-  case VK_ABS_G1:              return ":abs_g1:";
-  case VK_ABS_G1_S:            return ":abs_g1_s:";
-  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
-  case VK_ABS_G0:              return ":abs_g0:";
-  case VK_ABS_G0_S:            return ":abs_g0_s:";
-  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
-  case VK_DTPREL_G2:           return ":dtprel_g2:";
-  case VK_DTPREL_G1:           return ":dtprel_g1:";
-  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
-  case VK_DTPREL_G0:           return ":dtprel_g0:";
-  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
-  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
-  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
-  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
-  case VK_TPREL_G2:            return ":tprel_g2:";
-  case VK_TPREL_G1:            return ":tprel_g1:";
-  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
-  case VK_TPREL_G0:            return ":tprel_g0:";
-  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
-  case VK_TPREL_HI12:          return ":tprel_hi12:";
-  case VK_TPREL_LO12:          return ":tprel_lo12:";
-  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
-  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
-  case VK_ABS_PAGE:            return "";
-  case VK_GOT_PAGE:            return ":got:";
-  case VK_GOT_LO12:            return ":got_lo12:";
-  case VK_GOTTPREL_PAGE:       return ":gottprel:";
-  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
-  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
-  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
-  case VK_TLSDESC:             return "";
-  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
-  default:
-    llvm_unreachable("Invalid ELF symbol kind");
-  }
-}
-
-void ARM64MCExpr::PrintImpl(raw_ostream &OS) const {
-  if (getKind() != VK_NONE)
-    OS << getVariantKindName();
-  OS << *Expr;
-}
-
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-// FIXME: really do above: now that two backends are using it.
-static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expr!");
-    break;
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbolsImpl(BE->getLHS(), Asm);
-    AddValueSymbolsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void ARM64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbolsImpl(getSubExpr(), Asm);
-}
-
-const MCSection *ARM64MCExpr::FindAssociatedSection() const {
-  llvm_unreachable("FIXME: what goes here?");
-}
-
-bool ARM64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                            const MCAsmLayout *Layout) const {
-  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
-    return false;
-
-  Res =
-      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
-
-  return true;
-}
-
-static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
-  switch (Expr->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expression");
-    break;
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
-    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
-    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef: {
-    // We're known to be under a TLS fixup, so any symbol should be
-    // modified. There should be only one.
-    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
-    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
-    MCELF::SetType(SD, ELF::STT_TLS);
-    break;
-  }
-
-  case MCExpr::Unary:
-    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void ARM64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
-  switch (getSymbolLoc(Kind)) {
-  default:
-    return;
-  case VK_DTPREL:
-  case VK_GOTTPREL:
-  case VK_TPREL:
-  case VK_TLSDESC:
-    break;
-  }
-
-  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
deleted file mode 100644
index d8325465178..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
+++ /dev/null
@@ -1,168 +0,0 @@
-//=---- ARM64MCExpr.h - ARM64 specific MC expression classes ------*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes ARM64-specific MCExprs, used for modifiers like
-// ":lo12:" or ":gottprel_g1:".
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ARM64MCEXPR_H
-#define LLVM_ARM64MCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-
-class ARM64MCExpr : public MCTargetExpr {
-public:
-  enum VariantKind {
-    VK_NONE     = 0x000,
-
-    // Symbol locations specifying (roughly speaking) what calculation should be
-    // performed to construct the final address for the relocated
-    // symbol. E.g. direct, via the GOT, ...
-    VK_ABS      = 0x001,
-    VK_SABS     = 0x002,
-    VK_GOT      = 0x003,
-    VK_DTPREL   = 0x004,
-    VK_GOTTPREL = 0x005,
-    VK_TPREL    = 0x006,
-    VK_TLSDESC  = 0x007,
-    VK_SymLocBits = 0x00f,
-
-    // Variants specifying which part of the final address calculation is
-    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
-    // MOVZ/MOVK.
-    VK_PAGE     = 0x010,
-    VK_PAGEOFF  = 0x020,
-    VK_HI12     = 0x030,
-    VK_G0       = 0x040,
-    VK_G1       = 0x050,
-    VK_G2       = 0x060,
-    VK_G3       = 0x070,
-    VK_AddressFragBits = 0x0f0,
-
-    // Whether the final relocation is a checked one (where a linker should
-    // perform a range-check on the final address) or not. Note that this field
-    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
-    // on its own is a non-checked relocation. We side with ELF on being
-    // explicit about this!
-    VK_NC       = 0x100,
-
-    // Convenience definitions for referring to specific textual representations
-    // of relocation specifiers. Note that this means the "_NC" is sometimes
-    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
-    // since a user would write ":lo12:").
-    VK_CALL              = VK_ABS,
-    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
-    VK_ABS_G3            = VK_ABS      | VK_G3,
-    VK_ABS_G2            = VK_ABS      | VK_G2,
-    VK_ABS_G2_S          = VK_SABS     | VK_G2,
-    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
-    VK_ABS_G1            = VK_ABS      | VK_G1,
-    VK_ABS_G1_S          = VK_SABS     | VK_G1,
-    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
-    VK_ABS_G0            = VK_ABS      | VK_G0,
-    VK_ABS_G0_S          = VK_SABS     | VK_G0,
-    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
-    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
-    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
-    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
-    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
-    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
-    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
-    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
-    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
-    VK_DTPREL_HI12       = VK_DTPREL   | VK_HI12,
-    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
-    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
-    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
-    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
-    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
-    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
-    VK_TPREL_G2          = VK_TPREL    | VK_G2,
-    VK_TPREL_G1          = VK_TPREL    | VK_G1,
-    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
-    VK_TPREL_G0          = VK_TPREL    | VK_G0,
-    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
-    VK_TPREL_HI12        = VK_TPREL    | VK_HI12,
-    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
-    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
-    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
-    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
-
-    VK_INVALID  = 0xfff
-  };
-
-private:
-  const MCExpr *Expr;
-  const VariantKind Kind;
-
-  explicit ARM64MCExpr(const MCExpr *Expr, VariantKind Kind)
-    : Expr(Expr), Kind(Kind) {}
-
-public:
-  /// @name Construction
-  /// @{
-
-  static const ARM64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
-                                   MCContext &Ctx);
-
-  /// @}
-  /// @name Accessors
-  /// @{
-
-  /// Get the kind of this expression.
-  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
-
-  /// Get the expression this modifier applies to.
-  const MCExpr *getSubExpr() const { return Expr; }
-
-  /// @}
-  /// @name VariantKind information extractors.
-  /// @{
-
-  static VariantKind getSymbolLoc(VariantKind Kind) {
-    return static_cast<VariantKind>(Kind & VK_SymLocBits);
-  }
-
-  static VariantKind getAddressFrag(VariantKind Kind) {
-    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
-  }
-
-  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
-
-  /// @}
-
-  /// Convert the variant kind into an ELF-appropriate modifier
-  /// (e.g. ":got:", ":lo12:").
-  StringRef getVariantKindName() const;
-
-  void PrintImpl(raw_ostream &OS) const override;
-
-  void AddValueSymbols(MCAssembler *) const override;
-
-  const MCSection *FindAssociatedSection() const override;
-
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const override;
-
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
-
-  static bool classof(const MCExpr *E) {
-    return E->getKind() == MCExpr::Target;
-  }
-
-  static bool classof(const ARM64MCExpr *) { return true; }
-
-};
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
deleted file mode 100644
index 079d3588f6e..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-//===-- ARM64MCTargetDesc.cpp - ARM64 Target Descriptions -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides ARM64 specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCTargetDesc.h"
-#include "ARM64ELFStreamer.h"
-#include "ARM64MCAsmInfo.h"
-#include "InstPrinter/ARM64InstPrinter.h"
-#include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-#define GET_INSTRINFO_MC_DESC
-#include "ARM64GenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_MC_DESC
-#include "ARM64GenSubtargetInfo.inc"
-
-#define GET_REGINFO_MC_DESC
-#include "ARM64GenRegisterInfo.inc"
-
-static MCInstrInfo *createARM64MCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitARM64MCInstrInfo(X);
-  return X;
-}
-
-static MCSubtargetInfo *createARM64MCSubtargetInfo(StringRef TT, StringRef CPU,
-                                                   StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-
-  if (CPU.empty())
-    CPU = "generic";
-
-  InitARM64MCSubtargetInfo(X, TT, CPU, FS);
-  return X;
-}
-
-static MCRegisterInfo *createARM64MCRegisterInfo(StringRef Triple) {
-  MCRegisterInfo *X = new MCRegisterInfo();
-  InitARM64MCRegisterInfo(X, ARM64::LR);
-  return X;
-}
-
-static MCAsmInfo *createARM64MCAsmInfo(const MCRegisterInfo &MRI,
-                                       StringRef TT) {
-  Triple TheTriple(TT);
-
-  MCAsmInfo *MAI;
-  if (TheTriple.isOSDarwin())
-    MAI = new ARM64MCAsmInfoDarwin();
-  else {
-    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
-    MAI = new ARM64MCAsmInfoELF(TT);
-  }
-
-  // Initial state of the frame pointer is SP.
-  unsigned Reg = MRI.getDwarfRegNum(ARM64::SP, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
-  MAI->addInitialFrameState(Inst);
-
-  return MAI;
-}
-
-static MCCodeGenInfo *createARM64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                               CodeModel::Model CM,
-                                               CodeGenOpt::Level OL) {
-  Triple TheTriple(TT);
-  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
-         "Only expect Darwin and ELF targets");
-
-  if (CM == CodeModel::Default)
-    CM = CodeModel::Small;
-  // The default MCJIT memory managers make no guarantees about where they can
-  // find an executable page; JITed code needs to be able to refer to globals
-  // no matter how far away they are.
-  else if (CM == CodeModel::JITDefault)
-    CM = CodeModel::Large;
-  else if (CM != CodeModel::Small && CM != CodeModel::Large)
-    report_fatal_error("Only small and large code models are allowed on ARM64");
-
-  // ARM64 Darwin is always PIC.
-  if (TheTriple.isOSDarwin())
-    RM = Reloc::PIC_;
-  // On ELF platforms the default static relocation model has a smart enough
-  // linker to cope with referencing external symbols defined in a shared
-  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
-  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
-    RM = Reloc::Static;
-
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->InitMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
-static MCInstPrinter *createARM64MCInstPrinter(const Target &T,
-                                               unsigned SyntaxVariant,
-                                               const MCAsmInfo &MAI,
-                                               const MCInstrInfo &MII,
-                                               const MCRegisterInfo &MRI,
-                                               const MCSubtargetInfo &STI) {
-  if (SyntaxVariant == 0)
-    return new ARM64InstPrinter(MAI, MII, MRI, STI);
-  if (SyntaxVariant == 1)
-    return new ARM64AppleInstPrinter(MAI, MII, MRI, STI);
-
-  return nullptr;
-}
-
-static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
-                                    MCContext &Ctx, MCAsmBackend &TAB,
-                                    raw_ostream &OS, MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI, bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin())
-    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
-                               /*LabelSections*/ true);
-
-  return createARM64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializeARM64TargetMC() {
-  // Register the MC asm info.
-  RegisterMCAsmInfoFn X(TheARM64leTarget, createARM64MCAsmInfo);
-  RegisterMCAsmInfoFn Y(TheARM64beTarget, createARM64MCAsmInfo);
-  RegisterMCAsmInfoFn Z(TheAArch64leTarget, createARM64MCAsmInfo);
-  RegisterMCAsmInfoFn W(TheAArch64beTarget, createARM64MCAsmInfo);
-
-  // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget,
-                                        createARM64MCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget,
-                                        createARM64MCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget,
-                                        createARM64MCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget,
-                                        createARM64MCCodeGenInfo);
-
-  // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget, createARM64MCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget, createARM64MCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheAArch64leTarget, createARM64MCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget, createARM64MCInstrInfo);
-
-  // Register the MC register info.
-  TargetRegistry::RegisterMCRegInfo(TheARM64leTarget, createARM64MCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheARM64beTarget, createARM64MCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheAArch64leTarget, createARM64MCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget, createARM64MCRegisterInfo);
-
-  // Register the MC subtarget info.
-  TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget,
-                                          createARM64MCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget,
-                                          createARM64MCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheAArch64leTarget,
-                                          createARM64MCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget,
-                                          createARM64MCSubtargetInfo);
-
-  // Register the asm backend.
-  TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget, createARM64leAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget, createARM64beAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget, createARM64leAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget, createARM64beAsmBackend);
-
-  // Register the MC Code Emitter
-  TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget,
-                                        createARM64MCCodeEmitter);
-  TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget,
-                                        createARM64MCCodeEmitter);
-  TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget,
-                                        createARM64MCCodeEmitter);
-  TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget,
-                                        createARM64MCCodeEmitter);
-
-  // Register the object streamer.
-  TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer);
-  TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer);
-  TargetRegistry::RegisterMCObjectStreamer(TheAArch64leTarget, createMCStreamer);
-  TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget, createMCStreamer);
-
-  // Register the MCInstPrinter.
-  TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget,
-                                        createARM64MCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget,
-                                        createARM64MCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget,
-                                        createARM64MCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget,
-                                        createARM64MCInstPrinter);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
deleted file mode 100644
index f2e9c17a378..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//===-- ARM64MCTargetDesc.h - ARM64 Target Descriptions ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides ARM64 specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64MCTARGETDESC_H
-#define ARM64MCTARGETDESC_H
-
-#include "llvm/Support/DataTypes.h"
-#include <string>
-
-namespace llvm {
-class MCAsmBackend;
-class MCCodeEmitter;
-class MCContext;
-class MCInstrInfo;
-class MCRegisterInfo;
-class MCObjectWriter;
-class MCSubtargetInfo;
-class StringRef;
-class Target;
-class raw_ostream;
-
-extern Target TheARM64leTarget;
-extern Target TheARM64beTarget;
-extern Target TheAArch64leTarget;
-extern Target TheAArch64beTarget;
-
-MCCodeEmitter *createARM64MCCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCRegisterInfo &MRI,
-                                        const MCSubtargetInfo &STI,
-                                        MCContext &Ctx);
-MCAsmBackend *createARM64leAsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                                      StringRef TT, StringRef CPU);
-MCAsmBackend *createARM64beAsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                                      StringRef TT, StringRef CPU);
-
-        MCObjectWriter *createARM64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
-                                                   bool IsLittleEndian);
-
-MCObjectWriter *createARM64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
-                                            uint32_t CPUSubtype);
-
-} // End llvm namespace
-
-// Defines symbolic names for ARM64 registers.  This defines a mapping from
-// register name to register number.
-//
-#define GET_REGINFO_ENUM
-#include "ARM64GenRegisterInfo.inc"
-
-// Defines symbolic names for the ARM64 instructions.
-//
-#define GET_INSTRINFO_ENUM
-#include "ARM64GenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_ENUM
-#include "ARM64GenSubtargetInfo.inc"
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
deleted file mode 100644
index 1c48159bbe9..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
+++ /dev/null
@@ -1,395 +0,0 @@
-//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCAsmLayout.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCMachObjectWriter.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
-using namespace llvm;
-
-namespace {
-class ARM64MachObjectWriter : public MCMachObjectTargetWriter {
-  bool getARM64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
-                                  const MCSymbolRefExpr *Sym,
-                                  unsigned &Log2Size, const MCAssembler &Asm);
-
-public:
-  ARM64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
-      : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
-                                 /*UseAggressiveSymbolFolding=*/true) {}
-
-  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
-                        const MCAsmLayout &Layout, const MCFragment *Fragment,
-                        const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) override;
-};
-}
-
-bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
-    const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
-    unsigned &Log2Size, const MCAssembler &Asm) {
-  RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
-  Log2Size = ~0U;
-
-  switch ((unsigned)Fixup.getKind()) {
-  default:
-    return false;
-
-  case FK_Data_1:
-    Log2Size = llvm::Log2_32(1);
-    return true;
-  case FK_Data_2:
-    Log2Size = llvm::Log2_32(2);
-    return true;
-  case FK_Data_4:
-    Log2Size = llvm::Log2_32(4);
-    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
-      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
-    return true;
-  case FK_Data_8:
-    Log2Size = llvm::Log2_32(8);
-    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
-      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
-    return true;
-  case ARM64::fixup_arm64_add_imm12:
-  case ARM64::fixup_arm64_ldst_imm12_scale1:
-  case ARM64::fixup_arm64_ldst_imm12_scale2:
-  case ARM64::fixup_arm64_ldst_imm12_scale4:
-  case ARM64::fixup_arm64_ldst_imm12_scale8:
-  case ARM64::fixup_arm64_ldst_imm12_scale16:
-    Log2Size = llvm::Log2_32(4);
-    switch (Sym->getKind()) {
-    default:
-      assert(0 && "Unexpected symbol reference variant kind!");
-    case MCSymbolRefExpr::VK_PAGEOFF:
-      RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
-      return true;
-    case MCSymbolRefExpr::VK_GOTPAGEOFF:
-      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12);
-      return true;
-    case MCSymbolRefExpr::VK_TLVPPAGEOFF:
-      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
-      return true;
-    }
-  case ARM64::fixup_arm64_pcrel_adrp_imm21:
-    Log2Size = llvm::Log2_32(4);
-    // This encompasses the relocation for the whole 21-bit value.
-    switch (Sym->getKind()) {
-    default:
-      Asm.getContext().FatalError(Fixup.getLoc(),
-                                  "ADR/ADRP relocations must be GOT relative");
-    case MCSymbolRefExpr::VK_PAGE:
-      RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
-      return true;
-    case MCSymbolRefExpr::VK_GOTPAGE:
-      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21);
-      return true;
-    case MCSymbolRefExpr::VK_TLVPPAGE:
-      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21);
-      return true;
-    }
-    return true;
-  case ARM64::fixup_arm64_pcrel_branch26:
-  case ARM64::fixup_arm64_pcrel_call26:
-    Log2Size = llvm::Log2_32(4);
-    RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
-    return true;
-  }
-}
-
-void ARM64MachObjectWriter::RecordRelocation(
-    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
-    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
-    uint64_t &FixedValue) {
-  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-
-  // See <reloc.h>.
-  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment);
-  unsigned Log2Size = 0;
-  int64_t Value = 0;
-  unsigned Index = 0;
-  unsigned IsExtern = 0;
-  unsigned Type = 0;
-  unsigned Kind = Fixup.getKind();
-
-  FixupOffset += Fixup.getOffset();
-
-  // ARM64 pcrel relocation addends do not include the section offset.
-  if (IsPCRel)
-    FixedValue += FixupOffset;
-
-  // ADRP fixups use relocations for the whole symbol value and only
-  // put the addend in the instruction itself. Clear out any value the
-  // generic code figured out from the sybmol definition.
-  if (Kind == ARM64::fixup_arm64_pcrel_adrp_imm21)
-    FixedValue = 0;
-
-  // imm19 relocations are for conditional branches, which require
-  // assembler local symbols. If we got here, that's not what we have,
-  // so complain loudly.
-  if (Kind == ARM64::fixup_arm64_pcrel_branch19) {
-    Asm.getContext().FatalError(Fixup.getLoc(),
-                                "conditional branch requires assembler-local"
-                                " label. '" +
-                                    Target.getSymA()->getSymbol().getName() +
-                                    "' is external.");
-    return;
-  }
-
-  // 14-bit branch relocations should only target internal labels, and so
-  // should never get here.
-  if (Kind == ARM64::fixup_arm64_pcrel_branch14) {
-    Asm.getContext().FatalError(Fixup.getLoc(),
-                                "Invalid relocation on conditional branch!");
-    return;
-  }
-
-  if (!getARM64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
-                                  Asm)) {
-    Asm.getContext().FatalError(Fixup.getLoc(), "unknown ARM64 fixup kind!");
-    return;
-  }
-
-  Value = Target.getConstant();
-
-  if (Target.isAbsolute()) { // constant
-    // FIXME: Should this always be extern?
-    // SymbolNum of 0 indicates the absolute section.
-    Type = MachO::ARM64_RELOC_UNSIGNED;
-    Index = 0;
-
-    if (IsPCRel) {
-      IsExtern = 1;
-      Asm.getContext().FatalError(Fixup.getLoc(),
-                                  "PC relative absolute relocation!");
-
-      // FIXME: x86_64 sets the type to a branch reloc here. Should we do
-      // something similar?
-    }
-  } else if (Target.getSymB()) { // A - B + constant
-    const MCSymbol *A = &Target.getSymA()->getSymbol();
-    const MCSymbolData &A_SD = Asm.getSymbolData(*A);
-    const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
-
-    const MCSymbol *B = &Target.getSymB()->getSymbol();
-    const MCSymbolData &B_SD = Asm.getSymbolData(*B);
-    const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
-
-    // Check for "_foo@got - .", which comes through here as:
-    // Ltmp0:
-    //    ... _foo@got - Ltmp0
-    if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT &&
-        Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None &&
-        Layout.getSymbolOffset(&B_SD) ==
-            Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
-      // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
-      Index = A_Base->getIndex();
-      IsExtern = 1;
-      Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
-      IsPCRel = 1;
-      MachO::any_relocation_info MRE;
-      MRE.r_word0 = FixupOffset;
-      MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                     (IsExtern << 27) | (Type << 28));
-      Writer->addRelocation(Fragment->getParent(), MRE);
-      return;
-    } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
-               Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
-      // Otherwise, neither symbol can be modified.
-      Asm.getContext().FatalError(Fixup.getLoc(),
-                                  "unsupported relocation of modified symbol");
-
-    // We don't support PCrel relocations of differences.
-    if (IsPCRel)
-      Asm.getContext().FatalError(Fixup.getLoc(),
-                                  "unsupported pc-relative relocation of "
-                                  "difference");
-
-    // ARM64 always uses external relocations. If there is no symbol to use as
-    // a base address (a local symbol with no preceding non-local symbol),
-    // error out.
-    //
-    // FIXME: We should probably just synthesize an external symbol and use
-    // that.
-    if (!A_Base)
-      Asm.getContext().FatalError(
-          Fixup.getLoc(),
-          "unsupported relocation of local symbol '" + A->getName() +
-              "'. Must have non-local symbol earlier in section.");
-    if (!B_Base)
-      Asm.getContext().FatalError(
-          Fixup.getLoc(),
-          "unsupported relocation of local symbol '" + B->getName() +
-              "'. Must have non-local symbol earlier in section.");
-
-    if (A_Base == B_Base && A_Base)
-      Asm.getContext().FatalError(Fixup.getLoc(),
-                                  "unsupported relocation with identical base");
-
-    Value += (!A_SD.getFragment() ? 0
-                                  : Writer->getSymbolAddress(&A_SD, Layout)) -
-             (!A_Base || !A_Base->getFragment()
-                  ? 0
-                  : Writer->getSymbolAddress(A_Base, Layout));
-    Value -= (!B_SD.getFragment() ? 0
-                                  : Writer->getSymbolAddress(&B_SD, Layout)) -
-             (!B_Base || !B_Base->getFragment()
-                  ? 0
-                  : Writer->getSymbolAddress(B_Base, Layout));
-
-    Index = A_Base->getIndex();
-    IsExtern = 1;
-    Type = MachO::ARM64_RELOC_UNSIGNED;
-
-    MachO::any_relocation_info MRE;
-    MRE.r_word0 = FixupOffset;
-    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                   (IsExtern << 27) | (Type << 28));
-    Writer->addRelocation(Fragment->getParent(), MRE);
-
-    Index = B_Base->getIndex();
-    IsExtern = 1;
-    Type = MachO::ARM64_RELOC_SUBTRACTOR;
-  } else { // A + constant
-    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
-    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
-    const MCSymbolData *Base = Asm.getAtom(&SD);
-    const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
-        Fragment->getParent()->getSection());
-
-    // If the symbol is a variable and we weren't able to get a Base for it
-    // (i.e., it's not in the symbol table associated with a section) resolve
-    // the relocation based its expansion instead.
-    if (Symbol->isVariable() && !Base) {
-      // If the evaluation is an absolute value, just use that directly
-      // to keep things easy.
-      int64_t Res;
-      if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
-              Res, Layout, Writer->getSectionAddressMap())) {
-        FixedValue = Res;
-        return;
-      }
-
-      // FIXME: Will the Target we already have ever have any data in it
-      // we need to preserve and merge with the new Target? How about
-      // the FixedValue?
-      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
-        Asm.getContext().FatalError(Fixup.getLoc(),
-                                    "unable to resolve variable '" +
-                                        Symbol->getName() + "'");
-      return RecordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
-                              FixedValue);
-    }
-
-    // Relocations inside debug sections always use local relocations when
-    // possible. This seems to be done because the debugger doesn't fully
-    // understand relocation entries and expects to find values that
-    // have already been fixed up.
-    if (Symbol->isInSection()) {
-      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
-        Base = nullptr;
-    }
-
-    // ARM64 uses external relocations as much as possible. For debug sections,
-    // and for pointer-sized relocations (.quad), we allow section relocations.
-    // It's code sections that run into trouble.
-    if (Base) {
-      Index = Base->getIndex();
-      IsExtern = 1;
-
-      // Add the local offset, if needed.
-      if (Base != &SD)
-        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
-    } else if (Symbol->isInSection()) {
-      // Pointer-sized relocations can use a local relocation. Otherwise,
-      // we have to be in a debug info section.
-      if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
-        Asm.getContext().FatalError(
-            Fixup.getLoc(),
-            "unsupported relocation of local symbol '" + Symbol->getName() +
-                "'. Must have non-local symbol earlier in section.");
-      // Adjust the relocation to be section-relative.
-      // The index is the section ordinal (1-based).
-      const MCSectionData &SymSD =
-          Asm.getSectionData(SD.getSymbol().getSection());
-      Index = SymSD.getOrdinal() + 1;
-      IsExtern = 0;
-      Value += Writer->getSymbolAddress(&SD, Layout);
-
-      if (IsPCRel)
-        Value -= Writer->getFragmentAddress(Fragment, Layout) +
-                 Fixup.getOffset() + (1ULL << Log2Size);
-    } else {
-      // Resolve constant variables.
-      if (SD.getSymbol().isVariable()) {
-        int64_t Res;
-        if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
-                Res, Layout, Writer->getSectionAddressMap())) {
-          FixedValue = Res;
-          return;
-        }
-      }
-      Asm.getContext().FatalError(Fixup.getLoc(),
-                                  "unsupported relocation of variable '" +
-                                      Symbol->getName() + "'");
-    }
-  }
-
-  // If the relocation kind is Branch26, Page21, or Pageoff12, any addend
-  // is represented via an Addend relocation, not encoded directly into
-  // the instruction.
-  if ((Type == MachO::ARM64_RELOC_BRANCH26 ||
-       Type == MachO::ARM64_RELOC_PAGE21 ||
-       Type == MachO::ARM64_RELOC_PAGEOFF12) &&
-      Value) {
-    assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
-
-    MachO::any_relocation_info MRE;
-    MRE.r_word0 = FixupOffset;
-    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                   (IsExtern << 27) | (Type << 28));
-    Writer->addRelocation(Fragment->getParent(), MRE);
-
-    // Now set up the Addend relocation.
-    Type = MachO::ARM64_RELOC_ADDEND;
-    Index = Value;
-    IsPCRel = 0;
-    Log2Size = 2;
-    IsExtern = 0;
-
-    // Put zero into the instruction itself. The addend is in the relocation.
-    Value = 0;
-  }
-
-  // If there's any addend left to handle, encode it in the instruction.
-  FixedValue = Value;
-
-  // struct relocation_info (8 bytes)
-  MachO::any_relocation_info MRE;
-  MRE.r_word0 = FixupOffset;
-  MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                 (IsExtern << 27) | (Type << 28));
-  Writer->addRelocation(Fragment->getParent(), MRE);
-}
-
-MCObjectWriter *llvm::createARM64MachObjectWriter(raw_ostream &OS,
-                                                  uint32_t CPUType,
-                                                  uint32_t CPUSubtype) {
-  return createMachObjectWriter(new ARM64MachObjectWriter(CPUType, CPUSubtype),
-                                OS, /*IsLittleEndian=*/true);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
deleted file mode 100644
index f8665bcfe94..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-add_llvm_library(LLVMARM64Desc
-  ARM64AsmBackend.cpp
-  ARM64ELFObjectWriter.cpp
-  ARM64ELFStreamer.cpp
-  ARM64MCAsmInfo.cpp
-  ARM64MCCodeEmitter.cpp
-  ARM64MCExpr.cpp
-  ARM64MCTargetDesc.cpp
-  ARM64MachObjectWriter.cpp
-)
-add_dependencies(LLVMARM64Desc ARM64CommonTableGen)
-
-# Hack: we need to include 'main' target directory to grab private headers
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt b/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
deleted file mode 100644
index e4c74d285d4..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Desc
-parent = ARM64
-required_libraries = ARM64AsmPrinter ARM64Info MC Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/MCTargetDesc/Makefile b/lib/Target/ARM64/MCTargetDesc/Makefile
deleted file mode 100644
index 013cc633f66..00000000000
--- a/lib/Target/ARM64/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM64/TargetDesc/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Desc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/Makefile b/lib/Target/ARM64/Makefile
deleted file mode 100644
index cfb05d2a87b..00000000000
--- a/lib/Target/ARM64/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-##===- lib/Target/ARM64/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMARM64CodeGen
-TARGET = ARM64
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = ARM64GenRegisterInfo.inc ARM64GenInstrInfo.inc \
-		ARM64GenAsmWriter.inc ARM64GenAsmWriter1.inc \
-		ARM64GenDAGISel.inc \
-		ARM64GenCallingConv.inc ARM64GenAsmMatcher.inc \
-		ARM64GenSubtargetInfo.inc ARM64GenMCCodeEmitter.inc \
-		ARM64GenFastISel.inc ARM64GenDisassemblerTables.inc \
-		ARM64GenMCPseudoLowering.inc
-
-DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp b/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp
deleted file mode 100644
index 247566825ab..00000000000
--- a/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- ARM64TargetInfo.cpp - ARM64 Target Implementation -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/TargetRegistry.h"
-using namespace llvm;
-
-namespace llvm {
-Target TheARM64leTarget;
-Target TheARM64beTarget;
-Target TheAArch64leTarget;
-Target TheAArch64beTarget;
-} // end namespace llvm
-
-extern "C" void LLVMInitializeARM64TargetInfo() {
-  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64leTarget, "arm64",
-                                                   "ARM64 (little endian)");
-  RegisterTarget<Triple::arm64_be, /*HasJIT=*/true> Y(TheARM64beTarget, "arm64_be",
-                                                      "ARM64 (big endian)");
-
-  RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
-      TheAArch64leTarget, "aarch64", "ARM64 (little endian)");
-  RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
-      TheAArch64beTarget, "aarch64_be", "ARM64 (big endian)");
-}
diff --git a/lib/Target/ARM64/TargetInfo/CMakeLists.txt b/lib/Target/ARM64/TargetInfo/CMakeLists.txt
deleted file mode 100644
index a0142c40713..00000000000
--- a/lib/Target/ARM64/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64Info
-  ARM64TargetInfo.cpp
-  )
-
-add_dependencies(LLVMARM64Info ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/TargetInfo/LLVMBuild.txt b/lib/Target/ARM64/TargetInfo/LLVMBuild.txt
deleted file mode 100644
index b9ecb706952..00000000000
--- a/lib/Target/ARM64/TargetInfo/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/ARM64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Info
-parent = ARM64
-required_libraries = Support
-add_to_library_groups = ARM64
diff --git a/lib/Target/ARM64/TargetInfo/Makefile b/lib/Target/ARM64/TargetInfo/Makefile
deleted file mode 100644
index 2d5a1a087a5..00000000000
--- a/lib/Target/ARM64/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM64/TargetInfo/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Info
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/Utils/ARM64BaseInfo.cpp b/lib/Target/ARM64/Utils/ARM64BaseInfo.cpp
deleted file mode 100644
index 5142d18c23c..00000000000
--- a/lib/Target/ARM64/Utils/ARM64BaseInfo.cpp
+++ /dev/null
@@ -1,901 +0,0 @@
-//===-- ARM64BaseInfo.cpp - ARM64 Base encoding information------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides basic encoding and assembly information for ARM64.
-//
-//===----------------------------------------------------------------------===//
-#include "ARM64BaseInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Regex.h"
-
-using namespace llvm;
-
-StringRef ARM64NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
-  for (unsigned i = 0; i < NumPairs; ++i) {
-    if (Pairs[i].Value == Value) {
-      Valid = true;
-      return Pairs[i].Name;
-    }
-  }
-
-  Valid = false;
-  return StringRef();
-}
-
-uint32_t ARM64NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
-  std::string LowerCaseName = Name.lower();
-  for (unsigned i = 0; i < NumPairs; ++i) {
-    if (Pairs[i].Name == LowerCaseName) {
-      Valid = true;
-      return Pairs[i].Value;
-    }
-  }
-
-  Valid = false;
-  return -1;
-}
-
-bool ARM64NamedImmMapper::validImm(uint32_t Value) const {
-  return Value < TooBigImm;
-}
-
-const ARM64NamedImmMapper::Mapping ARM64AT::ATMapper::ATPairs[] = {
-  {"s1e1r", S1E1R},
-  {"s1e2r", S1E2R},
-  {"s1e3r", S1E3R},
-  {"s1e1w", S1E1W},
-  {"s1e2w", S1E2W},
-  {"s1e3w", S1E3W},
-  {"s1e0r", S1E0R},
-  {"s1e0w", S1E0W},
-  {"s12e1r", S12E1R},
-  {"s12e1w", S12E1W},
-  {"s12e0r", S12E0R},
-  {"s12e0w", S12E0W},
-};
-
-ARM64AT::ATMapper::ATMapper()
-  : ARM64NamedImmMapper(ATPairs, 0) {}
-
-const ARM64NamedImmMapper::Mapping ARM64DB::DBarrierMapper::DBarrierPairs[] = {
-  {"oshld", OSHLD},
-  {"oshst", OSHST},
-  {"osh", OSH},
-  {"nshld", NSHLD},
-  {"nshst", NSHST},
-  {"nsh", NSH},
-  {"ishld", ISHLD},
-  {"ishst", ISHST},
-  {"ish", ISH},
-  {"ld", LD},
-  {"st", ST},
-  {"sy", SY}
-};
-
-ARM64DB::DBarrierMapper::DBarrierMapper()
-  : ARM64NamedImmMapper(DBarrierPairs, 16u) {}
-
-const ARM64NamedImmMapper::Mapping ARM64DC::DCMapper::DCPairs[] = {
-  {"zva", ZVA},
-  {"ivac", IVAC},
-  {"isw", ISW},
-  {"cvac", CVAC},
-  {"csw", CSW},
-  {"cvau", CVAU},
-  {"civac", CIVAC},
-  {"cisw", CISW}
-};
-
-ARM64DC::DCMapper::DCMapper()
-  : ARM64NamedImmMapper(DCPairs, 0) {}
-
-const ARM64NamedImmMapper::Mapping ARM64IC::ICMapper::ICPairs[] = {
-  {"ialluis",  IALLUIS},
-  {"iallu", IALLU},
-  {"ivau", IVAU}
-};
-
-ARM64IC::ICMapper::ICMapper()
-  : ARM64NamedImmMapper(ICPairs, 0) {}
-
-const ARM64NamedImmMapper::Mapping ARM64ISB::ISBMapper::ISBPairs[] = {
-  {"sy",  SY},
-};
-
-ARM64ISB::ISBMapper::ISBMapper()
-  : ARM64NamedImmMapper(ISBPairs, 16) {}
-
-const ARM64NamedImmMapper::Mapping ARM64PRFM::PRFMMapper::PRFMPairs[] = {
-  {"pldl1keep", PLDL1KEEP},
-  {"pldl1strm", PLDL1STRM},
-  {"pldl2keep", PLDL2KEEP},
-  {"pldl2strm", PLDL2STRM},
-  {"pldl3keep", PLDL3KEEP},
-  {"pldl3strm", PLDL3STRM},
-  {"plil1keep", PLIL1KEEP},
-  {"plil1strm", PLIL1STRM},
-  {"plil2keep", PLIL2KEEP},
-  {"plil2strm", PLIL2STRM},
-  {"plil3keep", PLIL3KEEP},
-  {"plil3strm", PLIL3STRM},
-  {"pstl1keep", PSTL1KEEP},
-  {"pstl1strm", PSTL1STRM},
-  {"pstl2keep", PSTL2KEEP},
-  {"pstl2strm", PSTL2STRM},
-  {"pstl3keep", PSTL3KEEP},
-  {"pstl3strm", PSTL3STRM}
-};
-
-ARM64PRFM::PRFMMapper::PRFMMapper()
-  : ARM64NamedImmMapper(PRFMPairs, 32) {}
-
-const ARM64NamedImmMapper::Mapping ARM64PState::PStateMapper::PStatePairs[] = {
-  {"spsel", SPSel},
-  {"daifset", DAIFSet},
-  {"daifclr", DAIFClr}
-};
-
-ARM64PState::PStateMapper::PStateMapper()
-  : ARM64NamedImmMapper(PStatePairs, 0) {}
-
-const ARM64NamedImmMapper::Mapping ARM64SysReg::MRSMapper::MRSPairs[] = {
-  {"mdccsr_el0", MDCCSR_EL0},
-  {"dbgdtrrx_el0", DBGDTRRX_EL0},
-  {"mdrar_el1", MDRAR_EL1},
-  {"oslsr_el1", OSLSR_EL1},
-  {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1},
-  {"pmceid0_el0", PMCEID0_EL0},
-  {"pmceid1_el0", PMCEID1_EL0},
-  {"midr_el1", MIDR_EL1},
-  {"ccsidr_el1", CCSIDR_EL1},
-  {"clidr_el1", CLIDR_EL1},
-  {"ctr_el0", CTR_EL0},
-  {"mpidr_el1", MPIDR_EL1},
-  {"revidr_el1", REVIDR_EL1},
-  {"aidr_el1", AIDR_EL1},
-  {"dczid_el0", DCZID_EL0},
-  {"id_pfr0_el1", ID_PFR0_EL1},
-  {"id_pfr1_el1", ID_PFR1_EL1},
-  {"id_dfr0_el1", ID_DFR0_EL1},
-  {"id_afr0_el1", ID_AFR0_EL1},
-  {"id_mmfr0_el1", ID_MMFR0_EL1},
-  {"id_mmfr1_el1", ID_MMFR1_EL1},
-  {"id_mmfr2_el1", ID_MMFR2_EL1},
-  {"id_mmfr3_el1", ID_MMFR3_EL1},
-  {"id_isar0_el1", ID_ISAR0_EL1},
-  {"id_isar1_el1", ID_ISAR1_EL1},
-  {"id_isar2_el1", ID_ISAR2_EL1},
-  {"id_isar3_el1", ID_ISAR3_EL1},
-  {"id_isar4_el1", ID_ISAR4_EL1},
-  {"id_isar5_el1", ID_ISAR5_EL1},
-  {"id_aa64pfr0_el1", ID_AARM64PFR0_EL1},
-  {"id_aa64pfr1_el1", ID_AARM64PFR1_EL1},
-  {"id_aa64dfr0_el1", ID_AARM64DFR0_EL1},
-  {"id_aa64dfr1_el1", ID_AARM64DFR1_EL1},
-  {"id_aa64afr0_el1", ID_AARM64AFR0_EL1},
-  {"id_aa64afr1_el1", ID_AARM64AFR1_EL1},
-  {"id_aa64isar0_el1", ID_AARM64ISAR0_EL1},
-  {"id_aa64isar1_el1", ID_AARM64ISAR1_EL1},
-  {"id_aa64mmfr0_el1", ID_AARM64MMFR0_EL1},
-  {"id_aa64mmfr1_el1", ID_AARM64MMFR1_EL1},
-  {"mvfr0_el1", MVFR0_EL1},
-  {"mvfr1_el1", MVFR1_EL1},
-  {"mvfr2_el1", MVFR2_EL1},
-  {"rvbar_el1", RVBAR_EL1},
-  {"rvbar_el2", RVBAR_EL2},
-  {"rvbar_el3", RVBAR_EL3},
-  {"isr_el1", ISR_EL1},
-  {"cntpct_el0", CNTPCT_EL0},
-  {"cntvct_el0", CNTVCT_EL0},
-
-  // Trace registers
-  {"trcstatr", TRCSTATR},
-  {"trcidr8", TRCIDR8},
-  {"trcidr9", TRCIDR9},
-  {"trcidr10", TRCIDR10},
-  {"trcidr11", TRCIDR11},
-  {"trcidr12", TRCIDR12},
-  {"trcidr13", TRCIDR13},
-  {"trcidr0", TRCIDR0},
-  {"trcidr1", TRCIDR1},
-  {"trcidr2", TRCIDR2},
-  {"trcidr3", TRCIDR3},
-  {"trcidr4", TRCIDR4},
-  {"trcidr5", TRCIDR5},
-  {"trcidr6", TRCIDR6},
-  {"trcidr7", TRCIDR7},
-  {"trcoslsr", TRCOSLSR},
-  {"trcpdsr", TRCPDSR},
-  {"trcdevaff0", TRCDEVAFF0},
-  {"trcdevaff1", TRCDEVAFF1},
-  {"trclsr", TRCLSR},
-  {"trcauthstatus", TRCAUTHSTATUS},
-  {"trcdevarch", TRCDEVARCH},
-  {"trcdevid", TRCDEVID},
-  {"trcdevtype", TRCDEVTYPE},
-  {"trcpidr4", TRCPIDR4},
-  {"trcpidr5", TRCPIDR5},
-  {"trcpidr6", TRCPIDR6},
-  {"trcpidr7", TRCPIDR7},
-  {"trcpidr0", TRCPIDR0},
-  {"trcpidr1", TRCPIDR1},
-  {"trcpidr2", TRCPIDR2},
-  {"trcpidr3", TRCPIDR3},
-  {"trccidr0", TRCCIDR0},
-  {"trccidr1", TRCCIDR1},
-  {"trccidr2", TRCCIDR2},
-  {"trccidr3", TRCCIDR3},
-
-  // GICv3 registers
-  {"icc_iar1_el1", ICC_IAR1_EL1},
-  {"icc_iar0_el1", ICC_IAR0_EL1},
-  {"icc_hppir1_el1", ICC_HPPIR1_EL1},
-  {"icc_hppir0_el1", ICC_HPPIR0_EL1},
-  {"icc_rpr_el1", ICC_RPR_EL1},
-  {"ich_vtr_el2", ICH_VTR_EL2},
-  {"ich_eisr_el2", ICH_EISR_EL2},
-  {"ich_elsr_el2", ICH_ELSR_EL2}
-};
-
-ARM64SysReg::MRSMapper::MRSMapper(uint64_t FeatureBits)
-  : SysRegMapper(FeatureBits) {
-    InstPairs = &MRSPairs[0];
-    NumInstPairs = llvm::array_lengthof(MRSPairs);
-}
-
-const ARM64NamedImmMapper::Mapping ARM64SysReg::MSRMapper::MSRPairs[] = {
-  {"dbgdtrtx_el0", DBGDTRTX_EL0},
-  {"oslar_el1", OSLAR_EL1},
-  {"pmswinc_el0", PMSWINC_EL0},
-
-  // Trace registers
-  {"trcoslar", TRCOSLAR},
-  {"trclar", TRCLAR},
-
-  // GICv3 registers
-  {"icc_eoir1_el1", ICC_EOIR1_EL1},
-  {"icc_eoir0_el1", ICC_EOIR0_EL1},
-  {"icc_dir_el1", ICC_DIR_EL1},
-  {"icc_sgi1r_el1", ICC_SGI1R_EL1},
-  {"icc_asgi1r_el1", ICC_ASGI1R_EL1},
-  {"icc_sgi0r_el1", ICC_SGI0R_EL1}
-};
-
-ARM64SysReg::MSRMapper::MSRMapper(uint64_t FeatureBits)
-  : SysRegMapper(FeatureBits) {
-    InstPairs = &MSRPairs[0];
-    NumInstPairs = llvm::array_lengthof(MSRPairs);
-}
-
-
-const ARM64NamedImmMapper::Mapping ARM64SysReg::SysRegMapper::SysRegPairs[] = {
-  {"osdtrrx_el1", OSDTRRX_EL1},
-  {"osdtrtx_el1",  OSDTRTX_EL1},
-  {"teecr32_el1", TEECR32_EL1},
-  {"mdccint_el1", MDCCINT_EL1},
-  {"mdscr_el1", MDSCR_EL1},
-  {"dbgdtr_el0", DBGDTR_EL0},
-  {"oseccr_el1", OSECCR_EL1},
-  {"dbgvcr32_el2", DBGVCR32_EL2},
-  {"dbgbvr0_el1", DBGBVR0_EL1},
-  {"dbgbvr1_el1", DBGBVR1_EL1},
-  {"dbgbvr2_el1", DBGBVR2_EL1},
-  {"dbgbvr3_el1", DBGBVR3_EL1},
-  {"dbgbvr4_el1", DBGBVR4_EL1},
-  {"dbgbvr5_el1", DBGBVR5_EL1},
-  {"dbgbvr6_el1", DBGBVR6_EL1},
-  {"dbgbvr7_el1", DBGBVR7_EL1},
-  {"dbgbvr8_el1", DBGBVR8_EL1},
-  {"dbgbvr9_el1", DBGBVR9_EL1},
-  {"dbgbvr10_el1", DBGBVR10_EL1},
-  {"dbgbvr11_el1", DBGBVR11_EL1},
-  {"dbgbvr12_el1", DBGBVR12_EL1},
-  {"dbgbvr13_el1", DBGBVR13_EL1},
-  {"dbgbvr14_el1", DBGBVR14_EL1},
-  {"dbgbvr15_el1", DBGBVR15_EL1},
-  {"dbgbcr0_el1", DBGBCR0_EL1},
-  {"dbgbcr1_el1", DBGBCR1_EL1},
-  {"dbgbcr2_el1", DBGBCR2_EL1},
-  {"dbgbcr3_el1", DBGBCR3_EL1},
-  {"dbgbcr4_el1", DBGBCR4_EL1},
-  {"dbgbcr5_el1", DBGBCR5_EL1},
-  {"dbgbcr6_el1", DBGBCR6_EL1},
-  {"dbgbcr7_el1", DBGBCR7_EL1},
-  {"dbgbcr8_el1", DBGBCR8_EL1},
-  {"dbgbcr9_el1", DBGBCR9_EL1},
-  {"dbgbcr10_el1", DBGBCR10_EL1},
-  {"dbgbcr11_el1", DBGBCR11_EL1},
-  {"dbgbcr12_el1", DBGBCR12_EL1},
-  {"dbgbcr13_el1", DBGBCR13_EL1},
-  {"dbgbcr14_el1", DBGBCR14_EL1},
-  {"dbgbcr15_el1", DBGBCR15_EL1},
-  {"dbgwvr0_el1", DBGWVR0_EL1},
-  {"dbgwvr1_el1", DBGWVR1_EL1},
-  {"dbgwvr2_el1", DBGWVR2_EL1},
-  {"dbgwvr3_el1", DBGWVR3_EL1},
-  {"dbgwvr4_el1", DBGWVR4_EL1},
-  {"dbgwvr5_el1", DBGWVR5_EL1},
-  {"dbgwvr6_el1", DBGWVR6_EL1},
-  {"dbgwvr7_el1", DBGWVR7_EL1},
-  {"dbgwvr8_el1", DBGWVR8_EL1},
-  {"dbgwvr9_el1", DBGWVR9_EL1},
-  {"dbgwvr10_el1", DBGWVR10_EL1},
-  {"dbgwvr11_el1", DBGWVR11_EL1},
-  {"dbgwvr12_el1", DBGWVR12_EL1},
-  {"dbgwvr13_el1", DBGWVR13_EL1},
-  {"dbgwvr14_el1", DBGWVR14_EL1},
-  {"dbgwvr15_el1", DBGWVR15_EL1},
-  {"dbgwcr0_el1", DBGWCR0_EL1},
-  {"dbgwcr1_el1", DBGWCR1_EL1},
-  {"dbgwcr2_el1", DBGWCR2_EL1},
-  {"dbgwcr3_el1", DBGWCR3_EL1},
-  {"dbgwcr4_el1", DBGWCR4_EL1},
-  {"dbgwcr5_el1", DBGWCR5_EL1},
-  {"dbgwcr6_el1", DBGWCR6_EL1},
-  {"dbgwcr7_el1", DBGWCR7_EL1},
-  {"dbgwcr8_el1", DBGWCR8_EL1},
-  {"dbgwcr9_el1", DBGWCR9_EL1},
-  {"dbgwcr10_el1", DBGWCR10_EL1},
-  {"dbgwcr11_el1", DBGWCR11_EL1},
-  {"dbgwcr12_el1", DBGWCR12_EL1},
-  {"dbgwcr13_el1", DBGWCR13_EL1},
-  {"dbgwcr14_el1", DBGWCR14_EL1},
-  {"dbgwcr15_el1", DBGWCR15_EL1},
-  {"teehbr32_el1", TEEHBR32_EL1},
-  {"osdlr_el1", OSDLR_EL1},
-  {"dbgprcr_el1", DBGPRCR_EL1},
-  {"dbgclaimset_el1", DBGCLAIMSET_EL1},
-  {"dbgclaimclr_el1", DBGCLAIMCLR_EL1},
-  {"csselr_el1", CSSELR_EL1},
-  {"vpidr_el2", VPIDR_EL2},
-  {"vmpidr_el2", VMPIDR_EL2},
-  {"sctlr_el1", SCTLR_EL1},
-  {"sctlr_el2", SCTLR_EL2},
-  {"sctlr_el3", SCTLR_EL3},
-  {"actlr_el1", ACTLR_EL1},
-  {"actlr_el2", ACTLR_EL2},
-  {"actlr_el3", ACTLR_EL3},
-  {"cpacr_el1", CPACR_EL1},
-  {"hcr_el2", HCR_EL2},
-  {"scr_el3", SCR_EL3},
-  {"mdcr_el2", MDCR_EL2},
-  {"sder32_el3", SDER32_EL3},
-  {"cptr_el2", CPTR_EL2},
-  {"cptr_el3", CPTR_EL3},
-  {"hstr_el2", HSTR_EL2},
-  {"hacr_el2", HACR_EL2},
-  {"mdcr_el3", MDCR_EL3},
-  {"ttbr0_el1", TTBR0_EL1},
-  {"ttbr0_el2", TTBR0_EL2},
-  {"ttbr0_el3", TTBR0_EL3},
-  {"ttbr1_el1", TTBR1_EL1},
-  {"tcr_el1", TCR_EL1},
-  {"tcr_el2", TCR_EL2},
-  {"tcr_el3", TCR_EL3},
-  {"vttbr_el2", VTTBR_EL2},
-  {"vtcr_el2", VTCR_EL2},
-  {"dacr32_el2", DACR32_EL2},
-  {"spsr_el1", SPSR_EL1},
-  {"spsr_el2", SPSR_EL2},
-  {"spsr_el3", SPSR_EL3},
-  {"elr_el1", ELR_EL1},
-  {"elr_el2", ELR_EL2},
-  {"elr_el3", ELR_EL3},
-  {"sp_el0", SP_EL0},
-  {"sp_el1", SP_EL1},
-  {"sp_el2", SP_EL2},
-  {"spsel", SPSel},
-  {"nzcv", NZCV},
-  {"daif", DAIF},
-  {"currentel", CurrentEL},
-  {"spsr_irq", SPSR_irq},
-  {"spsr_abt", SPSR_abt},
-  {"spsr_und", SPSR_und},
-  {"spsr_fiq", SPSR_fiq},
-  {"fpcr", FPCR},
-  {"fpsr", FPSR},
-  {"dspsr_el0", DSPSR_EL0},
-  {"dlr_el0", DLR_EL0},
-  {"ifsr32_el2", IFSR32_EL2},
-  {"afsr0_el1", AFSR0_EL1},
-  {"afsr0_el2", AFSR0_EL2},
-  {"afsr0_el3", AFSR0_EL3},
-  {"afsr1_el1", AFSR1_EL1},
-  {"afsr1_el2", AFSR1_EL2},
-  {"afsr1_el3", AFSR1_EL3},
-  {"esr_el1", ESR_EL1},
-  {"esr_el2", ESR_EL2},
-  {"esr_el3", ESR_EL3},
-  {"fpexc32_el2", FPEXC32_EL2},
-  {"far_el1", FAR_EL1},
-  {"far_el2", FAR_EL2},
-  {"far_el3", FAR_EL3},
-  {"hpfar_el2", HPFAR_EL2},
-  {"par_el1", PAR_EL1},
-  {"pmcr_el0", PMCR_EL0},
-  {"pmcntenset_el0", PMCNTENSET_EL0},
-  {"pmcntenclr_el0", PMCNTENCLR_EL0},
-  {"pmovsclr_el0", PMOVSCLR_EL0},
-  {"pmselr_el0", PMSELR_EL0},
-  {"pmccntr_el0", PMCCNTR_EL0},
-  {"pmxevtyper_el0", PMXEVTYPER_EL0},
-  {"pmxevcntr_el0", PMXEVCNTR_EL0},
-  {"pmuserenr_el0", PMUSERENR_EL0},
-  {"pmintenset_el1", PMINTENSET_EL1},
-  {"pmintenclr_el1", PMINTENCLR_EL1},
-  {"pmovsset_el0", PMOVSSET_EL0},
-  {"mair_el1", MAIR_EL1},
-  {"mair_el2", MAIR_EL2},
-  {"mair_el3", MAIR_EL3},
-  {"amair_el1", AMAIR_EL1},
-  {"amair_el2", AMAIR_EL2},
-  {"amair_el3", AMAIR_EL3},
-  {"vbar_el1", VBAR_EL1},
-  {"vbar_el2", VBAR_EL2},
-  {"vbar_el3", VBAR_EL3},
-  {"rmr_el1", RMR_EL1},
-  {"rmr_el2", RMR_EL2},
-  {"rmr_el3", RMR_EL3},
-  {"contextidr_el1", CONTEXTIDR_EL1},
-  {"tpidr_el0", TPIDR_EL0},
-  {"tpidr_el2", TPIDR_EL2},
-  {"tpidr_el3", TPIDR_EL3},
-  {"tpidrro_el0", TPIDRRO_EL0},
-  {"tpidr_el1", TPIDR_EL1},
-  {"cntfrq_el0", CNTFRQ_EL0},
-  {"cntvoff_el2", CNTVOFF_EL2},
-  {"cntkctl_el1", CNTKCTL_EL1},
-  {"cnthctl_el2", CNTHCTL_EL2},
-  {"cntp_tval_el0", CNTP_TVAL_EL0},
-  {"cnthp_tval_el2", CNTHP_TVAL_EL2},
-  {"cntps_tval_el1", CNTPS_TVAL_EL1},
-  {"cntp_ctl_el0", CNTP_CTL_EL0},
-  {"cnthp_ctl_el2", CNTHP_CTL_EL2},
-  {"cntps_ctl_el1", CNTPS_CTL_EL1},
-  {"cntp_cval_el0", CNTP_CVAL_EL0},
-  {"cnthp_cval_el2", CNTHP_CVAL_EL2},
-  {"cntps_cval_el1", CNTPS_CVAL_EL1},
-  {"cntv_tval_el0", CNTV_TVAL_EL0},
-  {"cntv_ctl_el0", CNTV_CTL_EL0},
-  {"cntv_cval_el0", CNTV_CVAL_EL0},
-  {"pmevcntr0_el0", PMEVCNTR0_EL0},
-  {"pmevcntr1_el0", PMEVCNTR1_EL0},
-  {"pmevcntr2_el0", PMEVCNTR2_EL0},
-  {"pmevcntr3_el0", PMEVCNTR3_EL0},
-  {"pmevcntr4_el0", PMEVCNTR4_EL0},
-  {"pmevcntr5_el0", PMEVCNTR5_EL0},
-  {"pmevcntr6_el0", PMEVCNTR6_EL0},
-  {"pmevcntr7_el0", PMEVCNTR7_EL0},
-  {"pmevcntr8_el0", PMEVCNTR8_EL0},
-  {"pmevcntr9_el0", PMEVCNTR9_EL0},
-  {"pmevcntr10_el0", PMEVCNTR10_EL0},
-  {"pmevcntr11_el0", PMEVCNTR11_EL0},
-  {"pmevcntr12_el0", PMEVCNTR12_EL0},
-  {"pmevcntr13_el0", PMEVCNTR13_EL0},
-  {"pmevcntr14_el0", PMEVCNTR14_EL0},
-  {"pmevcntr15_el0", PMEVCNTR15_EL0},
-  {"pmevcntr16_el0", PMEVCNTR16_EL0},
-  {"pmevcntr17_el0", PMEVCNTR17_EL0},
-  {"pmevcntr18_el0", PMEVCNTR18_EL0},
-  {"pmevcntr19_el0", PMEVCNTR19_EL0},
-  {"pmevcntr20_el0", PMEVCNTR20_EL0},
-  {"pmevcntr21_el0", PMEVCNTR21_EL0},
-  {"pmevcntr22_el0", PMEVCNTR22_EL0},
-  {"pmevcntr23_el0", PMEVCNTR23_EL0},
-  {"pmevcntr24_el0", PMEVCNTR24_EL0},
-  {"pmevcntr25_el0", PMEVCNTR25_EL0},
-  {"pmevcntr26_el0", PMEVCNTR26_EL0},
-  {"pmevcntr27_el0", PMEVCNTR27_EL0},
-  {"pmevcntr28_el0", PMEVCNTR28_EL0},
-  {"pmevcntr29_el0", PMEVCNTR29_EL0},
-  {"pmevcntr30_el0", PMEVCNTR30_EL0},
-  {"pmccfiltr_el0", PMCCFILTR_EL0},
-  {"pmevtyper0_el0", PMEVTYPER0_EL0},
-  {"pmevtyper1_el0", PMEVTYPER1_EL0},
-  {"pmevtyper2_el0", PMEVTYPER2_EL0},
-  {"pmevtyper3_el0", PMEVTYPER3_EL0},
-  {"pmevtyper4_el0", PMEVTYPER4_EL0},
-  {"pmevtyper5_el0", PMEVTYPER5_EL0},
-  {"pmevtyper6_el0", PMEVTYPER6_EL0},
-  {"pmevtyper7_el0", PMEVTYPER7_EL0},
-  {"pmevtyper8_el0", PMEVTYPER8_EL0},
-  {"pmevtyper9_el0", PMEVTYPER9_EL0},
-  {"pmevtyper10_el0", PMEVTYPER10_EL0},
-  {"pmevtyper11_el0", PMEVTYPER11_EL0},
-  {"pmevtyper12_el0", PMEVTYPER12_EL0},
-  {"pmevtyper13_el0", PMEVTYPER13_EL0},
-  {"pmevtyper14_el0", PMEVTYPER14_EL0},
-  {"pmevtyper15_el0", PMEVTYPER15_EL0},
-  {"pmevtyper16_el0", PMEVTYPER16_EL0},
-  {"pmevtyper17_el0", PMEVTYPER17_EL0},
-  {"pmevtyper18_el0", PMEVTYPER18_EL0},
-  {"pmevtyper19_el0", PMEVTYPER19_EL0},
-  {"pmevtyper20_el0", PMEVTYPER20_EL0},
-  {"pmevtyper21_el0", PMEVTYPER21_EL0},
-  {"pmevtyper22_el0", PMEVTYPER22_EL0},
-  {"pmevtyper23_el0", PMEVTYPER23_EL0},
-  {"pmevtyper24_el0", PMEVTYPER24_EL0},
-  {"pmevtyper25_el0", PMEVTYPER25_EL0},
-  {"pmevtyper26_el0", PMEVTYPER26_EL0},
-  {"pmevtyper27_el0", PMEVTYPER27_EL0},
-  {"pmevtyper28_el0", PMEVTYPER28_EL0},
-  {"pmevtyper29_el0", PMEVTYPER29_EL0},
-  {"pmevtyper30_el0", PMEVTYPER30_EL0},
-
-  // Trace registers
-  {"trcprgctlr", TRCPRGCTLR},
-  {"trcprocselr", TRCPROCSELR},
-  {"trcconfigr", TRCCONFIGR},
-  {"trcauxctlr", TRCAUXCTLR},
-  {"trceventctl0r", TRCEVENTCTL0R},
-  {"trceventctl1r", TRCEVENTCTL1R},
-  {"trcstallctlr", TRCSTALLCTLR},
-  {"trctsctlr", TRCTSCTLR},
-  {"trcsyncpr", TRCSYNCPR},
-  {"trcccctlr", TRCCCCTLR},
-  {"trcbbctlr", TRCBBCTLR},
-  {"trctraceidr", TRCTRACEIDR},
-  {"trcqctlr", TRCQCTLR},
-  {"trcvictlr", TRCVICTLR},
-  {"trcviiectlr", TRCVIIECTLR},
-  {"trcvissctlr", TRCVISSCTLR},
-  {"trcvipcssctlr", TRCVIPCSSCTLR},
-  {"trcvdctlr", TRCVDCTLR},
-  {"trcvdsacctlr", TRCVDSACCTLR},
-  {"trcvdarcctlr", TRCVDARCCTLR},
-  {"trcseqevr0", TRCSEQEVR0},
-  {"trcseqevr1", TRCSEQEVR1},
-  {"trcseqevr2", TRCSEQEVR2},
-  {"trcseqrstevr", TRCSEQRSTEVR},
-  {"trcseqstr", TRCSEQSTR},
-  {"trcextinselr", TRCEXTINSELR},
-  {"trccntrldvr0", TRCCNTRLDVR0},
-  {"trccntrldvr1", TRCCNTRLDVR1},
-  {"trccntrldvr2", TRCCNTRLDVR2},
-  {"trccntrldvr3", TRCCNTRLDVR3},
-  {"trccntctlr0", TRCCNTCTLR0},
-  {"trccntctlr1", TRCCNTCTLR1},
-  {"trccntctlr2", TRCCNTCTLR2},
-  {"trccntctlr3", TRCCNTCTLR3},
-  {"trccntvr0", TRCCNTVR0},
-  {"trccntvr1", TRCCNTVR1},
-  {"trccntvr2", TRCCNTVR2},
-  {"trccntvr3", TRCCNTVR3},
-  {"trcimspec0", TRCIMSPEC0},
-  {"trcimspec1", TRCIMSPEC1},
-  {"trcimspec2", TRCIMSPEC2},
-  {"trcimspec3", TRCIMSPEC3},
-  {"trcimspec4", TRCIMSPEC4},
-  {"trcimspec5", TRCIMSPEC5},
-  {"trcimspec6", TRCIMSPEC6},
-  {"trcimspec7", TRCIMSPEC7},
-  {"trcrsctlr2", TRCRSCTLR2},
-  {"trcrsctlr3", TRCRSCTLR3},
-  {"trcrsctlr4", TRCRSCTLR4},
-  {"trcrsctlr5", TRCRSCTLR5},
-  {"trcrsctlr6", TRCRSCTLR6},
-  {"trcrsctlr7", TRCRSCTLR7},
-  {"trcrsctlr8", TRCRSCTLR8},
-  {"trcrsctlr9", TRCRSCTLR9},
-  {"trcrsctlr10", TRCRSCTLR10},
-  {"trcrsctlr11", TRCRSCTLR11},
-  {"trcrsctlr12", TRCRSCTLR12},
-  {"trcrsctlr13", TRCRSCTLR13},
-  {"trcrsctlr14", TRCRSCTLR14},
-  {"trcrsctlr15", TRCRSCTLR15},
-  {"trcrsctlr16", TRCRSCTLR16},
-  {"trcrsctlr17", TRCRSCTLR17},
-  {"trcrsctlr18", TRCRSCTLR18},
-  {"trcrsctlr19", TRCRSCTLR19},
-  {"trcrsctlr20", TRCRSCTLR20},
-  {"trcrsctlr21", TRCRSCTLR21},
-  {"trcrsctlr22", TRCRSCTLR22},
-  {"trcrsctlr23", TRCRSCTLR23},
-  {"trcrsctlr24", TRCRSCTLR24},
-  {"trcrsctlr25", TRCRSCTLR25},
-  {"trcrsctlr26", TRCRSCTLR26},
-  {"trcrsctlr27", TRCRSCTLR27},
-  {"trcrsctlr28", TRCRSCTLR28},
-  {"trcrsctlr29", TRCRSCTLR29},
-  {"trcrsctlr30", TRCRSCTLR30},
-  {"trcrsctlr31", TRCRSCTLR31},
-  {"trcssccr0", TRCSSCCR0},
-  {"trcssccr1", TRCSSCCR1},
-  {"trcssccr2", TRCSSCCR2},
-  {"trcssccr3", TRCSSCCR3},
-  {"trcssccr4", TRCSSCCR4},
-  {"trcssccr5", TRCSSCCR5},
-  {"trcssccr6", TRCSSCCR6},
-  {"trcssccr7", TRCSSCCR7},
-  {"trcsscsr0", TRCSSCSR0},
-  {"trcsscsr1", TRCSSCSR1},
-  {"trcsscsr2", TRCSSCSR2},
-  {"trcsscsr3", TRCSSCSR3},
-  {"trcsscsr4", TRCSSCSR4},
-  {"trcsscsr5", TRCSSCSR5},
-  {"trcsscsr6", TRCSSCSR6},
-  {"trcsscsr7", TRCSSCSR7},
-  {"trcsspcicr0", TRCSSPCICR0},
-  {"trcsspcicr1", TRCSSPCICR1},
-  {"trcsspcicr2", TRCSSPCICR2},
-  {"trcsspcicr3", TRCSSPCICR3},
-  {"trcsspcicr4", TRCSSPCICR4},
-  {"trcsspcicr5", TRCSSPCICR5},
-  {"trcsspcicr6", TRCSSPCICR6},
-  {"trcsspcicr7", TRCSSPCICR7},
-  {"trcpdcr", TRCPDCR},
-  {"trcacvr0", TRCACVR0},
-  {"trcacvr1", TRCACVR1},
-  {"trcacvr2", TRCACVR2},
-  {"trcacvr3", TRCACVR3},
-  {"trcacvr4", TRCACVR4},
-  {"trcacvr5", TRCACVR5},
-  {"trcacvr6", TRCACVR6},
-  {"trcacvr7", TRCACVR7},
-  {"trcacvr8", TRCACVR8},
-  {"trcacvr9", TRCACVR9},
-  {"trcacvr10", TRCACVR10},
-  {"trcacvr11", TRCACVR11},
-  {"trcacvr12", TRCACVR12},
-  {"trcacvr13", TRCACVR13},
-  {"trcacvr14", TRCACVR14},
-  {"trcacvr15", TRCACVR15},
-  {"trcacatr0", TRCACATR0},
-  {"trcacatr1", TRCACATR1},
-  {"trcacatr2", TRCACATR2},
-  {"trcacatr3", TRCACATR3},
-  {"trcacatr4", TRCACATR4},
-  {"trcacatr5", TRCACATR5},
-  {"trcacatr6", TRCACATR6},
-  {"trcacatr7", TRCACATR7},
-  {"trcacatr8", TRCACATR8},
-  {"trcacatr9", TRCACATR9},
-  {"trcacatr10", TRCACATR10},
-  {"trcacatr11", TRCACATR11},
-  {"trcacatr12", TRCACATR12},
-  {"trcacatr13", TRCACATR13},
-  {"trcacatr14", TRCACATR14},
-  {"trcacatr15", TRCACATR15},
-  {"trcdvcvr0", TRCDVCVR0},
-  {"trcdvcvr1", TRCDVCVR1},
-  {"trcdvcvr2", TRCDVCVR2},
-  {"trcdvcvr3", TRCDVCVR3},
-  {"trcdvcvr4", TRCDVCVR4},
-  {"trcdvcvr5", TRCDVCVR5},
-  {"trcdvcvr6", TRCDVCVR6},
-  {"trcdvcvr7", TRCDVCVR7},
-  {"trcdvcmr0", TRCDVCMR0},
-  {"trcdvcmr1", TRCDVCMR1},
-  {"trcdvcmr2", TRCDVCMR2},
-  {"trcdvcmr3", TRCDVCMR3},
-  {"trcdvcmr4", TRCDVCMR4},
-  {"trcdvcmr5", TRCDVCMR5},
-  {"trcdvcmr6", TRCDVCMR6},
-  {"trcdvcmr7", TRCDVCMR7},
-  {"trccidcvr0", TRCCIDCVR0},
-  {"trccidcvr1", TRCCIDCVR1},
-  {"trccidcvr2", TRCCIDCVR2},
-  {"trccidcvr3", TRCCIDCVR3},
-  {"trccidcvr4", TRCCIDCVR4},
-  {"trccidcvr5", TRCCIDCVR5},
-  {"trccidcvr6", TRCCIDCVR6},
-  {"trccidcvr7", TRCCIDCVR7},
-  {"trcvmidcvr0", TRCVMIDCVR0},
-  {"trcvmidcvr1", TRCVMIDCVR1},
-  {"trcvmidcvr2", TRCVMIDCVR2},
-  {"trcvmidcvr3", TRCVMIDCVR3},
-  {"trcvmidcvr4", TRCVMIDCVR4},
-  {"trcvmidcvr5", TRCVMIDCVR5},
-  {"trcvmidcvr6", TRCVMIDCVR6},
-  {"trcvmidcvr7", TRCVMIDCVR7},
-  {"trccidcctlr0", TRCCIDCCTLR0},
-  {"trccidcctlr1", TRCCIDCCTLR1},
-  {"trcvmidcctlr0", TRCVMIDCCTLR0},
-  {"trcvmidcctlr1", TRCVMIDCCTLR1},
-  {"trcitctrl", TRCITCTRL},
-  {"trcclaimset", TRCCLAIMSET},
-  {"trcclaimclr", TRCCLAIMCLR},
-
-  // GICv3 registers
-  {"icc_bpr1_el1", ICC_BPR1_EL1},
-  {"icc_bpr0_el1", ICC_BPR0_EL1},
-  {"icc_pmr_el1", ICC_PMR_EL1},
-  {"icc_ctlr_el1", ICC_CTLR_EL1},
-  {"icc_ctlr_el3", ICC_CTLR_EL3},
-  {"icc_sre_el1", ICC_SRE_EL1},
-  {"icc_sre_el2", ICC_SRE_EL2},
-  {"icc_sre_el3", ICC_SRE_EL3},
-  {"icc_igrpen0_el1", ICC_IGRPEN0_EL1},
-  {"icc_igrpen1_el1", ICC_IGRPEN1_EL1},
-  {"icc_igrpen1_el3", ICC_IGRPEN1_EL3},
-  {"icc_seien_el1", ICC_SEIEN_EL1},
-  {"icc_ap0r0_el1", ICC_AP0R0_EL1},
-  {"icc_ap0r1_el1", ICC_AP0R1_EL1},
-  {"icc_ap0r2_el1", ICC_AP0R2_EL1},
-  {"icc_ap0r3_el1", ICC_AP0R3_EL1},
-  {"icc_ap1r0_el1", ICC_AP1R0_EL1},
-  {"icc_ap1r1_el1", ICC_AP1R1_EL1},
-  {"icc_ap1r2_el1", ICC_AP1R2_EL1},
-  {"icc_ap1r3_el1", ICC_AP1R3_EL1},
-  {"ich_ap0r0_el2", ICH_AP0R0_EL2},
-  {"ich_ap0r1_el2", ICH_AP0R1_EL2},
-  {"ich_ap0r2_el2", ICH_AP0R2_EL2},
-  {"ich_ap0r3_el2", ICH_AP0R3_EL2},
-  {"ich_ap1r0_el2", ICH_AP1R0_EL2},
-  {"ich_ap1r1_el2", ICH_AP1R1_EL2},
-  {"ich_ap1r2_el2", ICH_AP1R2_EL2},
-  {"ich_ap1r3_el2", ICH_AP1R3_EL2},
-  {"ich_hcr_el2", ICH_HCR_EL2},
-  {"ich_misr_el2", ICH_MISR_EL2},
-  {"ich_vmcr_el2", ICH_VMCR_EL2},
-  {"ich_vseir_el2", ICH_VSEIR_EL2},
-  {"ich_lr0_el2", ICH_LR0_EL2},
-  {"ich_lr1_el2", ICH_LR1_EL2},
-  {"ich_lr2_el2", ICH_LR2_EL2},
-  {"ich_lr3_el2", ICH_LR3_EL2},
-  {"ich_lr4_el2", ICH_LR4_EL2},
-  {"ich_lr5_el2", ICH_LR5_EL2},
-  {"ich_lr6_el2", ICH_LR6_EL2},
-  {"ich_lr7_el2", ICH_LR7_EL2},
-  {"ich_lr8_el2", ICH_LR8_EL2},
-  {"ich_lr9_el2", ICH_LR9_EL2},
-  {"ich_lr10_el2", ICH_LR10_EL2},
-  {"ich_lr11_el2", ICH_LR11_EL2},
-  {"ich_lr12_el2", ICH_LR12_EL2},
-  {"ich_lr13_el2", ICH_LR13_EL2},
-  {"ich_lr14_el2", ICH_LR14_EL2},
-  {"ich_lr15_el2", ICH_LR15_EL2}
-};
-
-const ARM64NamedImmMapper::Mapping
-ARM64SysReg::SysRegMapper::CycloneSysRegPairs[] = {
-  {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3}
-};
-
-uint32_t
-ARM64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
-  std::string NameLower = Name.lower();
-
-  // First search the registers shared by all
-  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
-    if (SysRegPairs[i].Name == NameLower) {
-      Valid = true;
-      return SysRegPairs[i].Value;
-    }
-  }
-
-  // Next search for target specific registers
-  if (FeatureBits & ARM64::ProcCyclone) {
-    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
-      if (CycloneSysRegPairs[i].Name == NameLower) {
-        Valid = true;
-        return CycloneSysRegPairs[i].Value;
-      }
-    }
-  }
-
-  // Now try the instruction-specific registers (either read-only or
-  // write-only).
-  for (unsigned i = 0; i < NumInstPairs; ++i) {
-    if (InstPairs[i].Name == NameLower) {
-      Valid = true;
-      return InstPairs[i].Value;
-    }
-  }
-
-  // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name, where the bits
-  // are: 11 xxx 1x11 xxxx xxx
-  Regex GenericRegPattern("^s3_([0-7])_c(1[15])_c([0-9]|1[0-5])_([0-7])$");
-
-  SmallVector<StringRef, 4> Ops;
-  if (!GenericRegPattern.match(NameLower, &Ops)) {
-    Valid = false;
-    return -1;
-  }
-
-  uint32_t Op0 = 3, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
-  uint32_t Bits;
-  Ops[1].getAsInteger(10, Op1);
-  Ops[2].getAsInteger(10, CRn);
-  Ops[3].getAsInteger(10, CRm);
-  Ops[4].getAsInteger(10, Op2);
-  Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
-
-  Valid = true;
-  return Bits;
-}
-
-std::string
-ARM64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
-  // First search the registers shared by all
-  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
-    if (SysRegPairs[i].Value == Bits) {
-      Valid = true;
-      return SysRegPairs[i].Name;
-    }
-  }
-
-  // Next search for target specific registers
-  if (FeatureBits & ARM64::ProcCyclone) {
-    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
-      if (CycloneSysRegPairs[i].Value == Bits) {
-        Valid = true;
-        return CycloneSysRegPairs[i].Name;
-      }
-    }
-  }
-
-  // Now try the instruction-specific registers (either read-only or
-  // write-only).
-  for (unsigned i = 0; i < NumInstPairs; ++i) {
-    if (InstPairs[i].Value == Bits) {
-      Valid = true;
-      return InstPairs[i].Name;
-    }
-  }
-
-  uint32_t Op0 = (Bits >> 14) & 0x3;
-  uint32_t Op1 = (Bits >> 11) & 0x7;
-  uint32_t CRn = (Bits >> 7) & 0xf;
-  uint32_t CRm = (Bits >> 3) & 0xf;
-  uint32_t Op2 = Bits & 0x7;
-
-  // Only combinations matching: 11 xxx 1x11 xxxx xxx are valid for a generic
-  // name.
-  if (Op0 != 3 || (CRn != 11 && CRn != 15)) {
-      Valid = false;
-      return "";
-  }
-
-  assert(Op0 == 3 && (CRn == 11 || CRn == 15) && "Invalid generic sysreg");
-
-  Valid = true;
-  return "s3_" + utostr(Op1) + "_c" + utostr(CRn)
-               + "_c" + utostr(CRm) + "_" + utostr(Op2);
-}
-
-const ARM64NamedImmMapper::Mapping ARM64TLBI::TLBIMapper::TLBIPairs[] = {
-  {"ipas2e1is", IPAS2E1IS},
-  {"ipas2le1is", IPAS2LE1IS},
-  {"vmalle1is", VMALLE1IS},
-  {"alle2is", ALLE2IS},
-  {"alle3is", ALLE3IS},
-  {"vae1is", VAE1IS},
-  {"vae2is", VAE2IS},
-  {"vae3is", VAE3IS},
-  {"aside1is", ASIDE1IS},
-  {"vaae1is", VAAE1IS},
-  {"alle1is", ALLE1IS},
-  {"vale1is", VALE1IS},
-  {"vale2is", VALE2IS},
-  {"vale3is", VALE3IS},
-  {"vmalls12e1is", VMALLS12E1IS},
-  {"vaale1is", VAALE1IS},
-  {"ipas2e1", IPAS2E1},
-  {"ipas2le1", IPAS2LE1},
-  {"vmalle1", VMALLE1},
-  {"alle2", ALLE2},
-  {"alle3", ALLE3},
-  {"vae1", VAE1},
-  {"vae2", VAE2},
-  {"vae3", VAE3},
-  {"aside1", ASIDE1},
-  {"vaae1", VAAE1},
-  {"alle1", ALLE1},
-  {"vale1", VALE1},
-  {"vale2", VALE2},
-  {"vale3", VALE3},
-  {"vmalls12e1", VMALLS12E1},
-  {"vaale1", VAALE1}
-};
-
-ARM64TLBI::TLBIMapper::TLBIMapper()
-  : ARM64NamedImmMapper(TLBIPairs, 0) {}
diff --git a/lib/Target/ARM64/Utils/ARM64BaseInfo.h b/lib/Target/ARM64/Utils/ARM64BaseInfo.h
deleted file mode 100644
index 8075d6b37c9..00000000000
--- a/lib/Target/ARM64/Utils/ARM64BaseInfo.h
+++ /dev/null
@@ -1,1294 +0,0 @@
-//===-- ARM64BaseInfo.h - Top level definitions for ARM64 -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains small standalone helper functions and enum definitions for
-// the ARM64 target useful for the compiler back-end and the MC libraries.
-// As such, it deliberately does not include references to LLVM core
-// code gen types, passes, etc..
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64BASEINFO_H
-#define ARM64BASEINFO_H
-
-// FIXME: Is it easiest to fix this layering violation by moving the .inc
-// #includes from ARM64MCTargetDesc.h to here?
-#include "MCTargetDesc/ARM64MCTargetDesc.h" // For ARM64::X0 and friends.
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-
-inline static unsigned getWRegFromXReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::X0: return ARM64::W0;
-  case ARM64::X1: return ARM64::W1;
-  case ARM64::X2: return ARM64::W2;
-  case ARM64::X3: return ARM64::W3;
-  case ARM64::X4: return ARM64::W4;
-  case ARM64::X5: return ARM64::W5;
-  case ARM64::X6: return ARM64::W6;
-  case ARM64::X7: return ARM64::W7;
-  case ARM64::X8: return ARM64::W8;
-  case ARM64::X9: return ARM64::W9;
-  case ARM64::X10: return ARM64::W10;
-  case ARM64::X11: return ARM64::W11;
-  case ARM64::X12: return ARM64::W12;
-  case ARM64::X13: return ARM64::W13;
-  case ARM64::X14: return ARM64::W14;
-  case ARM64::X15: return ARM64::W15;
-  case ARM64::X16: return ARM64::W16;
-  case ARM64::X17: return ARM64::W17;
-  case ARM64::X18: return ARM64::W18;
-  case ARM64::X19: return ARM64::W19;
-  case ARM64::X20: return ARM64::W20;
-  case ARM64::X21: return ARM64::W21;
-  case ARM64::X22: return ARM64::W22;
-  case ARM64::X23: return ARM64::W23;
-  case ARM64::X24: return ARM64::W24;
-  case ARM64::X25: return ARM64::W25;
-  case ARM64::X26: return ARM64::W26;
-  case ARM64::X27: return ARM64::W27;
-  case ARM64::X28: return ARM64::W28;
-  case ARM64::FP: return ARM64::W29;
-  case ARM64::LR: return ARM64::W30;
-  case ARM64::SP: return ARM64::WSP;
-  case ARM64::XZR: return ARM64::WZR;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-inline static unsigned getXRegFromWReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::W0: return ARM64::X0;
-  case ARM64::W1: return ARM64::X1;
-  case ARM64::W2: return ARM64::X2;
-  case ARM64::W3: return ARM64::X3;
-  case ARM64::W4: return ARM64::X4;
-  case ARM64::W5: return ARM64::X5;
-  case ARM64::W6: return ARM64::X6;
-  case ARM64::W7: return ARM64::X7;
-  case ARM64::W8: return ARM64::X8;
-  case ARM64::W9: return ARM64::X9;
-  case ARM64::W10: return ARM64::X10;
-  case ARM64::W11: return ARM64::X11;
-  case ARM64::W12: return ARM64::X12;
-  case ARM64::W13: return ARM64::X13;
-  case ARM64::W14: return ARM64::X14;
-  case ARM64::W15: return ARM64::X15;
-  case ARM64::W16: return ARM64::X16;
-  case ARM64::W17: return ARM64::X17;
-  case ARM64::W18: return ARM64::X18;
-  case ARM64::W19: return ARM64::X19;
-  case ARM64::W20: return ARM64::X20;
-  case ARM64::W21: return ARM64::X21;
-  case ARM64::W22: return ARM64::X22;
-  case ARM64::W23: return ARM64::X23;
-  case ARM64::W24: return ARM64::X24;
-  case ARM64::W25: return ARM64::X25;
-  case ARM64::W26: return ARM64::X26;
-  case ARM64::W27: return ARM64::X27;
-  case ARM64::W28: return ARM64::X28;
-  case ARM64::W29: return ARM64::FP;
-  case ARM64::W30: return ARM64::LR;
-  case ARM64::WSP: return ARM64::SP;
-  case ARM64::WZR: return ARM64::XZR;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-static inline unsigned getBRegFromDReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::D0:  return ARM64::B0;
-  case ARM64::D1:  return ARM64::B1;
-  case ARM64::D2:  return ARM64::B2;
-  case ARM64::D3:  return ARM64::B3;
-  case ARM64::D4:  return ARM64::B4;
-  case ARM64::D5:  return ARM64::B5;
-  case ARM64::D6:  return ARM64::B6;
-  case ARM64::D7:  return ARM64::B7;
-  case ARM64::D8:  return ARM64::B8;
-  case ARM64::D9:  return ARM64::B9;
-  case ARM64::D10: return ARM64::B10;
-  case ARM64::D11: return ARM64::B11;
-  case ARM64::D12: return ARM64::B12;
-  case ARM64::D13: return ARM64::B13;
-  case ARM64::D14: return ARM64::B14;
-  case ARM64::D15: return ARM64::B15;
-  case ARM64::D16: return ARM64::B16;
-  case ARM64::D17: return ARM64::B17;
-  case ARM64::D18: return ARM64::B18;
-  case ARM64::D19: return ARM64::B19;
-  case ARM64::D20: return ARM64::B20;
-  case ARM64::D21: return ARM64::B21;
-  case ARM64::D22: return ARM64::B22;
-  case ARM64::D23: return ARM64::B23;
-  case ARM64::D24: return ARM64::B24;
-  case ARM64::D25: return ARM64::B25;
-  case ARM64::D26: return ARM64::B26;
-  case ARM64::D27: return ARM64::B27;
-  case ARM64::D28: return ARM64::B28;
-  case ARM64::D29: return ARM64::B29;
-  case ARM64::D30: return ARM64::B30;
-  case ARM64::D31: return ARM64::B31;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-
-static inline unsigned getDRegFromBReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::B0:  return ARM64::D0;
-  case ARM64::B1:  return ARM64::D1;
-  case ARM64::B2:  return ARM64::D2;
-  case ARM64::B3:  return ARM64::D3;
-  case ARM64::B4:  return ARM64::D4;
-  case ARM64::B5:  return ARM64::D5;
-  case ARM64::B6:  return ARM64::D6;
-  case ARM64::B7:  return ARM64::D7;
-  case ARM64::B8:  return ARM64::D8;
-  case ARM64::B9:  return ARM64::D9;
-  case ARM64::B10: return ARM64::D10;
-  case ARM64::B11: return ARM64::D11;
-  case ARM64::B12: return ARM64::D12;
-  case ARM64::B13: return ARM64::D13;
-  case ARM64::B14: return ARM64::D14;
-  case ARM64::B15: return ARM64::D15;
-  case ARM64::B16: return ARM64::D16;
-  case ARM64::B17: return ARM64::D17;
-  case ARM64::B18: return ARM64::D18;
-  case ARM64::B19: return ARM64::D19;
-  case ARM64::B20: return ARM64::D20;
-  case ARM64::B21: return ARM64::D21;
-  case ARM64::B22: return ARM64::D22;
-  case ARM64::B23: return ARM64::D23;
-  case ARM64::B24: return ARM64::D24;
-  case ARM64::B25: return ARM64::D25;
-  case ARM64::B26: return ARM64::D26;
-  case ARM64::B27: return ARM64::D27;
-  case ARM64::B28: return ARM64::D28;
-  case ARM64::B29: return ARM64::D29;
-  case ARM64::B30: return ARM64::D30;
-  case ARM64::B31: return ARM64::D31;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-namespace ARM64CC {
-
-// The CondCodes constants map directly to the 4-bit encoding of the condition
-// field for predicated instructions.
-enum CondCode {  // Meaning (integer)          Meaning (floating-point)
-  EQ = 0x0,      // Equal                      Equal
-  NE = 0x1,      // Not equal                  Not equal, or unordered
-  HS = 0x2,      // Unsigned higher or same    >, ==, or unordered
-  LO = 0x3,      // Unsigned lower             Less than
-  MI = 0x4,      // Minus, negative            Less than
-  PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
-  VS = 0x6,      // Overflow                   Unordered
-  VC = 0x7,      // No overflow                Not unordered
-  HI = 0x8,      // Unsigned higher            Greater than, or unordered
-  LS = 0x9,      // Unsigned lower or same     Less than or equal
-  GE = 0xa,      // Greater than or equal      Greater than or equal
-  LT = 0xb,      // Less than                  Less than, or unordered
-  GT = 0xc,      // Greater than               Greater than
-  LE = 0xd,      // Less than or equal         <, ==, or unordered
-  AL = 0xe,      // Always (unconditional)     Always (unconditional)
-  NV = 0xf,      // Always (unconditional)     Always (unconditional)
-  // Note the NV exists purely to disassemble 0b1111. Execution is "always".
-  Invalid
-};
-
-inline static const char *getCondCodeName(CondCode Code) {
-  switch (Code) {
-  default: llvm_unreachable("Unknown condition code");
-  case EQ:  return "eq";
-  case NE:  return "ne";
-  case HS:  return "hs";
-  case LO:  return "lo";
-  case MI:  return "mi";
-  case PL:  return "pl";
-  case VS:  return "vs";
-  case VC:  return "vc";
-  case HI:  return "hi";
-  case LS:  return "ls";
-  case GE:  return "ge";
-  case LT:  return "lt";
-  case GT:  return "gt";
-  case LE:  return "le";
-  case AL:  return "al";
-  case NV:  return "nv";
-  }
-}
-
-inline static CondCode getInvertedCondCode(CondCode Code) {
-  switch (Code) {
-  default: llvm_unreachable("Unknown condition code");
-  case EQ:  return NE;
-  case NE:  return EQ;
-  case HS:  return LO;
-  case LO:  return HS;
-  case MI:  return PL;
-  case PL:  return MI;
-  case VS:  return VC;
-  case VC:  return VS;
-  case HI:  return LS;
-  case LS:  return HI;
-  case GE:  return LT;
-  case LT:  return GE;
-  case GT:  return LE;
-  case LE:  return GT;
-  }
-}
-
-/// Given a condition code, return NZCV flags that would satisfy that condition.
-/// The flag bits are in the format expected by the ccmp instructions.
-/// Note that many different flag settings can satisfy a given condition code,
-/// this function just returns one of them.
-inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
-  // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
-  enum { N = 8, Z = 4, C = 2, V = 1 };
-  switch (Code) {
-  default: llvm_unreachable("Unknown condition code");
-  case EQ: return Z; // Z == 1
-  case NE: return 0; // Z == 0
-  case HS: return C; // C == 1
-  case LO: return 0; // C == 0
-  case MI: return N; // N == 1
-  case PL: return 0; // N == 0
-  case VS: return V; // V == 1
-  case VC: return 0; // V == 0
-  case HI: return C; // C == 1 && Z == 0
-  case LS: return 0; // C == 0 || Z == 1
-  case GE: return 0; // N == V
-  case LT: return N; // N != V
-  case GT: return 0; // Z == 0 && N == V
-  case LE: return Z; // Z == 1 || N != V
-  }
-}
-} // end namespace ARM64CC
-
-/// Instances of this class can perform bidirectional mapping from random
-/// identifier strings to operand encodings. For example "MSR" takes a named
-/// system-register which must be encoded somehow and decoded for printing. This
-/// central location means that the information for those transformations is not
-/// duplicated and remains in sync.
-///
-/// FIXME: currently the algorithm is a completely unoptimised linear
-/// search. Obviously this could be improved, but we would probably want to work
-/// out just how often these instructions are emitted before working on it. It
-/// might even be optimal to just reorder the tables for the common instructions
-/// rather than changing the algorithm.
-struct ARM64NamedImmMapper {
-  struct Mapping {
-    const char *Name;
-    uint32_t Value;
-  };
-
-  template<int N>
-  ARM64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
-    : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {}
-
-  StringRef toString(uint32_t Value, bool &Valid) const;
-  uint32_t fromString(StringRef Name, bool &Valid) const;
-
-  /// Many of the instructions allow an alternative assembly form consisting of
-  /// a simple immediate. Currently the only valid forms are ranges [0, N) where
-  /// N being 0 indicates no immediate syntax-form is allowed.
-  bool validImm(uint32_t Value) const;
-protected:
-  const Mapping *Pairs;
-  size_t NumPairs;
-  uint32_t TooBigImm;
-};
-
-namespace ARM64AT {
-  enum ATValues {
-    Invalid = -1,    // Op0 Op1  CRn   CRm   Op2
-    S1E1R = 0x43c0,  // 01  000  0111  1000  000
-    S1E2R = 0x63c0,  // 01  100  0111  1000  000
-    S1E3R = 0x73c0,  // 01  110  0111  1000  000
-    S1E1W = 0x43c1,  // 01  000  0111  1000  001
-    S1E2W = 0x63c1,  // 01  100  0111  1000  001
-    S1E3W = 0x73c1,  // 01  110  0111  1000  001
-    S1E0R = 0x43c2,  // 01  000  0111  1000  010
-    S1E0W = 0x43c3,  // 01  000  0111  1000  011
-    S12E1R = 0x63c4, // 01  100  0111  1000  100
-    S12E1W = 0x63c5, // 01  100  0111  1000  101
-    S12E0R = 0x63c6, // 01  100  0111  1000  110
-    S12E0W = 0x63c7  // 01  100  0111  1000  111
-  };
-
-  struct ATMapper : ARM64NamedImmMapper {
-    const static Mapping ATPairs[];
-
-    ATMapper();
-  };
-
-}
-namespace ARM64DB {
-  enum DBValues {
-    Invalid = -1,
-    OSHLD = 0x1,
-    OSHST = 0x2,
-    OSH =   0x3,
-    NSHLD = 0x5,
-    NSHST = 0x6,
-    NSH =   0x7,
-    ISHLD = 0x9,
-    ISHST = 0xa,
-    ISH =   0xb,
-    LD =    0xd,
-    ST =    0xe,
-    SY =    0xf
-  };
-
-  struct DBarrierMapper : ARM64NamedImmMapper {
-    const static Mapping DBarrierPairs[];
-
-    DBarrierMapper();
-  };
-}
-
-namespace  ARM64DC {
-  enum DCValues {
-    Invalid = -1,   // Op1  CRn   CRm   Op2
-    ZVA   = 0x5ba1, // 01  011  0111  0100  001
-    IVAC  = 0x43b1, // 01  000  0111  0110  001
-    ISW   = 0x43b2, // 01  000  0111  0110  010
-    CVAC  = 0x5bd1, // 01  011  0111  1010  001
-    CSW   = 0x43d2, // 01  000  0111  1010  010
-    CVAU  = 0x5bd9, // 01  011  0111  1011  001
-    CIVAC = 0x5bf1, // 01  011  0111  1110  001
-    CISW  = 0x43f2  // 01  000  0111  1110  010
-  };
-
-  struct DCMapper : ARM64NamedImmMapper {
-    const static Mapping DCPairs[];
-
-    DCMapper();
-  };
-
-}
-
-namespace  ARM64IC {
-  enum ICValues {
-    Invalid = -1,     // Op1  CRn   CRm   Op2
-    IALLUIS = 0x0388, // 000  0111  0001  000
-    IALLU = 0x03a8,   // 000  0111  0101  000
-    IVAU = 0x1ba9     // 011  0111  0101  001
-  };
-
-
-  struct ICMapper : ARM64NamedImmMapper {
-    const static Mapping ICPairs[];
-
-    ICMapper();
-  };
-
-  static inline bool NeedsRegister(ICValues Val) {
-    return Val == IVAU;
-  }
-}
-
-namespace  ARM64ISB {
-  enum ISBValues {
-    Invalid = -1,
-    SY = 0xf
-  };
-  struct ISBMapper : ARM64NamedImmMapper {
-    const static Mapping ISBPairs[];
-
-    ISBMapper();
-  };
-}
-
-namespace ARM64PRFM {
-  enum PRFMValues {
-    Invalid = -1,
-    PLDL1KEEP = 0x00,
-    PLDL1STRM = 0x01,
-    PLDL2KEEP = 0x02,
-    PLDL2STRM = 0x03,
-    PLDL3KEEP = 0x04,
-    PLDL3STRM = 0x05,
-    PLIL1KEEP = 0x08,
-    PLIL1STRM = 0x09,
-    PLIL2KEEP = 0x0a,
-    PLIL2STRM = 0x0b,
-    PLIL3KEEP = 0x0c,
-    PLIL3STRM = 0x0d,
-    PSTL1KEEP = 0x10,
-    PSTL1STRM = 0x11,
-    PSTL2KEEP = 0x12,
-    PSTL2STRM = 0x13,
-    PSTL3KEEP = 0x14,
-    PSTL3STRM = 0x15
-  };
-
-  struct PRFMMapper : ARM64NamedImmMapper {
-    const static Mapping PRFMPairs[];
-
-    PRFMMapper();
-  };
-}
-
-namespace ARM64PState {
-  enum PStateValues {
-    Invalid = -1,
-    SPSel = 0x05,
-    DAIFSet = 0x1e,
-    DAIFClr = 0x1f
-  };
-
-  struct PStateMapper : ARM64NamedImmMapper {
-    const static Mapping PStatePairs[];
-
-    PStateMapper();
-  };
-
-}
-
-namespace ARM64SE {
-    enum ShiftExtSpecifiers {
-        Invalid = -1,
-        LSL,
-        MSL,
-        LSR,
-        ASR,
-        ROR,
-
-        UXTB,
-        UXTH,
-        UXTW,
-        UXTX,
-
-        SXTB,
-        SXTH,
-        SXTW,
-        SXTX
-    };
-}
-
-namespace ARM64Layout {
-    enum VectorLayout {
-        Invalid = -1,
-        VL_8B,
-        VL_4H,
-        VL_2S,
-        VL_1D,
-
-        VL_16B,
-        VL_8H,
-        VL_4S,
-        VL_2D,
-
-        // Bare layout for the 128-bit vector
-        // (only show ".b", ".h", ".s", ".d" without vector number)
-        VL_B,
-        VL_H,
-        VL_S,
-        VL_D
-    };
-}
-
-inline static const char *
-ARM64VectorLayoutToString(ARM64Layout::VectorLayout Layout) {
-  switch (Layout) {
-  case ARM64Layout::VL_8B:  return ".8b";
-  case ARM64Layout::VL_4H:  return ".4h";
-  case ARM64Layout::VL_2S:  return ".2s";
-  case ARM64Layout::VL_1D:  return ".1d";
-  case ARM64Layout::VL_16B:  return ".16b";
-  case ARM64Layout::VL_8H:  return ".8h";
-  case ARM64Layout::VL_4S:  return ".4s";
-  case ARM64Layout::VL_2D:  return ".2d";
-  case ARM64Layout::VL_B:  return ".b";
-  case ARM64Layout::VL_H:  return ".h";
-  case ARM64Layout::VL_S:  return ".s";
-  case ARM64Layout::VL_D:  return ".d";
-  default: llvm_unreachable("Unknown Vector Layout");
-  }
-}
-
-inline static ARM64Layout::VectorLayout
-ARM64StringToVectorLayout(StringRef LayoutStr) {
-  return StringSwitch<ARM64Layout::VectorLayout>(LayoutStr)
-             .Case(".8b", ARM64Layout::VL_8B)
-             .Case(".4h", ARM64Layout::VL_4H)
-             .Case(".2s", ARM64Layout::VL_2S)
-             .Case(".1d", ARM64Layout::VL_1D)
-             .Case(".16b", ARM64Layout::VL_16B)
-             .Case(".8h", ARM64Layout::VL_8H)
-             .Case(".4s", ARM64Layout::VL_4S)
-             .Case(".2d", ARM64Layout::VL_2D)
-             .Case(".b", ARM64Layout::VL_B)
-             .Case(".h", ARM64Layout::VL_H)
-             .Case(".s", ARM64Layout::VL_S)
-             .Case(".d", ARM64Layout::VL_D)
-             .Default(ARM64Layout::Invalid);
-}
-
-namespace ARM64SysReg {
-  enum SysRegROValues {
-    MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
-    DBGDTRRX_EL0      = 0x9828, // 10  011  0000  0101  000
-    MDRAR_EL1         = 0x8080, // 10  000  0001  0000  000
-    OSLSR_EL1         = 0x808c, // 10  000  0001  0001  100
-    DBGAUTHSTATUS_EL1 = 0x83f6, // 10  000  0111  1110  110
-    PMCEID0_EL0       = 0xdce6, // 11  011  1001  1100  110
-    PMCEID1_EL0       = 0xdce7, // 11  011  1001  1100  111
-    MIDR_EL1          = 0xc000, // 11  000  0000  0000  000
-    CCSIDR_EL1        = 0xc800, // 11  001  0000  0000  000
-    CLIDR_EL1         = 0xc801, // 11  001  0000  0000  001
-    CTR_EL0           = 0xd801, // 11  011  0000  0000  001
-    MPIDR_EL1         = 0xc005, // 11  000  0000  0000  101
-    REVIDR_EL1        = 0xc006, // 11  000  0000  0000  110
-    AIDR_EL1          = 0xc807, // 11  001  0000  0000  111
-    DCZID_EL0         = 0xd807, // 11  011  0000  0000  111
-    ID_PFR0_EL1       = 0xc008, // 11  000  0000  0001  000
-    ID_PFR1_EL1       = 0xc009, // 11  000  0000  0001  001
-    ID_DFR0_EL1       = 0xc00a, // 11  000  0000  0001  010
-    ID_AFR0_EL1       = 0xc00b, // 11  000  0000  0001  011
-    ID_MMFR0_EL1      = 0xc00c, // 11  000  0000  0001  100
-    ID_MMFR1_EL1      = 0xc00d, // 11  000  0000  0001  101
-    ID_MMFR2_EL1      = 0xc00e, // 11  000  0000  0001  110
-    ID_MMFR3_EL1      = 0xc00f, // 11  000  0000  0001  111
-    ID_ISAR0_EL1      = 0xc010, // 11  000  0000  0010  000
-    ID_ISAR1_EL1      = 0xc011, // 11  000  0000  0010  001
-    ID_ISAR2_EL1      = 0xc012, // 11  000  0000  0010  010
-    ID_ISAR3_EL1      = 0xc013, // 11  000  0000  0010  011
-    ID_ISAR4_EL1      = 0xc014, // 11  000  0000  0010  100
-    ID_ISAR5_EL1      = 0xc015, // 11  000  0000  0010  101
-    ID_AARM64PFR0_EL1   = 0xc020, // 11  000  0000  0100  000
-    ID_AARM64PFR1_EL1   = 0xc021, // 11  000  0000  0100  001
-    ID_AARM64DFR0_EL1   = 0xc028, // 11  000  0000  0101  000
-    ID_AARM64DFR1_EL1   = 0xc029, // 11  000  0000  0101  001
-    ID_AARM64AFR0_EL1   = 0xc02c, // 11  000  0000  0101  100
-    ID_AARM64AFR1_EL1   = 0xc02d, // 11  000  0000  0101  101
-    ID_AARM64ISAR0_EL1  = 0xc030, // 11  000  0000  0110  000
-    ID_AARM64ISAR1_EL1  = 0xc031, // 11  000  0000  0110  001
-    ID_AARM64MMFR0_EL1  = 0xc038, // 11  000  0000  0111  000
-    ID_AARM64MMFR1_EL1  = 0xc039, // 11  000  0000  0111  001
-    MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
-    MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
-    MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
-    RVBAR_EL1         = 0xc601, // 11  000  1100  0000  001
-    RVBAR_EL2         = 0xe601, // 11  100  1100  0000  001
-    RVBAR_EL3         = 0xf601, // 11  110  1100  0000  001
-    ISR_EL1           = 0xc608, // 11  000  1100  0001  000
-    CNTPCT_EL0        = 0xdf01, // 11  011  1110  0000  001
-    CNTVCT_EL0        = 0xdf02,  // 11  011  1110  0000  010
-
-    // Trace registers
-    TRCSTATR          = 0x8818, // 10  001  0000  0011  000
-    TRCIDR8           = 0x8806, // 10  001  0000  0000  110
-    TRCIDR9           = 0x880e, // 10  001  0000  0001  110
-    TRCIDR10          = 0x8816, // 10  001  0000  0010  110
-    TRCIDR11          = 0x881e, // 10  001  0000  0011  110
-    TRCIDR12          = 0x8826, // 10  001  0000  0100  110
-    TRCIDR13          = 0x882e, // 10  001  0000  0101  110
-    TRCIDR0           = 0x8847, // 10  001  0000  1000  111
-    TRCIDR1           = 0x884f, // 10  001  0000  1001  111
-    TRCIDR2           = 0x8857, // 10  001  0000  1010  111
-    TRCIDR3           = 0x885f, // 10  001  0000  1011  111
-    TRCIDR4           = 0x8867, // 10  001  0000  1100  111
-    TRCIDR5           = 0x886f, // 10  001  0000  1101  111
-    TRCIDR6           = 0x8877, // 10  001  0000  1110  111
-    TRCIDR7           = 0x887f, // 10  001  0000  1111  111
-    TRCOSLSR          = 0x888c, // 10  001  0001  0001  100
-    TRCPDSR           = 0x88ac, // 10  001  0001  0101  100
-    TRCDEVAFF0        = 0x8bd6, // 10  001  0111  1010  110
-    TRCDEVAFF1        = 0x8bde, // 10  001  0111  1011  110
-    TRCLSR            = 0x8bee, // 10  001  0111  1101  110
-    TRCAUTHSTATUS     = 0x8bf6, // 10  001  0111  1110  110
-    TRCDEVARCH        = 0x8bfe, // 10  001  0111  1111  110
-    TRCDEVID          = 0x8b97, // 10  001  0111  0010  111
-    TRCDEVTYPE        = 0x8b9f, // 10  001  0111  0011  111
-    TRCPIDR4          = 0x8ba7, // 10  001  0111  0100  111
-    TRCPIDR5          = 0x8baf, // 10  001  0111  0101  111
-    TRCPIDR6          = 0x8bb7, // 10  001  0111  0110  111
-    TRCPIDR7          = 0x8bbf, // 10  001  0111  0111  111
-    TRCPIDR0          = 0x8bc7, // 10  001  0111  1000  111
-    TRCPIDR1          = 0x8bcf, // 10  001  0111  1001  111
-    TRCPIDR2          = 0x8bd7, // 10  001  0111  1010  111
-    TRCPIDR3          = 0x8bdf, // 10  001  0111  1011  111
-    TRCCIDR0          = 0x8be7, // 10  001  0111  1100  111
-    TRCCIDR1          = 0x8bef, // 10  001  0111  1101  111
-    TRCCIDR2          = 0x8bf7, // 10  001  0111  1110  111
-    TRCCIDR3          = 0x8bff, // 10  001  0111  1111  111
-
-    // GICv3 registers
-    ICC_IAR1_EL1      = 0xc660, // 11  000  1100  1100  000
-    ICC_IAR0_EL1      = 0xc640, // 11  000  1100  1000  000
-    ICC_HPPIR1_EL1    = 0xc662, // 11  000  1100  1100  010
-    ICC_HPPIR0_EL1    = 0xc642, // 11  000  1100  1000  010
-    ICC_RPR_EL1       = 0xc65b, // 11  000  1100  1011  011
-    ICH_VTR_EL2       = 0xe659, // 11  100  1100  1011  001
-    ICH_EISR_EL2      = 0xe65b, // 11  100  1100  1011  011
-    ICH_ELSR_EL2      = 0xe65d  // 11  100  1100  1011  101
-  };
-
-  enum SysRegWOValues {
-    DBGDTRTX_EL0      = 0x9828, // 10  011  0000  0101  000
-    OSLAR_EL1         = 0x8084, // 10  000  0001  0000  100
-    PMSWINC_EL0       = 0xdce4,  // 11  011  1001  1100  100
-
-    // Trace Registers
-    TRCOSLAR          = 0x8884, // 10  001  0001  0000  100
-    TRCLAR            = 0x8be6, // 10  001  0111  1100  110
-
-    // GICv3 registers
-    ICC_EOIR1_EL1     = 0xc661, // 11  000  1100  1100  001
-    ICC_EOIR0_EL1     = 0xc641, // 11  000  1100  1000  001
-    ICC_DIR_EL1       = 0xc659, // 11  000  1100  1011  001
-    ICC_SGI1R_EL1     = 0xc65d, // 11  000  1100  1011  101
-    ICC_ASGI1R_EL1    = 0xc65e, // 11  000  1100  1011  110
-    ICC_SGI0R_EL1     = 0xc65f  // 11  000  1100  1011  111
-  };
-
-  enum SysRegValues {
-    Invalid = -1,               // Op0 Op1  CRn   CRm   Op2
-    OSDTRRX_EL1       = 0x8002, // 10  000  0000  0000  010
-    OSDTRTX_EL1       = 0x801a, // 10  000  0000  0011  010
-    TEECR32_EL1       = 0x9000, // 10  010  0000  0000  000
-    MDCCINT_EL1       = 0x8010, // 10  000  0000  0010  000
-    MDSCR_EL1         = 0x8012, // 10  000  0000  0010  010
-    DBGDTR_EL0        = 0x9820, // 10  011  0000  0100  000
-    OSECCR_EL1        = 0x8032, // 10  000  0000  0110  010
-    DBGVCR32_EL2      = 0xa038, // 10  100  0000  0111  000
-    DBGBVR0_EL1       = 0x8004, // 10  000  0000  0000  100
-    DBGBVR1_EL1       = 0x800c, // 10  000  0000  0001  100
-    DBGBVR2_EL1       = 0x8014, // 10  000  0000  0010  100
-    DBGBVR3_EL1       = 0x801c, // 10  000  0000  0011  100
-    DBGBVR4_EL1       = 0x8024, // 10  000  0000  0100  100
-    DBGBVR5_EL1       = 0x802c, // 10  000  0000  0101  100
-    DBGBVR6_EL1       = 0x8034, // 10  000  0000  0110  100
-    DBGBVR7_EL1       = 0x803c, // 10  000  0000  0111  100
-    DBGBVR8_EL1       = 0x8044, // 10  000  0000  1000  100
-    DBGBVR9_EL1       = 0x804c, // 10  000  0000  1001  100
-    DBGBVR10_EL1      = 0x8054, // 10  000  0000  1010  100
-    DBGBVR11_EL1      = 0x805c, // 10  000  0000  1011  100
-    DBGBVR12_EL1      = 0x8064, // 10  000  0000  1100  100
-    DBGBVR13_EL1      = 0x806c, // 10  000  0000  1101  100
-    DBGBVR14_EL1      = 0x8074, // 10  000  0000  1110  100
-    DBGBVR15_EL1      = 0x807c, // 10  000  0000  1111  100
-    DBGBCR0_EL1       = 0x8005, // 10  000  0000  0000  101
-    DBGBCR1_EL1       = 0x800d, // 10  000  0000  0001  101
-    DBGBCR2_EL1       = 0x8015, // 10  000  0000  0010  101
-    DBGBCR3_EL1       = 0x801d, // 10  000  0000  0011  101
-    DBGBCR4_EL1       = 0x8025, // 10  000  0000  0100  101
-    DBGBCR5_EL1       = 0x802d, // 10  000  0000  0101  101
-    DBGBCR6_EL1       = 0x8035, // 10  000  0000  0110  101
-    DBGBCR7_EL1       = 0x803d, // 10  000  0000  0111  101
-    DBGBCR8_EL1       = 0x8045, // 10  000  0000  1000  101
-    DBGBCR9_EL1       = 0x804d, // 10  000  0000  1001  101
-    DBGBCR10_EL1      = 0x8055, // 10  000  0000  1010  101
-    DBGBCR11_EL1      = 0x805d, // 10  000  0000  1011  101
-    DBGBCR12_EL1      = 0x8065, // 10  000  0000  1100  101
-    DBGBCR13_EL1      = 0x806d, // 10  000  0000  1101  101
-    DBGBCR14_EL1      = 0x8075, // 10  000  0000  1110  101
-    DBGBCR15_EL1      = 0x807d, // 10  000  0000  1111  101
-    DBGWVR0_EL1       = 0x8006, // 10  000  0000  0000  110
-    DBGWVR1_EL1       = 0x800e, // 10  000  0000  0001  110
-    DBGWVR2_EL1       = 0x8016, // 10  000  0000  0010  110
-    DBGWVR3_EL1       = 0x801e, // 10  000  0000  0011  110
-    DBGWVR4_EL1       = 0x8026, // 10  000  0000  0100  110
-    DBGWVR5_EL1       = 0x802e, // 10  000  0000  0101  110
-    DBGWVR6_EL1       = 0x8036, // 10  000  0000  0110  110
-    DBGWVR7_EL1       = 0x803e, // 10  000  0000  0111  110
-    DBGWVR8_EL1       = 0x8046, // 10  000  0000  1000  110
-    DBGWVR9_EL1       = 0x804e, // 10  000  0000  1001  110
-    DBGWVR10_EL1      = 0x8056, // 10  000  0000  1010  110
-    DBGWVR11_EL1      = 0x805e, // 10  000  0000  1011  110
-    DBGWVR12_EL1      = 0x8066, // 10  000  0000  1100  110
-    DBGWVR13_EL1      = 0x806e, // 10  000  0000  1101  110
-    DBGWVR14_EL1      = 0x8076, // 10  000  0000  1110  110
-    DBGWVR15_EL1      = 0x807e, // 10  000  0000  1111  110
-    DBGWCR0_EL1       = 0x8007, // 10  000  0000  0000  111
-    DBGWCR1_EL1       = 0x800f, // 10  000  0000  0001  111
-    DBGWCR2_EL1       = 0x8017, // 10  000  0000  0010  111
-    DBGWCR3_EL1       = 0x801f, // 10  000  0000  0011  111
-    DBGWCR4_EL1       = 0x8027, // 10  000  0000  0100  111
-    DBGWCR5_EL1       = 0x802f, // 10  000  0000  0101  111
-    DBGWCR6_EL1       = 0x8037, // 10  000  0000  0110  111
-    DBGWCR7_EL1       = 0x803f, // 10  000  0000  0111  111
-    DBGWCR8_EL1       = 0x8047, // 10  000  0000  1000  111
-    DBGWCR9_EL1       = 0x804f, // 10  000  0000  1001  111
-    DBGWCR10_EL1      = 0x8057, // 10  000  0000  1010  111
-    DBGWCR11_EL1      = 0x805f, // 10  000  0000  1011  111
-    DBGWCR12_EL1      = 0x8067, // 10  000  0000  1100  111
-    DBGWCR13_EL1      = 0x806f, // 10  000  0000  1101  111
-    DBGWCR14_EL1      = 0x8077, // 10  000  0000  1110  111
-    DBGWCR15_EL1      = 0x807f, // 10  000  0000  1111  111
-    TEEHBR32_EL1      = 0x9080, // 10  010  0001  0000  000
-    OSDLR_EL1         = 0x809c, // 10  000  0001  0011  100
-    DBGPRCR_EL1       = 0x80a4, // 10  000  0001  0100  100
-    DBGCLAIMSET_EL1   = 0x83c6, // 10  000  0111  1000  110
-    DBGCLAIMCLR_EL1   = 0x83ce, // 10  000  0111  1001  110
-    CSSELR_EL1        = 0xd000, // 11  010  0000  0000  000
-    VPIDR_EL2         = 0xe000, // 11  100  0000  0000  000
-    VMPIDR_EL2        = 0xe005, // 11  100  0000  0000  101
-    CPACR_EL1         = 0xc082, // 11  000  0001  0000  010
-    SCTLR_EL1         = 0xc080, // 11  000  0001  0000  000
-    SCTLR_EL2         = 0xe080, // 11  100  0001  0000  000
-    SCTLR_EL3         = 0xf080, // 11  110  0001  0000  000
-    ACTLR_EL1         = 0xc081, // 11  000  0001  0000  001
-    ACTLR_EL2         = 0xe081, // 11  100  0001  0000  001
-    ACTLR_EL3         = 0xf081, // 11  110  0001  0000  001
-    HCR_EL2           = 0xe088, // 11  100  0001  0001  000
-    SCR_EL3           = 0xf088, // 11  110  0001  0001  000
-    MDCR_EL2          = 0xe089, // 11  100  0001  0001  001
-    SDER32_EL3        = 0xf089, // 11  110  0001  0001  001
-    CPTR_EL2          = 0xe08a, // 11  100  0001  0001  010
-    CPTR_EL3          = 0xf08a, // 11  110  0001  0001  010
-    HSTR_EL2          = 0xe08b, // 11  100  0001  0001  011
-    HACR_EL2          = 0xe08f, // 11  100  0001  0001  111
-    MDCR_EL3          = 0xf099, // 11  110  0001  0011  001
-    TTBR0_EL1         = 0xc100, // 11  000  0010  0000  000
-    TTBR0_EL2         = 0xe100, // 11  100  0010  0000  000
-    TTBR0_EL3         = 0xf100, // 11  110  0010  0000  000
-    TTBR1_EL1         = 0xc101, // 11  000  0010  0000  001
-    TCR_EL1           = 0xc102, // 11  000  0010  0000  010
-    TCR_EL2           = 0xe102, // 11  100  0010  0000  010
-    TCR_EL3           = 0xf102, // 11  110  0010  0000  010
-    VTTBR_EL2         = 0xe108, // 11  100  0010  0001  000
-    VTCR_EL2          = 0xe10a, // 11  100  0010  0001  010
-    DACR32_EL2        = 0xe180, // 11  100  0011  0000  000
-    SPSR_EL1          = 0xc200, // 11  000  0100  0000  000
-    SPSR_EL2          = 0xe200, // 11  100  0100  0000  000
-    SPSR_EL3          = 0xf200, // 11  110  0100  0000  000
-    ELR_EL1           = 0xc201, // 11  000  0100  0000  001
-    ELR_EL2           = 0xe201, // 11  100  0100  0000  001
-    ELR_EL3           = 0xf201, // 11  110  0100  0000  001
-    SP_EL0            = 0xc208, // 11  000  0100  0001  000
-    SP_EL1            = 0xe208, // 11  100  0100  0001  000
-    SP_EL2            = 0xf208, // 11  110  0100  0001  000
-    SPSel             = 0xc210, // 11  000  0100  0010  000
-    NZCV              = 0xda10, // 11  011  0100  0010  000
-    DAIF              = 0xda11, // 11  011  0100  0010  001
-    CurrentEL         = 0xc212, // 11  000  0100  0010  010
-    SPSR_irq          = 0xe218, // 11  100  0100  0011  000
-    SPSR_abt          = 0xe219, // 11  100  0100  0011  001
-    SPSR_und          = 0xe21a, // 11  100  0100  0011  010
-    SPSR_fiq          = 0xe21b, // 11  100  0100  0011  011
-    FPCR              = 0xda20, // 11  011  0100  0100  000
-    FPSR              = 0xda21, // 11  011  0100  0100  001
-    DSPSR_EL0         = 0xda28, // 11  011  0100  0101  000
-    DLR_EL0           = 0xda29, // 11  011  0100  0101  001
-    IFSR32_EL2        = 0xe281, // 11  100  0101  0000  001
-    AFSR0_EL1         = 0xc288, // 11  000  0101  0001  000
-    AFSR0_EL2         = 0xe288, // 11  100  0101  0001  000
-    AFSR0_EL3         = 0xf288, // 11  110  0101  0001  000
-    AFSR1_EL1         = 0xc289, // 11  000  0101  0001  001
-    AFSR1_EL2         = 0xe289, // 11  100  0101  0001  001
-    AFSR1_EL3         = 0xf289, // 11  110  0101  0001  001
-    ESR_EL1           = 0xc290, // 11  000  0101  0010  000
-    ESR_EL2           = 0xe290, // 11  100  0101  0010  000
-    ESR_EL3           = 0xf290, // 11  110  0101  0010  000
-    FPEXC32_EL2       = 0xe298, // 11  100  0101  0011  000
-    FAR_EL1           = 0xc300, // 11  000  0110  0000  000
-    FAR_EL2           = 0xe300, // 11  100  0110  0000  000
-    FAR_EL3           = 0xf300, // 11  110  0110  0000  000
-    HPFAR_EL2         = 0xe304, // 11  100  0110  0000  100
-    PAR_EL1           = 0xc3a0, // 11  000  0111  0100  000
-    PMCR_EL0          = 0xdce0, // 11  011  1001  1100  000
-    PMCNTENSET_EL0    = 0xdce1, // 11  011  1001  1100  001
-    PMCNTENCLR_EL0    = 0xdce2, // 11  011  1001  1100  010
-    PMOVSCLR_EL0      = 0xdce3, // 11  011  1001  1100  011
-    PMSELR_EL0        = 0xdce5, // 11  011  1001  1100  101
-    PMCCNTR_EL0       = 0xdce8, // 11  011  1001  1101  000
-    PMXEVTYPER_EL0    = 0xdce9, // 11  011  1001  1101  001
-    PMXEVCNTR_EL0     = 0xdcea, // 11  011  1001  1101  010
-    PMUSERENR_EL0     = 0xdcf0, // 11  011  1001  1110  000
-    PMINTENSET_EL1    = 0xc4f1, // 11  000  1001  1110  001
-    PMINTENCLR_EL1    = 0xc4f2, // 11  000  1001  1110  010
-    PMOVSSET_EL0      = 0xdcf3, // 11  011  1001  1110  011
-    MAIR_EL1          = 0xc510, // 11  000  1010  0010  000
-    MAIR_EL2          = 0xe510, // 11  100  1010  0010  000
-    MAIR_EL3          = 0xf510, // 11  110  1010  0010  000
-    AMAIR_EL1         = 0xc518, // 11  000  1010  0011  000
-    AMAIR_EL2         = 0xe518, // 11  100  1010  0011  000
-    AMAIR_EL3         = 0xf518, // 11  110  1010  0011  000
-    VBAR_EL1          = 0xc600, // 11  000  1100  0000  000
-    VBAR_EL2          = 0xe600, // 11  100  1100  0000  000
-    VBAR_EL3          = 0xf600, // 11  110  1100  0000  000
-    RMR_EL1           = 0xc602, // 11  000  1100  0000  010
-    RMR_EL2           = 0xe602, // 11  100  1100  0000  010
-    RMR_EL3           = 0xf602, // 11  110  1100  0000  010
-    CONTEXTIDR_EL1    = 0xc681, // 11  000  1101  0000  001
-    TPIDR_EL0         = 0xde82, // 11  011  1101  0000  010
-    TPIDR_EL2         = 0xe682, // 11  100  1101  0000  010
-    TPIDR_EL3         = 0xf682, // 11  110  1101  0000  010
-    TPIDRRO_EL0       = 0xde83, // 11  011  1101  0000  011
-    TPIDR_EL1         = 0xc684, // 11  000  1101  0000  100
-    CNTFRQ_EL0        = 0xdf00, // 11  011  1110  0000  000
-    CNTVOFF_EL2       = 0xe703, // 11  100  1110  0000  011
-    CNTKCTL_EL1       = 0xc708, // 11  000  1110  0001  000
-    CNTHCTL_EL2       = 0xe708, // 11  100  1110  0001  000
-    CNTP_TVAL_EL0     = 0xdf10, // 11  011  1110  0010  000
-    CNTHP_TVAL_EL2    = 0xe710, // 11  100  1110  0010  000
-    CNTPS_TVAL_EL1    = 0xff10, // 11  111  1110  0010  000
-    CNTP_CTL_EL0      = 0xdf11, // 11  011  1110  0010  001
-    CNTHP_CTL_EL2     = 0xe711, // 11  100  1110  0010  001
-    CNTPS_CTL_EL1     = 0xff11, // 11  111  1110  0010  001
-    CNTP_CVAL_EL0     = 0xdf12, // 11  011  1110  0010  010
-    CNTHP_CVAL_EL2    = 0xe712, // 11  100  1110  0010  010
-    CNTPS_CVAL_EL1    = 0xff12, // 11  111  1110  0010  010
-    CNTV_TVAL_EL0     = 0xdf18, // 11  011  1110  0011  000
-    CNTV_CTL_EL0      = 0xdf19, // 11  011  1110  0011  001
-    CNTV_CVAL_EL0     = 0xdf1a, // 11  011  1110  0011  010
-    PMEVCNTR0_EL0     = 0xdf40, // 11  011  1110  1000  000
-    PMEVCNTR1_EL0     = 0xdf41, // 11  011  1110  1000  001
-    PMEVCNTR2_EL0     = 0xdf42, // 11  011  1110  1000  010
-    PMEVCNTR3_EL0     = 0xdf43, // 11  011  1110  1000  011
-    PMEVCNTR4_EL0     = 0xdf44, // 11  011  1110  1000  100
-    PMEVCNTR5_EL0     = 0xdf45, // 11  011  1110  1000  101
-    PMEVCNTR6_EL0     = 0xdf46, // 11  011  1110  1000  110
-    PMEVCNTR7_EL0     = 0xdf47, // 11  011  1110  1000  111
-    PMEVCNTR8_EL0     = 0xdf48, // 11  011  1110  1001  000
-    PMEVCNTR9_EL0     = 0xdf49, // 11  011  1110  1001  001
-    PMEVCNTR10_EL0    = 0xdf4a, // 11  011  1110  1001  010
-    PMEVCNTR11_EL0    = 0xdf4b, // 11  011  1110  1001  011
-    PMEVCNTR12_EL0    = 0xdf4c, // 11  011  1110  1001  100
-    PMEVCNTR13_EL0    = 0xdf4d, // 11  011  1110  1001  101
-    PMEVCNTR14_EL0    = 0xdf4e, // 11  011  1110  1001  110
-    PMEVCNTR15_EL0    = 0xdf4f, // 11  011  1110  1001  111
-    PMEVCNTR16_EL0    = 0xdf50, // 11  011  1110  1010  000
-    PMEVCNTR17_EL0    = 0xdf51, // 11  011  1110  1010  001
-    PMEVCNTR18_EL0    = 0xdf52, // 11  011  1110  1010  010
-    PMEVCNTR19_EL0    = 0xdf53, // 11  011  1110  1010  011
-    PMEVCNTR20_EL0    = 0xdf54, // 11  011  1110  1010  100
-    PMEVCNTR21_EL0    = 0xdf55, // 11  011  1110  1010  101
-    PMEVCNTR22_EL0    = 0xdf56, // 11  011  1110  1010  110
-    PMEVCNTR23_EL0    = 0xdf57, // 11  011  1110  1010  111
-    PMEVCNTR24_EL0    = 0xdf58, // 11  011  1110  1011  000
-    PMEVCNTR25_EL0    = 0xdf59, // 11  011  1110  1011  001
-    PMEVCNTR26_EL0    = 0xdf5a, // 11  011  1110  1011  010
-    PMEVCNTR27_EL0    = 0xdf5b, // 11  011  1110  1011  011
-    PMEVCNTR28_EL0    = 0xdf5c, // 11  011  1110  1011  100
-    PMEVCNTR29_EL0    = 0xdf5d, // 11  011  1110  1011  101
-    PMEVCNTR30_EL0    = 0xdf5e, // 11  011  1110  1011  110
-    PMCCFILTR_EL0     = 0xdf7f, // 11  011  1110  1111  111
-    PMEVTYPER0_EL0    = 0xdf60, // 11  011  1110  1100  000
-    PMEVTYPER1_EL0    = 0xdf61, // 11  011  1110  1100  001
-    PMEVTYPER2_EL0    = 0xdf62, // 11  011  1110  1100  010
-    PMEVTYPER3_EL0    = 0xdf63, // 11  011  1110  1100  011
-    PMEVTYPER4_EL0    = 0xdf64, // 11  011  1110  1100  100
-    PMEVTYPER5_EL0    = 0xdf65, // 11  011  1110  1100  101
-    PMEVTYPER6_EL0    = 0xdf66, // 11  011  1110  1100  110
-    PMEVTYPER7_EL0    = 0xdf67, // 11  011  1110  1100  111
-    PMEVTYPER8_EL0    = 0xdf68, // 11  011  1110  1101  000
-    PMEVTYPER9_EL0    = 0xdf69, // 11  011  1110  1101  001
-    PMEVTYPER10_EL0   = 0xdf6a, // 11  011  1110  1101  010
-    PMEVTYPER11_EL0   = 0xdf6b, // 11  011  1110  1101  011
-    PMEVTYPER12_EL0   = 0xdf6c, // 11  011  1110  1101  100
-    PMEVTYPER13_EL0   = 0xdf6d, // 11  011  1110  1101  101
-    PMEVTYPER14_EL0   = 0xdf6e, // 11  011  1110  1101  110
-    PMEVTYPER15_EL0   = 0xdf6f, // 11  011  1110  1101  111
-    PMEVTYPER16_EL0   = 0xdf70, // 11  011  1110  1110  000
-    PMEVTYPER17_EL0   = 0xdf71, // 11  011  1110  1110  001
-    PMEVTYPER18_EL0   = 0xdf72, // 11  011  1110  1110  010
-    PMEVTYPER19_EL0   = 0xdf73, // 11  011  1110  1110  011
-    PMEVTYPER20_EL0   = 0xdf74, // 11  011  1110  1110  100
-    PMEVTYPER21_EL0   = 0xdf75, // 11  011  1110  1110  101
-    PMEVTYPER22_EL0   = 0xdf76, // 11  011  1110  1110  110
-    PMEVTYPER23_EL0   = 0xdf77, // 11  011  1110  1110  111
-    PMEVTYPER24_EL0   = 0xdf78, // 11  011  1110  1111  000
-    PMEVTYPER25_EL0   = 0xdf79, // 11  011  1110  1111  001
-    PMEVTYPER26_EL0   = 0xdf7a, // 11  011  1110  1111  010
-    PMEVTYPER27_EL0   = 0xdf7b, // 11  011  1110  1111  011
-    PMEVTYPER28_EL0   = 0xdf7c, // 11  011  1110  1111  100
-    PMEVTYPER29_EL0   = 0xdf7d, // 11  011  1110  1111  101
-    PMEVTYPER30_EL0   = 0xdf7e, // 11  011  1110  1111  110
-
-    // Trace registers
-    TRCPRGCTLR        = 0x8808, // 10  001  0000  0001  000
-    TRCPROCSELR       = 0x8810, // 10  001  0000  0010  000
-    TRCCONFIGR        = 0x8820, // 10  001  0000  0100  000
-    TRCAUXCTLR        = 0x8830, // 10  001  0000  0110  000
-    TRCEVENTCTL0R     = 0x8840, // 10  001  0000  1000  000
-    TRCEVENTCTL1R     = 0x8848, // 10  001  0000  1001  000
-    TRCSTALLCTLR      = 0x8858, // 10  001  0000  1011  000
-    TRCTSCTLR         = 0x8860, // 10  001  0000  1100  000
-    TRCSYNCPR         = 0x8868, // 10  001  0000  1101  000
-    TRCCCCTLR         = 0x8870, // 10  001  0000  1110  000
-    TRCBBCTLR         = 0x8878, // 10  001  0000  1111  000
-    TRCTRACEIDR       = 0x8801, // 10  001  0000  0000  001
-    TRCQCTLR          = 0x8809, // 10  001  0000  0001  001
-    TRCVICTLR         = 0x8802, // 10  001  0000  0000  010
-    TRCVIIECTLR       = 0x880a, // 10  001  0000  0001  010
-    TRCVISSCTLR       = 0x8812, // 10  001  0000  0010  010
-    TRCVIPCSSCTLR     = 0x881a, // 10  001  0000  0011  010
-    TRCVDCTLR         = 0x8842, // 10  001  0000  1000  010
-    TRCVDSACCTLR      = 0x884a, // 10  001  0000  1001  010
-    TRCVDARCCTLR      = 0x8852, // 10  001  0000  1010  010
-    TRCSEQEVR0        = 0x8804, // 10  001  0000  0000  100
-    TRCSEQEVR1        = 0x880c, // 10  001  0000  0001  100
-    TRCSEQEVR2        = 0x8814, // 10  001  0000  0010  100
-    TRCSEQRSTEVR      = 0x8834, // 10  001  0000  0110  100
-    TRCSEQSTR         = 0x883c, // 10  001  0000  0111  100
-    TRCEXTINSELR      = 0x8844, // 10  001  0000  1000  100
-    TRCCNTRLDVR0      = 0x8805, // 10  001  0000  0000  101
-    TRCCNTRLDVR1      = 0x880d, // 10  001  0000  0001  101
-    TRCCNTRLDVR2      = 0x8815, // 10  001  0000  0010  101
-    TRCCNTRLDVR3      = 0x881d, // 10  001  0000  0011  101
-    TRCCNTCTLR0       = 0x8825, // 10  001  0000  0100  101
-    TRCCNTCTLR1       = 0x882d, // 10  001  0000  0101  101
-    TRCCNTCTLR2       = 0x8835, // 10  001  0000  0110  101
-    TRCCNTCTLR3       = 0x883d, // 10  001  0000  0111  101
-    TRCCNTVR0         = 0x8845, // 10  001  0000  1000  101
-    TRCCNTVR1         = 0x884d, // 10  001  0000  1001  101
-    TRCCNTVR2         = 0x8855, // 10  001  0000  1010  101
-    TRCCNTVR3         = 0x885d, // 10  001  0000  1011  101
-    TRCIMSPEC0        = 0x8807, // 10  001  0000  0000  111
-    TRCIMSPEC1        = 0x880f, // 10  001  0000  0001  111
-    TRCIMSPEC2        = 0x8817, // 10  001  0000  0010  111
-    TRCIMSPEC3        = 0x881f, // 10  001  0000  0011  111
-    TRCIMSPEC4        = 0x8827, // 10  001  0000  0100  111
-    TRCIMSPEC5        = 0x882f, // 10  001  0000  0101  111
-    TRCIMSPEC6        = 0x8837, // 10  001  0000  0110  111
-    TRCIMSPEC7        = 0x883f, // 10  001  0000  0111  111
-    TRCRSCTLR2        = 0x8890, // 10  001  0001  0010  000
-    TRCRSCTLR3        = 0x8898, // 10  001  0001  0011  000
-    TRCRSCTLR4        = 0x88a0, // 10  001  0001  0100  000
-    TRCRSCTLR5        = 0x88a8, // 10  001  0001  0101  000
-    TRCRSCTLR6        = 0x88b0, // 10  001  0001  0110  000
-    TRCRSCTLR7        = 0x88b8, // 10  001  0001  0111  000
-    TRCRSCTLR8        = 0x88c0, // 10  001  0001  1000  000
-    TRCRSCTLR9        = 0x88c8, // 10  001  0001  1001  000
-    TRCRSCTLR10       = 0x88d0, // 10  001  0001  1010  000
-    TRCRSCTLR11       = 0x88d8, // 10  001  0001  1011  000
-    TRCRSCTLR12       = 0x88e0, // 10  001  0001  1100  000
-    TRCRSCTLR13       = 0x88e8, // 10  001  0001  1101  000
-    TRCRSCTLR14       = 0x88f0, // 10  001  0001  1110  000
-    TRCRSCTLR15       = 0x88f8, // 10  001  0001  1111  000
-    TRCRSCTLR16       = 0x8881, // 10  001  0001  0000  001
-    TRCRSCTLR17       = 0x8889, // 10  001  0001  0001  001
-    TRCRSCTLR18       = 0x8891, // 10  001  0001  0010  001
-    TRCRSCTLR19       = 0x8899, // 10  001  0001  0011  001
-    TRCRSCTLR20       = 0x88a1, // 10  001  0001  0100  001
-    TRCRSCTLR21       = 0x88a9, // 10  001  0001  0101  001
-    TRCRSCTLR22       = 0x88b1, // 10  001  0001  0110  001
-    TRCRSCTLR23       = 0x88b9, // 10  001  0001  0111  001
-    TRCRSCTLR24       = 0x88c1, // 10  001  0001  1000  001
-    TRCRSCTLR25       = 0x88c9, // 10  001  0001  1001  001
-    TRCRSCTLR26       = 0x88d1, // 10  001  0001  1010  001
-    TRCRSCTLR27       = 0x88d9, // 10  001  0001  1011  001
-    TRCRSCTLR28       = 0x88e1, // 10  001  0001  1100  001
-    TRCRSCTLR29       = 0x88e9, // 10  001  0001  1101  001
-    TRCRSCTLR30       = 0x88f1, // 10  001  0001  1110  001
-    TRCRSCTLR31       = 0x88f9, // 10  001  0001  1111  001
-    TRCSSCCR0         = 0x8882, // 10  001  0001  0000  010
-    TRCSSCCR1         = 0x888a, // 10  001  0001  0001  010
-    TRCSSCCR2         = 0x8892, // 10  001  0001  0010  010
-    TRCSSCCR3         = 0x889a, // 10  001  0001  0011  010
-    TRCSSCCR4         = 0x88a2, // 10  001  0001  0100  010
-    TRCSSCCR5         = 0x88aa, // 10  001  0001  0101  010
-    TRCSSCCR6         = 0x88b2, // 10  001  0001  0110  010
-    TRCSSCCR7         = 0x88ba, // 10  001  0001  0111  010
-    TRCSSCSR0         = 0x88c2, // 10  001  0001  1000  010
-    TRCSSCSR1         = 0x88ca, // 10  001  0001  1001  010
-    TRCSSCSR2         = 0x88d2, // 10  001  0001  1010  010
-    TRCSSCSR3         = 0x88da, // 10  001  0001  1011  010
-    TRCSSCSR4         = 0x88e2, // 10  001  0001  1100  010
-    TRCSSCSR5         = 0x88ea, // 10  001  0001  1101  010
-    TRCSSCSR6         = 0x88f2, // 10  001  0001  1110  010
-    TRCSSCSR7         = 0x88fa, // 10  001  0001  1111  010
-    TRCSSPCICR0       = 0x8883, // 10  001  0001  0000  011
-    TRCSSPCICR1       = 0x888b, // 10  001  0001  0001  011
-    TRCSSPCICR2       = 0x8893, // 10  001  0001  0010  011
-    TRCSSPCICR3       = 0x889b, // 10  001  0001  0011  011
-    TRCSSPCICR4       = 0x88a3, // 10  001  0001  0100  011
-    TRCSSPCICR5       = 0x88ab, // 10  001  0001  0101  011
-    TRCSSPCICR6       = 0x88b3, // 10  001  0001  0110  011
-    TRCSSPCICR7       = 0x88bb, // 10  001  0001  0111  011
-    TRCPDCR           = 0x88a4, // 10  001  0001  0100  100
-    TRCACVR0          = 0x8900, // 10  001  0010  0000  000
-    TRCACVR1          = 0x8910, // 10  001  0010  0010  000
-    TRCACVR2          = 0x8920, // 10  001  0010  0100  000
-    TRCACVR3          = 0x8930, // 10  001  0010  0110  000
-    TRCACVR4          = 0x8940, // 10  001  0010  1000  000
-    TRCACVR5          = 0x8950, // 10  001  0010  1010  000
-    TRCACVR6          = 0x8960, // 10  001  0010  1100  000
-    TRCACVR7          = 0x8970, // 10  001  0010  1110  000
-    TRCACVR8          = 0x8901, // 10  001  0010  0000  001
-    TRCACVR9          = 0x8911, // 10  001  0010  0010  001
-    TRCACVR10         = 0x8921, // 10  001  0010  0100  001
-    TRCACVR11         = 0x8931, // 10  001  0010  0110  001
-    TRCACVR12         = 0x8941, // 10  001  0010  1000  001
-    TRCACVR13         = 0x8951, // 10  001  0010  1010  001
-    TRCACVR14         = 0x8961, // 10  001  0010  1100  001
-    TRCACVR15         = 0x8971, // 10  001  0010  1110  001
-    TRCACATR0         = 0x8902, // 10  001  0010  0000  010
-    TRCACATR1         = 0x8912, // 10  001  0010  0010  010
-    TRCACATR2         = 0x8922, // 10  001  0010  0100  010
-    TRCACATR3         = 0x8932, // 10  001  0010  0110  010
-    TRCACATR4         = 0x8942, // 10  001  0010  1000  010
-    TRCACATR5         = 0x8952, // 10  001  0010  1010  010
-    TRCACATR6         = 0x8962, // 10  001  0010  1100  010
-    TRCACATR7         = 0x8972, // 10  001  0010  1110  010
-    TRCACATR8         = 0x8903, // 10  001  0010  0000  011
-    TRCACATR9         = 0x8913, // 10  001  0010  0010  011
-    TRCACATR10        = 0x8923, // 10  001  0010  0100  011
-    TRCACATR11        = 0x8933, // 10  001  0010  0110  011
-    TRCACATR12        = 0x8943, // 10  001  0010  1000  011
-    TRCACATR13        = 0x8953, // 10  001  0010  1010  011
-    TRCACATR14        = 0x8963, // 10  001  0010  1100  011
-    TRCACATR15        = 0x8973, // 10  001  0010  1110  011
-    TRCDVCVR0         = 0x8904, // 10  001  0010  0000  100
-    TRCDVCVR1         = 0x8924, // 10  001  0010  0100  100
-    TRCDVCVR2         = 0x8944, // 10  001  0010  1000  100
-    TRCDVCVR3         = 0x8964, // 10  001  0010  1100  100
-    TRCDVCVR4         = 0x8905, // 10  001  0010  0000  101
-    TRCDVCVR5         = 0x8925, // 10  001  0010  0100  101
-    TRCDVCVR6         = 0x8945, // 10  001  0010  1000  101
-    TRCDVCVR7         = 0x8965, // 10  001  0010  1100  101
-    TRCDVCMR0         = 0x8906, // 10  001  0010  0000  110
-    TRCDVCMR1         = 0x8926, // 10  001  0010  0100  110
-    TRCDVCMR2         = 0x8946, // 10  001  0010  1000  110
-    TRCDVCMR3         = 0x8966, // 10  001  0010  1100  110
-    TRCDVCMR4         = 0x8907, // 10  001  0010  0000  111
-    TRCDVCMR5         = 0x8927, // 10  001  0010  0100  111
-    TRCDVCMR6         = 0x8947, // 10  001  0010  1000  111
-    TRCDVCMR7         = 0x8967, // 10  001  0010  1100  111
-    TRCCIDCVR0        = 0x8980, // 10  001  0011  0000  000
-    TRCCIDCVR1        = 0x8990, // 10  001  0011  0010  000
-    TRCCIDCVR2        = 0x89a0, // 10  001  0011  0100  000
-    TRCCIDCVR3        = 0x89b0, // 10  001  0011  0110  000
-    TRCCIDCVR4        = 0x89c0, // 10  001  0011  1000  000
-    TRCCIDCVR5        = 0x89d0, // 10  001  0011  1010  000
-    TRCCIDCVR6        = 0x89e0, // 10  001  0011  1100  000
-    TRCCIDCVR7        = 0x89f0, // 10  001  0011  1110  000
-    TRCVMIDCVR0       = 0x8981, // 10  001  0011  0000  001
-    TRCVMIDCVR1       = 0x8991, // 10  001  0011  0010  001
-    TRCVMIDCVR2       = 0x89a1, // 10  001  0011  0100  001
-    TRCVMIDCVR3       = 0x89b1, // 10  001  0011  0110  001
-    TRCVMIDCVR4       = 0x89c1, // 10  001  0011  1000  001
-    TRCVMIDCVR5       = 0x89d1, // 10  001  0011  1010  001
-    TRCVMIDCVR6       = 0x89e1, // 10  001  0011  1100  001
-    TRCVMIDCVR7       = 0x89f1, // 10  001  0011  1110  001
-    TRCCIDCCTLR0      = 0x8982, // 10  001  0011  0000  010
-    TRCCIDCCTLR1      = 0x898a, // 10  001  0011  0001  010
-    TRCVMIDCCTLR0     = 0x8992, // 10  001  0011  0010  010
-    TRCVMIDCCTLR1     = 0x899a, // 10  001  0011  0011  010
-    TRCITCTRL         = 0x8b84, // 10  001  0111  0000  100
-    TRCCLAIMSET       = 0x8bc6, // 10  001  0111  1000  110
-    TRCCLAIMCLR       = 0x8bce, // 10  001  0111  1001  110
-
-    // GICv3 registers
-    ICC_BPR1_EL1      = 0xc663, // 11  000  1100  1100  011
-    ICC_BPR0_EL1      = 0xc643, // 11  000  1100  1000  011
-    ICC_PMR_EL1       = 0xc230, // 11  000  0100  0110  000
-    ICC_CTLR_EL1      = 0xc664, // 11  000  1100  1100  100
-    ICC_CTLR_EL3      = 0xf664, // 11  110  1100  1100  100
-    ICC_SRE_EL1       = 0xc665, // 11  000  1100  1100  101
-    ICC_SRE_EL2       = 0xe64d, // 11  100  1100  1001  101
-    ICC_SRE_EL3       = 0xf665, // 11  110  1100  1100  101
-    ICC_IGRPEN0_EL1   = 0xc666, // 11  000  1100  1100  110
-    ICC_IGRPEN1_EL1   = 0xc667, // 11  000  1100  1100  111
-    ICC_IGRPEN1_EL3   = 0xf667, // 11  110  1100  1100  111
-    ICC_SEIEN_EL1     = 0xc668, // 11  000  1100  1101  000
-    ICC_AP0R0_EL1     = 0xc644, // 11  000  1100  1000  100
-    ICC_AP0R1_EL1     = 0xc645, // 11  000  1100  1000  101
-    ICC_AP0R2_EL1     = 0xc646, // 11  000  1100  1000  110
-    ICC_AP0R3_EL1     = 0xc647, // 11  000  1100  1000  111
-    ICC_AP1R0_EL1     = 0xc648, // 11  000  1100  1001  000
-    ICC_AP1R1_EL1     = 0xc649, // 11  000  1100  1001  001
-    ICC_AP1R2_EL1     = 0xc64a, // 11  000  1100  1001  010
-    ICC_AP1R3_EL1     = 0xc64b, // 11  000  1100  1001  011
-    ICH_AP0R0_EL2     = 0xe640, // 11  100  1100  1000  000
-    ICH_AP0R1_EL2     = 0xe641, // 11  100  1100  1000  001
-    ICH_AP0R2_EL2     = 0xe642, // 11  100  1100  1000  010
-    ICH_AP0R3_EL2     = 0xe643, // 11  100  1100  1000  011
-    ICH_AP1R0_EL2     = 0xe648, // 11  100  1100  1001  000
-    ICH_AP1R1_EL2     = 0xe649, // 11  100  1100  1001  001
-    ICH_AP1R2_EL2     = 0xe64a, // 11  100  1100  1001  010
-    ICH_AP1R3_EL2     = 0xe64b, // 11  100  1100  1001  011
-    ICH_HCR_EL2       = 0xe658, // 11  100  1100  1011  000
-    ICH_MISR_EL2      = 0xe65a, // 11  100  1100  1011  010
-    ICH_VMCR_EL2      = 0xe65f, // 11  100  1100  1011  111
-    ICH_VSEIR_EL2     = 0xe64c, // 11  100  1100  1001  100
-    ICH_LR0_EL2       = 0xe660, // 11  100  1100  1100  000
-    ICH_LR1_EL2       = 0xe661, // 11  100  1100  1100  001
-    ICH_LR2_EL2       = 0xe662, // 11  100  1100  1100  010
-    ICH_LR3_EL2       = 0xe663, // 11  100  1100  1100  011
-    ICH_LR4_EL2       = 0xe664, // 11  100  1100  1100  100
-    ICH_LR5_EL2       = 0xe665, // 11  100  1100  1100  101
-    ICH_LR6_EL2       = 0xe666, // 11  100  1100  1100  110
-    ICH_LR7_EL2       = 0xe667, // 11  100  1100  1100  111
-    ICH_LR8_EL2       = 0xe668, // 11  100  1100  1101  000
-    ICH_LR9_EL2       = 0xe669, // 11  100  1100  1101  001
-    ICH_LR10_EL2      = 0xe66a, // 11  100  1100  1101  010
-    ICH_LR11_EL2      = 0xe66b, // 11  100  1100  1101  011
-    ICH_LR12_EL2      = 0xe66c, // 11  100  1100  1101  100
-    ICH_LR13_EL2      = 0xe66d, // 11  100  1100  1101  101
-    ICH_LR14_EL2      = 0xe66e, // 11  100  1100  1101  110
-    ICH_LR15_EL2      = 0xe66f, // 11  100  1100  1101  111
-  };
-
-  // Cyclone specific system registers
-  enum CycloneSysRegValues {
-    CPM_IOACC_CTL_EL3 = 0xff90
-  };
-
-  // Note that these do not inherit from ARM64NamedImmMapper. This class is
-  // sufficiently different in its behaviour that I don't believe it's worth
-  // burdening the common ARM64NamedImmMapper with abstractions only needed in
-  // this one case.
-  struct SysRegMapper {
-    static const ARM64NamedImmMapper::Mapping SysRegPairs[];
-    static const ARM64NamedImmMapper::Mapping CycloneSysRegPairs[];
-
-    const ARM64NamedImmMapper::Mapping *InstPairs;
-    size_t NumInstPairs;
-    uint64_t FeatureBits;
-
-    SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
-    uint32_t fromString(StringRef Name, bool &Valid) const;
-    std::string toString(uint32_t Bits, bool &Valid) const;
-  };
-
-  struct MSRMapper : SysRegMapper {
-    static const ARM64NamedImmMapper::Mapping MSRPairs[];
-    MSRMapper(uint64_t FeatureBits);
-  };
-
-  struct MRSMapper : SysRegMapper {
-    static const ARM64NamedImmMapper::Mapping MRSPairs[];
-    MRSMapper(uint64_t FeatureBits);
-  };
-
-  uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
-}
-
-namespace ARM64TLBI {
-  enum TLBIValues {
-    Invalid = -1,          // Op0 Op1  CRn   CRm   Op2
-    IPAS2E1IS    = 0x6401, // 01  100  1000  0000  001
-    IPAS2LE1IS   = 0x6405, // 01  100  1000  0000  101
-    VMALLE1IS    = 0x4418, // 01  000  1000  0011  000
-    ALLE2IS      = 0x6418, // 01  100  1000  0011  000
-    ALLE3IS      = 0x7418, // 01  110  1000  0011  000
-    VAE1IS       = 0x4419, // 01  000  1000  0011  001
-    VAE2IS       = 0x6419, // 01  100  1000  0011  001
-    VAE3IS       = 0x7419, // 01  110  1000  0011  001
-    ASIDE1IS     = 0x441a, // 01  000  1000  0011  010
-    VAAE1IS      = 0x441b, // 01  000  1000  0011  011
-    ALLE1IS      = 0x641c, // 01  100  1000  0011  100
-    VALE1IS      = 0x441d, // 01  000  1000  0011  101
-    VALE2IS      = 0x641d, // 01  100  1000  0011  101
-    VALE3IS      = 0x741d, // 01  110  1000  0011  101
-    VMALLS12E1IS = 0x641e, // 01  100  1000  0011  110
-    VAALE1IS     = 0x441f, // 01  000  1000  0011  111
-    IPAS2E1      = 0x6421, // 01  100  1000  0100  001
-    IPAS2LE1     = 0x6425, // 01  100  1000  0100  101
-    VMALLE1      = 0x4438, // 01  000  1000  0111  000
-    ALLE2        = 0x6438, // 01  100  1000  0111  000
-    ALLE3        = 0x7438, // 01  110  1000  0111  000
-    VAE1         = 0x4439, // 01  000  1000  0111  001
-    VAE2         = 0x6439, // 01  100  1000  0111  001
-    VAE3         = 0x7439, // 01  110  1000  0111  001
-    ASIDE1       = 0x443a, // 01  000  1000  0111  010
-    VAAE1        = 0x443b, // 01  000  1000  0111  011
-    ALLE1        = 0x643c, // 01  100  1000  0111  100
-    VALE1        = 0x443d, // 01  000  1000  0111  101
-    VALE2        = 0x643d, // 01  100  1000  0111  101
-    VALE3        = 0x743d, // 01  110  1000  0111  101
-    VMALLS12E1   = 0x643e, // 01  100  1000  0111  110
-    VAALE1       = 0x443f  // 01  000  1000  0111  111
-  };
-
-  struct TLBIMapper : ARM64NamedImmMapper {
-    const static Mapping TLBIPairs[];
-
-    TLBIMapper();
-  };
-
-  static inline bool NeedsRegister(TLBIValues Val) {
-    switch (Val) {
-    case VMALLE1IS:
-    case ALLE2IS:
-    case ALLE3IS:
-    case ALLE1IS:
-    case VMALLS12E1IS:
-    case VMALLE1:
-    case ALLE2:
-    case ALLE3:
-    case ALLE1:
-    case VMALLS12E1:
-      return false;
-    default:
-      return true;
-    }
-  }
-} 
-
-namespace ARM64II {
-  /// Target Operand Flag enum.
-  enum TOF {
-    //===------------------------------------------------------------------===//
-    // ARM64 Specific MachineOperand flags.
-
-    MO_NO_FLAG,
-
-    MO_FRAGMENT = 0x7,
-
-    /// MO_PAGE - A symbol operand with this flag represents the pc-relative
-    /// offset of the 4K page containing the symbol.  This is used with the
-    /// ADRP instruction.
-    MO_PAGE = 1,
-
-    /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
-    /// that symbol within a 4K page.  This offset is added to the page address
-    /// to produce the complete address.
-    MO_PAGEOFF = 2,
-
-    /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
-    /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G3 = 3,
-
-    /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
-    /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G2 = 4,
-
-    /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
-    /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G1 = 5,
-
-    /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
-    /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G0 = 6,
-
-    /// MO_GOT - This flag indicates that a symbol operand represents the
-    /// address of the GOT entry for the symbol, rather than the address of
-    /// the symbol itself.
-    MO_GOT = 8,
-
-    /// MO_NC - Indicates whether the linker is expected to check the symbol
-    /// reference for overflow. For example in an ADRP/ADD pair of relocations
-    /// the ADRP usually does check, but not the ADD.
-    MO_NC = 0x10,
-
-    /// MO_TLS - Indicates that the operand being accessed is some kind of
-    /// thread-local symbol. On Darwin, only one type of thread-local access
-    /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
-    /// referee will affect interpretation.
-    MO_TLS = 0x20
-  };
-} // end namespace ARM64II
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/Utils/CMakeLists.txt b/lib/Target/ARM64/Utils/CMakeLists.txt
deleted file mode 100644
index f69076f4ef6..00000000000
--- a/lib/Target/ARM64/Utils/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_llvm_library(LLVMARM64Utils
-  ARM64BaseInfo.cpp
-  )
diff --git a/lib/Target/ARM64/Utils/LLVMBuild.txt b/lib/Target/ARM64/Utils/LLVMBuild.txt
deleted file mode 100644
index 232dca29f40..00000000000
--- a/lib/Target/ARM64/Utils/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/ARM64/Utils/LLVMBuild.txt ----------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Utils
-parent = ARM64
-required_libraries = Support
-add_to_library_groups = ARM64
diff --git a/lib/Target/ARM64/Utils/Makefile b/lib/Target/ARM64/Utils/Makefile
deleted file mode 100644
index 6491ad9a07b..00000000000
--- a/lib/Target/ARM64/Utils/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM64/Utils/Makefile -------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Utils
-
-# Hack: we need to include 'main' ARM64 target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index da2309ba0cb..1b0837cb3b5 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = ARM ARM64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
+subdirectories = ARM AArch64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 18a0f9c167a..be1b5aa50b1 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -836,8 +836,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu:
-  case Intrinsic::arm64_neon_smull:
-  case Intrinsic::arm64_neon_umull: {
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull: {
     Value *Arg0 = II->getArgOperand(0);
     Value *Arg1 = II->getArgOperand(1);
 
@@ -848,7 +848,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // Check for constant LHS & RHS - in this case we just simplify.
     bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu ||
-                 II->getIntrinsicID() == Intrinsic::arm64_neon_umull);
+                 II->getIntrinsicID() == Intrinsic::aarch64_neon_umull);
     VectorType *NewVT = cast<VectorType>(II->getType());
     if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
       if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
diff --git a/test/Analysis/CostModel/AArch64/lit.local.cfg b/test/Analysis/CostModel/AArch64/lit.local.cfg
new file mode 100644
index 00000000000..c42034979fc
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/lit.local.cfg
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
diff --git a/test/Analysis/CostModel/AArch64/select.ll b/test/Analysis/CostModel/AArch64/select.ll
new file mode 100644
index 00000000000..216dc5ddc48
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/select.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+; CHECK-LABEL: select
+define void @select() {
+    ; Scalar values
+  ; CHECK: cost of 1 {{.*}} select
+  %v1 = select i1 undef, i8 undef, i8 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v2 = select i1 undef, i16 undef, i16 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v3 = select i1 undef, i32 undef, i32 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v4 = select i1 undef, i64 undef, i64 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v5 = select i1 undef, float undef, float undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v6 = select i1 undef, double undef, double undef
+
+  ; Vector values - check for vectors that have a high cost because they end up
+  ; scalarized.
+  ; CHECK: cost of 320 {{.*}} select
+  %v13b = select <16 x i1>  undef, <16 x i16> undef, <16 x i16> undef
+
+  ; CHECK: cost of 160 {{.*}} select
+  %v15b = select <8 x i1>  undef, <8 x i32> undef, <8 x i32> undef
+  ; CHECK: cost of 320 {{.*}} select
+  %v15c = select <16 x i1>  undef, <16 x i32> undef, <16 x i32> undef
+
+  ; CHECK: cost of 80 {{.*}} select
+  %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+  ; CHECK: cost of 160 {{.*}} select
+  %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+  ; CHECK: cost of 320 {{.*}} select
+  %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+
+    ret void
+}
diff --git a/test/Analysis/CostModel/AArch64/store.ll b/test/Analysis/CostModel/AArch64/store.ll
new file mode 100644
index 00000000000..0c9883cf2a2
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/store.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+; CHECK-LABEL: store
+define void @store() {
+    ; Stores of <2 x i64> should be expensive because we don't split them and
+    ; and unaligned 16b stores have bad performance.
+    ; CHECK: cost of 12 {{.*}} store
+    store <2 x i64> undef, <2 x i64> * undef
+
+    ; We scalarize the loads/stores because there is no vector register name for
+    ; these types (they get extended to v.4h/v.2s).
+    ; CHECK: cost of 16 {{.*}} store
+    store <2 x i8> undef, <2 x i8> * undef
+    ; CHECK: cost of 64 {{.*}} store
+    store <4 x i8> undef, <4 x i8> * undef
+    ; CHECK: cost of 16 {{.*}} load
+    load <2 x i8> * undef
+    ; CHECK: cost of 64 {{.*}} load
+    load <4 x i8> * undef
+
+    ret void
+}
diff --git a/test/Analysis/CostModel/ARM64/lit.local.cfg b/test/Analysis/CostModel/ARM64/lit.local.cfg
deleted file mode 100644
index 84ac9811f01..00000000000
--- a/test/Analysis/CostModel/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/Analysis/CostModel/ARM64/select.ll b/test/Analysis/CostModel/ARM64/select.ll
deleted file mode 100644
index 216dc5ddc48..00000000000
--- a/test/Analysis/CostModel/ARM64/select.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-
-; CHECK-LABEL: select
-define void @select() {
-    ; Scalar values
-  ; CHECK: cost of 1 {{.*}} select
-  %v1 = select i1 undef, i8 undef, i8 undef
-  ; CHECK: cost of 1 {{.*}} select
-  %v2 = select i1 undef, i16 undef, i16 undef
-  ; CHECK: cost of 1 {{.*}} select
-  %v3 = select i1 undef, i32 undef, i32 undef
-  ; CHECK: cost of 1 {{.*}} select
-  %v4 = select i1 undef, i64 undef, i64 undef
-  ; CHECK: cost of 1 {{.*}} select
-  %v5 = select i1 undef, float undef, float undef
-  ; CHECK: cost of 1 {{.*}} select
-  %v6 = select i1 undef, double undef, double undef
-
-  ; Vector values - check for vectors that have a high cost because they end up
-  ; scalarized.
-  ; CHECK: cost of 320 {{.*}} select
-  %v13b = select <16 x i1>  undef, <16 x i16> undef, <16 x i16> undef
-
-  ; CHECK: cost of 160 {{.*}} select
-  %v15b = select <8 x i1>  undef, <8 x i32> undef, <8 x i32> undef
-  ; CHECK: cost of 320 {{.*}} select
-  %v15c = select <16 x i1>  undef, <16 x i32> undef, <16 x i32> undef
-
-  ; CHECK: cost of 80 {{.*}} select
-  %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-  ; CHECK: cost of 160 {{.*}} select
-  %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-  ; CHECK: cost of 320 {{.*}} select
-  %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-
-    ret void
-}
diff --git a/test/Analysis/CostModel/ARM64/store.ll b/test/Analysis/CostModel/ARM64/store.ll
deleted file mode 100644
index 0c9883cf2a2..00000000000
--- a/test/Analysis/CostModel/ARM64/store.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-; CHECK-LABEL: store
-define void @store() {
-    ; Stores of <2 x i64> should be expensive because we don't split them and
-    ; and unaligned 16b stores have bad performance.
-    ; CHECK: cost of 12 {{.*}} store
-    store <2 x i64> undef, <2 x i64> * undef
-
-    ; We scalarize the loads/stores because there is no vector register name for
-    ; these types (they get extended to v.4h/v.2s).
-    ; CHECK: cost of 16 {{.*}} store
-    store <2 x i8> undef, <2 x i8> * undef
-    ; CHECK: cost of 64 {{.*}} store
-    store <4 x i8> undef, <4 x i8> * undef
-    ; CHECK: cost of 16 {{.*}} load
-    load <2 x i8> * undef
-    ; CHECK: cost of 64 {{.*}} load
-    load <4 x i8> * undef
-
-    ret void
-}
diff --git a/test/CodeGen/AArch64/128bit_load_store.ll b/test/CodeGen/AArch64/128bit_load_store.ll
index 56f67873f84..a6f077698e4 100644
--- a/test/CodeGen/AArch64/128bit_load_store.ll
+++ b/test/CodeGen/AArch64/128bit_load_store.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s --check-prefix=CHECK
 
 define void @test_store_f128(fp128* %ptr, fp128 %val) #0 {
 ; CHECK-LABEL: test_store_f128
@@ -17,8 +17,8 @@ entry:
 }
 
 define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
-; CHECK-ARM64-LABEL: test_vstrq_p128
-; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-LABEL: test_vstrq_p128
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
 
 entry:
   %0 = bitcast i128* %ptr to fp128*
@@ -28,8 +28,8 @@ entry:
 }
 
 define i128 @test_vldrq_p128(i128* readonly %ptr) #2 {
-; CHECK-ARM64-LABEL: test_vldrq_p128
-; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-LABEL: test_vldrq_p128
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
 
 entry:
   %0 = bitcast i128* %ptr to fp128*
diff --git a/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll b/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
new file mode 100644
index 00000000000..c932253049e
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; arm64 has a separate copy as aarch64-neon-v1i1-setcc.ll
+
+; This file test the DAG node like "v1i1 SETCC v1i64, v1i64". As the v1i1 type
+; is illegal in AArch64 backend, the legalizer tries to scalarize this node.
+; As the v1i64 operands of SETCC are legal types, they will not be scalarized.
+; Currently the type legalizer will have an assertion failure as it assumes all
+; operands of SETCC have been legalized.
+; FIXME: If the algorithm of type scalarization is improved and can legaize
+; "v1i1 SETCC" correctly, these test cases are not needed.
+
+define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_0:
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
+  %1 = icmp sge <1 x i64> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  %vget_lane = sext i1 %2 to i64
+  ret i64 %vget_lane
+}
+
+define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_1:
+; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
+  %1 = fcmp oeq <1 x double> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  %vget_lane = sext i1 %2 to i64
+  ret i64 %vget_lane
+}
+
+define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_0:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_1:
+; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = fcmp oeq <1 x double> %v1, %v2
+  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
+; CHECK-LABEL: test_select_v1i1_2:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
+  ret <1 x double> %res
+}
+
+define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_br_extr_cmp:
+; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}}
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  br i1 %2, label %if.end, label %if.then
+
+if.then:
+  ret i32 0;
+
+if.end:
+  ret i32 1;
+}
diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll
index 3aa427c352c..b85fdbb14ce 100644
--- a/test/CodeGen/AArch64/addsub.ll
+++ b/test/CodeGen/AArch64/addsub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 ; Note that this should be refactored (for efficiency if nothing else)
 ; when the PCS is implemented so we don't have to worry about the
diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll
index cd01f594dcd..a2266b1d36d 100644
--- a/test/CodeGen/AArch64/addsub_ext.ll
+++ b/test/CodeGen/AArch64/addsub_ext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs %s -o - -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var8 = global i8 0
 @var16 = global i16 0
diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll
index 7cab200b1ea..f93efbc42e6 100644
--- a/test/CodeGen/AArch64/alloca.ll
+++ b/test/CodeGen/AArch64/alloca.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
 
 declare void @use_addr(i8*)
 
@@ -53,7 +53,7 @@ define i64 @test_alloca_with_local(i64 %n) {
 
   %val = load i64* %loc
 
-; CHECK-ARM64: ldur x0, [x29, #-[[LOC_FROM_FP]]]
+; CHECK: ldur x0, [x29, #-[[LOC_FROM_FP]]]
 
   ret i64 %val
   ; Make sure epilogue restores sp from fp
@@ -74,16 +74,16 @@ define void @test_variadic_alloca(i64 %n, ...) {
 ; CHECK-NOFP-AARCH64: add     x8, [[TMP]], #0
 
 
-; CHECK-ARM64: stp     x29, x30, [sp, #-16]!
-; CHECK-ARM64: mov     x29, sp
-; CHECK-ARM64: sub     sp, sp, #192
-; CHECK-ARM64: stp     q6, q7, [x29, #-96]
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
+; CHECK: sub     sp, sp, #192
+; CHECK: stp     q6, q7, [x29, #-96]
 ; [...]
-; CHECK-ARM64: stp     q0, q1, [x29, #-192]
+; CHECK: stp     q0, q1, [x29, #-192]
 
-; CHECK-ARM64: stp     x6, x7, [x29, #-16]
+; CHECK: stp     x6, x7, [x29, #-16]
 ; [...]
-; CHECK-ARM64: stp     x2, x3, [x29, #-48]
+; CHECK: stp     x2, x3, [x29, #-48]
 
 ; CHECK-NOFP-ARM64: stp     x29, x30, [sp, #-16]!
 ; CHECK-NOFP-ARM64: mov     x29, sp
@@ -115,11 +115,11 @@ define void @test_alloca_large_frame(i64 %n) {
 ; CHECK-LABEL: test_alloca_large_frame:
 
 
-; CHECK-ARM64: stp     x20, x19, [sp, #-32]!
-; CHECK-ARM64: stp     x29, x30, [sp, #16]
-; CHECK-ARM64: add     x29, sp, #16
-; CHECK-ARM64: sub     sp, sp, #1953, lsl #12
-; CHECK-ARM64: sub     sp, sp, #512
+; CHECK: stp     x20, x19, [sp, #-32]!
+; CHECK: stp     x29, x30, [sp, #16]
+; CHECK: add     x29, sp, #16
+; CHECK: sub     sp, sp, #1953, lsl #12
+; CHECK: sub     sp, sp, #512
 
   %addr1 = alloca i8, i64 %n
   %addr2 = alloca i64, i64 1000000
@@ -128,9 +128,9 @@ define void @test_alloca_large_frame(i64 %n) {
 
   ret void
 
-; CHECK-ARM64: sub     sp, x29, #16
-; CHECK-ARM64: ldp     x29, x30, [sp, #16]
-; CHECK-ARM64: ldp     x20, x19, [sp], #32
+; CHECK: sub     sp, x29, #16
+; CHECK: ldp     x29, x30, [sp, #16]
+; CHECK: ldp     x20, x19, [sp], #32
 }
 
 declare i8* @llvm.stacksave()
diff --git a/test/CodeGen/AArch64/analyze-branch.ll b/test/CodeGen/AArch64/analyze-branch.ll
index 1d4daec5f43..6616b27c45b 100644
--- a/test/CodeGen/AArch64/analyze-branch.ll
+++ b/test/CodeGen/AArch64/analyze-branch.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
 
 ; This test checks that LLVM can do basic stripping and reapplying of branches
 ; to basic blocks.
diff --git a/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll b/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
new file mode 100644
index 00000000000..6fb7c3fb5e0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin
+
+; Can't copy or spill / restore CPSR.
+; rdar://9105206
+
+define fastcc void @t() ssp align 2 {
+entry:
+  br i1 undef, label %bb3.i, label %bb2.i
+
+bb2.i:                                            ; preds = %entry
+  br label %bb3.i
+
+bb3.i:                                            ; preds = %bb2.i, %entry
+  br i1 undef, label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71, label %bb.i69
+
+bb.i69:                                           ; preds = %bb3.i
+  br label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+
+_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71: ; preds = %bb.i69, %bb3.i
+  %0 = select i1 undef, float 0.000000e+00, float undef
+  %1 = fdiv float %0, undef
+  %2 = fcmp ult float %1, 0xBF847AE140000000
+  %storemerge9 = select i1 %2, float %1, float 0.000000e+00
+  store float %storemerge9, float* undef, align 4
+  br i1 undef, label %bb42, label %bb47
+
+bb42:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+  br i1 undef, label %bb46, label %bb53
+
+bb46:                                             ; preds = %bb42
+  br label %bb48
+
+bb47:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+  br label %bb48
+
+bb48:                                             ; preds = %bb47, %bb46
+  br i1 undef, label %bb1.i14, label %bb.i13
+
+bb.i13:                                           ; preds = %bb48
+  br label %bb1.i14
+
+bb1.i14:                                          ; preds = %bb.i13, %bb48
+  br label %bb53
+
+bb53:                                             ; preds = %bb1.i14, %bb42
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
new file mode 100644
index 00000000000..2b083d80491
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin
+
+; rdar://9146594
+
+define void @drt_vsprintf() nounwind ssp {
+entry:
+  %do_tab_convert = alloca i32, align 4
+  br i1 undef, label %if.then24, label %if.else295, !dbg !13
+
+if.then24:                                        ; preds = %entry
+  unreachable
+
+if.else295:                                       ; preds = %entry
+  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16), !dbg !18
+  store i32 0, i32* %do_tab_convert, align 4, !dbg !19
+  unreachable
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.gv = !{!0}
+!llvm.dbg.sp = !{!1, !7, !10, !11, !12}
+
+!0 = metadata !{i32 589876, i32 0, metadata !1, metadata !"vsplive", metadata !"vsplive", metadata !"", metadata !2, i32 617, metadata !6, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
+!1 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"drt_vsprintf", metadata !"drt_vsprintf", metadata !"", i32 616, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 589860, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"putc_mem", metadata !"putc_mem", metadata !"", i32 30, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!8 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!9 = metadata !{null}
+!10 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_double", metadata !"print_double", metadata !"", i32 203, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_number", metadata !"print_number", metadata !"", i32 75, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!12 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"get_flags", metadata !"get_flags", metadata !"", i32 508, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!13 = metadata !{i32 653, i32 5, metadata !14, null}
+!14 = metadata !{i32 589835, metadata !20, metadata !15, i32 652, i32 35, i32 2} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{i32 589835, metadata !20, metadata !1, i32 616, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{i32 590080, metadata !17, metadata !"do_tab_convert", metadata !2, i32 853, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!17 = metadata !{i32 589835, metadata !20, metadata !14, i32 850, i32 12, i32 33} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{i32 853, i32 11, metadata !17, null}
+!19 = metadata !{i32 853, i32 29, metadata !17, null}
+!20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"}
+!21 = metadata !{i32 0}
diff --git a/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll b/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
new file mode 100644
index 00000000000..6f0ec34fc1d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define void @foo(i64 %val) {
+; CHECK: foo
+;   The stack frame store is not 64-bit aligned. Make sure we use an
+;   instruction that can handle that.
+; CHECK: stur x0, [sp, #20]
+  %a = alloca [49 x i32], align 4
+  %p32 = getelementptr inbounds [49 x i32]* %a, i64 0, i64 2
+  %p = bitcast i32* %p32 to i64*
+  store i64 %val, i64* %p, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll b/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
new file mode 100644
index 00000000000..88232fcc0b4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=arm64-apple-iOS5.0
+
+; CPSR is not allocatable so fast allocatable wouldn't mark them killed.
+; rdar://9313272
+
+define hidden void @t() nounwind {
+entry:
+  %cmp = icmp eq i32* null, undef
+  %frombool = zext i1 %cmp to i8
+  store i8 %frombool, i8* undef, align 1
+  %tmp4 = load i8* undef, align 1
+  %tobool = trunc i8 %tmp4 to i1
+  br i1 %tobool, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %land.lhs.true14, label %if.end33
+
+land.lhs.true14:                                  ; preds = %if.end
+  unreachable
+
+if.end33:                                         ; preds = %if.end
+  unreachable
+}
diff --git a/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
new file mode 100644
index 00000000000..8f99bc30a55
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+; Can't fold the increment by 1<<12 into a post-increment load
+; rdar://10301335
+
+@test_data = common global i32 0, align 4
+
+define void @t() nounwind ssp {
+; CHECK-LABEL: t:
+entry:
+  br label %for.body
+
+for.body:
+; CHECK: for.body
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
+; CHECK: add x[[REG:[0-9]+]],
+; CHECK:                      x[[REG]], #1, lsl  #12
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 12
+  %add = add nsw i64 %0, 34628173824
+  %1 = inttoptr i64 %add to i32*
+  %2 = load volatile i32* %1, align 4096
+  store volatile i32 %2, i32* @test_data, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll b/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
new file mode 100644
index 00000000000..d47dbb28164
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=arm64
+
+; The target lowering for integer comparisons was replacing some DAG nodes
+; during operation legalization, which resulted in dangling pointers,
+; cycles in DAGs, and eventually crashes.  This is the testcase for
+; one of those crashes. (rdar://10653656)
+
+define void @test(i1 zeroext %IsArrow) nounwind ssp align 2 {
+entry:
+  br i1 undef, label %return, label %lor.lhs.false
+
+lor.lhs.false:
+  br i1 undef, label %return, label %if.end
+
+if.end:
+  %tmp.i = load i64* undef, align 8
+  %and.i.i.i = and i64 %tmp.i, -16
+  br i1 %IsArrow, label %if.else_crit_edge, label %if.end32
+
+if.else_crit_edge:
+  br i1 undef, label %if.end32, label %return
+
+if.end32:
+  %0 = icmp ult i32 undef, 3
+  %1 = zext i64 %tmp.i to i320
+  %.pn.v = select i1 %0, i320 128, i320 64
+  %.pn = shl i320 %1, %.pn.v
+  %ins346392 = or i320 %.pn, 0
+  store i320 %ins346392, i320* undef, align 8
+  br i1 undef, label %sw.bb.i.i, label %exit
+
+sw.bb.i.i:
+  unreachable
+
+exit:
+  unreachable
+
+return:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll b/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
new file mode 100644
index 00000000000..a4d37e48685
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i32 @foo(<4 x i32> %a, i32 %n) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: fmov w0, s0
+; CHECK-NEXT: ret
+  %b = bitcast <4 x i32> %a to i128
+  %c = trunc i128 %b to i32
+  ret i32 %c
+}
+
+define i64 @bar(<2 x i64> %a, i64 %n) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: fmov x0, d0
+; CHECK-NEXT: ret
+  %b = bitcast <2 x i64> %a to i128
+  %c = trunc i128 %b to i64
+  ret i64 %c
+}
+
diff --git a/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
new file mode 100644
index 00000000000..d59b0d00438
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march arm64 -mcpu=cyclone | FileCheck %s
+; <rdar://problem/11294426>
+
+@b = private unnamed_addr constant [3 x i32] [i32 1768775988, i32 1685481784, i32 1836253201], align 4
+
+; The important thing for this test is that we need an unaligned load of `l_b'
+; ("ldr w2, [x1, #8]" in this case).
+
+; CHECK:      adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}}
+; CHECK: add  x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}}
+; CHECK-NEXT: ldr  [[VAL:w[0-9]+]], [x[[ADDR]], #8]
+; CHECK-NEXT: str  [[VAL]], [x0, #8]
+; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
+; CHECK-NEXT: str  [[VAL2]], [x0]
+
+define void @foo(i8* %a) {
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([3 x i32]* @b to i8*), i64 12, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
new file mode 100644
index 00000000000..d1840d35942
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; <rdar://problem/11392109>
+
+define hidden void @t() optsize ssp {
+entry:
+  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* undef, align 8
+; CHECK:             adrp    x{{[0-9]+}}, _x@GOTPAGE
+; CHECK:        ldr     x{{[0-9]+}}, [x{{[0-9]+}}, _x@GOTPAGEOFF]
+; CHECK-NEXT:        and     x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff
+; CHECK-NEXT:        str     x{{[0-9]+}}, [x{{[0-9]+}}]
+  unreachable
+}
+
+declare i64 @x(i32) optsize
+
+; Worth checking the Linux code is sensible too: only way to access
+; the GOT is via a 64-bit load. Just loading wN is unacceptable
+; (there's no ELF relocation to do that).
+
+; CHECK-LINUX: adrp {{x[0-9]+}}, :got:x
+; CHECK-LINUX: ldr {{x[0-9]+}}, [{{x[0-9]+}}, :got_lo12:x]
diff --git a/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
new file mode 100644
index 00000000000..4b037db9c84
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -verify-machineinstrs | FileCheck %s
+
+; LdStOpt bug created illegal instruction:
+;   %D1<def>, %D2<def> = LDPSi %X0, 1
+; rdar://11512047
+
+%0 = type opaque
+%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
+%struct.CGPoint = type { double, double }
+%struct.CGSize = type { double, double }
+
+@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+define hidden %struct.CGRect @t(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: ldp d{{[0-9]+}}, d{{[0-9]+}}
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr.sum = add i64 %ivar, 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr.sum17 = add i64 %ivar, 16
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.sum = add i64 %ivar, 24
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{}
diff --git a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
new file mode 100644
index 00000000000..168e921bcc0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=arm64 -O0 < %s | FileCheck %s
+; RUN: llc -march=arm64 -O3 < %s | FileCheck %s
+
+@.str = private unnamed_addr constant [9 x i8] c"%lf %lu\0A\00", align 1
+@.str1 = private unnamed_addr constant [8 x i8] c"%lf %u\0A\00", align 1
+@.str2 = private unnamed_addr constant [8 x i8] c"%f %lu\0A\00", align 1
+@.str3 = private unnamed_addr constant [7 x i8] c"%f %u\0A\00", align 1
+
+define void @testDouble(double %d) ssp {
+; CHECK-LABEL: testDouble:
+; CHECK:  fcvtzu x{{[0-9]+}}, d{{[0-9]+}}
+; CHECK:  fcvtzu w{{[0-9]+}}, d{{[0-9]+}}
+entry:
+  %d.addr = alloca double, align 8
+  store double %d, double* %d.addr, align 8
+  %0 = load double* %d.addr, align 8
+  %1 = load double* %d.addr, align 8
+  %conv = fptoui double %1 to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
+  %2 = load double* %d.addr, align 8
+  %3 = load double* %d.addr, align 8
+  %conv1 = fptoui double %3 to i32
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
+  ret void
+}
+
+declare i32 @printf(i8*, ...)
+
+define void @testFloat(float %f) ssp {
+; CHECK-LABEL: testFloat:
+; CHECK:  fcvtzu x{{[0-9]+}}, s{{[0-9]+}}
+; CHECK:  fcvtzu w{{[0-9]+}}, s{{[0-9]+}}
+entry:
+  %f.addr = alloca float, align 4
+  store float %f, float* %f.addr, align 4
+  %0 = load float* %f.addr, align 4
+  %conv = fpext float %0 to double
+  %1 = load float* %f.addr, align 4
+  %conv1 = fptoui float %1 to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
+  %2 = load float* %f.addr, align 4
+  %conv2 = fpext float %2 to double
+  %3 = load float* %f.addr, align 4
+  %conv3 = fptoui float %3 to i32
+  %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
+  ret void
+}
+
+define i32 @main(i32 %argc, i8** %argv) ssp {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  store i32 0, i32* %retval
+  store i32 %argc, i32* %argc.addr, align 4
+  store i8** %argv, i8*** %argv.addr, align 8
+  call void @testDouble(double 1.159198e+01)
+  call void @testFloat(float 0x40272F1800000000)
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll b/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
new file mode 100644
index 00000000000..55ecfb5d2bd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios
+; rdar://11849816
+
+@shlib_path_substitutions = external hidden unnamed_addr global i8**, align 8
+
+declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+
+declare noalias i8* @xmalloc(i64) optsize
+
+declare i64 @strlen(i8* nocapture) nounwind readonly optsize
+
+declare i8* @__strcpy_chk(i8*, i8*, i64) nounwind optsize
+
+declare i8* @__strcat_chk(i8*, i8*, i64) nounwind optsize
+
+declare noalias i8* @xstrdup(i8*) optsize
+
+define i8* @dyld_fix_path(i8* %path) nounwind optsize ssp {
+entry:
+  br i1 undef, label %if.end56, label %for.cond
+
+for.cond:                                         ; preds = %entry
+  br i1 undef, label %for.cond10, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  unreachable
+
+for.cond10:                                       ; preds = %for.cond
+  br i1 undef, label %if.end56, label %for.body14
+
+for.body14:                                       ; preds = %for.cond10
+  %call22 = tail call i64 @strlen(i8* undef) nounwind optsize
+  %sext = shl i64 %call22, 32
+  %conv30 = ashr exact i64 %sext, 32
+  %add29 = sub i64 0, %conv30
+  %sub = add i64 %add29, 0
+  %add31 = shl i64 %sub, 32
+  %sext59 = add i64 %add31, 4294967296
+  %conv33 = ashr exact i64 %sext59, 32
+  %call34 = tail call noalias i8* @xmalloc(i64 %conv33) nounwind optsize
+  br i1 undef, label %cond.false45, label %cond.true43
+
+cond.true43:                                      ; preds = %for.body14
+  unreachable
+
+cond.false45:                                     ; preds = %for.body14
+  %add.ptr = getelementptr inbounds i8* %path, i64 %conv30
+  unreachable
+
+if.end56:                                         ; preds = %for.cond10, %entry
+  ret i8* null
+}
+
+declare i32 @strncmp(i8* nocapture, i8* nocapture, i64) nounwind readonly optsize
+
+declare i8* @strcpy(i8*, i8* nocapture) nounwind
diff --git a/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll b/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
new file mode 100644
index 00000000000..e2c43d953bb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+;FAST-LABEL: _Z9example25v:
+;FAST: fcmgt.4s
+;FAST: ret
+
+;CHECK-LABEL: _Z9example25v:
+;CHECK: fcmgt.4s
+;CHECK: ret
+
+define <4 x i32> @_Z9example25v( <4 x float> %N0,  <4 x float> %N1) {
+  %A = fcmp olt <4 x float> %N0, %N1
+  %B = zext <4 x i1> %A to <4 x i32>
+  ret <4 x i32> %B
+}
diff --git a/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll b/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
new file mode 100644
index 00000000000..94511243a49
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64
+; Make sure we are not crashing on this test.
+
+define void @autogen_SD13158() {
+entry:
+  %B26 = frem float 0.000000e+00, undef
+  br i1 undef, label %CF, label %CF77
+
+CF:                                               ; preds = %CF, %CF76
+  store float %B26, float* undef
+  br i1 undef, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll b/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
new file mode 100644
index 00000000000..404027bfd5f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=arm64
+
+; Make sure we are not crashing on this test.
+
+define void @autogen_SD12881() {
+BB:
+  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
+  br label %CF
+
+CF:                                               ; preds = %CF83, %CF, %BB
+  br i1 undef, label %CF, label %CF83
+
+CF83:                                             ; preds = %CF
+  %FC70 = sitofp <4 x i32> %B17 to <4 x double>
+  br label %CF
+}
+
+
+define void @autogen_SD12881_2() {
+BB:
+  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
+  br label %CF
+
+CF:                                               ; preds = %CF83, %CF, %BB
+  br i1 undef, label %CF, label %CF83
+
+CF83:                                             ; preds = %CF
+  %FC70 = uitofp <4 x i32> %B17 to <4 x double>
+  br label %CF
+}
+
+define void @_Z12my_example2bv() nounwind noinline ssp {
+entry:
+  %0 = fptosi <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll b/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
new file mode 100644
index 00000000000..a350ba1472c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple
+
+;CHECK-LABEL: Shuff:
+;CHECK: tbl.8b
+;CHECK: ret
+define <8 x i8 > @Shuff(<8 x i8> %in, <8 x i8>* %out) nounwind ssp {
+  %value = shufflevector <8 x i8> %in, <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %value
+}
+
+
diff --git a/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll b/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
new file mode 100644
index 00000000000..a73b7071801
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=arm64
+
+; This test case tests an infinite loop bug in DAG combiner.
+; It just tries to do the following replacing endlessly:
+; (1)  Replacing.3 0x2c509f0: v4i32 = any_extend 0x2c4cd08 [ORD=4]
+;      With: 0x2c4d128: v4i32 = sign_extend 0x2c4cd08 [ORD=4]
+;
+; (2)  Replacing.2 0x2c4d128: v4i32 = sign_extend 0x2c4cd08 [ORD=4]
+;      With: 0x2c509f0: v4i32 = any_extend 0x2c4cd08 [ORD=4]
+; As we think the (2) optimization from SIGN_EXTEND to ANY_EXTEND is
+; an optimization to replace unused bits with undefined bits, we remove
+; the (1) optimization (It doesn't make sense to replace undefined bits
+; with signed bits).
+
+define <4 x i32> @infiniteLoop(<4 x i32> %in0, <4 x i16> %in1) {
+entry:
+  %cmp.i = icmp sge <4 x i16> %in1, <i16 32767, i16 32767, i16 -1, i16 -32768>
+  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+  %mul.i = mul <4 x i32> %in0, %sext.i
+  %sext = shl <4 x i32> %mul.i, <i32 16, i32 16, i32 16, i32 16>
+  %vmovl.i.i = ashr <4 x i32> %sext, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %vmovl.i.i
+}
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll b/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
new file mode 100644
index 00000000000..3949b85fbd3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -verify-machineinstrs -march=arm64 | FileCheck %s
+
+; Check if sqshl/uqshl with constant shift amout can be selected. 
+define i64 @test_vqshld_s64_i(i64 %a) {
+; CHECK-LABEL: test_vqshld_s64_i:
+; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
+  %1 = tail call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 36)
+  ret i64 %1
+}
+
+define i64 @test_vqshld_u64_i(i64 %a) {
+; CHECK-LABEL: test_vqshld_u64_i:
+; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
+  %1 = tail call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 36)
+  ret i64 %1
+}
+
+declare i64 @llvm.aarch64.neon.uqshl.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.sqshl.i64(i64, i64)
diff --git a/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll b/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
new file mode 100644
index 00000000000..1b2d54317c2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; The following 2 test cases test shufflevector with beginning UNDEF mask.
+define <8 x i16> @test_vext_undef_traverse(<8 x i16> %in) {
+;CHECK-LABEL: test_vext_undef_traverse:
+;CHECK: {{ext.16b.*v0, #4}}
+  %vext = shufflevector <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0>, <8 x i16> %in, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x i16> %vext
+}
+
+define <8 x i16> @test_vext_undef_traverse2(<8 x i16> %in) {
+;CHECK-LABEL: test_vext_undef_traverse2:
+;CHECK: {{ext.16b.*v0, #6}}
+  %vext = shufflevector <8 x i16> %in, <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2>
+  ret <8 x i16> %vext
+}
+
+define <8 x i8> @test_vext_undef_traverse3(<8 x i8> %in) {
+;CHECK-LABEL: test_vext_undef_traverse3:
+;CHECK: {{ext.8b.*v0, #6}}
+  %vext = shufflevector <8 x i8> %in, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5>
+  ret <8 x i8> %vext
+}
diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
new file mode 100644
index 00000000000..c4597d5a481
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
+
+define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: bar:
+; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
+; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+; GENERIC-LABEL: bar:
+; GENERIC: add	v[[REG:[0-9]+]].2d, v0.2d, v1.2d
+; GENERIC: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; GENERIC: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+  %add = add <2 x i64> %a, %b
+  %vgetq_lane = extractelement <2 x i64> %add, i32 0
+  %vgetq_lane2 = extractelement <2 x i64> %b, i32 0
+  %add3 = add i64 %vgetq_lane, %vgetq_lane2
+  %sub = sub i64 %vgetq_lane, %vgetq_lane2
+  %vecinit = insertelement <2 x i64> undef, i64 %add3, i32 0
+  %vecinit8 = insertelement <2 x i64> %vecinit, i64 %sub, i32 1
+  ret <2 x i64> %vecinit8
+}
+
+define double @subdd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: subdd_su64:
+; CHECK: sub d0, d1, d0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: subdd_su64:
+; GENERIC: sub d0, d1, d0
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %sub.i = sub nsw i64 %vecext1, %vecext
+  %retval = bitcast i64 %sub.i to double
+  ret double %retval
+}
+
+define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vaddd_su64:
+; CHECK: add d0, d1, d0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: vaddd_su64:
+; GENERIC: add d0, d1, d0
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %add.i = add nsw i64 %vecext1, %vecext
+  %retval = bitcast i64 %add.i to double
+  ret double %retval
+}
+
+; sub MI doesn't access dsub register.
+define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: add_sub_su64:
+; CHECK: add d0, d1, d0
+; CHECK: sub d0, {{d[0-9]+}}, d0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: add_sub_su64:
+; GENERIC: add d0, d1, d0
+; GENERIC: sub d0, {{d[0-9]+}}, d0
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %add.i = add i64 %vecext1, %vecext
+  %sub.i = sub i64 0, %add.i
+  %retval = bitcast i64 %sub.i to double
+  ret double %retval
+}
diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
new file mode 100644
index 00000000000..b713f0d5a53
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -0,0 +1,103 @@
+; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false < %s | FileCheck %s
+
+@var = global i32 0, align 4
+
+define i128 @test_i128_align(i32, i128 %arg, i32 %after) {
+  store i32 %after, i32* @var, align 4
+; CHECK: str w4, [{{x[0-9]+}}, :lo12:var]
+
+  ret i128 %arg
+; CHECK: mov x0, x2
+; CHECK: mov x1, x3
+}
+
+@var64 = global i64 0, align 8
+
+  ; Check stack slots are 64-bit at all times.
+define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
+                                i32 %int, i64 %long) {
+  ; Part of last store. Blasted scheduler.
+; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
+
+  %ext_bool = zext i1 %bool to i64
+  store volatile i64 %ext_bool, i64* @var64, align 8
+; CHECK: ldrb w[[EXT:[0-9]+]], [sp]
+; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1
+; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_char = zext i8 %char to i64
+  store volatile i64 %ext_char, i64* @var64, align 8
+; CHECK: ldrb w[[EXT:[0-9]+]], [sp, #8]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_short = zext i16 %short to i64
+  store volatile i64 %ext_short, i64* @var64, align 8
+; CHECK: ldrh w[[EXT:[0-9]+]], [sp, #16]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_int = zext i32 %int to i64
+  store volatile i64 %ext_int, i64* @var64, align 8
+; CHECK: ldr{{b?}} w[[EXT:[0-9]+]], [sp, #24]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  store volatile i64 %long, i64* @var64, align 8
+; CHECK: str [[LONG]], [{{x[0-9]+}}, :lo12:var64]
+
+  ret void
+}
+
+; Make sure the callee does extensions (in the absence of zext/sext
+; keyword on args) while we're here.
+
+define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
+  %ext_bool = zext i1 %bool to i64
+  store volatile i64 %ext_bool, i64* @var64
+; CHECK: and [[EXT:x[0-9]+]], x0, #0x1
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_char = sext i8 %char to i64
+  store volatile i64 %ext_char, i64* @var64
+; CHECK: sxtb [[EXT:x[0-9]+]], w1
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_short = zext i16 %short to i64
+  store volatile i64 %ext_short, i64* @var64
+; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_int = zext i32 %int to i64
+  store volatile i64 %ext_int, i64* @var64
+; CHECK: ubfx [[EXT:x[0-9]+]], x3, #0, #32
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  ret void
+}
+
+declare void @variadic(i32 %a, ...)
+
+  ; Under AAPCS variadic functions have the same calling convention as
+  ; others. The extra arguments should go in registers rather than on the stack.
+define void @test_variadic() {
+  call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0)
+; CHECK: fmov d0, #2.0
+; CHECK: orr w1, wzr, #0x1
+; CHECK: bl variadic
+  ret void
+}
+
+; We weren't marking x7 as used after deciding that the i128 didn't fit into
+; registers and putting the first half on the stack, so the *second* half went
+; into x7. Yuck!
+define i128 @test_i128_shadow([7 x i64] %x0_x6, i128 %sp) {
+; CHECK-LABEL: test_i128_shadow:
+; CHECK: ldp x0, x1, [sp]
+
+  ret i128 %sp
+}
+
+; This test is to check if fp128 can be correctly handled on stack.
+define fp128 @test_fp128([8 x float] %arg0, fp128 %arg1) {
+; CHECK-LABEL: test_fp128:
+; CHECK: ldr {{q[0-9]+}}, [sp]
+  ret fp128 %arg1
+}
diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
new file mode 100644
index 00000000000..92db392cd04
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -0,0 +1,191 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+target triple = "arm64-apple-ios7.0.0"
+
+; rdar://13625505
+; Here we have 9 fixed integer arguments the 9th argument in on stack, the
+; varargs start right after at 8-byte alignment.
+define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
+; CHECK-LABEL: fn9:
+; 9th fixed argument
+; CHECK: ldr {{w[0-9]+}}, [sp, #64]
+; CHECK: add [[ARGS:x[0-9]+]], sp, #72
+; CHECK: add {{x[0-9]+}}, [[ARGS]], #8
+; First vararg
+; CHECK: ldr {{w[0-9]+}}, [sp, #72]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Second vararg
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Third vararg
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  %7 = alloca i32, align 4
+  %8 = alloca i32, align 4
+  %9 = alloca i32, align 4
+  %args = alloca i8*, align 8
+  %a10 = alloca i32, align 4
+  %a11 = alloca i32, align 4
+  %a12 = alloca i32, align 4
+  store i32 %a1, i32* %1, align 4
+  store i32 %a2, i32* %2, align 4
+  store i32 %a3, i32* %3, align 4
+  store i32 %a4, i32* %4, align 4
+  store i32 %a5, i32* %5, align 4
+  store i32 %a6, i32* %6, align 4
+  store i32 %a7, i32* %7, align 4
+  store i32 %a8, i32* %8, align 4
+  store i32 %a9, i32* %9, align 4
+  %10 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %10)
+  %11 = va_arg i8** %args, i32
+  store i32 %11, i32* %a10, align 4
+  %12 = va_arg i8** %args, i32
+  store i32 %12, i32* %a11, align 4
+  %13 = va_arg i8** %args, i32
+  store i32 %13, i32* %a12, align 4
+  ret void
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+define i32 @main() nounwind ssp {
+; CHECK-LABEL: main:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp, #8]
+; CHECK: str {{w[0-9]+}}, [sp]
+  %a1 = alloca i32, align 4
+  %a2 = alloca i32, align 4
+  %a3 = alloca i32, align 4
+  %a4 = alloca i32, align 4
+  %a5 = alloca i32, align 4
+  %a6 = alloca i32, align 4
+  %a7 = alloca i32, align 4
+  %a8 = alloca i32, align 4
+  %a9 = alloca i32, align 4
+  %a10 = alloca i32, align 4
+  %a11 = alloca i32, align 4
+  %a12 = alloca i32, align 4
+  store i32 1, i32* %a1, align 4
+  store i32 2, i32* %a2, align 4
+  store i32 3, i32* %a3, align 4
+  store i32 4, i32* %a4, align 4
+  store i32 5, i32* %a5, align 4
+  store i32 6, i32* %a6, align 4
+  store i32 7, i32* %a7, align 4
+  store i32 8, i32* %a8, align 4
+  store i32 9, i32* %a9, align 4
+  store i32 10, i32* %a10, align 4
+  store i32 11, i32* %a11, align 4
+  store i32 12, i32* %a12, align 4
+  %1 = load i32* %a1, align 4
+  %2 = load i32* %a2, align 4
+  %3 = load i32* %a3, align 4
+  %4 = load i32* %a4, align 4
+  %5 = load i32* %a5, align 4
+  %6 = load i32* %a6, align 4
+  %7 = load i32* %a7, align 4
+  %8 = load i32* %a8, align 4
+  %9 = load i32* %a9, align 4
+  %10 = load i32* %a10, align 4
+  %11 = load i32* %a11, align 4
+  %12 = load i32* %a12, align 4
+  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
+  ret i32 0
+}
+
+;rdar://13668483
+@.str = private unnamed_addr constant [4 x i8] c"fmt\00", align 1
+define void @foo(i8* %fmt, ...) nounwind {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
+; CHECK: ldr {{w[0-9]+}}, [sp, #48]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+  %fmt.addr = alloca i8*, align 8
+  %args = alloca i8*, align 8
+  %vc = alloca i32, align 4
+  %vv = alloca <4 x i32>, align 16
+  store i8* %fmt, i8** %fmt.addr, align 8
+  %args1 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %args1)
+  %0 = va_arg i8** %args, i32
+  store i32 %0, i32* %vc, align 4
+  %1 = va_arg i8** %args, <4 x i32>
+  store <4 x i32> %1, <4 x i32>* %vv, align 16
+  ret void
+}
+
+define void @bar(i32 %x, <4 x i32> %y) nounwind {
+entry:
+; CHECK-LABEL: bar:
+; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp]
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca <4 x i32>, align 16
+  store i32 %x, i32* %x.addr, align 4
+  store <4 x i32> %y, <4 x i32>* %y.addr, align 16
+  %0 = load i32* %x.addr, align 4
+  %1 = load <4 x i32>* %y.addr, align 16
+  call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
+  ret void
+}
+
+; rdar://13668927
+; When passing 16-byte aligned small structs as vararg, make sure the caller
+; side is 16-byte aligned on stack.
+%struct.s41 = type { i32, i16, i32, i16 }
+define void @foo2(i8* %fmt, ...) nounwind {
+entry:
+; CHECK-LABEL: foo2:
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
+; CHECK: ldr {{w[0-9]+}}, [sp, #48]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+  %fmt.addr = alloca i8*, align 8
+  %args = alloca i8*, align 8
+  %vc = alloca i32, align 4
+  %vs = alloca %struct.s41, align 16
+  store i8* %fmt, i8** %fmt.addr, align 8
+  %args1 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %args1)
+  %0 = va_arg i8** %args, i32
+  store i32 %0, i32* %vc, align 4
+  %ap.cur = load i8** %args
+  %1 = getelementptr i8* %ap.cur, i32 15
+  %2 = ptrtoint i8* %1 to i64
+  %3 = and i64 %2, -16
+  %ap.align = inttoptr i64 %3 to i8*
+  %ap.next = getelementptr i8* %ap.align, i32 16
+  store i8* %ap.next, i8** %args
+  %4 = bitcast i8* %ap.align to %struct.s41*
+  %5 = bitcast %struct.s41* %vs to i8*
+  %6 = bitcast %struct.s41* %4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 16, i32 16, i1 false)
+  ret void
+}
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @bar2(i32 %x, i128 %s41.coerce) nounwind {
+entry:
+; CHECK-LABEL: bar2:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp]
+  %x.addr = alloca i32, align 4
+  %s41 = alloca %struct.s41, align 16
+  store i32 %x, i32* %x.addr, align 4
+  %0 = bitcast %struct.s41* %s41 to i128*
+  store i128 %s41.coerce, i128* %0, align 1
+  %1 = load i32* %x.addr, align 4
+  %2 = bitcast %struct.s41* %s41 to i128*
+  %3 = load i128* %2, align 1
+  call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
new file mode 100644
index 00000000000..e2de434c7b0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -0,0 +1,238 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+target triple = "arm64-apple-darwin"
+
+; rdar://9932559
+define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline {
+entry:
+; CHECK-LABEL: i8i16callee:
+; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
+; They are i8, i16, i8 and i8.
+; CHECK: ldrsb	{{w[0-9]+}}, [sp, #5]
+; CHECK: ldrsh	{{w[0-9]+}}, [sp, #2]
+; CHECK: ldrsb	{{w[0-9]+}}, [sp]
+; CHECK: ldrsb	{{w[0-9]+}}, [sp, #4]
+; FAST-LABEL: i8i16callee:
+; FAST: ldrb  {{w[0-9]+}}, [sp, #5]
+; FAST: ldrb  {{w[0-9]+}}, [sp, #4]
+; FAST: ldrh  {{w[0-9]+}}, [sp, #2]
+; FAST: ldrb  {{w[0-9]+}}, [sp]
+  %conv = sext i8 %a4 to i64
+  %conv3 = sext i16 %a5 to i64
+  %conv8 = sext i8 %b1 to i64
+  %conv9 = sext i16 %b2 to i64
+  %conv11 = sext i8 %b3 to i64
+  %conv13 = sext i8 %b4 to i64
+  %add10 = add i64 %a2, %a1
+  %add12 = add i64 %add10, %a3
+  %add14 = add i64 %add12, %conv
+  %add = add i64 %add14, %conv3
+  %add1 = add i64 %add, %a6
+  %add2 = add i64 %add1, %a7
+  %add4 = add i64 %add2, %a8
+  %add5 = add i64 %add4, %conv8
+  %add6 = add i64 %add5, %conv9
+  %add7 = add i64 %add6, %conv11
+  %add15 = add i64 %add7, %conv13
+  %sext = shl i64 %add15, 32
+  %conv17 = ashr exact i64 %sext, 32
+  ret i64 %conv17
+}
+
+define i32 @i8i16caller() nounwind readnone {
+entry:
+; CHECK: i8i16caller
+; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
+; They are i8, i16, i8 and i8.
+; CHECK: strb {{w[0-9]+}}, [sp, #5]
+; CHECK: strb {{w[0-9]+}}, [sp, #4]
+; CHECK: strh {{w[0-9]+}}, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp]
+; CHECK: bl
+; FAST: i8i16caller
+; FAST: strb {{w[0-9]+}}, [sp]
+; FAST: strh {{w[0-9]+}}, [sp, #2]
+; FAST: strb {{w[0-9]+}}, [sp, #4]
+; FAST: strb {{w[0-9]+}}, [sp, #5]
+; FAST: bl
+  %call = tail call i64 @i8i16callee(i64 0, i64 1, i64 2, i8 signext 3, i16 signext 4, i64 5, i64 6, i64 7, i8 signext 97, i16 signext 98, i8 signext 99, i8 signext 100)
+  %conv = trunc i64 %call to i32
+  ret i32 %conv
+}
+
+; rdar://12651543
+define double @circle_center([2 x float] %a) nounwind ssp {
+  %call = tail call double @ext([2 x float] %a) nounwind
+; CHECK: circle_center
+; CHECK: bl
+  ret double %call
+}
+declare double @ext([2 x float])
+
+; rdar://12656141
+; 16-byte vector should be aligned at 16-byte when passing on stack.
+; A double argument will be passed on stack, so vecotr should be at sp+16.
+define double @fixed_4i(<4 x i32>* nocapture %in) nounwind {
+entry:
+; CHECK: fixed_4i
+; CHECK: str [[REG_1:q[0-9]+]], [sp, #16]
+; FAST: fixed_4i
+; FAST: sub sp, sp, #64
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16]
+  %0 = load <4 x i32>* %in, align 16
+  %call = tail call double @args_vec_4i(double 3.000000e+00, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, double 3.000000e+00, <4 x i32> %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_vec_4i(double, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, double, <4 x i32>, i8 signext)
+
+; rdar://12695237
+; d8 at sp, i in register w0.
+@g_d = common global double 0.000000e+00, align 8
+define void @test1(float %f1, double %d1, double %d2, double %d3, double %d4,
+       double %d5, double %d6, double %d7, double %d8, i32 %i) nounwind ssp {
+entry:
+; CHECK: test1
+; CHECK: ldr [[REG_1:d[0-9]+]], [sp]
+; CHECK: scvtf [[REG_2:s[0-9]+]], w0
+; CHECK: fadd s0, [[REG_2]], s0
+  %conv = sitofp i32 %i to float
+  %add = fadd float %conv, %f1
+  %conv1 = fpext float %add to double
+  %add2 = fadd double %conv1, %d7
+  %add3 = fadd double %add2, %d8
+  store double %add3, double* @g_d, align 8
+  ret void
+}
+
+; i9 at sp, d1 in register s0.
+define void @test2(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+            i32 %i7, i32 %i8, i32 %i9, float %d1) nounwind ssp {
+entry:
+; CHECK: test2
+; CHECK: scvtf [[REG_2:s[0-9]+]], w0
+; CHECK: fadd s0, [[REG_2]], s0
+; CHECK: ldr [[REG_1:s[0-9]+]], [sp]
+  %conv = sitofp i32 %i1 to float
+  %add = fadd float %conv, %d1
+  %conv1 = fpext float %add to double
+  %conv2 = sitofp i32 %i8 to double
+  %add3 = fadd double %conv2, %conv1
+  %conv4 = sitofp i32 %i9 to double
+  %add5 = fadd double %conv4, %add3
+  store double %add5, double* @g_d, align 8
+  ret void
+}
+
+; rdar://12648441
+; Check alignment on stack for v64, f64, i64, f32, i32.
+define double @test3(<2 x i32>* nocapture %in) nounwind {
+entry:
+; CHECK: test3
+; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
+; FAST: test3
+; FAST: sub sp, sp, #32
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
+  %0 = load <2 x i32>* %in, align 8
+  %call = tail call double @args_vec_2i(double 3.000000e+00, <2 x i32> %0,
+          <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0,
+          <2 x i32> %0, float 3.000000e+00, <2 x i32> %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_vec_2i(double, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>,
+               <2 x i32>, <2 x i32>, <2 x i32>, float, <2 x i32>, i8 signext)
+
+define double @test4(double* nocapture %in) nounwind {
+entry:
+; CHECK: test4
+; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
+; CHECK: str [[REG_2:w[0-9]+]], [sp]
+; CHECK: orr w0, wzr, #0x3
+  %0 = load double* %in, align 8
+  %call = tail call double @args_f64(double 3.000000e+00, double %0, double %0,
+          double %0, double %0, double %0, double %0, double %0,
+          float 3.000000e+00, double %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_f64(double, double, double, double, double, double, double,
+               double, float, double, i8 signext)
+
+define i64 @test5(i64* nocapture %in) nounwind {
+entry:
+; CHECK: test5
+; CHECK: strb [[REG_3:w[0-9]+]], [sp, #16]
+; CHECK: str [[REG_1:x[0-9]+]], [sp, #8]
+; CHECK: str [[REG_2:w[0-9]+]], [sp]
+  %0 = load i64* %in, align 8
+  %call = tail call i64 @args_i64(i64 3, i64 %0, i64 %0, i64 %0, i64 %0, i64 %0,
+                         i64 %0, i64 %0, i32 3, i64 %0, i8 signext 3)
+  ret i64 %call
+}
+declare i64 @args_i64(i64, i64, i64, i64, i64, i64, i64, i64, i32, i64,
+             i8 signext)
+
+define i32 @test6(float* nocapture %in) nounwind {
+entry:
+; CHECK: test6
+; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
+; CHECK: str [[REG_1:s[0-9]+]], [sp, #4]
+; CHECK: strh [[REG_3:w[0-9]+]], [sp]
+  %0 = load float* %in, align 4
+  %call = tail call i32 @args_f32(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+          i32 7, i32 8, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0,
+          float 6.0, float 7.0, float 8.0, i16 signext 3, float %0,
+          i8 signext 3)
+  ret i32 %call
+}
+declare i32 @args_f32(i32, i32, i32, i32, i32, i32, i32, i32,
+                      float, float, float, float, float, float, float, float,
+                      i16 signext, float, i8 signext)
+
+define i32 @test7(i32* nocapture %in) nounwind {
+entry:
+; CHECK: test7
+; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
+; CHECK: str [[REG_1:w[0-9]+]], [sp, #4]
+; CHECK: strh [[REG_3:w[0-9]+]], [sp]
+  %0 = load i32* %in, align 4
+  %call = tail call i32 @args_i32(i32 3, i32 %0, i32 %0, i32 %0, i32 %0, i32 %0,
+                         i32 %0, i32 %0, i16 signext 3, i32 %0, i8 signext 4)
+  ret i32 %call
+}
+declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
+             i8 signext)
+
+define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
+entry:
+; CHECK: test8
+; CHECK: strb {{w[0-9]+}}, [sp, #3]
+; CHECK: strb wzr, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp, #1]
+; CHECK: strb wzr, [sp]
+; CHECK: bl
+; FAST: test8
+; FAST: strb {{w[0-9]+}}, [sp]
+; FAST: strb {{w[0-9]+}}, [sp, #1]
+; FAST: strb {{w[0-9]+}}, [sp, #2]
+; FAST: strb {{w[0-9]+}}, [sp, #3]
+; FAST: bl
+  tail call void @args_i1(i1 zeroext false, i1 zeroext true, i1 zeroext false,
+                  i1 zeroext true, i1 zeroext false, i1 zeroext true,
+                  i1 zeroext false, i1 zeroext true, i1 zeroext false,
+                  i1 zeroext true, i1 zeroext false, i1 zeroext true)
+  ret i32 0
+}
+
+declare void @args_i1(i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
+                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
+                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext)
+
+define i32 @i1_stack_incoming(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f,
+                               i64 %g, i64 %h, i64 %i, i1 zeroext %j) {
+; CHECK-LABEL: i1_stack_incoming:
+; CHECK: ldrb w0, [sp, #8]
+; CHECK: ret
+  %v = zext i1 %j to i32
+  ret i32 %v
+}
diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
new file mode 100644
index 00000000000..44c5a07ce39
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -0,0 +1,532 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+target triple = "arm64-apple-darwin"
+
+; rdar://12648441
+; Generated from arm64-arguments.c with -O2.
+; Test passing structs with size < 8, < 16 and > 16
+; with alignment of 16 and without
+
+; Structs with size < 8
+%struct.s38 = type { i32, i16 }
+; With alignment of 16, the size will be padded to multiple of 16 bytes.
+%struct.s39 = type { i32, i16, [10 x i8] }
+; Structs with size < 16
+%struct.s40 = type { i32, i16, i32, i16 }
+%struct.s41 = type { i32, i16, i32, i16 }
+; Structs with size > 16
+%struct.s42 = type { i32, i16, i32, i16, i32, i16 }
+%struct.s43 = type { i32, i16, i32, i16, i32, i16, [10 x i8] }
+
+@g38 = common global %struct.s38 zeroinitializer, align 4
+@g38_2 = common global %struct.s38 zeroinitializer, align 4
+@g39 = common global %struct.s39 zeroinitializer, align 16
+@g39_2 = common global %struct.s39 zeroinitializer, align 16
+@g40 = common global %struct.s40 zeroinitializer, align 4
+@g40_2 = common global %struct.s40 zeroinitializer, align 4
+@g41 = common global %struct.s41 zeroinitializer, align 16
+@g41_2 = common global %struct.s41 zeroinitializer, align 16
+@g42 = common global %struct.s42 zeroinitializer, align 4
+@g42_2 = common global %struct.s42 zeroinitializer, align 4
+@g43 = common global %struct.s43 zeroinitializer, align 16
+@g43_2 = common global %struct.s43 zeroinitializer, align 16
+
+; structs with size < 8 bytes, passed via i64 in x1 and x2
+define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce) #0 {
+entry:
+; CHECK: f38
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w2
+  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i64 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i64 %s2.coerce, 32
+  %sext8 = shl nuw nsw i64 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i64 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i64 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i64 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller38() #1 {
+entry:
+; CHECK: caller38
+; CHECK: ldr x1,
+; CHECK: ldr x2,
+  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
+  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
+  %call = tail call i32 @f38(i32 3, i64 %0, i64 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f38_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i64 %s1.coerce, i64 %s2.coerce) #0
+
+; structs with size < 8 bytes, passed on stack at [sp+8] and [sp+16]
+; i9 at [sp]
+define i32 @caller38_stack() #1 {
+entry:
+; CHECK: caller38_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
+  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
+  %call = tail call i32 @f38_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                                   i32 7, i32 8, i32 9, i64 %0, i64 %1) #5
+  ret i32 %call
+}
+
+; structs with size < 8 bytes, alignment of 16
+; passed via i128 in x1 and x3
+define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
+entry:
+; CHECK: f39
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
+  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i128 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i128 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller39() #1 {
+entry:
+; CHECK: caller39
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
+  %call = tail call i32 @f39(i32 3, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f39_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
+
+; structs with size < 8 bytes, alignment 16
+; passed on stack at [sp+16] and [sp+32]
+define i32 @caller39_stack() #1 {
+entry:
+; CHECK: caller39_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
+  %call = tail call i32 @f39_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                                   i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+; structs with size < 16 bytes
+; passed via i128 in x1 and x3
+define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 {
+entry:
+; CHECK: f40
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.coerce.fca.0.extract = extractvalue [2 x i64] %s1.coerce, 0
+  %s2.coerce.fca.0.extract = extractvalue [2 x i64] %s2.coerce, 0
+  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce.fca.0.extract to i32
+  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce.fca.0.extract to i32
+  %s1.sroa.0.4.extract.shift = lshr i64 %s1.coerce.fca.0.extract, 32
+  %sext8 = shl nuw nsw i64 %s1.sroa.0.4.extract.shift, 16
+  %sext = trunc i64 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %s2.sroa.0.4.extract.shift = lshr i64 %s2.coerce.fca.0.extract, 32
+  %sext1011 = shl nuw nsw i64 %s2.sroa.0.4.extract.shift, 16
+  %sext10 = trunc i64 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller40() #1 {
+entry:
+; CHECK: caller40
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
+  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
+  %call = tail call i32 @f40(i32 3, [2 x i64] %0, [2 x i64] %1) #5
+  ret i32 %call
+}
+
+declare i32 @f40_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0
+
+; structs with size < 16 bytes
+; passed on stack at [sp+8] and [sp+24]
+define i32 @caller40_stack() #1 {
+entry:
+; CHECK: caller40_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
+  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
+  %call = tail call i32 @f40_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                         i32 7, i32 8, i32 9, [2 x i64] %0, [2 x i64] %1) #5
+  ret i32 %call
+}
+
+; structs with size < 16 bytes, alignment of 16
+; passed via i128 in x1 and x3
+define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
+entry:
+; CHECK: f41
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
+  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i128 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i128 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller41() #1 {
+entry:
+; CHECK: caller41
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
+  %call = tail call i32 @f41(i32 3, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f41_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
+
+; structs with size < 16 bytes, alignment of 16
+; passed on stack at [sp+16] and [sp+32]
+define i32 @caller41_stack() #1 {
+entry:
+; CHECK: caller41_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
+  %call = tail call i32 @f41_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                            i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+; structs with size of 22 bytes, passed indirectly in x1 and x2
+define i32 @f42(i32 %i, %struct.s42* nocapture %s1, %struct.s42* nocapture %s2) #2 {
+entry:
+; CHECK: f42
+; CHECK: ldr w[[A:[0-9]+]], [x1]
+; CHECK: ldr w[[B:[0-9]+]], [x2]
+; CHECK: add w[[C:[0-9]+]], w[[A]], w0
+; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
+; FAST: f42
+; FAST: ldr w[[A:[0-9]+]], [x1]
+; FAST: ldr w[[B:[0-9]+]], [x2]
+; FAST: add w[[C:[0-9]+]], w[[A]], w0
+; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
+  %i1 = getelementptr inbounds %struct.s42* %s1, i64 0, i32 0
+  %0 = load i32* %i1, align 4, !tbaa !0
+  %i2 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 0
+  %1 = load i32* %i2, align 4, !tbaa !0
+  %s = getelementptr inbounds %struct.s42* %s1, i64 0, i32 1
+  %2 = load i16* %s, align 2, !tbaa !3
+  %conv = sext i16 %2 to i32
+  %s5 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 1
+  %3 = load i16* %s5, align 2, !tbaa !3
+  %conv6 = sext i16 %3 to i32
+  %add = add i32 %0, %i
+  %add3 = add i32 %add, %1
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+; For s1, we allocate a 22-byte space, pass its address via x1
+define i32 @caller42() #3 {
+entry:
+; CHECK: caller42
+; CHECK: str {{x[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK: str {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK: add x1, sp, #32
+; CHECK: mov x2, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+
+; FAST: caller42
+; FAST: sub sp, sp, #96
+; Space for s1 is allocated at fp-24 = sp+72
+; Space for s2 is allocated at sp+48
+; FAST: sub x[[A:[0-9]+]], x29, #24
+; FAST: add x[[A:[0-9]+]], sp, #48
+; Call memcpy with size = 24 (0x18)
+; FAST: orr {{x[0-9]+}}, xzr, #0x18
+  %tmp = alloca %struct.s42, align 4
+  %tmp1 = alloca %struct.s42, align 4
+  %0 = bitcast %struct.s42* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s42* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %call = call i32 @f42(i32 3, %struct.s42* %tmp, %struct.s42* %tmp1) #5
+  ret i32 %call
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #4
+
+declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                       i32 %i7, i32 %i8, i32 %i9, %struct.s42* nocapture %s1,
+                       %struct.s42* nocapture %s2) #2
+
+define i32 @caller42_stack() #3 {
+entry:
+; CHECK: caller42_stack
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #96
+; CHECK: stur {{x[0-9]+}}, [x29, #-16]
+; CHECK: stur {{q[0-9]+}}, [x29, #-32]
+; CHECK: str {{x[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; Space for s1 is allocated at x29-32 = sp+64
+; Space for s2 is allocated at sp+32
+; CHECK: add x[[B:[0-9]+]], sp, #32
+; CHECK: str x[[B]], [sp, #16]
+; CHECK: sub x[[A:[0-9]+]], x29, #32
+; Address of s1 is passed on stack at sp+8
+; CHECK: str x[[A]], [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+
+; FAST: caller42_stack
+; Space for s1 is allocated at fp-24
+; Space for s2 is allocated at fp-48
+; FAST: sub x[[A:[0-9]+]], x29, #24
+; FAST: sub x[[B:[0-9]+]], x29, #48
+; Call memcpy with size = 24 (0x18)
+; FAST: orr {{x[0-9]+}}, xzr, #0x18
+; FAST: str {{w[0-9]+}}, [sp]
+; Address of s1 is passed on stack at sp+8
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+  %tmp = alloca %struct.s42, align 4
+  %tmp1 = alloca %struct.s42, align 4
+  %0 = bitcast %struct.s42* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s42* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %call = call i32 @f42_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, %struct.s42* %tmp, %struct.s42* %tmp1) #5
+  ret i32 %call
+}
+
+; structs with size of 22 bytes, alignment of 16
+; passed indirectly in x1 and x2
+define i32 @f43(i32 %i, %struct.s43* nocapture %s1, %struct.s43* nocapture %s2) #2 {
+entry:
+; CHECK: f43
+; CHECK: ldr w[[A:[0-9]+]], [x1]
+; CHECK: ldr w[[B:[0-9]+]], [x2]
+; CHECK: add w[[C:[0-9]+]], w[[A]], w0
+; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
+; FAST: f43
+; FAST: ldr w[[A:[0-9]+]], [x1]
+; FAST: ldr w[[B:[0-9]+]], [x2]
+; FAST: add w[[C:[0-9]+]], w[[A]], w0
+; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
+  %i1 = getelementptr inbounds %struct.s43* %s1, i64 0, i32 0
+  %0 = load i32* %i1, align 4, !tbaa !0
+  %i2 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 0
+  %1 = load i32* %i2, align 4, !tbaa !0
+  %s = getelementptr inbounds %struct.s43* %s1, i64 0, i32 1
+  %2 = load i16* %s, align 2, !tbaa !3
+  %conv = sext i16 %2 to i32
+  %s5 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 1
+  %3 = load i16* %s5, align 2, !tbaa !3
+  %conv6 = sext i16 %3 to i32
+  %add = add i32 %0, %i
+  %add3 = add i32 %add, %1
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller43() #3 {
+entry:
+; CHECK: caller43
+; CHECK: str {{q[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK: add x1, sp, #32
+; CHECK: mov x2, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+
+; FAST: caller43
+; FAST: mov x29, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+; FAST: add x1, sp, #32
+; FAST: mov x2, sp
+; FAST: str {{x[0-9]+}}, [sp, #32]
+; FAST: str {{x[0-9]+}}, [sp, #40]
+; FAST: str {{x[0-9]+}}, [sp, #48]
+; FAST: str {{x[0-9]+}}, [sp, #56]
+; FAST: str {{x[0-9]+}}, [sp]
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+; FAST: str {{x[0-9]+}}, [sp, #24]
+  %tmp = alloca %struct.s43, align 16
+  %tmp1 = alloca %struct.s43, align 16
+  %0 = bitcast %struct.s43* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s43* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %call = call i32 @f43(i32 3, %struct.s43* %tmp, %struct.s43* %tmp1) #5
+  ret i32 %call
+}
+
+declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                       i32 %i7, i32 %i8, i32 %i9, %struct.s43* nocapture %s1,
+                       %struct.s43* nocapture %s2) #2
+
+define i32 @caller43_stack() #3 {
+entry:
+; CHECK: caller43_stack
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #96
+; CHECK: stur {{q[0-9]+}}, [x29, #-16]
+; CHECK: stur {{q[0-9]+}}, [x29, #-32]
+; CHECK: str {{q[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; Space for s1 is allocated at x29-32 = sp+64
+; Space for s2 is allocated at sp+32
+; CHECK: add x[[B:[0-9]+]], sp, #32
+; CHECK: str x[[B]], [sp, #16]
+; CHECK: sub x[[A:[0-9]+]], x29, #32
+; Address of s1 is passed on stack at sp+8
+; CHECK: str x[[A]], [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+
+; FAST: caller43_stack
+; FAST: sub sp, sp, #96
+; Space for s1 is allocated at fp-32 = sp+64
+; Space for s2 is allocated at sp+32
+; FAST: sub x[[A:[0-9]+]], x29, #32
+; FAST: add x[[B:[0-9]+]], sp, #32
+; FAST: stur {{x[0-9]+}}, [x29, #-32]
+; FAST: stur {{x[0-9]+}}, [x29, #-24]
+; FAST: stur {{x[0-9]+}}, [x29, #-16]
+; FAST: stur {{x[0-9]+}}, [x29, #-8]
+; FAST: str {{x[0-9]+}}, [sp, #32]
+; FAST: str {{x[0-9]+}}, [sp, #40]
+; FAST: str {{x[0-9]+}}, [sp, #48]
+; FAST: str {{x[0-9]+}}, [sp, #56]
+; FAST: str {{w[0-9]+}}, [sp]
+; Address of s1 is passed on stack at sp+8
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+  %tmp = alloca %struct.s43, align 16
+  %tmp1 = alloca %struct.s43, align 16
+  %0 = bitcast %struct.s43* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s43* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %call = call i32 @f43_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, %struct.s43* %tmp, %struct.s43* %tmp1) #5
+  ret i32 %call
+}
+
+; rdar://13668927
+; Check that we don't split an i128.
+declare i32 @callee_i128_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
+                               i32 %i6, i32 %i7, i128 %s1, i32 %i8)
+
+define i32 @i128_split() {
+entry:
+; CHECK: i128_split
+; "i128 %0" should be on stack at [sp].
+; "i32 8" should be on stack at [sp, #16].
+; CHECK: str {{w[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp]
+; FAST: i128_split
+; FAST: sub sp, sp, #48
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16]
+; Load/Store opt is disabled with -O0, so the i128 is split.
+; FAST: str {{x[0-9]+}}, [x[[ADDR]], #8]
+; FAST: str {{x[0-9]+}}, [x[[ADDR]]]
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %call = tail call i32 @callee_i128_split(i32 1, i32 2, i32 3, i32 4, i32 5,
+                                           i32 6, i32 7, i128 %0, i32 8) #5
+  ret i32 %call
+}
+
+declare i32 @callee_i64(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
+                               i32 %i6, i32 %i7, i64 %s1, i32 %i8)
+
+define i32 @i64_split() {
+entry:
+; CHECK: i64_split
+; "i64 %0" should be in register x7.
+; "i32 8" should be on stack at [sp].
+; CHECK: ldr x7, [{{x[0-9]+}}]
+; CHECK: str {{w[0-9]+}}, [sp]
+; FAST: i64_split
+; FAST: ldr x7, [{{x[0-9]+}}]
+; FAST: str {{w[0-9]+}}, [sp]
+  %0 = load i64* bitcast (%struct.s41* @g41 to i64*), align 16
+  %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5,
+                                    i32 6, i32 7, i64 %0, i32 8) #5
+  ret i32 %call
+}
+
+attributes #0 = { noinline nounwind readnone "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #1 = { nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #2 = { noinline nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #3 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #4 = { nounwind }
+attributes #5 = { nobuiltin }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"short", metadata !1}
+!4 = metadata !{i64 0, i64 4, metadata !0, i64 4, i64 2, metadata !3, i64 8, i64 4, metadata !0, i64 12, i64 2, metadata !3, i64 16, i64 4, metadata !0, i64 20, i64 2, metadata !3}
diff --git a/test/CodeGen/AArch64/arm64-addp.ll b/test/CodeGen/AArch64/arm64-addp.ll
new file mode 100644
index 00000000000..3f1e5c5d44e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-addp.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+
+define double @foo(<2 x double> %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: faddp.2d d0, v0
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x double> %a, i32 0
+  %lane1.i = extractelement <2 x double> %a, i32 1
+  %vpaddd.i = fadd double %lane0.i, %lane1.i
+  ret double %vpaddd.i
+}
+
+define i64 @foo0(<2 x i64> %a) nounwind {
+; CHECK-LABEL: foo0:
+; CHECK: addp.2d d0, v0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x i64> %a, i32 0
+  %lane1.i = extractelement <2 x i64> %a, i32 1
+  %vpaddd.i = add i64 %lane0.i, %lane1.i
+  ret i64 %vpaddd.i
+}
+
+define float @foo1(<2 x float> %a) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: faddp.2s
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x float> %a, i32 0
+  %lane1.i = extractelement <2 x float> %a, i32 1
+  %vpaddd.i = fadd float %lane0.i, %lane1.i
+  ret float %vpaddd.i
+}
diff --git a/test/CodeGen/AArch64/arm64-addr-mode-folding.ll b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
new file mode 100644
index 00000000000..08fb8c90c48
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
@@ -0,0 +1,171 @@
+; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s
+; <rdar://problem/13621857>
+
+@block = common global i8* null, align 8
+
+define i32 @fct(i32 %i1, i32 %i2) {
+; CHECK: @fct
+; Sign extension is used more than once, thus it should not be folded.
+; CodeGenPrepare is not sharing sext across uses, thus this is folded because
+; of that.
+; _CHECK-NOT_: , sxtw]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv8 = zext i1 %cmp7 to i32
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc9 = add nsw i32 %i2, 1
+  %idxprom10 = sext i32 %inc to i64
+  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
+  %3 = load i8* %arrayidx11, align 1
+  %idxprom12 = sext i32 %inc9 to i64
+  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
+  %4 = load i8* %arrayidx13, align 1
+  %cmp16 = icmp eq i8 %3, %4
+  br i1 %cmp16, label %if.end23, label %if.then18
+
+if.then18:                                        ; preds = %if.end
+  %cmp21 = icmp ugt i8 %3, %4
+  %conv22 = zext i1 %cmp21 to i32
+  br label %return
+
+if.end23:                                         ; preds = %if.end
+  %inc24 = add nsw i32 %i1, 2
+  %inc25 = add nsw i32 %i2, 2
+  %idxprom26 = sext i32 %inc24 to i64
+  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
+  %5 = load i8* %arrayidx27, align 1
+  %idxprom28 = sext i32 %inc25 to i64
+  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
+  %6 = load i8* %arrayidx29, align 1
+  %cmp32 = icmp eq i8 %5, %6
+  br i1 %cmp32, label %return, label %if.then34
+
+if.then34:                                        ; preds = %if.end23
+  %cmp37 = icmp ugt i8 %5, %6
+  %conv38 = zext i1 %cmp37 to i32
+  br label %return
+
+return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
+  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
+  ret i32 %retval.0
+}
+
+define i32 @fct1(i32 %i1, i32 %i2) optsize {
+; CHECK: @fct1
+; Addressing are folded when optimizing for code size.
+; CHECK: , sxtw]
+; CHECK: , sxtw]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv8 = zext i1 %cmp7 to i32
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc9 = add nsw i32 %i2, 1
+  %idxprom10 = sext i32 %inc to i64
+  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
+  %3 = load i8* %arrayidx11, align 1
+  %idxprom12 = sext i32 %inc9 to i64
+  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
+  %4 = load i8* %arrayidx13, align 1
+  %cmp16 = icmp eq i8 %3, %4
+  br i1 %cmp16, label %if.end23, label %if.then18
+
+if.then18:                                        ; preds = %if.end
+  %cmp21 = icmp ugt i8 %3, %4
+  %conv22 = zext i1 %cmp21 to i32
+  br label %return
+
+if.end23:                                         ; preds = %if.end
+  %inc24 = add nsw i32 %i1, 2
+  %inc25 = add nsw i32 %i2, 2
+  %idxprom26 = sext i32 %inc24 to i64
+  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
+  %5 = load i8* %arrayidx27, align 1
+  %idxprom28 = sext i32 %inc25 to i64
+  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
+  %6 = load i8* %arrayidx29, align 1
+  %cmp32 = icmp eq i8 %5, %6
+  br i1 %cmp32, label %return, label %if.then34
+
+if.then34:                                        ; preds = %if.end23
+  %cmp37 = icmp ugt i8 %5, %6
+  %conv38 = zext i1 %cmp37 to i32
+  br label %return
+
+return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
+  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
+  ret i32 %retval.0
+}
+
+; CHECK: @test
+; CHECK-NOT: , uxtw #2]
+define i32 @test(i32* %array, i8 zeroext %c, i32 %arg) {
+entry:
+  %conv = zext i8 %c to i32
+  %add = sub i32 0, %arg
+  %tobool = icmp eq i32 %conv, %add
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = zext i8 %c to i64
+  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
+  %0 = load volatile i32* %arrayidx, align 4
+  %1 = load volatile i32* %arrayidx, align 4
+  %add3 = add nsw i32 %1, %0
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
+  ret i32 %res.0
+}
+
+
+; CHECK: @test2
+; CHECK: , uxtw #2]
+; CHECK: , uxtw #2]
+define i32 @test2(i32* %array, i8 zeroext %c, i32 %arg) optsize {
+entry:
+  %conv = zext i8 %c to i32
+  %add = sub i32 0, %arg
+  %tobool = icmp eq i32 %conv, %add
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = zext i8 %c to i64
+  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
+  %0 = load volatile i32* %arrayidx, align 4
+  %1 = load volatile i32* %arrayidx, align 4
+  %add3 = add nsw i32 %1, %0
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
+  ret i32 %res.0
+}
diff --git a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
new file mode 100644
index 00000000000..1a3ca8bd5b8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march arm64 < %s | FileCheck %s
+; rdar://13452552
+; ModuleID = 'reduced_test.ll'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+@block = common global i8* null, align 8
+
+define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
+; CHECK: fullGtU
+; CHECK: adrp [[PAGE:x[0-9]+]], _block@GOTPAGE
+; CHECK: ldr [[ADDR:x[0-9]+]], {{\[}}[[PAGE]], _block@GOTPAGEOFF]
+; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]]
+; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]],  w0, sxtw]
+; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], w1, sxtw]
+; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
+; CHECK-NEXT b.ne
+; Next BB
+; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw
+; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw
+; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1]
+; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1]
+; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]]
+; CHECK-NEXT: b.ne
+; Next BB
+; CHECK: ldrb [[LOADEDVAL3:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #2]
+; CHECK-NEXT: ldrb [[LOADEDVAL4:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #2]
+; CHECK-NEXT: cmp [[LOADEDVAL3]], [[LOADEDVAL4]]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %tmp = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %tmp, i64 %idxprom
+  %tmp1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %tmp, i64 %idxprom1
+  %tmp2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %tmp1, %tmp2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %tmp1, %tmp2
+  %conv9 = zext i1 %cmp7 to i8
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc10 = add nsw i32 %i2, 1
+  %idxprom11 = sext i32 %inc to i64
+  %arrayidx12 = getelementptr inbounds i8* %tmp, i64 %idxprom11
+  %tmp3 = load i8* %arrayidx12, align 1
+  %idxprom13 = sext i32 %inc10 to i64
+  %arrayidx14 = getelementptr inbounds i8* %tmp, i64 %idxprom13
+  %tmp4 = load i8* %arrayidx14, align 1
+  %cmp17 = icmp eq i8 %tmp3, %tmp4
+  br i1 %cmp17, label %if.end25, label %if.then19
+
+if.then19:                                        ; preds = %if.end
+  %cmp22 = icmp ugt i8 %tmp3, %tmp4
+  %conv24 = zext i1 %cmp22 to i8
+  br label %return
+
+if.end25:                                         ; preds = %if.end
+  %inc26 = add nsw i32 %i1, 2
+  %inc27 = add nsw i32 %i2, 2
+  %idxprom28 = sext i32 %inc26 to i64
+  %arrayidx29 = getelementptr inbounds i8* %tmp, i64 %idxprom28
+  %tmp5 = load i8* %arrayidx29, align 1
+  %idxprom30 = sext i32 %inc27 to i64
+  %arrayidx31 = getelementptr inbounds i8* %tmp, i64 %idxprom30
+  %tmp6 = load i8* %arrayidx31, align 1
+  %cmp34 = icmp eq i8 %tmp5, %tmp6
+  br i1 %cmp34, label %return, label %if.then36
+
+if.then36:                                        ; preds = %if.end25
+  %cmp39 = icmp ugt i8 %tmp5, %tmp6
+  %conv41 = zext i1 %cmp39 to i8
+  br label %return
+
+return:                                           ; preds = %if.then36, %if.end25, %if.then19, %if.then
+  %retval.0 = phi i8 [ %conv9, %if.then ], [ %conv24, %if.then19 ], [ %conv41, %if.then36 ], [ 0, %if.end25 ]
+  ret i8 %retval.0
+}
diff --git a/test/CodeGen/AArch64/arm64-addrmode.ll b/test/CodeGen/AArch64/arm64-addrmode.ll
new file mode 100644
index 00000000000..700fba80149
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -0,0 +1,72 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+; rdar://10232252
+
+@object = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+; base + offset (imm9)
+; CHECK: @t1
+; CHECK: ldr xzr, [x{{[0-9]+}}, #8]
+; CHECK: ret
+define void @t1() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 1
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + offset (> imm9)
+; CHECK: @t2
+; CHECK: sub [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #264
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t2() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 -33
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + unsigned offset (> imm9 and <= imm12 * size of type in bytes)
+; CHECK: @t3
+; CHECK: ldr xzr, [x{{[0-9]+}}, #32760]
+; CHECK: ret
+define void @t3() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 4095
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + unsigned offset (> imm12 * size of type in bytes)
+; CHECK: @t4
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #8, lsl #12
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t4() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 4096
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + reg
+; CHECK: @t5
+; CHECK: ldr xzr, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #3]
+; CHECK: ret
+define void @t5(i64 %a) {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 %a
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + reg + imm
+; CHECK: @t6
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
+; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #8, lsl #12
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t6(i64 %a) {
+  %tmp1 = getelementptr inbounds i64* @object, i64 %a
+  %incdec.ptr = getelementptr inbounds i64* %tmp1, i64 4096
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
new file mode 100644
index 00000000000..f396bc99170
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=false | FileCheck %s
+
+; rdar://12713765
+; Make sure we are not creating stack objects that are assumed to be 64-byte
+; aligned.
+@T3_retval = common global <16 x float> zeroinitializer, align 16
+
+define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp {
+entry:
+; CHECK: test
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE:x[0-9]+]], #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE]]]
+ %retval = alloca <16 x float>, align 16
+ %0 = load <16 x float>* @T3_retval, align 16
+ store <16 x float> %0, <16 x float>* %retval
+ %1 = load <16 x float>* %retval
+ store <16 x float> %1, <16 x float>* %agg.result, align 16
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
new file mode 100644
index 00000000000..3750f31b373
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s
+
+; CHECK: foo
+; CHECK: ldr w[[REG:[0-9]+]], [x19, #264]
+; CHECK: str w[[REG]], [x19, #132]
+; CHECK: ldr w{{[0-9]+}}, [x19, #264]
+
+define i32 @foo(i32 %a) nounwind {
+  %retval = alloca i32, align 4
+  %a.addr = alloca i32, align 4
+  %arr = alloca [32 x i32], align 4
+  %i = alloca i32, align 4
+  %arr2 = alloca [32 x i32], align 4
+  %j = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %tmp1 = zext i32 %tmp to i64
+  %v = mul i64 4, %tmp1
+  %vla = alloca i8, i64 %v, align 4
+  %tmp2 = bitcast i8* %vla to i32*
+  %tmp3 = load i32* %a.addr, align 4
+  store i32 %tmp3, i32* %i, align 4
+  %tmp4 = load i32* %a.addr, align 4
+  store i32 %tmp4, i32* %j, align 4
+  %tmp5 = load i32* %j, align 4
+  store i32 %tmp5, i32* %retval
+  %x = load i32* %retval
+  ret i32 %x
+}
diff --git a/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll b/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
new file mode 100644
index 00000000000..419497722f4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
@@ -0,0 +1,72 @@
+; RUN: llc -O1 -march=arm64 -enable-andcmp-sinking=true < %s | FileCheck %s
+; ModuleID = 'and-cbz-extr-mr.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+define zeroext i1 @foo(i1 %IsEditable, i1 %isTextField, i8* %str1, i8* %str2, i8* %str3, i8* %str4, i8* %str5, i8* %str6, i8* %str7, i8* %str8, i8* %str9, i8* %str10, i8* %str11, i8* %str12, i8* %str13, i32 %int1, i8* %str14) unnamed_addr #0 align 2 {
+; CHECK: _foo:
+entry:
+  %tobool = icmp eq i8* %str14, null
+  br i1 %tobool, label %return, label %if.end
+
+; CHECK: %if.end
+; CHECK: tbz
+if.end:                                           ; preds = %entry
+  %and.i.i.i = and i32 %int1, 4
+  %tobool.i.i.i = icmp eq i32 %and.i.i.i, 0
+  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i
+
+land.rhs.i:                                       ; preds = %if.end
+  %cmp.i.i.i = icmp eq i8* %str12, %str13
+  br i1 %cmp.i.i.i, label %if.then3, label %lor.rhs.i.i.i
+
+lor.rhs.i.i.i:                                    ; preds = %land.rhs.i
+  %cmp.i13.i.i.i = icmp eq i8* %str10, %str11
+  br i1 %cmp.i13.i.i.i, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, label %if.end5
+
+_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit: ; preds = %lor.rhs.i.i.i
+  %cmp.i.i.i.i = icmp eq i8* %str8, %str9
+  br i1 %cmp.i.i.i.i, label %if.then3, label %if.end5
+
+if.then3:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %land.rhs.i
+  %tmp11 = load i8* %str14, align 8
+  %tmp12 = and i8 %tmp11, 2
+  %tmp13 = icmp ne i8 %tmp12, 0
+  br label %return
+
+if.end5:                                          ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %lor.rhs.i.i.i
+; CHECK: %if.end5
+; CHECK: tbz
+  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i19
+
+land.rhs.i19:                                     ; preds = %if.end5
+  %cmp.i.i.i18 = icmp eq i8* %str6, %str7
+  br i1 %cmp.i.i.i18, label %if.then7, label %lor.rhs.i.i.i23
+
+lor.rhs.i.i.i23:                                  ; preds = %land.rhs.i19
+  %cmp.i13.i.i.i22 = icmp eq i8* %str3, %str4
+  br i1 %cmp.i13.i.i.i22, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, label %if.end12
+
+_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28: ; preds = %lor.rhs.i.i.i23
+  %cmp.i.i.i.i26 = icmp eq i8* %str1, %str2
+  br i1 %cmp.i.i.i.i26, label %if.then7, label %if.end12
+
+if.then7:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %land.rhs.i19
+  br i1 %isTextField, label %if.then9, label %if.end12
+
+if.then9:                                         ; preds = %if.then7
+  %tmp23 = load i8* %str5, align 8
+  %tmp24 = and i8 %tmp23, 2
+  %tmp25 = icmp ne i8 %tmp24, 0
+  br label %return
+
+if.end12:                                         ; preds = %if.then7, %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %lor.rhs.i.i.i23, %if.end5, %if.end
+  %lnot = xor i1 %IsEditable, true
+  br label %return
+
+return:                                           ; preds = %if.end12, %if.then9, %if.then3, %entry
+  %retval.0 = phi i1 [ %tmp13, %if.then3 ], [ %tmp25, %if.then9 ], [ %lnot, %if.end12 ], [ true, %entry ]
+  ret i1 %retval.0
+}
+
+attributes #0 = { nounwind ssp }
diff --git a/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
new file mode 100644
index 00000000000..34d6287b8b4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
@@ -0,0 +1,31 @@
+; RUN: llc %s -o - | FileCheck %s
+; Check that ANDS (tst) is not merged with ADD when the immediate
+; is not 0.
+; <rdar://problem/16693089>
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+; CHECK-LABEL: tst1:
+; CHECK: add [[REG:w[0-9]+]], w{{[0-9]+}}, #1
+; CHECK: tst [[REG]], #0x1
+define void @tst1() {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %result.09 = phi i32 [ %add2.result.0, %for.body ], [ 1, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %and = and i32 %i.08, 1
+  %cmp1 = icmp eq i32 %and, 0
+  %add2.result.0 = select i1 %cmp1, i32 undef, i32 %result.09
+  %inc = add nsw i32 %i.08, 1
+  %cmp = icmp slt i32 %i.08, undef
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %add2.result.0.lcssa = phi i32 [ %add2.result.0, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-anyregcc-crash.ll b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
new file mode 100644
index 00000000000..241cf974c05
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
@@ -0,0 +1,19 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+;
+; Check that misuse of anyregcc results in a compile time error.
+
+; CHECK: LLVM ERROR: ran out of registers during register allocation
+define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+                        i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+                        i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+                        i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
+entry:
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
+                i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+                i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+                i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+                i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32)
+  ret i64 %result
+}
+
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-anyregcc.ll b/test/CodeGen/AArch64/arm64-anyregcc.ll
new file mode 100644
index 00000000000..e26875d52f9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-anyregcc.ll
@@ -0,0 +1,363 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Stackmap Header: no constants - 6 callsites
+; CHECK-LABEL: .section	__LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 8
+; Num LargeConstants
+; CHECK-NEXT:   .long 0
+; Num Callsites
+; CHECK-NEXT:   .long 8
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _test
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _property_access1
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _property_access2
+; CHECK-NEXT:   .quad 32
+; CHECK-NEXT:   .quad _property_access3
+; CHECK-NEXT:   .quad 32
+; CHECK-NEXT:   .quad _anyreg_test1
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _anyreg_test2
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _patchpoint_spilldef
+; CHECK-NEXT:   .quad 112
+; CHECK-NEXT:   .quad _patchpoint_spillargs
+; CHECK-NEXT:   .quad 128
+
+
+; test
+; CHECK-LABEL:  .long   L{{.*}}-_test
+; CHECK-NEXT:   .short  0
+; 3 locations
+; CHECK-NEXT:   .short  3
+; Loc 0: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Constant 3
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long 3
+define i64 @test() nounwind ssp uwtable {
+entry:
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
+  ret i64 0
+}
+
+; property access 1 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL:  .long   L{{.*}}-_property_access1
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
+  ret i64 %ret
+}
+
+; property access 2 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL:  .long   L{{.*}}-_property_access2
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access2() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
+  ret i64 %ret
+}
+
+; property access 3 - %obj is a frame index
+; CHECK-LABEL:  .long   L{{.*}}-_property_access3
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Direct FP - 8
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+; CHECK-NEXT:   .long -8
+define i64 @property_access3() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
+  ret i64 %ret
+}
+
+; anyreg_test1
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test1
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; anyreg_test2
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test2
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; Test spilling the return value of an anyregcc call.
+;
+; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
+;
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 3
+; Loc 0: Register (some register that will be spilled to the stack)
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  ret i64 %result
+}
+
+; Test spilling the arguments of an anyregcc call.
+;
+; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
+;
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 5
+; Loc 0: Return a register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Arg0 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 2: Arg1 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 3: Arg2 spilled to FP -96
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -96
+; Loc 4: Arg3 spilled to FP - 88
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -88
+define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  ret i64 %result
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-arith-saturating.ll b/test/CodeGen/AArch64/arm64-arith-saturating.ll
new file mode 100644
index 00000000000..78cd1fcb1a2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-arith-saturating.ll
@@ -0,0 +1,153 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qadds:
+; CHECK: sqadd s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqadd.i = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqadd.i
+}
+
+define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qaddd:
+; CHECK: sqadd d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqadd.i = tail call i64 @llvm.aarch64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqadd.i
+}
+
+define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqadds:
+; CHECK: uqadd s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqadd.i = tail call i32 @llvm.aarch64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqadd.i
+}
+
+define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqaddd:
+; CHECK: uqadd d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqadd.i = tail call i64 @llvm.aarch64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqadd.i
+}
+
+declare i64 @llvm.aarch64.neon.uqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.uqadd.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) nounwind readnone
+
+define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubs:
+; CHECK: sqsub s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqsub.i = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqsub.i
+}
+
+define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubd:
+; CHECK: sqsub d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqsub.i = tail call i64 @llvm.aarch64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqsub.i
+}
+
+define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubs:
+; CHECK: uqsub s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqsub.i = tail call i32 @llvm.aarch64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqsub.i
+}
+
+define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubd:
+; CHECK: uqsub d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqsub.i = tail call i64 @llvm.aarch64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqsub.i
+}
+
+declare i64 @llvm.aarch64.neon.uqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.uqsub.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) nounwind readnone
+
+define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qabss:
+; CHECK: sqabs s0, s0
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vqabs.i = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %vecext) nounwind
+  ret i32 %vqabs.i
+}
+
+define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qabsd:
+; CHECK: sqabs d0, d0
+; CHECK: ret
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqabs.i = tail call i64 @llvm.aarch64.neon.sqabs.i64(i64 %vecext) nounwind
+  ret i64 %vqabs.i
+}
+
+define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qnegs:
+; CHECK: sqneg s0, s0
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vqneg.i = tail call i32 @llvm.aarch64.neon.sqneg.i32(i32 %vecext) nounwind
+  ret i32 %vqneg.i
+}
+
+define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qnegd:
+; CHECK: sqneg d0, d0
+; CHECK: ret
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqneg.i = tail call i64 @llvm.aarch64.neon.sqneg.i64(i64 %vecext) nounwind
+  ret i64 %vqneg.i
+}
+
+declare i64 @llvm.aarch64.neon.sqneg.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqneg.i32(i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqabs.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqabs.i32(i32) nounwind readnone
+
+
+define i32 @vqmovund(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovund:
+; CHECK: sqxtun s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovun.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovun.i
+}
+
+define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_s:
+; CHECK: sqxtn s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovn.i
+}
+
+define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_u:
+; CHECK: uqxtn s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovn.i
+}
+
+declare i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-arith.ll b/test/CodeGen/AArch64/arm64-arith.ll
new file mode 100644
index 00000000000..ed9b569e218
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-arith.ll
@@ -0,0 +1,262 @@
+; RUN: llc < %s -march=arm64 -asm-verbose=false | FileCheck %s
+
+define i32 @t1(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: add w0, w1, w0
+; CHECK: ret
+  %add = add i32 %b, %a
+  ret i32 %add
+}
+
+define i32 @t2(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: udiv w0, w0, w1
+; CHECK: ret
+  %udiv = udiv i32 %a, %b
+  ret i32 %udiv
+}
+
+define i64 @t3(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: udiv x0, x0, x1
+; CHECK: ret
+  %udiv = udiv i64 %a, %b
+  ret i64 %udiv
+}
+
+define i32 @t4(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: sdiv w0, w0, w1
+; CHECK: ret
+  %sdiv = sdiv i32 %a, %b
+  ret i32 %sdiv
+}
+
+define i64 @t5(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: sdiv x0, x0, x1
+; CHECK: ret
+  %sdiv = sdiv i64 %a, %b
+  ret i64 %sdiv
+}
+
+define i32 @t6(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: lsl w0, w0, w1
+; CHECK: ret
+  %shl = shl i32 %a, %b
+  ret i32 %shl
+}
+
+define i64 @t7(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: lsl x0, x0, x1
+; CHECK: ret
+  %shl = shl i64 %a, %b
+  ret i64 %shl
+}
+
+define i32 @t8(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: lsr w0, w0, w1
+; CHECK: ret
+  %lshr = lshr i32 %a, %b
+  ret i32 %lshr
+}
+
+define i64 @t9(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t9:
+; CHECK: lsr x0, x0, x1
+; CHECK: ret
+  %lshr = lshr i64 %a, %b
+  ret i64 %lshr
+}
+
+define i32 @t10(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t10:
+; CHECK: asr w0, w0, w1
+; CHECK: ret
+  %ashr = ashr i32 %a, %b
+  ret i32 %ashr
+}
+
+define i64 @t11(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t11:
+; CHECK: asr x0, x0, x1
+; CHECK: ret
+  %ashr = ashr i64 %a, %b
+  ret i64 %ashr
+}
+
+define i32 @t12(i16 %a, i32 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t12:
+; CHECK: add	w0, w1, w0, sxth
+; CHECK: ret
+  %c = sext i16 %a to i32
+  %e = add i32 %x, %c
+  ret i32 %e
+}
+
+define i32 @t13(i16 %a, i32 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t13:
+; CHECK: add	w0, w1, w0, sxth #2
+; CHECK: ret
+  %c = sext i16 %a to i32
+  %d = shl i32 %c, 2
+  %e = add i32 %x, %d
+  ret i32 %e
+}
+
+define i64 @t14(i16 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t14:
+; CHECK: add	x0, x1, w0, uxth #3
+; CHECK: ret
+  %c = zext i16 %a to i64
+  %d = shl i64 %c, 3
+  %e = add i64 %x, %d
+  ret i64 %e
+}
+
+; rdar://9160598
+define i64 @t15(i64 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t15:
+; CHECK: add x0, x1, w0, uxtw
+; CHECK: ret
+  %b = and i64 %a, 4294967295
+  %c = add i64 %x, %b
+  ret i64 %c
+}
+
+define i64 @t16(i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t16:
+; CHECK: lsl x0, x0, #1
+; CHECK: ret
+  %a = shl i64 %x, 1
+  ret i64 %a
+}
+
+; rdar://9166974
+define i64 @t17(i16 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t17:
+; CHECK: sxth [[REG:x[0-9]+]], w0
+; CHECK: neg x0, [[REG]], lsl #32
+; CHECK: ret
+  %tmp16 = sext i16 %a to i64
+  %tmp17 = mul i64 %tmp16, -4294967296
+  ret i64 %tmp17
+}
+
+define i32 @t18(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t18:
+; CHECK: sdiv w0, w0, w1
+; CHECK: ret
+  %sdiv = call i32 @llvm.aarch64.sdiv.i32(i32 %a, i32 %b)
+  ret i32 %sdiv
+}
+
+define i64 @t19(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t19:
+; CHECK: sdiv x0, x0, x1
+; CHECK: ret
+  %sdiv = call i64 @llvm.aarch64.sdiv.i64(i64 %a, i64 %b)
+  ret i64 %sdiv
+}
+
+define i32 @t20(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t20:
+; CHECK: udiv w0, w0, w1
+; CHECK: ret
+  %udiv = call i32 @llvm.aarch64.udiv.i32(i32 %a, i32 %b)
+  ret i32 %udiv
+}
+
+define i64 @t21(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t21:
+; CHECK: udiv x0, x0, x1
+; CHECK: ret
+  %udiv = call i64 @llvm.aarch64.udiv.i64(i64 %a, i64 %b)
+  ret i64 %udiv
+}
+
+declare i32 @llvm.aarch64.sdiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.sdiv.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.udiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.udiv.i64(i64, i64) nounwind readnone
+
+; 32-bit not.
+define i32 @inv_32(i32 %x) nounwind ssp {
+entry:
+; CHECK: inv_32
+; CHECK: mvn w0, w0
+; CHECK: ret
+  %inv = xor i32 %x, -1
+  ret i32 %inv
+}
+
+; 64-bit not.
+define i64 @inv_64(i64 %x) nounwind ssp {
+entry:
+; CHECK: inv_64
+; CHECK: mvn x0, x0
+; CHECK: ret
+  %inv = xor i64 %x, -1
+  ret i64 %inv
+}
+
+; Multiplying by a power of two plus or minus one is better done via shift
+; and add/sub rather than the madd/msub instructions. The latter are 4+ cycles,
+; and the former are two (total for the two instruction sequence for subtract).
+define i32 @f0(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f0:
+; CHECK-NEXT: add w0, w0, w0, lsl #3
+; CHECK-NEXT: ret
+  %res = mul i32 %a, 9
+  ret i32 %res
+}
+
+define i64 @f1(i64 %a) nounwind readnone ssp {
+; CHECK-LABEL: f1:
+; CHECK-NEXT: lsl x8, x0, #4
+; CHECK-NEXT: sub x0, x8, x0
+; CHECK-NEXT: ret
+  %res = mul i64 %a, 15
+  ret i64 %res
+}
+
+define i32 @f2(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f2:
+; CHECK-NEXT: lsl w8, w0, #3
+; CHECK-NEXT: sub w0, w8, w0
+; CHECK-NEXT: ret
+  %res = mul nsw i32 %a, 7
+  ret i32 %res
+}
+
+define i64 @f3(i64 %a) nounwind readnone ssp {
+; CHECK-LABEL: f3:
+; CHECK-NEXT: add x0, x0, x0, lsl #4
+; CHECK-NEXT: ret
+  %res = mul nsw i64 %a, 17
+  ret i64 %res
+}
diff --git a/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll b/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
new file mode 100644
index 00000000000..0904b62c403
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=arm64 -aarch64-dead-def-elimination=false < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test1() #0 {
+  %tmp1 = alloca i8
+  %tmp2 = icmp eq i8* %tmp1, null
+  %tmp3 = zext i1 %tmp2 to i32
+
+  ret i32 %tmp3
+
+  ; CHECK-LABEL: test1
+  ; CHECK: adds {{x[0-9]+}}, sp, #15
+}
diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
new file mode 100644
index 00000000000..3b43aa16d2b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -0,0 +1,225 @@
+; RUN: llc < %s -march=arm64 -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+
+@var = global i128 0
+
+define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp   [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x[[ADDR:[0-9]+]]]
+; CHECK-DAG: eor     [[MISMATCH_LO:x[0-9]+]], [[RESULTLO]], x2
+; CHECK-DAG: eor     [[MISMATCH_HI:x[0-9]+]], [[RESULTHI]], x3
+; CHECK: orr [[MISMATCH:x[0-9]+]], [[MISMATCH_LO]], [[MISMATCH_HI]]
+; CHECK: cbnz    [[MISMATCH]], [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: stxp   [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+; CHECK: [[DONE]]:
+  %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+  ret i128 %val
+}
+
+define void @fetch_and_nand(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK-DAG: bic    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK-DAG: bic    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw nand i128* %p, i128 %bits release
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_or(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK-DAG: orr    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK-DAG: orr    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw or i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_add(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_add:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: adds   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: adcs   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw add i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_sub(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_sub:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: subs   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: sbcs    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw sub i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_min(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_min:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp   [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp     [[DEST_REGLO]], x2
+; CHECK: cset    [[LOCMP:w[0-9]+]], ls
+; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset    [[HICMP:w[0-9]+]], le
+; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp     [[CMP]], #0
+; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw min i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_max(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_max:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp     [[DEST_REGLO]], x2
+; CHECK: cset    [[LOCMP:w[0-9]+]], hi
+; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset    [[HICMP:w[0-9]+]], gt
+; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp     [[CMP]], #0
+; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw max i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_umin(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umin:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp     [[DEST_REGLO]], x2
+; CHECK: cset    [[LOCMP:w[0-9]+]], ls
+; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset    [[HICMP:w[0-9]+]], ls
+; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp     [[CMP]], #0
+; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw umin i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_umax(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umax:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp     [[DEST_REGLO]], x2
+; CHECK: cset    [[LOCMP:w[0-9]+]], hi
+; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset    [[HICMP:w[0-9]+]], hi
+; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp     [[CMP]], #0
+; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw umax i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define i128 @atomic_load_seq_cst(i128* %p) {
+; CHECK-LABEL: atomic_load_seq_cst:
+; CHECK-NOT: dmb
+; CHECK-LABEL: ldaxp
+; CHECK-NOT: dmb
+   %r = load atomic i128* %p seq_cst, align 16
+   ret i128 %r
+}
+
+define i128 @atomic_load_relaxed(i128* %p) {
+; CHECK-LABEL: atomic_load_relaxed:
+; CHECK-NOT: dmb
+; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK-NOT: dmb
+   %r = load atomic i128* %p monotonic, align 16
+   ret i128 %r
+}
+
+
+define void @atomic_store_seq_cst(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_seq_cst:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp xzr, xzr, [x2]
+; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p seq_cst, align 16
+   ret void
+}
+
+define void @atomic_store_release(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_release:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p release, align 16
+   ret void
+}
+
+define void @atomic_store_relaxed(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_relaxed:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: stxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p unordered, align 16
+   ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll
new file mode 100644
index 00000000000..aa9b284410b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-atomic.ll
@@ -0,0 +1,331 @@
+; RUN: llc < %s -march=arm64 -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+
+define i32 @val_compare_and_swap(i32* %p) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: orr    [[NEWVAL_REG:w[0-9]+]], wzr, #0x4
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   [[RESULT:w[0-9]+]], [x0]
+; CHECK: cmp    [[RESULT]], #7
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
+  ret i32 %val
+}
+
+define i64 @val_compare_and_swap_64(i64* %p) {
+; CHECK-LABEL: val_compare_and_swap_64:
+; CHECK: orr    w[[NEWVAL_REG:[0-9]+]], wzr, #0x4
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   [[RESULT:x[0-9]+]], [x0]
+; CHECK: cmp    [[RESULT]], #7
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK-NOT: stxr x[[NEWVAL_REG]], x[[NEWVAL_REG]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], x[[NEWVAL_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
+  ret i64 %val
+}
+
+define i32 @fetch_and_nand(i32* %p) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   w[[DEST_REG:[0-9]+]], [x0]
+; CHECK: and    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], #0xfffffff8
+; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
+; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, x[[DEST_REG]]
+  %val = atomicrmw nand i32* %p, i32 7 release
+  ret i32 %val
+}
+
+define i64 @fetch_and_nand_64(i64* %p) {
+; CHECK-LABEL: fetch_and_nand_64:
+; CHECK: mov    x[[ADDR:[0-9]+]], x0
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
+; CHECK: and    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0xfffffffffffffff8
+; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+
+  %val = atomicrmw nand i64* %p, i64 7 acq_rel
+  ret i64 %val
+}
+
+define i32 @fetch_and_or(i32* %p) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK: movz   [[OLDVAL_REG:w[0-9]+]], #0x5
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   w[[DEST_REG:[0-9]+]], [x0]
+; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
+; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
+; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, x[[DEST_REG]]
+  %val = atomicrmw or i32* %p, i32 5 seq_cst
+  ret i32 %val
+}
+
+define i64 @fetch_and_or_64(i64* %p) {
+; CHECK: fetch_and_or_64:
+; CHECK: mov    x[[ADDR:[0-9]+]], x0
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
+; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0x7
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+  %val = atomicrmw or i64* %p, i64 7 monotonic
+  ret i64 %val
+}
+
+define void @acquire_fence() {
+   fence acquire
+   ret void
+   ; CHECK-LABEL: acquire_fence:
+   ; CHECK: dmb ishld
+}
+
+define void @release_fence() {
+   fence release
+   ret void
+   ; CHECK-LABEL: release_fence:
+   ; CHECK: dmb ish{{$}}
+}
+
+define void @seq_cst_fence() {
+   fence seq_cst
+   ret void
+   ; CHECK-LABEL: seq_cst_fence:
+   ; CHECK: dmb ish{{$}}
+}
+
+define i32 @atomic_load(i32* %p) {
+   %r = load atomic i32* %p seq_cst, align 4
+   ret i32 %r
+   ; CHECK-LABEL: atomic_load:
+   ; CHECK: ldar
+}
+
+define i8 @atomic_load_relaxed_8(i8* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_8:
+  %ptr_unsigned = getelementptr i8* %p, i32 4095
+  %val_unsigned = load atomic i8* %ptr_unsigned monotonic, align 1
+; CHECK: ldrb {{w[0-9]+}}, [x0, #4095]
+
+  %ptr_regoff = getelementptr i8* %p, i32 %off32
+  %val_regoff = load atomic i8* %ptr_regoff unordered, align 1
+  %tot1 = add i8 %val_unsigned, %val_regoff
+; CHECK: ldrb {{w[0-9]+}}, [x0, w1, sxtw]
+
+  %ptr_unscaled = getelementptr i8* %p, i32 -256
+  %val_unscaled = load atomic i8* %ptr_unscaled monotonic, align 1
+  %tot2 = add i8 %tot1, %val_unscaled
+; CHECK: ldurb {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
+  %val_random = load atomic i8* %ptr_random unordered, align 1
+  %tot3 = add i8 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: ldrb {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i8 %tot3
+}
+
+define i16 @atomic_load_relaxed_16(i16* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_16:
+  %ptr_unsigned = getelementptr i16* %p, i32 4095
+  %val_unsigned = load atomic i16* %ptr_unsigned monotonic, align 2
+; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr i16* %p, i32 %off32
+  %val_regoff = load atomic i16* %ptr_regoff unordered, align 2
+  %tot1 = add i16 %val_unsigned, %val_regoff
+; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+  %ptr_unscaled = getelementptr i16* %p, i32 -128
+  %val_unscaled = load atomic i16* %ptr_unscaled monotonic, align 2
+  %tot2 = add i16 %tot1, %val_unscaled
+; CHECK: ldurh {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
+  %val_random = load atomic i16* %ptr_random unordered, align 2
+  %tot3 = add i16 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: ldrh {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i16 %tot3
+}
+
+define i32 @atomic_load_relaxed_32(i32* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_32:
+  %ptr_unsigned = getelementptr i32* %p, i32 4095
+  %val_unsigned = load atomic i32* %ptr_unsigned monotonic, align 4
+; CHECK: ldr {{w[0-9]+}}, [x0, #16380]
+
+  %ptr_regoff = getelementptr i32* %p, i32 %off32
+  %val_regoff = load atomic i32* %ptr_regoff unordered, align 4
+  %tot1 = add i32 %val_unsigned, %val_regoff
+; CHECK: ldr {{w[0-9]+}}, [x0, w1, sxtw #2]
+
+  %ptr_unscaled = getelementptr i32* %p, i32 -64
+  %val_unscaled = load atomic i32* %ptr_unscaled monotonic, align 4
+  %tot2 = add i32 %tot1, %val_unscaled
+; CHECK: ldur {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
+  %val_random = load atomic i32* %ptr_random unordered, align 4
+  %tot3 = add i32 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: ldr {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i32 %tot3
+}
+
+define i64 @atomic_load_relaxed_64(i64* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_64:
+  %ptr_unsigned = getelementptr i64* %p, i32 4095
+  %val_unsigned = load atomic i64* %ptr_unsigned monotonic, align 8
+; CHECK: ldr {{x[0-9]+}}, [x0, #32760]
+
+  %ptr_regoff = getelementptr i64* %p, i32 %off32
+  %val_regoff = load atomic i64* %ptr_regoff unordered, align 8
+  %tot1 = add i64 %val_unsigned, %val_regoff
+; CHECK: ldr {{x[0-9]+}}, [x0, w1, sxtw #3]
+
+  %ptr_unscaled = getelementptr i64* %p, i32 -32
+  %val_unscaled = load atomic i64* %ptr_unscaled monotonic, align 8
+  %tot2 = add i64 %tot1, %val_unscaled
+; CHECK: ldur {{x[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
+  %val_random = load atomic i64* %ptr_random unordered, align 8
+  %tot3 = add i64 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: ldr {{x[0-9]+}}, [x[[ADDR]]]
+
+  ret i64 %tot3
+}
+
+
+define void @atomc_store(i32* %p) {
+   store atomic i32 4, i32* %p seq_cst, align 4
+   ret void
+   ; CHECK-LABEL: atomc_store:
+   ; CHECK: stlr
+}
+
+define void @atomic_store_relaxed_8(i8* %p, i32 %off32, i8 %val) {
+; CHECK-LABEL: atomic_store_relaxed_8:
+  %ptr_unsigned = getelementptr i8* %p, i32 4095
+  store atomic i8 %val, i8* %ptr_unsigned monotonic, align 1
+; CHECK: strb {{w[0-9]+}}, [x0, #4095]
+
+  %ptr_regoff = getelementptr i8* %p, i32 %off32
+  store atomic i8 %val, i8* %ptr_regoff unordered, align 1
+; CHECK: strb {{w[0-9]+}}, [x0, w1, sxtw]
+
+  %ptr_unscaled = getelementptr i8* %p, i32 -256
+  store atomic i8 %val, i8* %ptr_unscaled monotonic, align 1
+; CHECK: sturb {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
+  store atomic i8 %val, i8* %ptr_random unordered, align 1
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: strb {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_16(i16* %p, i32 %off32, i16 %val) {
+; CHECK-LABEL: atomic_store_relaxed_16:
+  %ptr_unsigned = getelementptr i16* %p, i32 4095
+  store atomic i16 %val, i16* %ptr_unsigned monotonic, align 2
+; CHECK: strh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr i16* %p, i32 %off32
+  store atomic i16 %val, i16* %ptr_regoff unordered, align 2
+; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+  %ptr_unscaled = getelementptr i16* %p, i32 -128
+  store atomic i16 %val, i16* %ptr_unscaled monotonic, align 2
+; CHECK: sturh {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
+  store atomic i16 %val, i16* %ptr_random unordered, align 2
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: strh {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_32(i32* %p, i32 %off32, i32 %val) {
+; CHECK-LABEL: atomic_store_relaxed_32:
+  %ptr_unsigned = getelementptr i32* %p, i32 4095
+  store atomic i32 %val, i32* %ptr_unsigned monotonic, align 4
+; CHECK: str {{w[0-9]+}}, [x0, #16380]
+
+  %ptr_regoff = getelementptr i32* %p, i32 %off32
+  store atomic i32 %val, i32* %ptr_regoff unordered, align 4
+; CHECK: str {{w[0-9]+}}, [x0, w1, sxtw #2]
+
+  %ptr_unscaled = getelementptr i32* %p, i32 -64
+  store atomic i32 %val, i32* %ptr_unscaled monotonic, align 4
+; CHECK: stur {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
+  store atomic i32 %val, i32* %ptr_random unordered, align 4
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: str {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_64(i64* %p, i32 %off32, i64 %val) {
+; CHECK-LABEL: atomic_store_relaxed_64:
+  %ptr_unsigned = getelementptr i64* %p, i32 4095
+  store atomic i64 %val, i64* %ptr_unsigned monotonic, align 8
+; CHECK: str {{x[0-9]+}}, [x0, #32760]
+
+  %ptr_regoff = getelementptr i64* %p, i32 %off32
+  store atomic i64 %val, i64* %ptr_regoff unordered, align 8
+; CHECK: str {{x[0-9]+}}, [x0, w1, sxtw #3]
+
+  %ptr_unscaled = getelementptr i64* %p, i32 -32
+  store atomic i64 %val, i64* %ptr_unscaled monotonic, align 8
+; CHECK: stur {{x[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
+  store atomic i64 %val, i64* %ptr_random unordered, align 8
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: str {{x[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+; rdar://11531169
+; rdar://11531308
+
+%"class.X::Atomic" = type { %struct.x_atomic_t }
+%struct.x_atomic_t = type { i32 }
+
+@counter = external hidden global %"class.X::Atomic", align 4
+
+define i32 @next_id() nounwind optsize ssp align 2 {
+entry:
+  %0 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
+  %add.i = add i32 %0, 1
+  %tobool = icmp eq i32 %add.i, 0
+  br i1 %tobool, label %if.else, label %return
+
+if.else:                                          ; preds = %entry
+  %1 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
+  %add.i2 = add i32 %1, 1
+  br label %return
+
+return:                                           ; preds = %if.else, %entry
+  %retval.0 = phi i32 [ %add.i2, %if.else ], [ %add.i, %entry ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/AArch64/arm64-basic-pic.ll b/test/CodeGen/AArch64/arm64-basic-pic.ll
new file mode 100644
index 00000000000..9fdb1e91385
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-basic-pic.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+
+@var = global i32 0
+
+define i32 @get_globalvar() {
+; CHECK-LABEL: get_globalvar:
+
+  %val = load i32* @var
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
+; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], :got_lo12:var]
+; CHECK: ldr w0, [x[[GOTLOC]]]
+
+  ret i32 %val
+}
+
+define i32* @get_globalvaraddr() {
+; CHECK-LABEL: get_globalvaraddr:
+
+  %val = load i32* @var
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:var]
+
+  ret i32* @var
+}
+
+@hiddenvar = hidden global i32 0
+
+define i32 @get_hiddenvar() {
+; CHECK-LABEL: get_hiddenvar:
+
+  %val = load i32* @hiddenvar
+; CHECK: adrp x[[HI:[0-9]+]], hiddenvar
+; CHECK: ldr w0, [x[[HI]], :lo12:hiddenvar]
+
+  ret i32 %val
+}
+
+define i32* @get_hiddenvaraddr() {
+; CHECK-LABEL: get_hiddenvaraddr:
+
+  %val = load i32* @hiddenvar
+; CHECK: adrp [[HI:x[0-9]+]], hiddenvar
+; CHECK: add x0, [[HI]], :lo12:hiddenvar
+
+  ret i32* @hiddenvar
+}
+
+define void()* @get_func() {
+; CHECK-LABEL: get_func:
+
+  ret void()* bitcast(void()*()* @get_func to void()*)
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func
+; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:get_func]
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
new file mode 100644
index 00000000000..f0e968b2c17
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
@@ -0,0 +1,1101 @@
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
+
+; CHECK-LABEL: test_i64_f64:
+define void @test_i64_f64(double* %p, i64* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: str
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: str
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_i64:
+define void @test_f64_i64(i64* %p, double* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: str
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: str
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: str
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: str
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev16 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev16 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: ext
+; CHECK: str
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: ext
+; CHECK: str
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: str q
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: str
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: str
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: ext
+; CHECK: str q
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
+; CHECK: ldr
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
+; CHECK: ldr
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
+; CHECK: ldr q
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev16 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
+; CHECK: ldr q
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev16 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-eh.ll b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
new file mode 100644
index 00000000000..93e7da98de2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
@@ -0,0 +1,73 @@
+; RUN: llc -mtriple arm64_be-linux-gnu -filetype obj < %s | llvm-objdump -s - | FileCheck %s
+
+; ARM EHABI for big endian
+; This test case checks whether CIE length record is laid out in big endian format.
+;
+; This is the LLVM assembly generated from following C++ code:
+;
+; extern void foo(int);
+; void test(int a, int b) {
+;   try {
+;   foo(a);
+; } catch (...) {
+;   foo(b);
+; }
+;}
+
+define void @_Z4testii(i32 %a, i32 %b) #0 {
+entry:
+  invoke void @_Z3fooi(i32 %a)
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2
+  invoke void @_Z3fooi(i32 %b)
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %lpad
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %invoke.cont2
+  ret void
+
+lpad1:                                            ; preds = %lpad
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:                                        ; preds = %lpad1
+  resume { i8*, i32 } %3
+
+terminate.lpad:                                   ; preds = %lpad1
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %5 = extractvalue { i8*, i32 } %4, 0
+  tail call void @__clang_call_terminate(i8* %5) #3
+  unreachable
+}
+
+declare void @_Z3fooi(i32) #0
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: noinline noreturn nounwind
+define linkonce_odr hidden void @__clang_call_terminate(i8*) #1 {
+  %2 = tail call i8* @__cxa_begin_catch(i8* %0) #2
+  tail call void @_ZSt9terminatev() #3
+  unreachable
+}
+
+declare void @_ZSt9terminatev()
+
+; CHECK-LABEL: Contents of section .eh_frame:
+; CHECK-NEXT: 0000 0000001c
+
diff --git a/test/CodeGen/AArch64/arm64-big-endian-varargs.ll b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
new file mode 100644
index 00000000000..d7b26b97523
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s | FileCheck %s
+
+; Vararg saving must save Q registers using the equivalent of STR/STP.
+
+target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64_be-arm-none-eabi"
+
+%struct.__va_list = type { i8*, i8*, i8*, i32, i32 }
+
+declare void @llvm.va_start(i8*) nounwind
+declare void @llvm.va_end(i8*) nounwind
+
+define double @callee(i32 %a, ...) {
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+entry:
+  %vl = alloca %struct.__va_list, align 8
+  %vl1 = bitcast %struct.__va_list* %vl to i8*
+  call void @llvm.va_start(i8* %vl1)
+  %vr_offs_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 4
+  %vr_offs = load i32* %vr_offs_p, align 4
+  %0 = icmp sgt i32 %vr_offs, -1
+  br i1 %0, label %vaarg.on_stack, label %vaarg.maybe_reg
+
+vaarg.maybe_reg:                                  ; preds = %entry
+  %new_reg_offs = add i32 %vr_offs, 16
+  store i32 %new_reg_offs, i32* %vr_offs_p, align 4
+  %inreg = icmp slt i32 %new_reg_offs, 1
+  br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack
+
+vaarg.in_reg:                                     ; preds = %vaarg.maybe_reg
+  %reg_top_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 2
+  %reg_top = load i8** %reg_top_p, align 8
+  %1 = sext i32 %vr_offs to i64
+  %2 = getelementptr i8* %reg_top, i64 %1
+  %3 = ptrtoint i8* %2 to i64
+  %align_be = add i64 %3, 8
+  %4 = inttoptr i64 %align_be to i8*
+  br label %vaarg.end
+
+vaarg.on_stack:                                   ; preds = %vaarg.maybe_reg, %entry
+  %stack_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 0
+  %stack = load i8** %stack_p, align 8
+  %new_stack = getelementptr i8* %stack, i64 8
+  store i8* %new_stack, i8** %stack_p, align 8
+  br label %vaarg.end
+
+vaarg.end:                                        ; preds = %vaarg.on_stack, %vaarg.in_reg
+  %.sink = phi i8* [ %4, %vaarg.in_reg ], [ %stack, %vaarg.on_stack ]
+  %5 = bitcast i8* %.sink to double*
+  %6 = load double* %5, align 8
+  call void @llvm.va_end(i8* %vl1)
+  ret double %6
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
new file mode 100644
index 00000000000..1dcccf106a2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
@@ -0,0 +1,848 @@
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s
+
+; CHECK-LABEL: test_i64_f64:
+define i64 @test_i64_f64(double %p) {
+; CHECK-NOT: rev
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+define i64 @test_i64_v1i64(<1 x i64> %p) {
+; CHECK-NOT: rev
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+define i64 @test_i64_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+define i64 @test_i64_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+define i64 @test_i64_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+define i64 @test_i64_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_f64_i64:
+define double @test_f64_i64(i64 %p) {
+; CHECK-NOT: rev
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+define double @test_f64_v1i64(<1 x i64> %p) {
+; CHECK-NOT: rev
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+define double @test_f64_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+define double @test_f64_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+define double @test_f64_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+define double @test_f64_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+define <1 x i64> @test_v1i64_i64(i64 %p) {
+; CHECK-NOT: rev
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+define <1 x i64> @test_v1i64_f64(double %p) {
+; CHECK-NOT: rev
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+define <1 x i64> @test_v1i64_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+define <1 x i64> @test_v1i64_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+define <1 x i64> @test_v1i64_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+define <1 x i64> @test_v1i64_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+define <2 x float> @test_v2f32_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+define <2 x float> @test_v2f32_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+define <2 x float> @test_v2f32_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+define <2 x float> @test_v2f32_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+define <2 x float> @test_v2f32_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+define <2 x float> @test_v2f32_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+define <2 x i32> @test_v2i32_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+define <2 x i32> @test_v2i32_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+define <2 x i32> @test_v2i32_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+define <2 x i32> @test_v2i32_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+define <2 x i32> @test_v2i32_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+define <2 x i32> @test_v2i32_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+define <4 x i16> @test_v4i16_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+define <4 x i16> @test_v4i16_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+define <4 x i16> @test_v4i16_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+define <4 x i16> @test_v4i16_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+define <4 x i16> @test_v4i16_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+define <4 x i16> @test_v4i16_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+define <8 x i8> @test_v8i8_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+define <8 x i8> @test_v8i8_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+define <8 x i8> @test_v8i8_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+define <8 x i8> @test_v8i8_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+define <8 x i8> @test_v8i8_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+define <8 x i8> @test_v8i8_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+define fp128 @test_f128_v2f64(<2 x double> %p) {
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+define fp128 @test_f128_v2i64(<2 x i64> %p) {
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+define fp128 @test_f128_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+define fp128 @test_f128_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+define fp128 @test_f128_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+define fp128 @test_f128_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+define <2 x double> @test_v2f64_f128(fp128 %p) {
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+define <2 x double> @test_v2f64_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+define <2 x double> @test_v2f64_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+define <2 x double> @test_v2f64_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+define <2 x double> @test_v2f64_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+define <2 x double> @test_v2f64_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+define <2 x i64> @test_v2i64_f128(fp128 %p) {
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+define <2 x i64> @test_v2i64_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+define <2 x i64> @test_v2i64_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+define <2 x i64> @test_v2i64_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+define <2 x i64> @test_v2i64_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+define <2 x i64> @test_v2i64_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+define <4 x float> @test_v4f32_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+define <4 x float> @test_v4f32_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+define <4 x float> @test_v4f32_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+define <4 x float> @test_v4f32_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+define <4 x float> @test_v4f32_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+define <4 x float> @test_v4f32_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+define <4 x i32> @test_v4i32_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+define <4 x i32> @test_v4i32_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+define <4 x i32> @test_v4i32_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+define <4 x i32> @test_v4i32_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+define <4 x i32> @test_v4i32_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+define <4 x i32> @test_v4i32_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+define <8 x i16> @test_v8i16_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+define <8 x i16> @test_v8i16_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+define <8 x i16> @test_v8i16_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+define <8 x i16> @test_v8i16_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+define <8 x i16> @test_v8i16_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+define <8 x i16> @test_v8i16_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+define <16 x i8> @test_v16i8_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+define <16 x i8> @test_v16i8_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+define <16 x i8> @test_v16i8_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+define <16 x i8> @test_v16i8_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+define <16 x i8> @test_v16i8_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+define <16 x i8> @test_v16i8_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
new file mode 100644
index 00000000000..9a12b7a0115
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
@@ -0,0 +1,1100 @@
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
+
+; CHECK-LABEL: test_i64_f64:
+declare i64 @test_i64_f64_helper(double %p)
+define void @test_i64_f64(double* %p, i64* %q) {
+; CHECK-NOT: rev
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call i64 @test_i64_f64_helper(double %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+declare i64 @test_i64_v1i64_helper(<1 x i64> %p)
+define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
+; CHECK-NOT: rev
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call i64 @test_i64_v1i64_helper(<1 x i64> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+declare i64 @test_i64_v2f32_helper(<2 x float> %p)
+define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call i64 @test_i64_v2f32_helper(<2 x float> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+declare i64 @test_i64_v2i32_helper(<2 x i32> %p)
+define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call i64 @test_i64_v2i32_helper(<2 x i32> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+declare i64 @test_i64_v4i16_helper(<4 x i16> %p)
+define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call i64 @test_i64_v4i16_helper(<4 x i16> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+declare i64 @test_i64_v8i8_helper(<8 x i8> %p)
+define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call i64 @test_i64_v8i8_helper(<8 x i8> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_i64:
+declare double @test_f64_i64_helper(i64 %p)
+define void @test_f64_i64(i64* %p, double* %q) {
+; CHECK-NOT: rev
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call double @test_f64_i64_helper(i64 %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+declare double @test_f64_v1i64_helper(<1 x i64> %p)
+define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
+; CHECK-NOT: rev
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call double @test_f64_v1i64_helper(<1 x i64> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+declare double @test_f64_v2f32_helper(<2 x float> %p)
+define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call double @test_f64_v2f32_helper(<2 x float> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+declare double @test_f64_v2i32_helper(<2 x i32> %p)
+define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call double @test_f64_v2i32_helper(<2 x i32> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+declare double @test_f64_v4i16_helper(<4 x i16> %p)
+define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call double @test_f64_v4i16_helper(<4 x i16> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+declare double @test_f64_v8i8_helper(<8 x i8> %p)
+define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call double @test_f64_v8i8_helper(<8 x i8> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+declare <1 x i64> @test_v1i64_i64_helper(i64 %p)
+define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
+; CHECK-NOT: rev
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <1 x i64> @test_v1i64_i64_helper(i64 %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+declare <1 x i64> @test_v1i64_f64_helper(double %p)
+define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
+; CHECK-NOT: rev
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <1 x i64> @test_v1i64_f64_helper(double %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+declare <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %p)
+define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+declare <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %p)
+define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+declare <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %p)
+define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+declare <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %p)
+define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+declare <2 x float> @test_v2f32_i64_helper(i64 %p)
+define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <2 x float> @test_v2f32_i64_helper(i64 %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+declare <2 x float> @test_v2f32_f64_helper(double %p)
+define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <2 x float> @test_v2f32_f64_helper(double %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+declare <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %p)
+define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+declare <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %p)
+define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+declare <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %p)
+define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+declare <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %p)
+define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+declare <2 x i32> @test_v2i32_i64_helper(i64 %p)
+define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <2 x i32> @test_v2i32_i64_helper(i64 %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+declare <2 x i32> @test_v2i32_f64_helper(double %p)
+define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <2 x i32> @test_v2i32_f64_helper(double %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+declare <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %p)
+define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+declare <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %p)
+define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+declare <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %p)
+define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+declare <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %p)
+define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+declare <4 x i16> @test_v4i16_i64_helper(i64 %p)
+define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <4 x i16> @test_v4i16_i64_helper(i64 %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+declare <4 x i16> @test_v4i16_f64_helper(double %p)
+define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <4 x i16> @test_v4i16_f64_helper(double %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+declare <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %p)
+define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+declare <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %p)
+define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+declare <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %p)
+define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+declare <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %p)
+define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+declare <8 x i8> @test_v8i8_i64_helper(i64 %p)
+define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <8 x i8> @test_v8i8_i64_helper(i64 %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+declare <8 x i8> @test_v8i8_f64_helper(double %p)
+define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <8 x i8> @test_v8i8_f64_helper(double %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+declare <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %p)
+define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+declare <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %p)
+define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+declare <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %p)
+define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+declare <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %p)
+define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+declare fp128 @test_f128_v2f64_helper(<2 x double> %p)
+define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call fp128 @test_f128_v2f64_helper(<2 x double> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+declare fp128 @test_f128_v2i64_helper(<2 x i64> %p)
+define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call fp128 @test_f128_v2i64_helper(<2 x i64> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+declare fp128 @test_f128_v4f32_helper(<4 x float> %p)
+define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call fp128 @test_f128_v4f32_helper(<4 x float> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+declare fp128 @test_f128_v4i32_helper(<4 x i32> %p)
+define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call fp128 @test_f128_v4i32_helper(<4 x i32> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+declare fp128 @test_f128_v8i16_helper(<8 x i16> %p)
+define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call fp128 @test_f128_v8i16_helper(<8 x i16> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+declare fp128 @test_f128_v16i8_helper(<16 x i8> %p)
+define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call fp128 @test_f128_v16i8_helper(<16 x i8> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+declare <2 x double> @test_v2f64_f128_helper(fp128 %p)
+define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <2 x double> @test_v2f64_f128_helper(fp128 %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+declare <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %p)
+define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
+; CHECK: ext
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+declare <2 x double> @test_v2f64_v4f32_helper(<4 x float> %p)
+define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <2 x double> @test_v2f64_v4f32_helper(<4 x float> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+declare <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %p)
+define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+declare <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %p)
+define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+declare <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %p)
+define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+declare <2 x i64> @test_v2i64_f128_helper(fp128 %p)
+define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <2 x i64> @test_v2i64_f128_helper(fp128 %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+declare <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %p)
+define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
+; CHECK: ext
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+declare <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %p)
+define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+declare <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %p)
+define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+declare <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %p)
+define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+declare <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %p)
+define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+declare <4 x float> @test_v4f32_f128_helper(fp128 %p)
+define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <4 x float> @test_v4f32_f128_helper(fp128 %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+declare <4 x float> @test_v4f32_v2f64_helper(<2 x double> %p)
+define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <4 x float> @test_v4f32_v2f64_helper(<2 x double> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+declare <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %p)
+define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+declare <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %p)
+define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+declare <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %p)
+define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+declare <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %p)
+define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+declare <4 x i32> @test_v4i32_f128_helper(fp128 %p)
+define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <4 x i32> @test_v4i32_f128_helper(fp128 %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+declare <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %p)
+define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+declare <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %p)
+define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+declare <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %p)
+define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+declare <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %p)
+define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+declare <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %p)
+define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+declare <8 x i16> @test_v8i16_f128_helper(fp128 %p)
+define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <8 x i16> @test_v8i16_f128_helper(fp128 %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+declare <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %p)
+define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+declare <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %p)
+define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+declare <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %p)
+define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+declare <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %p)
+define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+declare <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %p)
+define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+declare <16 x i8> @test_v16i8_f128_helper(fp128 %p)
+define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <16 x i8> @test_v16i8_f128_helper(fp128 %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+declare <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %p)
+define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+declare <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %p)
+define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+declare <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %p)
+define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+declare <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %p)
+define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+declare <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %p)
+define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-big-imm-offsets.ll b/test/CodeGen/AArch64/arm64-big-imm-offsets.ll
new file mode 100644
index 00000000000..a56df07a49a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-imm-offsets.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm64 < %s
+
+
+; Make sure large offsets aren't mistaken for valid immediate offsets.
+; <rdar://problem/13190511>
+define void @f(i32* nocapture %p) {
+entry:
+  %a = ptrtoint i32* %p to i64
+  %ao = add i64 %a, 25769803792
+  %b = inttoptr i64 %ao to i32*
+  store volatile i32 0, i32* %b, align 4
+  store volatile i32 0, i32* %b, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-big-stack.ll b/test/CodeGen/AArch64/arm64-big-stack.ll
new file mode 100644
index 00000000000..3f91bb3c248
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-stack.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-apple-macosx10"
+
+; Check that big stacks are generated correctly.
+; Currently, this is done by a sequence of sub instructions,
+; which can encode immediate with a 12 bits mask an optionally
+; shift left (up to 12). I.e., 16773120 is the biggest value.
+; <rdar://12513931>
+; CHECK-LABEL: foo:
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #2, lsl #12
+define void @foo() nounwind ssp {
+entry:
+  %buffer = alloca [33554432 x i8], align 1
+  %arraydecay = getelementptr inbounds [33554432 x i8]* %buffer, i64 0, i64 0
+  call void @doit(i8* %arraydecay) nounwind
+  ret void
+}
+
+declare void @doit(i8*)
diff --git a/test/CodeGen/AArch64/arm64-bitfield-extract.ll b/test/CodeGen/AArch64/arm64-bitfield-extract.ll
new file mode 100644
index 00000000000..112efddd4fa
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-bitfield-extract.ll
@@ -0,0 +1,532 @@
+; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s | FileCheck --check-prefix=OPT %s
+; RUN: llc < %s -march=arm64 | FileCheck %s
+%struct.X = type { i8, i8, [2 x i8] }
+%struct.Y = type { i32, i8 }
+%struct.Z = type { i8, i8, [2 x i8], i16 }
+%struct.A = type { i64, i8 }
+
+define void @foo(%struct.X* nocapture %x, %struct.Y* nocapture %y) nounwind optsize ssp {
+; CHECK-LABEL: foo:
+; CHECK: ubfx
+; CHECK-NOT: and
+; CHECK: ret
+
+  %tmp = bitcast %struct.X* %x to i32*
+  %tmp1 = load i32* %tmp, align 4
+  %b = getelementptr inbounds %struct.Y* %y, i64 0, i32 1
+  %bf.clear = lshr i32 %tmp1, 3
+  %bf.clear.lobit = and i32 %bf.clear, 1
+  %frombool = trunc i32 %bf.clear.lobit to i8
+  store i8 %frombool, i8* %b, align 1
+  ret void
+}
+
+define i32 @baz(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: baz:
+; CHECK: sbfx  w0, w0, #0, #4
+  %tmp = trunc i64 %cav1.coerce to i32
+  %tmp1 = shl i32 %tmp, 28
+  %bf.val.sext = ashr exact i32 %tmp1, 28
+  ret i32 %bf.val.sext
+}
+
+define i32 @bar(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: sbfx  w0, w0, #4, #6
+  %tmp = trunc i64 %cav1.coerce to i32
+  %cav1.sroa.0.1.insert = shl i32 %tmp, 22
+  %tmp1 = ashr i32 %cav1.sroa.0.1.insert, 26
+  ret i32 %tmp1
+}
+
+define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind optsize ssp {
+; CHECK-LABEL: fct1:
+; CHECK: ubfx
+; CHECK-NOT: and
+; CHECK: ret
+
+  %tmp = bitcast %struct.Z* %x to i64*
+  %tmp1 = load i64* %tmp, align 4
+  %b = getelementptr inbounds %struct.A* %y, i64 0, i32 0
+  %bf.clear = lshr i64 %tmp1, 3
+  %bf.clear.lobit = and i64 %bf.clear, 1
+  store i64 %bf.clear.lobit, i64* %b, align 8
+  ret void
+}
+
+define i64 @fct2(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: fct2:
+; CHECK: sbfx  x0, x0, #0, #36
+  %tmp = shl i64 %cav1.coerce, 28
+  %bf.val.sext = ashr exact i64 %tmp, 28
+  ret i64 %bf.val.sext
+}
+
+define i64 @fct3(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: fct3:
+; CHECK: sbfx  x0, x0, #4, #38
+  %cav1.sroa.0.1.insert = shl i64 %cav1.coerce, 22
+  %tmp1 = ashr i64 %cav1.sroa.0.1.insert, 26
+  ret i64 %tmp1
+}
+
+define void @fct4(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct4:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #24
+; CHECK-NEXT: str [[REG1]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -16777216
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 16777215
+  %or = or i64 %and, %and1
+  store i64 %or, i64* %y, align 8
+  ret void
+}
+
+define void @fct5(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct5:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
+; CHECK-NEXT: str [[REG1]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  store i32 %or, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some low bits
+define void @fct6(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct6:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shr1 = lshr i32 %or, 2
+  store i32 %shr1, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+define void @fct7(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct7:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  store i32 %shl, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some low bits
+; (i64 version)
+define void @fct8(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct8:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shr1 = lshr i64 %or, 2
+  store i64 %shr1, i64* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; (i64 version)
+define void @fct9(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct9:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  store i64 %shl, i64* %y, align 8
+  ret void
+}
+
+; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
+; (i32 version)
+define void @fct10(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct10:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #0, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %and1 = and i32 %x, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  store i32 %shl, i32* %y, align 8
+  ret void
+}
+
+; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
+; (i64 version)
+define void @fct11(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct11:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #0, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %and1 = and i64 %x, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  store i64 %shl, i64* %y, align 8
+  ret void
+}
+
+define zeroext i1 @fct12bis(i32 %tmp2) unnamed_addr nounwind ssp align 2 {
+; CHECK-LABEL: fct12bis:
+; CHECK-NOT: and
+; CHECK: ubfx w0, w0, #11, #1
+  %and.i.i = and i32 %tmp2, 2048
+  %tobool.i.i = icmp ne i32 %and.i.i, 0
+  ret i1 %tobool.i.i
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+define void @fct12(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct12:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfx [[REG2:w[0-9]+]], [[REG1]], #2, #28
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  %shr2 = lshr i32 %shl, 4
+  store i32 %shr2, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+; (i64 version)
+define void @fct13(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct13:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfx [[REG2:x[0-9]+]], [[REG1]], #2, #60
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  %shr2 = lshr i64 %shl, 4
+  store i64 %shr2, i64* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+define void @fct14(i32* nocapture %y, i32 %x, i32 %x1) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct14:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #8
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #4
+; CHECK-NEXT: bfxil [[REG2]], w2, #5, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG3:w[0-9]+]], [[REG2]], #2
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -256
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 255
+  %or = or i32 %and, %and1
+  %shl = lshr i32 %or, 4
+  %and2 = and i32 %shl, -8
+  %shr1 = lshr i32 %x1, 5
+  %and3 = and i32 %shr1, 7
+  %or1 = or i32 %and2, %and3
+  %shl1 = shl i32 %or1, 2
+  store i32 %shl1, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+; (i64 version)
+define void @fct15(i64* nocapture %y, i64 %x, i64 %x1) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct15:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #8
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #4
+; CHECK-NEXT: bfxil [[REG2]], x2, #5, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG3:x[0-9]+]], [[REG2]], #2
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -256
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 255
+  %or = or i64 %and, %and1
+  %shl = lshr i64 %or, 4
+  %and2 = and i64 %shl, -8
+  %shr1 = lshr i64 %x1, 5
+  %and3 = and i64 %shr1, 7
+  %or1 = or i64 %and2, %and3
+  %shl1 = shl i64 %or1, 2
+  store i64 %shl1, i64* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits and a masking operation has to be kept
+define void @fct16(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct16:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; Create the constant
+; CHECK: movz [[REGCST:w[0-9]+]], #0x1a, lsl #16
+; CHECK: movk [[REGCST]], #0x8160
+; Do the masking
+; CHECK: and [[REG2:w[0-9]+]], [[REG1]], [[REGCST]]
+; CHECK-NEXT: bfxil [[REG2]], w1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfx [[REG3:w[0-9]+]], [[REG2]], #2, #28
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, 1737056
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  %shr2 = lshr i32 %shl, 4
+  store i32 %shr2, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits and a masking operation has to be kept
+; (i64 version)
+define void @fct17(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct17:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; Create the constant
+; CHECK: movz w[[REGCST:[0-9]+]], #0x1a, lsl #16
+; CHECK: movk w[[REGCST]], #0x8160
+; Do the masking
+; CHECK: and [[REG2:x[0-9]+]], [[REG1]], x[[REGCST]]
+; CHECK-NEXT: bfxil [[REG2]], x1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfx [[REG3:x[0-9]+]], [[REG2]], #2, #60
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, 1737056
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  %shr2 = lshr i64 %shl, 4
+  store i64 %shr2, i64* %y, align 8
+  ret void
+}
+
+define i64 @fct18(i32 %xor72) nounwind ssp {
+; CHECK-LABEL: fct18:
+; CHECK: ubfx x0, x0, #9, #8
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %result = and i64 %conv82, 255
+  ret i64 %result
+}
+
+; Using the access to the global array to keep the instruction and control flow.
+@first_ones = external global [65536 x i8]
+
+; Function Attrs: nounwind readonly ssp
+define i32 @fct19(i64 %arg1) nounwind readonly ssp  {
+; CHECK-LABEL: fct19:
+entry:
+  %x.sroa.1.0.extract.shift = lshr i64 %arg1, 16
+  %x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16
+  %x.sroa.3.0.extract.shift = lshr i64 %arg1, 32
+  %x.sroa.5.0.extract.shift = lshr i64 %arg1, 48
+  %tobool = icmp eq i64 %x.sroa.5.0.extract.shift, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %x.sroa.5.0.extract.shift
+  %0 = load i8* %arrayidx3, align 1
+  %conv = zext i8 %0 to i32
+  br label %return
+
+; OPT-LABEL: if.end
+if.end:                                           ; preds = %entry
+; OPT: lshr
+; CHECK: ubfx	[[REG1:x[0-9]+]], [[REG2:x[0-9]+]], #32, #16
+  %x.sroa.3.0.extract.trunc = trunc i64 %x.sroa.3.0.extract.shift to i16
+  %tobool6 = icmp eq i16 %x.sroa.3.0.extract.trunc, 0
+; CHECK: cbz
+  br i1 %tobool6, label %if.end13, label %if.then7
+
+; OPT-LABEL: if.then7
+if.then7:                                         ; preds = %if.end
+; OPT: lshr
+; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. 
+; So neither of them should be in the assemble code. 
+; CHECK-NOT: and
+; CHECK-NOT: ubfm
+  %idxprom10 = and i64 %x.sroa.3.0.extract.shift, 65535
+  %arrayidx11 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom10
+  %1 = load i8* %arrayidx11, align 1
+  %conv12 = zext i8 %1 to i32
+  %add = add nsw i32 %conv12, 16
+  br label %return
+
+; OPT-LABEL: if.end13
+if.end13:                                         ; preds = %if.end
+; OPT: lshr
+; OPT: trunc
+; CHECK: ubfx	[[REG3:x[0-9]+]], [[REG4:x[0-9]+]], #16, #16
+  %tobool16 = icmp eq i16 %x.sroa.1.0.extract.trunc, 0
+; CHECK: cbz
+  br i1 %tobool16, label %return, label %if.then17
+
+; OPT-LABEL: if.then17
+if.then17:                                        ; preds = %if.end13
+; OPT: lshr
+; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. 
+; So neither of them should be in the assemble code. 
+; CHECK-NOT: and
+; CHECK-NOT: ubfm
+  %idxprom20 = and i64 %x.sroa.1.0.extract.shift, 65535
+  %arrayidx21 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom20
+  %2 = load i8* %arrayidx21, align 1
+  %conv22 = zext i8 %2 to i32
+  %add23 = add nsw i32 %conv22, 32
+  br label %return
+
+return:                                           ; preds = %if.end13, %if.then17, %if.then7, %if.then
+; CHECK: ret
+  %retval.0 = phi i32 [ %conv, %if.then ], [ %add, %if.then7 ], [ %add23, %if.then17 ], [ 64, %if.end13 ]
+  ret i32 %retval.0
+}
+
+; Make sure we do not assert if the immediate in and is bigger than i64.
+; PR19503.
+; OPT-LABEL: @fct20
+; OPT: lshr
+; OPT-NOT: lshr
+; OPT: ret
+; CHECK-LABEL: fct20:
+; CHECK: ret
+define i80 @fct20(i128 %a, i128 %b) {
+entry:
+  %shr = lshr i128 %a, 18
+  %conv = trunc i128 %shr to i80
+  %tobool = icmp eq i128 %b, 0
+  br i1 %tobool, label %then, label %end
+then:                     
+  %and = and i128 %shr, 483673642326615442599424
+  %conv2 = trunc i128 %and to i80
+  br label %end
+end:
+  %conv3 = phi i80 [%conv, %entry], [%conv2, %then] 
+  ret i80 %conv3
+}
+
+; Check if we can still catch UBFX when "AND" is used by SHL.
+; CHECK-LABEL: fct21:
+; CHECK: ubfx
+@arr = external global [8 x [64 x i64]]
+define i64 @fct21(i64 %x) {
+entry:
+  %shr = lshr i64 %x, 4
+  %and = and i64 %shr, 15
+  %arrayidx = getelementptr inbounds [8 x [64 x i64]]* @arr, i64 0, i64 0, i64 %and
+  %0 = load i64* %arrayidx, align 8
+  ret i64 %0
+}
+
+define i16 @test_ignored_rightbits(i32 %dst, i32 %in) {
+; CHECK-LABEL: test_ignored_rightbits:
+
+  %positioned_field = shl i32 %in, 3
+  %positioned_masked_field = and i32 %positioned_field, 120
+  %masked_dst = and i32 %dst, 7
+  %insertion = or i32 %masked_dst, %positioned_masked_field
+; CHECK: {{bfm|bfi|bfxil}}
+
+  %shl16 = shl i32 %insertion, 8
+  %or18 = or i32 %shl16, %insertion
+  %conv19 = trunc i32 %or18 to i16
+; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #8, #7
+
+  ret i16 %conv19
+}
diff --git a/test/CodeGen/AArch64/arm64-blockaddress.ll b/test/CodeGen/AArch64/arm64-blockaddress.ll
new file mode 100644
index 00000000000..ac4f19e65df
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-blockaddress.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc < %s -mtriple=arm64-linux-gnu -code-model=large| FileCheck %s --check-prefix=CHECK-LARGE
+
+; rdar://9188695
+
+define i64 @t() nounwind ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: adrp [[REG:x[0-9]+]], Ltmp1@PAGE
+; CHECK: add {{x[0-9]+}}, [[REG]], Ltmp1@PAGEOFF
+
+; CHECK-LINUX-LABEL: t:
+; CHECK-LINUX: adrp [[REG:x[0-9]+]], .Ltmp1
+; CHECK-LINUX: add {{x[0-9]+}}, [[REG]], :lo12:.Ltmp1
+
+; CHECK-LARGE-LABEL: t:
+; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g3:[[DEST_LBL:.Ltmp[0-9]+]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g1_nc:[[DEST_LBL]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g0_nc:[[DEST_LBL]]
+
+  %recover = alloca i64, align 8
+  store volatile i64 ptrtoint (i8* blockaddress(@t, %mylabel) to i64), i64* %recover, align 8
+  br label %mylabel
+
+mylabel:
+  %tmp = load volatile i64* %recover, align 8
+  ret i64 %tmp
+}
diff --git a/test/CodeGen/AArch64/arm64-build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll
new file mode 100644
index 00000000000..c109263cedb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; Check that building up a vector w/ only one non-zero lane initializes
+; intelligently.
+define void @one_lane(i32* nocapture %out_int, i32 %skip0) nounwind {
+; CHECK-LABEL: one_lane:
+; CHECK: dup.16b v[[REG:[0-9]+]], wzr
+; CHECK-NEXT: ins.b v[[REG]][0], w1
+; v and q are aliases, and str is preferred against st.16b when possible
+; rdar://11246289
+; CHECK: str q[[REG]], [x0]
+; CHECK: ret
+  %conv = trunc i32 %skip0 to i8
+  %vset_lane = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %conv, i32 0
+  %tmp = bitcast i32* %out_int to <4 x i32>*
+  %tmp1 = bitcast <16 x i8> %vset_lane to <4 x i32>
+  store <4 x i32> %tmp1, <4 x i32>* %tmp, align 16
+  ret void
+}
+
+; Check that building a vector from floats doesn't insert an unnecessary
+; copy for lane zero.
+define <4 x float>  @foo(float %a, float %b, float %c, float %d) nounwind {
+; CHECK-LABEL: foo:
+; CHECK-NOT: ins.s v0[0], v0[0]
+; CHECK: ins.s v0[1], v1[0]
+; CHECK: ins.s v0[2], v2[0]
+; CHECK: ins.s v0[3], v3[0]
+; CHECK: ret
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float %b, i32 1
+  %3 = insertelement <4 x float> %2, float %c, i32 2
+  %4 = insertelement <4 x float> %3, float %d, i32 3
+  ret <4 x float> %4
+}
diff --git a/test/CodeGen/AArch64/arm64-call-tailcalls.ll b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
new file mode 100644
index 00000000000..487c1d9bec3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+@t = weak global i32 ()* null
+@x = external global i32, align 4
+
+define void @t2() {
+; CHECK-LABEL: t2:
+; CHECK: adrp	x[[GOTADDR:[0-9]+]], _t@GOTPAGE
+; CHECK: ldr	x[[ADDR:[0-9]+]], [x[[GOTADDR]], _t@GOTPAGEOFF]
+; CHECK: ldr	x[[DEST:[0-9]+]], [x[[ADDR]]]
+; CHECK: br	x[[DEST]]
+  %tmp = load i32 ()** @t
+  %tmp.upgrd.2 = tail call i32 %tmp()
+  ret void
+}
+
+define void @t3() {
+; CHECK-LABEL: t3:
+; CHECK: b	_t2
+  tail call void @t2()
+  ret void
+}
+
+define double @t4(double %a) nounwind readonly ssp {
+; CHECK-LABEL: t4:
+; CHECK: b	_sin
+  %tmp = tail call double @sin(double %a) nounwind readonly
+  ret double %tmp
+}
+
+define float @t5(float %a) nounwind readonly ssp {
+; CHECK-LABEL: t5:
+; CHECK: b	_sinf
+  %tmp = tail call float @sinf(float %a) nounwind readonly
+  ret float %tmp
+}
+
+define void @t7() nounwind {
+; CHECK-LABEL: t7:
+; CHECK: b	_foo
+; CHECK: b	_bar
+
+  br i1 undef, label %bb, label %bb1.lr.ph
+
+bb1.lr.ph:                                        ; preds = %entry
+  tail call void @bar() nounwind
+  ret void
+
+bb:                                               ; preds = %entry
+  tail call void @foo() nounwind
+  ret void
+}
+
+define i32 @t8(i32 %x) nounwind ssp {
+; CHECK-LABEL: t8:
+; CHECK: b	_a
+; CHECK: b	_b
+; CHECK: b	_c
+  %and = and i32 %x, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 @a(i32 %x) nounwind
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %and1 = and i32 %x, 2
+  %tobool2 = icmp eq i32 %and1, 0
+  br i1 %tobool2, label %if.end5, label %if.then3
+
+if.then3:                                         ; preds = %if.end
+  %call4 = tail call i32 @b(i32 %x) nounwind
+  br label %return
+
+if.end5:                                          ; preds = %if.end
+  %call6 = tail call i32 @c(i32 %x) nounwind
+  br label %return
+
+return:                                           ; preds = %if.end5, %if.then3, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ %call4, %if.then3 ], [ %call6, %if.end5 ]
+  ret i32 %retval.0
+}
+
+declare float @sinf(float) nounwind readonly
+declare double @sin(double) nounwind readonly
+declare void @bar() nounwind
+declare void @foo() nounwind
+declare i32 @a(i32)
+declare i32 @b(i32)
+declare i32 @c(i32)
diff --git a/test/CodeGen/AArch64/arm64-cast-opt.ll b/test/CodeGen/AArch64/arm64-cast-opt.ll
new file mode 100644
index 00000000000..65a871d4368
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-cast-opt.ll
@@ -0,0 +1,31 @@
+; RUN: llc -O3 -march=arm64 -mtriple arm64-apple-ios5.0.0 < %s | FileCheck %s
+; <rdar://problem/15992732>
+; Zero truncation is not necessary when the values are extended properly
+; already.
+
+@block = common global i8* null, align 8
+
+define zeroext i8 @foo(i32 %i1, i32 %i2) {
+; CHECK-LABEL: foo:
+; CHECK: cset
+; CHECK-NOT: and
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv9 = zext i1 %cmp7 to i8
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i8 [ %conv9, %if.then ], [ 1, %entry ]
+  ret i8 %retval.0
+}
diff --git a/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
new file mode 100644
index 00000000000..664a26cafe4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
@@ -0,0 +1,190 @@
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp | FileCheck %s
+target triple = "arm64-apple-ios7.0.0"
+
+@channelColumns = external global i64
+@channelTracks = external global i64
+@mazeRoute = external hidden unnamed_addr global i8*, align 8
+@TOP = external global i64*
+@BOT = external global i64*
+@netsAssign = external global i64*
+
+; Function from yacr2/maze.c
+; The branch at the end of %if.then is driven by %cmp5 and %cmp6.
+; Isel converts the and i1 into two branches, and arm64-ccmp should not convert
+; it back again. %cmp6 has much higher latency than %cmp5.
+; CHECK: Maze1
+; CHECK: %if.then
+; CHECK: cmp x{{[0-9]+}}, #2
+; CHECK-NEXT b.cc
+; CHECK: %if.then
+; CHECK: cmp x{{[0-9]+}}, #2
+; CHECK-NEXT b.cc
+define i32 @Maze1() nounwind ssp {
+entry:
+  %0 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp90 = icmp eq i64 %0, 0
+  br i1 %cmp90, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %1 = phi i64 [ %0, %entry ], [ %37, %for.inc ]
+  %i.092 = phi i64 [ 1, %entry ], [ %inc53, %for.inc ]
+  %numLeft.091 = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
+  %2 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx = getelementptr inbounds i8* %2, i64 %i.092
+  %3 = load i8* %arrayidx, align 1, !tbaa !1
+  %tobool = icmp eq i8 %3, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %4 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx1 = getelementptr inbounds i64* %4, i64 %i.092
+  %5 = load i64* %arrayidx1, align 8, !tbaa !0
+  %6 = load i64** @netsAssign, align 8, !tbaa !3
+  %arrayidx2 = getelementptr inbounds i64* %6, i64 %5
+  %7 = load i64* %arrayidx2, align 8, !tbaa !0
+  %8 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx3 = getelementptr inbounds i64* %8, i64 %i.092
+  %9 = load i64* %arrayidx3, align 8, !tbaa !0
+  %arrayidx4 = getelementptr inbounds i64* %6, i64 %9
+  %10 = load i64* %arrayidx4, align 8, !tbaa !0
+  %cmp5 = icmp ugt i64 %i.092, 1
+  %cmp6 = icmp ugt i64 %10, 1
+  %or.cond = and i1 %cmp5, %cmp6
+  br i1 %or.cond, label %land.lhs.true7, label %if.else
+
+land.lhs.true7:                                   ; preds = %if.then
+  %11 = load i64* @channelTracks, align 8, !tbaa !0
+  %add = add i64 %11, 1
+  %call = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add, i64 %10, i64 0, i64 %7, i32 -1, i32 -1)
+  %tobool8 = icmp eq i32 %call, 0
+  br i1 %tobool8, label %land.lhs.true7.if.else_crit_edge, label %if.then9
+
+land.lhs.true7.if.else_crit_edge:                 ; preds = %land.lhs.true7
+  %.pre = load i64* @channelColumns, align 8, !tbaa !0
+  br label %if.else
+
+if.then9:                                         ; preds = %land.lhs.true7
+  %12 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx10 = getelementptr inbounds i8* %12, i64 %i.092
+  store i8 0, i8* %arrayidx10, align 1, !tbaa !1
+  %13 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx11 = getelementptr inbounds i64* %13, i64 %i.092
+  %14 = load i64* %arrayidx11, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %14)
+  %15 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx12 = getelementptr inbounds i64* %15, i64 %i.092
+  %16 = load i64* %arrayidx12, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %16)
+  br label %for.inc
+
+if.else:                                          ; preds = %land.lhs.true7.if.else_crit_edge, %if.then
+  %17 = phi i64 [ %.pre, %land.lhs.true7.if.else_crit_edge ], [ %1, %if.then ]
+  %cmp13 = icmp ult i64 %i.092, %17
+  %or.cond89 = and i1 %cmp13, %cmp6
+  br i1 %or.cond89, label %land.lhs.true16, label %if.else24
+
+land.lhs.true16:                                  ; preds = %if.else
+  %18 = load i64* @channelTracks, align 8, !tbaa !0
+  %add17 = add i64 %18, 1
+  %call18 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add17, i64 %10, i64 0, i64 %7, i32 1, i32 -1)
+  %tobool19 = icmp eq i32 %call18, 0
+  br i1 %tobool19, label %if.else24, label %if.then20
+
+if.then20:                                        ; preds = %land.lhs.true16
+  %19 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx21 = getelementptr inbounds i8* %19, i64 %i.092
+  store i8 0, i8* %arrayidx21, align 1, !tbaa !1
+  %20 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx22 = getelementptr inbounds i64* %20, i64 %i.092
+  %21 = load i64* %arrayidx22, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %21)
+  %22 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx23 = getelementptr inbounds i64* %22, i64 %i.092
+  %23 = load i64* %arrayidx23, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %23)
+  br label %for.inc
+
+if.else24:                                        ; preds = %land.lhs.true16, %if.else
+  br i1 %cmp5, label %land.lhs.true26, label %if.else36
+
+land.lhs.true26:                                  ; preds = %if.else24
+  %24 = load i64* @channelTracks, align 8, !tbaa !0
+  %cmp27 = icmp ult i64 %7, %24
+  br i1 %cmp27, label %land.lhs.true28, label %if.else36
+
+land.lhs.true28:                                  ; preds = %land.lhs.true26
+  %add29 = add i64 %24, 1
+  %call30 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add29, i64 %10, i32 -1, i32 1)
+  %tobool31 = icmp eq i32 %call30, 0
+  br i1 %tobool31, label %if.else36, label %if.then32
+
+if.then32:                                        ; preds = %land.lhs.true28
+  %25 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx33 = getelementptr inbounds i8* %25, i64 %i.092
+  store i8 0, i8* %arrayidx33, align 1, !tbaa !1
+  %26 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx34 = getelementptr inbounds i64* %26, i64 %i.092
+  %27 = load i64* %arrayidx34, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %27)
+  %28 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx35 = getelementptr inbounds i64* %28, i64 %i.092
+  %29 = load i64* %arrayidx35, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %29)
+  br label %for.inc
+
+if.else36:                                        ; preds = %land.lhs.true28, %land.lhs.true26, %if.else24
+  %30 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp37 = icmp ult i64 %i.092, %30
+  br i1 %cmp37, label %land.lhs.true38, label %if.else48
+
+land.lhs.true38:                                  ; preds = %if.else36
+  %31 = load i64* @channelTracks, align 8, !tbaa !0
+  %cmp39 = icmp ult i64 %7, %31
+  br i1 %cmp39, label %land.lhs.true40, label %if.else48
+
+land.lhs.true40:                                  ; preds = %land.lhs.true38
+  %add41 = add i64 %31, 1
+  %call42 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add41, i64 %10, i32 1, i32 1)
+  %tobool43 = icmp eq i32 %call42, 0
+  br i1 %tobool43, label %if.else48, label %if.then44
+
+if.then44:                                        ; preds = %land.lhs.true40
+  %32 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx45 = getelementptr inbounds i8* %32, i64 %i.092
+  store i8 0, i8* %arrayidx45, align 1, !tbaa !1
+  %33 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx46 = getelementptr inbounds i64* %33, i64 %i.092
+  %34 = load i64* %arrayidx46, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %34)
+  %35 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx47 = getelementptr inbounds i64* %35, i64 %i.092
+  %36 = load i64* %arrayidx47, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %36)
+  br label %for.inc
+
+if.else48:                                        ; preds = %land.lhs.true40, %land.lhs.true38, %if.else36
+  %inc = add nsw i32 %numLeft.091, 1
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.else48, %if.then44, %if.then32, %if.then20, %if.then9, %for.body
+  %numLeft.1 = phi i32 [ %numLeft.091, %if.then9 ], [ %numLeft.091, %if.then20 ], [ %numLeft.091, %if.then32 ], [ %numLeft.091, %if.then44 ], [ %inc, %if.else48 ], [ %numLeft.091, %for.body ]
+  %inc53 = add i64 %i.092, 1
+  %37 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp = icmp ugt i64 %inc53, %37
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %numLeft.0.lcssa = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
+  ret i32 %numLeft.0.lcssa
+}
+
+; Materializable
+declare hidden fastcc i32 @Maze1Mech(i64, i64, i64, i64, i64, i32, i32) nounwind ssp
+
+; Materializable
+declare hidden fastcc void @CleanNet(i64) nounwind ssp
+
+!0 = metadata !{metadata !"long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
new file mode 100644
index 00000000000..63965f9538b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -0,0 +1,289 @@
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp -aarch64-stress-ccmp | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; CHECK: single_same
+; CHECK: cmp w0, #5
+; CHECK-NEXT: ccmp w1, #17, #4, ne
+; CHECK-NEXT: b.ne
+; CHECK: %if.then
+; CHECK: bl _foo
+; CHECK: %if.end
+define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  %cmp1 = icmp eq i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Different condition codes for the two compares.
+; CHECK: single_different
+; CHECK: cmp w0, #6
+; CHECK-NEXT: ccmp w1, #17, #0, ge
+; CHECK-NEXT: b.eq
+; CHECK: %if.then
+; CHECK: bl _foo
+; CHECK: %if.end
+define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp sle i32 %a, 5
+  %cmp1 = icmp ne i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Second block clobbers the flags, can't convert (easily).
+; CHECK: single_flagclobber
+; CHECK: cmp
+; CHECK: b.eq
+; CHECK: cmp
+; CHECK: b.gt
+define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp1 = icmp slt i32 %b, 7
+  %mul = shl nsw i32 %b, 1
+  %add = add nsw i32 %b, 1
+  %cond = select i1 %cmp1, i32 %mul, i32 %add
+  %cmp2 = icmp slt i32 %cond, 17
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  ret i32 7
+}
+
+; Second block clobbers the flags and ends with a tbz terminator.
+; CHECK: single_flagclobber_tbz
+; CHECK: cmp
+; CHECK: b.eq
+; CHECK: cmp
+; CHECK: tbz
+define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp1 = icmp slt i32 %b, 7
+  %mul = shl nsw i32 %b, 1
+  %add = add nsw i32 %b, 1
+  %cond = select i1 %cmp1, i32 %mul, i32 %add
+  %and = and i32 %cond, 8
+  %cmp2 = icmp ne i32 %and, 0
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  ret i32 7
+}
+
+; Speculatively execute division by zero.
+; The sdiv/udiv instructions do not trap when the divisor is zero, so they are
+; safe to speculate.
+; CHECK: speculate_division
+; CHECK-NOT: cmp
+; CHECK: sdiv
+; CHECK: cmp
+; CHECK-NEXT: ccmp
+define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %div = sdiv i32 %b, %a
+  %cmp1 = icmp slt i32 %div, 17
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Floating point compare.
+; CHECK: single_fcmp
+; CHECK: cmp
+; CHECK-NOT: b.
+; CHECK: fccmp {{.*}}, #8, ge
+; CHECK: b.lt
+define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %conv = sitofp i32 %a to float
+  %div = fdiv float %b, %conv
+  %cmp1 = fcmp oge float %div, 1.700000e+01
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Chain multiple compares.
+; CHECK: multi_different
+; CHECK: cmp
+; CHECK: ccmp
+; CHECK: ccmp
+; CHECK: b.
+define void @multi_different(i32 %a, i32 %b, i32 %c) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, %b
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %div = sdiv i32 %b, %a
+  %cmp1 = icmp eq i32 %div, 5
+  %cmp4 = icmp sgt i32 %div, %c
+  %or.cond = and i1 %cmp1, %cmp4
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Convert a cbz in the head block.
+; CHECK: cbz_head
+; CHECK: cmp w0, #0
+; CHECK: ccmp
+define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp ne i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Check that the immediate operand is in range. The ccmp instruction encodes a
+; smaller range of immediates than subs/adds.
+; The ccmp immediates must be in the range 0-31.
+; CHECK: immediate_range
+; CHECK-NOT: ccmp
+define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  %cmp1 = icmp eq i32 %b, 32
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Convert a cbz in the second block.
+; CHECK: cbz_second
+; CHECK: cmp w0, #0
+; CHECK: ccmp w1, #0, #0, ne
+; CHECK: b.eq
+define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp ne i32 %b, 0
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Convert a cbnz in the second block.
+; CHECK: cbnz_second
+; CHECK: cmp w0, #0
+; CHECK: ccmp w1, #0, #4, ne
+; CHECK: b.ne
+define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp eq i32 %b, 0
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+declare i32 @foo()
+
+%str1 = type { %str2 }
+%str2 = type { [24 x i8], i8*, i32, %str1*, i32, [4 x i8], %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, i8*, i8, i8*, %str1*, i8* }
+
+; Test case distilled from 126.gcc.
+; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor.
+; CHECK: build_modify_expr
+define void @build_modify_expr() nounwind ssp {
+entry:
+  switch i32 undef, label %sw.bb.i.i [
+    i32 69, label %if.end85
+    i32 70, label %if.end85
+    i32 71, label %if.end85
+    i32 72, label %if.end85
+    i32 73, label %if.end85
+    i32 105, label %if.end85
+    i32 106, label %if.end85
+  ]
+
+if.end85:
+  ret void
+
+sw.bb.i.i:
+  %ref.tr.i.i = phi %str1* [ %0, %sw.bb.i.i ], [ undef, %entry ]
+  %operands.i.i = getelementptr inbounds %str1* %ref.tr.i.i, i64 0, i32 0, i32 2
+  %arrayidx.i.i = bitcast i32* %operands.i.i to %str1**
+  %0 = load %str1** %arrayidx.i.i, align 8
+  %code1.i.i.phi.trans.insert = getelementptr inbounds %str1* %0, i64 0, i32 0, i32 0, i64 16
+  br label %sw.bb.i.i
+}
diff --git a/test/CodeGen/AArch64/arm64-clrsb.ll b/test/CodeGen/AArch64/arm64-clrsb.ll
new file mode 100644
index 00000000000..042e52e5e78
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-clrsb.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=arm64 |  FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ctlz.i32(i32, i1) #0
+declare i64 @llvm.ctlz.i64(i64, i1) #1
+
+; Function Attrs: nounwind ssp
+define i32 @clrsb32(i32 %x) #2 {
+entry:
+  %shr = ashr i32 %x, 31
+  %xor = xor i32 %shr, %x
+  %mul = shl i32 %xor, 1
+  %add = or i32 %mul, 1
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %add, i1 false)
+
+  ret i32 %0
+; CHECK-LABEL: clrsb32
+; CHECK:   cls [[TEMP:w[0-9]+]], [[TEMP]]
+}
+
+; Function Attrs: nounwind ssp
+define i64 @clrsb64(i64 %x) #3 {
+entry:
+  %shr = ashr i64 %x, 63
+  %xor = xor i64 %shr, %x
+  %mul = shl nsw i64 %xor, 1
+  %add = or i64 %mul, 1
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %add, i1 false)
+
+  ret i64 %0
+; CHECK-LABEL: clrsb64
+; CHECK:   cls [[TEMP:x[0-9]+]], [[TEMP]]
+}
diff --git a/test/CodeGen/AArch64/arm64-coalesce-ext.ll b/test/CodeGen/AArch64/arm64-coalesce-ext.ll
new file mode 100644
index 00000000000..9420bf3bb59
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-coalesce-ext.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-darwin < %s | FileCheck %s
+; Check that the peephole optimizer knows about sext and zext instructions.
+; CHECK: test1sext
+define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
+  %C = add i64 %A, %B
+  ; CHECK: add x[[SUM:[0-9]+]], x0, x1
+  %D = trunc i64 %C to i32
+  %E = shl i64 %C, 32
+  %F = ashr i64 %E, 32
+  ; CHECK: sxtw x[[EXT:[0-9]+]], w[[SUM]]
+  store volatile i64 %F, i64 *%P2
+  ; CHECK: str x[[EXT]]
+  store volatile i32 %D, i32* %P
+  ; Reuse low bits of extended register, don't extend live range of SUM.
+  ; CHECK: str w[[SUM]]
+  ret i32 %D
+}
diff --git a/test/CodeGen/AArch64/arm64-code-model-large-abs.ll b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll
new file mode 100644
index 00000000000..264da2da25b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large < %s | FileCheck %s
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
+
+define i8* @global_addr() {
+; CHECK-LABEL: global_addr:
+  ret i8* @var8
+  ; The movz/movk calculation should end up returned directly in x0.
+; CHECK: movz x0, #:abs_g3:var8
+; CHECK: movk x0, #:abs_g2_nc:var8
+; CHECK: movk x0, #:abs_g1_nc:var8
+; CHECK: movk x0, #:abs_g0_nc:var8
+; CHECK-NEXT: ret
+}
+
+define i8 @global_i8() {
+; CHECK-LABEL: global_i8:
+  %val = load i8* @var8
+  ret i8 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var8
+; CHECK: ldrb w0, [x[[ADDR_REG]]]
+}
+
+define i16 @global_i16() {
+; CHECK-LABEL: global_i16:
+  %val = load i16* @var16
+  ret i16 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var16
+; CHECK: ldrh w0, [x[[ADDR_REG]]]
+}
+
+define i32 @global_i32() {
+; CHECK-LABEL: global_i32:
+  %val = load i32* @var32
+  ret i32 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var32
+; CHECK: ldr w0, [x[[ADDR_REG]]]
+}
+
+define i64 @global_i64() {
+; CHECK-LABEL: global_i64:
+  %val = load i64* @var64
+  ret i64 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var64
+; CHECK: ldr x0, [x[[ADDR_REG]]]
+}
+
+define <2 x i64> @constpool() {
+; CHECK-LABEL: constpool:
+  ret <2 x i64> <i64 123456789, i64 987654321100>
+
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:[[CPADDR:.LCPI[0-9]+_[0-9]+]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:[[CPADDR]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:[[CPADDR]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:[[CPADDR]]
+; CHECK: ldr q0, [x[[ADDR_REG]]]
+}
diff --git a/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
new file mode 100644
index 00000000000..81cee38420a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=arm64-apple-ios -O3 -aarch64-collect-loh -aarch64-collect-loh-bb-only=true -aarch64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
+; Check that the LOH analysis does not crash when the analysed chained
+; contains instructions that are filtered out.
+;
+; Before the fix for <rdar://problem/16041712>, these cases were removed
+; from the main container. Now, the deterministic container does not allow
+; to remove arbitrary values, so we have to live with garbage values.
+; <rdar://problem/16041712>
+
+%"class.H4ISP::H4ISPDevice" = type { i32 (%"class.H4ISP::H4ISPDevice"*, i32, i8*, i8*)*, i8*, i32*, %"class.H4ISP::H4ISPCameraManager"* }
+
+%"class.H4ISP::H4ISPCameraManager" = type opaque
+
+declare i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"*)
+
+@pH4ISPDevice = hidden global %"class.H4ISP::H4ISPDevice"* null, align 8
+
+; CHECK-LABEL: _foo:
+; CHECK: ret
+; CHECK-NOT: .loh AdrpLdrGotLdr
+define void @foo() {
+entry:
+  br label %if.then83
+if.then83:                                        ; preds = %if.end81
+  %tmp = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
+  %call84 = call i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"* %tmp) #19
+  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"()
+  %tmp2 = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
+  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x28}"()
+  %pCameraManager.i268 = getelementptr inbounds %"class.H4ISP::H4ISPDevice"* %tmp2, i64 0, i32 3
+  %tmp3 = load %"class.H4ISP::H4ISPCameraManager"** %pCameraManager.i268, align 8
+  %tobool.i269 = icmp eq %"class.H4ISP::H4ISPCameraManager"* %tmp3, null
+  br i1 %tobool.i269, label %if.then83, label %end
+end:
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-collect-loh-str.ll b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
new file mode 100644
index 00000000000..d7bc00e318f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; Test case for <rdar://problem/15942912>.
+; AdrpAddStr cannot be used when the store uses same
+; register as address and value. Indeed, the related
+; if applied, may completely remove the definition or
+; at least provide a wrong one (with the offset folded
+; into the definition).
+
+%struct.anon = type { i32*, i32** }
+
+@pptp_wan_head = internal global %struct.anon zeroinitializer, align 8
+
+; CHECK-LABEL: _pptp_wan_init
+; CHECK: ret
+; CHECK-NOT: AdrpAddStr
+define i32 @pptp_wan_init() {
+entry:
+  store i32* null, i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), align 8
+  store i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), i32*** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 1), align 8
+  ret i32 0
+}
+
+
diff --git a/test/CodeGen/AArch64/arm64-collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll
new file mode 100644
index 00000000000..6d73daac620
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF
+
+; CHECK-ELF-NOT: .loh
+; CHECK-ELF-NOT: AdrpAdrp
+; CHECK-ELF-NOT: AdrpAdd
+; CHECK-ELF-NOT: AdrpLdrGot
+
+@a = internal unnamed_addr global i32 0, align 4
+@b = external global i32
+
+; Function Attrs: noinline nounwind ssp
+define void @foo(i32 %t) {
+entry:
+  %tmp = load i32* @a, align 4
+  %add = add nsw i32 %tmp, %t
+  store i32 %add, i32* @a, align 4
+  ret void
+}
+
+; Function Attrs: nounwind ssp
+; Testcase for <rdar://problem/15438605>, AdrpAdrp reuse is valid only when the first adrp
+; dominates the second.
+; The first adrp comes from the loading of 'a' and the second the loading of 'b'.
+; 'a' is loaded in if.then, 'b' in if.end4, if.then does not dominates if.end4.
+; CHECK-LABEL: _test
+; CHECK: ret
+; CHECK-NOT: .loh AdrpAdrp
+define i32 @test(i32 %t) {
+entry:
+  %cmp = icmp sgt i32 %t, 5
+  br i1 %cmp, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %entry
+  %tmp = load i32* @a, align 4
+  %add = add nsw i32 %tmp, %t
+  %cmp1 = icmp sgt i32 %add, 12
+  br i1 %cmp1, label %if.then2, label %if.end4
+
+if.then2:                                         ; preds = %if.then
+  tail call void @foo(i32 %add)
+  %tmp1 = load i32* @a, align 4
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.then2, %if.then, %entry
+  %t.addr.0 = phi i32 [ %tmp1, %if.then2 ], [ %t, %if.then ], [ %t, %entry ]
+  %tmp2 = load i32* @b, align 4
+  %add5 = add nsw i32 %tmp2, %t.addr.0
+  tail call void @foo(i32 %add5)
+  %tmp3 = load i32* @b, align 4
+  %add6 = add nsw i32 %tmp3, %t.addr.0
+  ret i32 %add6
+}
diff --git a/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll b/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
new file mode 100644
index 00000000000..f65b1161282
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s
+
+; The DAG combiner decided to use a vector load/store for this struct copy
+; previously. This probably shouldn't happen without NEON, but the most
+; important thing is that it compiles.
+
+define void @store_combine() nounwind {
+  %src = alloca { double, double }, align 8
+  %dst = alloca { double, double }, align 8
+
+  %src.realp = getelementptr inbounds { double, double }* %src, i32 0, i32 0
+  %src.real = load double* %src.realp
+  %src.imagp = getelementptr inbounds { double, double }* %src, i32 0, i32 1
+  %src.imag = load double* %src.imagp
+
+  %dst.realp = getelementptr inbounds { double, double }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { double, double }* %dst, i32 0, i32 1
+  store double %src.real, double* %dst.realp
+  store double %src.imag, double* %dst.imagp
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-complex-ret.ll b/test/CodeGen/AArch64/arm64-complex-ret.ll
new file mode 100644
index 00000000000..93d50a59861
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-complex-ret.ll
@@ -0,0 +1,7 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+define { i192, i192, i21, i192 } @foo(i192) {
+; CHECK-LABEL: foo:
+; CHECK: stp xzr, xzr, [x8]
+  ret { i192, i192, i21, i192 } {i192 0, i192 1, i21 2, i192 3}
+}
diff --git a/test/CodeGen/AArch64/arm64-const-addr.ll b/test/CodeGen/AArch64/arm64-const-addr.ll
new file mode 100644
index 00000000000..c55a9226cc7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-const-addr.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=arm64-darwin-unknown < %s | FileCheck %s
+
+%T = type { i32, i32, i32, i32 }
+
+; Test if the constant base address gets only materialized once.
+define i32 @test1() nounwind {
+; CHECK-LABEL:  test1
+; CHECK:        movz  w8, #0x40f, lsl #16
+; CHECK-NEXT:   movk  w8, #0xc000
+; CHECK-NEXT:   ldp w9, w10, [x8, #4]
+; CHECK:        ldr w8, [x8, #12]
+  %at = inttoptr i64 68141056 to %T*
+  %o1 = getelementptr %T* %at, i32 0, i32 1
+  %t1 = load i32* %o1
+  %o2 = getelementptr %T* %at, i32 0, i32 2
+  %t2 = load i32* %o2
+  %a1 = add i32 %t1, %t2
+  %o3 = getelementptr %T* %at, i32 0, i32 3
+  %t3 = load i32* %o3
+  %a2 = add i32 %a1, %t3
+  ret i32 %a2
+}
+
diff --git a/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll b/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
new file mode 100644
index 00000000000..d862b1e1943
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; CHECK: fptosi_1
+; CHECK: fcvtzs.2d
+; CHECK: xtn.2s
+; CHECK: ret
+define void @fptosi_1() nounwind noinline ssp {
+entry:
+  %0 = fptosi <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
+
+; CHECK: fptoui_1
+; CHECK: fcvtzu.2d
+; CHECK: xtn.2s
+; CHECK: ret
+define void @fptoui_1() nounwind noinline ssp {
+entry:
+  %0 = fptoui <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll b/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
new file mode 100644
index 00000000000..daaf1e0f87d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: f1:
+; CHECK: sshll.2d v0, v0, #0
+; CHECK-NEXT: scvtf.2d v0, v0
+; CHECK-NEXT: ret
+  %conv = sitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+define <2 x double> @f2(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: f2:
+; CHECK: ushll.2d v0, v0, #0
+; CHECK-NEXT: ucvtf.2d v0, v0
+; CHECK-NEXT: ret
+  %conv = uitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+; CHECK: autogen_SD19655
+; CHECK: scvtf
+; CHECK: ret
+define void @autogen_SD19655() {
+  %T = load <2 x i64>* undef
+  %F = sitofp <2 x i64> undef to <2 x float>
+  store <2 x float> %F, <2 x float>* undef
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-copy-tuple.ll b/test/CodeGen/AArch64/arm64-copy-tuple.ll
new file mode 100644
index 00000000000..1803787d729
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-copy-tuple.ll
@@ -0,0 +1,146 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
+
+; The main purpose of this test is to find out whether copyPhysReg can deal with
+; the memmove-like situation arising in tuples, where an early copy can clobber
+; the value needed by a later one if the tuples overlap.
+
+; We use dummy inline asm to force LLVM to generate a COPY between the registers
+; we want by clobbering all the others.
+
+define void @test_D1D2_from_D0D1(i8* %addr) #0 {
+; CHECK-LABEL: test_D1D2_from_D0D1:
+; CHECK: mov.8b v2, v1
+; CHECK: mov.8b v1, v0
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D0D1_from_D1D2(i8* %addr) #0 {
+; CHECK-LABEL: test_D0D1_from_D1D2:
+; CHECK: mov.8b v0, v1
+; CHECK: mov.8b v1, v2
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D0D1_from_D31D0(i8* %addr) #0 {
+; CHECK-LABEL: test_D0D1_from_D31D0:
+; CHECK: mov.8b v1, v0
+; CHECK: mov.8b v0, v31
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D31D0_from_D0D1(i8* %addr) #0 {
+; CHECK-LABEL: test_D31D0_from_D0D1:
+; CHECK: mov.8b v31, v0
+; CHECK: mov.8b v0, v1
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D2D3D4_from_D0D1D2(i8* %addr) #0 {
+; CHECK-LABEL: test_D2D3D4_from_D0D1D2:
+; CHECK: mov.8b v4, v2
+; CHECK: mov.8b v3, v1
+; CHECK: mov.8b v2, v0
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 1
+  %vec2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 2
+
+  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+  ret void
+}
+
+define void @test_Q0Q1Q2_from_Q1Q2Q3(i8* %addr) #0 {
+; CHECK-LABEL: test_Q0Q1Q2_from_Q1Q2Q3:
+; CHECK: mov.16b v0, v1
+; CHECK: mov.16b v1, v2
+; CHECK: mov.16b v2, v3
+entry:
+  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
+  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
+  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
+  tail call void asm sideeffect "", "~{v0},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+  ret void
+}
+
+define void @test_Q1Q2Q3Q4_from_Q30Q31Q0Q1(i8* %addr) #0 {
+; CHECK-LABEL: test_Q1Q2Q3Q4_from_Q30Q31Q0Q1:
+; CHECK: mov.16b v4, v1
+; CHECK: mov.16b v3, v0
+; CHECK: mov.16b v2, v31
+; CHECK: mov.16b v1, v30
+  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
+  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
+  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
+  %vec3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 3
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}"()
+  tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
+
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
diff --git a/test/CodeGen/AArch64/arm64-crc32.ll b/test/CodeGen/AArch64/arm64-crc32.ll
new file mode 100644
index 00000000000..d3099e6bb13
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-crc32.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=arm64 -mattr=+crc -o - %s | FileCheck %s
+
+define i32 @test_crc32b(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32b:
+; CHECK: crc32b w0, w0, w1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.aarch64.crc32b(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32h(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32h:
+; CHECK: crc32h w0, w0, w1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.aarch64.crc32h(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32w(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32w:
+; CHECK: crc32w w0, w0, w1
+  %val = call i32 @llvm.aarch64.crc32w(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32x(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32x:
+; CHECK: crc32x w0, w0, x1
+  %val = call i32 @llvm.aarch64.crc32x(i32 %cur, i64 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cb(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32cb:
+; CHECK: crc32cb w0, w0, w1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.aarch64.crc32cb(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32ch(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32ch:
+; CHECK: crc32ch w0, w0, w1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.aarch64.crc32ch(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32cw(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32cw:
+; CHECK: crc32cw w0, w0, w1
+  %val = call i32 @llvm.aarch64.crc32cw(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cx(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32cx:
+; CHECK: crc32cx w0, w0, x1
+  %val = call i32 @llvm.aarch64.crc32cx(i32 %cur, i64 %next)
+  ret i32 %val
+}
+
+declare i32 @llvm.aarch64.crc32b(i32, i32)
+declare i32 @llvm.aarch64.crc32h(i32, i32)
+declare i32 @llvm.aarch64.crc32w(i32, i32)
+declare i32 @llvm.aarch64.crc32x(i32, i64)
+
+declare i32 @llvm.aarch64.crc32cb(i32, i32)
+declare i32 @llvm.aarch64.crc32ch(i32, i32)
+declare i32 @llvm.aarch64.crc32cw(i32, i32)
+declare i32 @llvm.aarch64.crc32cx(i32, i64)
diff --git a/test/CodeGen/AArch64/arm64-crypto.ll b/test/CodeGen/AArch64/arm64-crypto.ll
new file mode 100644
index 00000000000..2908b336b1b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-crypto.ll
@@ -0,0 +1,135 @@
+; RUN: llc -march=arm64 -mattr=crypto -aarch64-neon-syntax=apple -o - %s | FileCheck %s
+
+declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data)
+declare <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data)
+
+define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aese:
+; CHECK: aese.16b v0, v1
+  %res = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aesd:
+; CHECK: aesd.16b v0, v1
+  %res = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesmc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesmc:
+; CHECK: aesmc.16b v0, v0
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesimc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesimc:
+; CHECK: aesimc.16b v0, v0
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data)
+  ret <16 x i8> %res
+}
+
+declare <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e)
+declare <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+declare <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+
+define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+; <rdar://problem/14742333> Incomplete removal of unnecessary FMOV instructions in intrinsic SHA1
+define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c_in_a_row:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1
+; CHECK-NOT: fmov
+; CHECK: sha1c.4s q0, s[[SHA1RES]], v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %extract = extractelement <4 x i32> %res, i32 0
+  %res2 = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
+  ret <4 x i32> %res2
+}
+
+define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1p:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1p.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1m:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1m.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define i32 @test_sha1h(i32 %hash_e) {
+; CHECK-LABEL: test_sha1h:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
+; CHECK: fmov w0, [[RES]]
+  %res = call i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e)
+  ret i32 %res
+}
+
+define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) {
+; CHECK-LABEL: test_sha1su0:
+; CHECK: sha1su0.4s v0, v1, v2
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) {
+; CHECK-LABEL: test_sha1su1:
+; CHECK: sha1su1.4s v0, v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+declare <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+
+define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h:
+; CHECK: sha256h.4s q0, q1, v2
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h2:
+; CHECK: sha256h2.4s q0, q1, v2
+
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) {
+; CHECK-LABEL: test_sha256su0:
+; CHECK: sha256su0.4s v0, v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
+; CHECK-LABEL: test_sha256su1:
+; CHECK: sha256su1.4s v0, v1, v2
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+  ret <4 x i32> %res
+}
diff --git a/test/CodeGen/AArch64/arm64-cse.ll b/test/CodeGen/AArch64/arm64-cse.ll
new file mode 100644
index 00000000000..bb14c895504
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-cse.ll
@@ -0,0 +1,59 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; rdar://12462006
+; CSE between "icmp reg reg" and "sub reg reg".
+; Both can be in the same basic block or in different basic blocks.
+define i8* @t1(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: subs
+; CHECK-NOT: cmp
+; CHECK-NOT: sub
+; CHECK: b.ge
+; CHECK: sub
+; CHECK: sub
+; CHECK-NOT: sub
+; CHECK: ret
+ %0 = load i32* %offset, align 4
+ %cmp = icmp slt i32 %0, %size
+ %s = sub nsw i32 %0, %size
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %sub = sub nsw i32 %0, %size
+ %s2 = sub nsw i32 %s, %size
+ %s3 = sub nsw i32 %sub, %s2
+ store i32 %s3, i32* %offset, align 4
+ %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+ br label %return
+
+return:
+ %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ ret i8* %retval.0
+}
+
+; CSE between "icmp reg imm" and "sub reg imm".
+define i8* @t2(i8* %base, i32* nocapture %offset) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: subs
+; CHECK-NOT: cmp
+; CHECK-NOT: sub
+; CHECK: b.lt
+; CHECK-NOT: sub
+; CHECK: ret
+ %0 = load i32* %offset, align 4
+ %cmp = icmp slt i32 %0, 1
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %sub = sub nsw i32 %0, 1
+ store i32 %sub, i32* %offset, align 4
+ %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+ br label %return
+
+return:
+ %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ ret i8* %retval.0
+}
diff --git a/test/CodeGen/AArch64/arm64-csel.ll b/test/CodeGen/AArch64/arm64-csel.ll
new file mode 100644
index 00000000000..98eba30f119
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-csel.ll
@@ -0,0 +1,230 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-unknown-unknown"
+
+; CHECK-LABEL: foo1
+; CHECK: cinc w{{[0-9]+}}, w{{[0-9]+}}, ne
+define i32 @foo1(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %not.tobool = icmp ne i32 %c, 0
+  %add = zext i1 %not.tobool to i32
+  %b.add = add i32 %c, %b
+  %add1 = add i32 %b.add, %add
+  ret i32 %add1
+}
+
+; CHECK-LABEL: foo2
+; CHECK: cneg w{{[0-9]+}}, w{{[0-9]+}}, ne
+define i32 @foo2(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %mul = sub i32 0, %b
+  %tobool = icmp eq i32 %c, 0
+  %b.mul = select i1 %tobool, i32 %b, i32 %mul
+  %add = add nsw i32 %b.mul, %c
+  ret i32 %add
+}
+
+; CHECK-LABEL: foo3
+; CHECK: cinv w{{[0-9]+}}, w{{[0-9]+}}, ne
+define i32 @foo3(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %not.tobool = icmp ne i32 %c, 0
+  %xor = sext i1 %not.tobool to i32
+  %b.xor = xor i32 %xor, %b
+  %add = add nsw i32 %b.xor, %c
+  ret i32 %add
+}
+
+; rdar://11632325
+define i32@foo4(i32 %a) nounwind ssp {
+; CHECK-LABEL: foo4
+; CHECK: cneg
+; CHECK-NEXT: ret
+  %cmp = icmp sgt i32 %a, -1
+  %neg = sub nsw i32 0, %a
+  %cond = select i1 %cmp, i32 %a, i32 %neg
+  ret i32 %cond
+}
+
+define i32@foo5(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK-LABEL: foo5
+; CHECK: subs
+; CHECK-NEXT: cneg
+; CHECK-NEXT: ret
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub3 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub3
+  ret i32 %cond
+}
+
+; make sure we can handle branch instruction in optimizeCompare.
+define i32@foo6(i32 %a, i32 %b) nounwind ssp {
+; CHECK-LABEL: foo6
+; CHECK: b
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, 0
+  br i1 %cmp, label %l.if, label %l.else
+
+l.if:
+  ret i32 1
+
+l.else:
+  ret i32 %sub
+}
+
+; If CPSR is used multiple times and V flag is used, we don't remove cmp.
+define i32 @foo7(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: foo7:
+; CHECK: sub
+; CHECK-next: adds
+; CHECK-next: csneg
+; CHECK-next: b
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub3 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub3
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = icmp slt i32 %sub, -1
+  %sel = select i1 %cmp2, i32 %cond, i32 %a
+  ret i32 %sel
+
+if.else:
+  ret i32 %cond
+}
+
+define i32 @foo8(i32 %v, i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: foo8:
+; CHECK: cmp w0, #0
+; CHECK: csinv w0, w1, w2, ne
+  %tobool = icmp eq i32 %v, 0
+  %neg = xor i32 -1, %b
+  %cond = select i1 %tobool, i32 %neg, i32 %a
+  ret i32 %cond
+}
+
+define i32 @foo9(i32 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo9:
+; CHECK: cmp w0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cinv w0, w[[REG]], eq
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 4, i32 -5
+  ret i32 %cond
+}
+
+define i64 @foo10(i64 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo10:
+; CHECK: cmp x0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cinv x0, x[[REG]], eq
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 4, i64 -5
+  ret i64 %cond
+}
+
+define i32 @foo11(i32 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo11:
+; CHECK: cmp w0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cneg w0, w[[REG]], eq
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 4, i32 -4
+  ret i32 %cond
+}
+
+define i64 @foo12(i64 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo12:
+; CHECK: cmp x0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cneg x0, x[[REG]], eq
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 4, i64 -4
+  ret i64 %cond
+}
+
+define i32 @foo13(i32 %v, i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo13:
+; CHECK: cmp w0, #0
+; CHECK: csneg w0, w1, w2, ne
+  %tobool = icmp eq i32 %v, 0
+  %sub = sub i32 0, %b
+  %cond = select i1 %tobool, i32 %sub, i32 %a
+  ret i32 %cond
+}
+
+define i64 @foo14(i64 %v, i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo14:
+; CHECK: cmp x0, #0
+; CHECK: csneg x0, x1, x2, ne
+  %tobool = icmp eq i64 %v, 0
+  %sub = sub i64 0, %b
+  %cond = select i1 %tobool, i64 %sub, i64 %a
+  ret i64 %cond
+}
+
+define i32 @foo15(i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo15:
+; CHECK: cmp w0, w1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc w0, w[[REG]], gt
+  %cmp = icmp sgt i32 %a, %b
+  %. = select i1 %cmp, i32 2, i32 1
+  ret i32 %.
+}
+
+define i32 @foo16(i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo16:
+; CHECK: cmp w0, w1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc w0, w[[REG]], le
+  %cmp = icmp sgt i32 %a, %b
+  %. = select i1 %cmp, i32 1, i32 2
+  ret i32 %.
+}
+
+define i64 @foo17(i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo17:
+; CHECK: cmp x0, x1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc x0, x[[REG]], gt
+  %cmp = icmp sgt i64 %a, %b
+  %. = select i1 %cmp, i64 2, i64 1
+  ret i64 %.
+}
+
+define i64 @foo18(i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo18:
+; CHECK: cmp x0, x1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc x0, x[[REG]], le
+  %cmp = icmp sgt i64 %a, %b
+  %. = select i1 %cmp, i64 1, i64 2
+  ret i64 %.
+}
+
+define i64 @foo19(i64 %a, i64 %b, i64 %c) {
+entry:
+; CHECK-LABEL: foo19:
+; CHECK: cinc x0, x2
+; CHECK-NOT: add
+  %cmp = icmp ult i64 %a, %b
+  %inc = zext i1 %cmp to i64
+  %inc.c = add i64 %inc, %c
+  ret i64 %inc.c
+}
diff --git a/test/CodeGen/AArch64/arm64-cvt.ll b/test/CodeGen/AArch64/arm64-cvt.ll
new file mode 100644
index 00000000000..420a8bc0483
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-cvt.ll
@@ -0,0 +1,401 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to away)
+;
+define i32 @fcvtas_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1s:
+;CHECK: fcvtas w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1s:
+;CHECK: fcvtas x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtas_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1d:
+;CHECK: fcvtas w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1d:
+;CHECK: fcvtas x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtas.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtas.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtas.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtas.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer
+;
+define i32 @fcvtau_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1s:
+;CHECK: fcvtau w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1s:
+;CHECK: fcvtau x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtau_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1d:
+;CHECK: fcvtau w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1d:
+;CHECK: fcvtau x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtau.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtau.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtau.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtau.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward -Inf)
+;
+define i32 @fcvtms_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1s:
+;CHECK: fcvtms w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1s:
+;CHECK: fcvtms x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtms_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1d:
+;CHECK: fcvtms w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1d:
+;CHECK: fcvtms x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtms.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtms.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtms.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtms.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward -Inf)
+;
+define i32 @fcvtmu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1s:
+;CHECK: fcvtmu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1s:
+;CHECK: fcvtmu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtmu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1d:
+;CHECK: fcvtmu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1d:
+;CHECK: fcvtmu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to even)
+;
+define i32 @fcvtns_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1s:
+;CHECK: fcvtns w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1s:
+;CHECK: fcvtns x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtns_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1d:
+;CHECK: fcvtns w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1d:
+;CHECK: fcvtns x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtns.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtns.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtns.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtns.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (to nearest with ties to even)
+;
+define i32 @fcvtnu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1s:
+;CHECK: fcvtnu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1s:
+;CHECK: fcvtnu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtnu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1d:
+;CHECK: fcvtnu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1d:
+;CHECK: fcvtnu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward +Inf)
+;
+define i32 @fcvtps_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1s:
+;CHECK: fcvtps w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1s:
+;CHECK: fcvtps x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtps_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1d:
+;CHECK: fcvtps w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1d:
+;CHECK: fcvtps x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtps.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtps.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtps.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtps.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward +Inf)
+;
+define i32 @fcvtpu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1s:
+;CHECK: fcvtpu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1s:
+;CHECK: fcvtpu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtpu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1d:
+;CHECK: fcvtpu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1d:
+;CHECK: fcvtpu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double) nounwind readnone
+
+;
+;  Floating-point scalar convert to signed integer (toward zero)
+;
+define i32 @fcvtzs_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1s:
+;CHECK: fcvtzs w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1s:
+;CHECK: fcvtzs x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtzs_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1d:
+;CHECK: fcvtzs w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1d:
+;CHECK: fcvtzs x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward zero)
+;
+define i32 @fcvtzu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1s:
+;CHECK: fcvtzu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1s:
+;CHECK: fcvtzu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtzu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1d:
+;CHECK: fcvtzu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1d:
+;CHECK: fcvtzu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll b/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
new file mode 100644
index 00000000000..a45e31320de
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -o /dev/null
+; rdar://10795250
+; DAGCombiner should converge.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-macosx10.8.0"
+
+define i64 @foo(i128 %Params.coerce, i128 %SelLocs.coerce) {
+entry:
+  %tmp = lshr i128 %Params.coerce, 61
+  %.tr38.i = trunc i128 %tmp to i64
+  %mul.i = and i64 %.tr38.i, 4294967288
+  %tmp1 = lshr i128 %SelLocs.coerce, 62
+  %.tr.i = trunc i128 %tmp1 to i64
+  %mul7.i = and i64 %.tr.i, 4294967292
+  %add.i = add i64 %mul7.i, %mul.i
+  %conv.i.i = and i64 %add.i, 4294967292
+  ret i64 %conv.i.i
+}
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
new file mode 100644
index 00000000000..2cf01357324
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mcpu=cyclone < %s | FileCheck %s
+
+target datalayout = "e-i64:64-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+%"struct.SU" = type { i32, %"struct.SU"*, i32*, i32, i32, %"struct.BO", i32, [5 x i8] }
+%"struct.BO" = type { %"struct.RE" }
+
+%"struct.RE" = type { i32, i32, i32, i32 }
+
+; This is a read-modify-write of some bifields combined into an i48.  It gets
+; legalized into i32 and i16 accesses.  Only a single store of zero to the low
+; i32 part should be live.
+
+; CHECK-LABEL: test:
+; CHECK-NOT: ldr
+; CHECK: str wzr
+; CHECK-NOT: str
+define void @test(%"struct.SU"* nocapture %su) {
+entry:
+  %r1 = getelementptr inbounds %"struct.SU"* %su, i64 1, i32 5
+  %r2 = bitcast %"struct.BO"* %r1 to i48*
+  %r3 = load i48* %r2, align 8
+  %r4 = and i48 %r3, -4294967296
+  %r5 = or i48 0, %r4
+  store i48 %r5, i48* %r2, align 8
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
new file mode 100644
index 00000000000..2e4b658f1c9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
@@ -0,0 +1,46 @@
+; RUN: llc -O3 < %s | FileCheck %s
+; RUN: llc -O3 -addr-sink-using-gep=1 < %s | FileCheck %s
+; Test case for a DAG combiner bug where we combined an indexed load
+; with an extension (sext, zext, or any) into a regular extended load,
+; i.e., dropping the indexed value.
+; <rdar://problem/16389332>
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+%class.A = type { i64, i64 }
+%class.C = type { i64 }
+
+; CHECK-LABEL: XX:
+; CHECK: ldr
+define void @XX(%class.A* %K) {
+entry:
+  br i1 undef, label %if.then, label %lor.rhs.i
+
+lor.rhs.i:                                        ; preds = %entry
+  %tmp = load i32* undef, align 4
+  %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
+  %tmp1 = load i64* %y.i.i.i, align 8
+  %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
+  %div11.i = sdiv i32 %U.sroa.3.8.extract.trunc.i, 17
+  %add12.i = add nsw i32 0, %div11.i
+  %U.sroa.3.12.extract.shift.i = lshr i64 %tmp1, 32
+  %U.sroa.3.12.extract.trunc.i = trunc i64 %U.sroa.3.12.extract.shift.i to i32
+  %div15.i = sdiv i32 %U.sroa.3.12.extract.trunc.i, 13
+  %add16.i = add nsw i32 %add12.i, %div15.i
+  %rem.i.i = srem i32 %add16.i, %tmp
+  %idxprom = sext i32 %rem.i.i to i64
+  %arrayidx = getelementptr inbounds %class.C** undef, i64 %idxprom
+  %tobool533 = icmp eq %class.C* undef, null
+  br i1 %tobool533, label %while.end, label %while.body
+
+if.then:                                          ; preds = %entry
+  unreachable
+
+while.body:                                       ; preds = %lor.rhs.i
+  unreachable
+
+while.end:                                        ; preds = %lor.rhs.i
+  %tmp3 = load %class.C** %arrayidx, align 8
+  unreachable
+}
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll b/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
new file mode 100644
index 00000000000..0679014e59a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
@@ -0,0 +1,102 @@
+; RUN: llc -mtriple arm64-apple-ios -O3 -o - < %s | FileCheck %s
+; <rdar://problem/14477220>
+
+%class.Complex = type { float, float }
+%class.Complex_int = type { i32, i32 }
+%class.Complex_long = type { i64, i64 }
+
+; CHECK-LABEL: @test
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
+; CHECK: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: fadd {{s[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: fadd {{s[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test(%class.Complex* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %0 = bitcast %class.Complex* %arrayidx to i64*
+  %1 = load i64* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
+  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
+  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
+  %4 = load float* %i.i, align 4
+  %add.i = fadd float %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
+  %5 = load float* %r.i, align 4
+  %add5.i = fadd float %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
+  store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
+
+; CHECK-LABEL: @test_int
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
+; CHECK: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: add {{w[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: add {{w[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test_int(%class.Complex_int* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex_int* %out, i64 %out_start
+  %0 = bitcast %class.Complex_int* %arrayidx to i64*
+  %1 = load i64* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to i32
+  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
+  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to i32
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex_int* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 0
+  %4 = load i32* %i.i, align 4
+  %add.i = add i32 %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i32> undef, i32 %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 1
+  %5 = load i32* %r.i, align 4
+  %add5.i = add i32 %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i32> %retval.sroa.0.0.vec.insert.i, i32 %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_int* %arrayidx to <2 x i32>*
+  store <2 x i32> %retval.sroa.0.4.vec.insert.i, <2 x i32>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
+
+; CHECK-LABEL: @test_long
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #4
+; CHECK: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], {{\[}}[[BASE]], #128]
+; CHECK: add {{x[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: add {{x[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test_long(%class.Complex_long* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex_long* %out, i64 %out_start
+  %0 = bitcast %class.Complex_long* %arrayidx to i128*
+  %1 = load i128* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i128 %1 to i64
+  %2 = bitcast i64 %t0.sroa.0.0.extract.trunc to i64
+  %t0.sroa.2.0.extract.shift = lshr i128 %1, 64
+  %t0.sroa.2.0.extract.trunc = trunc i128 %t0.sroa.2.0.extract.shift to i64
+  %3 = bitcast i64 %t0.sroa.2.0.extract.trunc to i64
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex_long* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 0
+  %4 = load i64* %i.i, align 4
+  %add.i = add i64 %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i64> undef, i64 %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 1
+  %5 = load i64* %r.i, align 4
+  %add5.i = add i64 %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i64> %retval.sroa.0.0.vec.insert.i, i64 %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_long* %arrayidx to <2 x i64>*
+  store <2 x i64> %retval.sroa.0.4.vec.insert.i, <2 x i64>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll b/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
new file mode 100644
index 00000000000..9bb4b712076
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test1() #0 {
+  %tmp1 = alloca i8
+  %tmp2 = alloca i32, i32 4096
+  %tmp3 = icmp eq i8* %tmp1, null
+  %tmp4 = zext i1 %tmp3 to i32
+
+  ret i32 %tmp4
+
+  ; CHECK-LABEL: test1
+  ; CHECK:   adds [[TEMP:[a-z0-9]+]], sp, #4, lsl #12
+  ; CHECK:   adds [[TEMP]], [[TEMP]], #15
+}
diff --git a/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll b/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
new file mode 100644
index 00000000000..1bbcf50ba73
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple="arm64-apple-ios" < %s | FileCheck %s
+;
+; Check that the dead register definition pass is considering implicit defs.
+; When rematerializing through truncates, the coalescer may produce instructions
+; with dead defs, but live implicit-defs of subregs:
+; E.g. %X1<def, dead> = MOVi64imm 2, %W1<imp-def>; %X1:GPR64, %W1:GPR32
+; These instructions are live, and their definitions should not be rewritten.
+;
+; <rdar://problem/16492408>
+
+define void @testcase() {
+; CHECK: testcase:
+; CHECK-NOT: orr xzr, xzr, #0x2
+
+bb1:
+  %tmp1 = tail call float @ceilf(float 2.000000e+00)
+  %tmp2 = fptoui float %tmp1 to i64
+  br i1 undef, label %bb2, label %bb3
+
+bb2:
+  tail call void @foo()
+  br label %bb3
+
+bb3:
+  %tmp3 = trunc i64 %tmp2 to i32
+  tail call void @bar(i32 %tmp3)
+  ret void
+}
+
+declare void @foo()
+declare void @bar(i32)
+declare float @ceilf(float) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-dup.ll b/test/CodeGen/AArch64/arm64-dup.ll
new file mode 100644
index 00000000000..0c56b46c417
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dup.ll
@@ -0,0 +1,323 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+define <8 x i8> @v_dup8(i8 %A) nounwind {
+;CHECK-LABEL: v_dup8:
+;CHECK: dup.8b
+	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
+	ret <8 x i8> %tmp8
+}
+
+define <4 x i16> @v_dup16(i16 %A) nounwind {
+;CHECK-LABEL: v_dup16:
+;CHECK: dup.4h
+	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_dup32(i32 %A) nounwind {
+;CHECK-LABEL: v_dup32:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_dupfloat(float %A) nounwind {
+;CHECK-LABEL: v_dupfloat:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_dupQ8(i8 %A) nounwind {
+;CHECK-LABEL: v_dupQ8:
+;CHECK: dup.16b
+	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
+	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
+	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
+	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
+	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
+	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
+	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
+	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
+	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
+	ret <16 x i8> %tmp16
+}
+
+define <8 x i16> @v_dupQ16(i16 %A) nounwind {
+;CHECK-LABEL: v_dupQ16:
+;CHECK: dup.8h
+	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
+	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
+	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
+	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
+	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
+	ret <8 x i16> %tmp8
+}
+
+define <4 x i32> @v_dupQ32(i32 %A) nounwind {
+;CHECK-LABEL: v_dupQ32:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
+	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
+	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
+	ret <4 x i32> %tmp4
+}
+
+define <4 x float> @v_dupQfloat(float %A) nounwind {
+;CHECK-LABEL: v_dupQfloat:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
+	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
+	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
+	ret <4 x float> %tmp4
+}
+
+; Check to make sure it works with shuffles, too.
+
+define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
+;CHECK-LABEL: v_shuffledup8:
+;CHECK: dup.8b
+	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
+;CHECK-LABEL: v_shuffledup16:
+;CHECK: dup.4h
+	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
+;CHECK-LABEL: v_shuffledup32:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_shuffledupfloat(float %A) nounwind {
+;CHECK-LABEL: v_shuffledupfloat:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ8:
+;CHECK: dup.16b
+	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ16:
+;CHECK: dup.8h
+	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ32:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
+;CHECK-LABEL: v_shuffledupQfloat:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+	ret <4 x float> %tmp2
+}
+
+define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vduplane8:
+;CHECK: dup.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vduplane16:
+;CHECK: dup.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vduplane32:
+;CHECK: dup.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
+;CHECK-LABEL: vduplanefloat:
+;CHECK: dup.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ8:
+;CHECK: dup.16b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ16:
+;CHECK: dup.8h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ32:
+;CHECK: dup.4s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
+;CHECK-LABEL: vduplaneQfloat:
+;CHECK: dup.4s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x float> %tmp2
+}
+
+define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: foo:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: bar:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x i64> %0
+}
+
+define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: baz:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %0
+}
+
+define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: qux:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %0
+}
+
+define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone  {
+; CHECK-LABEL: f:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ret
+  %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
+  %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
+  ret <2 x i32> %vecinit1
+}
+
+define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
+; CHECK-LABEL: g:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ins.s v0[2], w1
+; CHECK-NEXT: ins.s v0[3], w0
+; CHECK-NEXT: ret
+  %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
+  ret <4 x i32> %vecinit3
+}
+
+define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone  {
+; CHECK-LABEL: h:
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: ins.d v0[1], x1
+; CHECK-NEXT: ret
+  %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
+  %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
+  ret <2 x i64> %vecinit1
+}
+
+; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
+; the single value needed was of the same type as the vector. This is false if
+; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
+; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
+; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
+;
+; *However*, it is a dup vD.4h, vN.h[2*idx].
+define <4 x i16> @test_build_illegal(<4 x i32> %in) {
+; CHECK-LABEL: test_build_illegal:
+; CHECK: dup.4h v0, v0[6]
+  %val = extractelement <4 x i32> %in, i32 3
+  %smallval = trunc i32 %val to i16
+  %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
+
+  ret <4 x i16> %vec
+}
+
+; We used to inherit an already extract_subvectored v4i16 from
+; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
+; the formation of an indexed-by-7 MLS.
+define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+; CHECK-LABEL: test_high_splat:
+; CHECK: mls.4h v0, v1, v2[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
diff --git a/test/CodeGen/AArch64/arm64-early-ifcvt.ll b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
new file mode 100644
index 00000000000..17d783a488f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@@ -0,0 +1,423 @@
+; RUN: llc < %s -stress-early-ifcvt | FileCheck %s
+target triple = "arm64-apple-macosx"
+
+; CHECK: mm2
+define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  br label %do.body
+
+; CHECK: do.body
+; Loop body has no branches before the backedge.
+; CHECK-NOT: LBB
+do.body:
+  %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ]
+  %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ]
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ]
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ]
+  %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1
+  %0 = load i32* %p.addr.0, align 4
+  %cmp = icmp sgt i32 %0, %max.0
+  br i1 %cmp, label %do.cond, label %if.else
+
+if.else:
+  %cmp1 = icmp slt i32 %0, %min.0
+  %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0
+  br label %do.cond
+
+do.cond:
+  %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ]
+  %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ]
+; CHECK: cbnz
+  %dec = add i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:
+  %sub = sub nsw i32 %max.1, %min.1
+  ret i32 %sub
+}
+
+; CHECK-LABEL: fold_inc_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinc w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_inc_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inc = add nsw i32 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %inc, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inc_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinc x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_inc_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inc = add nsw i64 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %inc, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inc_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinc w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_inc_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inc = add nsw i32 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %inc, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inc_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinc x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_inc_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inc = add nsw i64 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %inc, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inv_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinv w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_inv_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inv = xor i32 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %inv, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inv_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinv x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_inv_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inv = xor i64 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %inv, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inv_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinv w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_inv_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inv = xor i32 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %inv, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inv_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinv x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_inv_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inv = xor i64 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %inv, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_neg_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csneg w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_neg_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %neg = sub nsw i32 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %neg, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_neg_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csneg x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_neg_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %neg = sub nsw i64 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %neg, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_neg_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csneg w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_neg_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %neg = sub nsw i32 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %neg, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_neg_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csneg x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_neg_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %neg = sub nsw i64 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %neg, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: cbnz_32
+; CHECK: {{subs.*wzr,|cmp}} w2, #0
+; CHECK-NEXT: csel w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @cbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: cbnz_64
+; CHECK: {{subs.*xzr,|cmp}} x2, #0
+; CHECK-NEXT: csel x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @cbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: cbz_32
+; CHECK: {{subs.*wzr,|cmp}} w2, #0
+; CHECK-NEXT: csel w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @cbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp ne i32 %c, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: cbz_64
+; CHECK: {{subs.*xzr,|cmp}} x2, #0
+; CHECK-NEXT: csel x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @cbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp ne i64 %c, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: tbnz_32
+; CHECK: {{ands.*xzr,|tst}} w2, #0x80
+; CHECK-NEXT: csel w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @tbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %mask = and i32 %c, 128
+  %tobool = icmp eq i32 %mask, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: tbnz_64
+; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
+; CHECK-NEXT: csel x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @tbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %mask = and i64 %c, 9223372036854775808
+  %tobool = icmp eq i64 %mask, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: tbz_32
+; CHECK: {{ands.*xzr,|tst}} w2, #0x80
+; CHECK-NEXT: csel w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @tbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %mask = and i32 %c, 128
+  %tobool = icmp ne i32 %mask, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: tbz_64
+; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
+; CHECK-NEXT: csel x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @tbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %mask = and i64 %c, 9223372036854775808
+  %tobool = icmp ne i64 %mask, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; This function from 175.vpr folds an ADDWri into a CSINC.
+; Remember to clear the kill flag on the ADDWri.
+define i32 @get_ytrack_to_xtracks() nounwind ssp {
+entry:
+  br label %for.body
+
+for.body:
+  %x0 = load i32* undef, align 4
+  br i1 undef, label %if.then.i146, label %is_sbox.exit155
+
+if.then.i146:
+  %add8.i143 = add nsw i32 0, %x0
+  %rem.i144 = srem i32 %add8.i143, %x0
+  %add9.i145 = add i32 %rem.i144, 1
+  br label %is_sbox.exit155
+
+is_sbox.exit155:                                  ; preds = %if.then.i146, %for.body
+  %seg_offset.0.i151 = phi i32 [ %add9.i145, %if.then.i146 ], [ undef, %for.body ]
+  %idxprom15.i152 = sext i32 %seg_offset.0.i151 to i64
+  %arrayidx18.i154 = getelementptr inbounds i32* null, i64 %idxprom15.i152
+  %x1 = load i32* %arrayidx18.i154, align 4
+  br i1 undef, label %for.body51, label %for.body
+
+for.body51:                                       ; preds = %is_sbox.exit155
+  call fastcc void @get_switch_type(i32 %x1, i32 undef, i16 signext undef, i16 signext undef, i16* undef)
+  unreachable
+}
+declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, i16* nocapture) nounwind ssp
diff --git a/test/CodeGen/AArch64/arm64-elf-calls.ll b/test/CodeGen/AArch64/arm64-elf-calls.ll
new file mode 100644
index 00000000000..8c4020327b9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-elf-calls.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
+
+declare void @callee()
+
+define void @caller() {
+  call void @callee()
+  ret void
+; CHECK-LABEL: caller:
+; CHECK:     bl callee
+; CHECK-OBJ: R_AARCH64_CALL26 callee
+}
+
+define void @tail_caller() {
+  tail call void @callee()
+  ret void
+; CHECK-LABEL: tail_caller:
+; CHECK:     b callee
+; CHECK-OBJ: R_AARCH64_JUMP26 callee
+}
diff --git a/test/CodeGen/AArch64/arm64-elf-constpool.ll b/test/CodeGen/AArch64/arm64-elf-constpool.ll
new file mode 100644
index 00000000000..95d334376b7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-elf-constpool.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -o - %s | FileCheck %s
+
+; O0 checked for fastisel purposes. It has a separate path which
+; creates a constpool entry for floating values.
+
+define double @needs_const() {
+  ret double 3.14159
+; CHECK: .LCPI0_0:
+
+; CHECK: adrp {{x[0-9]+}}, .LCPI0_0
+; CHECK: ldr d0, [{{x[0-9]+}}, :lo12:.LCPI0_0]
+}
diff --git a/test/CodeGen/AArch64/arm64-elf-globals.ll b/test/CodeGen/AArch64/arm64-elf-globals.ll
new file mode 100644
index 00000000000..4ed44e7c17a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-elf-globals.ll
@@ -0,0 +1,115 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s -mcpu=cyclone | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC
+
+@var8 = external global i8, align 1
+@var16 = external global i16, align 2
+@var32 = external global i32, align 4
+@var64 = external global i64, align 8
+
+define i8 @test_i8(i8 %new) {
+  %val = load i8* @var8, align 1
+  store i8 %new, i8* @var8
+  ret i8 %val
+; CHECK-LABEL: test_i8:
+; CHECK: adrp x[[HIREG:[0-9]+]], var8
+; CHECK: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+; CHECK: strb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+
+; CHECK-PIC-LABEL: test_i8:
+; CHECK-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
+; CHECK-PIC: ldr x[[VAR_ADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
+; CHECK-PIC: ldrb {{w[0-9]+}}, [x[[VAR_ADDR]]]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var8
+; CHECK-FAST: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+
+; CHECK-FAST-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
+; CHECK-FAST-PIC: ldr x[[VARADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
+; CHECK-FAST-PIC: ldr {{w[0-9]+}}, [x[[VARADDR]]]
+}
+
+define i16 @test_i16(i16 %new) {
+  %val = load i16* @var16, align 2
+  store i16 %new, i16* @var16
+  ret i16 %val
+; CHECK-LABEL: test_i16:
+; CHECK: adrp x[[HIREG:[0-9]+]], var16
+; CHECK: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+; CHECK: strh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var16
+; CHECK-FAST: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+}
+
+define i32 @test_i32(i32 %new) {
+  %val = load i32* @var32, align 4
+  store i32 %new, i32* @var32
+  ret i32 %val
+; CHECK-LABEL: test_i32:
+; CHECK: adrp x[[HIREG:[0-9]+]], var32
+; CHECK: ldr {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
+; CHECK: str {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var32
+; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var32
+}
+
+define i64 @test_i64(i64 %new) {
+  %val = load i64* @var64, align 8
+  store i64 %new, i64* @var64
+  ret i64 %val
+; CHECK-LABEL: test_i64:
+; CHECK: adrp x[[HIREG:[0-9]+]], var64
+; CHECK: ldr {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
+; CHECK: str {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var64
+; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var64
+}
+
+define i64* @test_addr() {
+  ret i64* @var64
+; CHECK-LABEL: test_addr:
+; CHECK: adrp [[HIREG:x[0-9]+]], var64
+; CHECK: add x0, [[HIREG]], :lo12:var64
+
+; CHECK-FAST: adrp [[HIREG:x[0-9]+]], var64
+; CHECK-FAST: add x0, [[HIREG]], :lo12:var64
+}
+
+@hiddenvar = hidden global i32 0, align 4
+@protectedvar = protected global i32 0, align 4
+
+define i32 @test_vis() {
+  %lhs = load i32* @hiddenvar, align 4
+  %rhs = load i32* @protectedvar, align 4
+  %ret = add i32 %lhs, %rhs
+  ret i32 %ret
+; CHECK-PIC: adrp {{x[0-9]+}}, hiddenvar
+; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:hiddenvar]
+; CHECK-PIC: adrp {{x[0-9]+}}, protectedvar
+; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:protectedvar]
+}
+
+@var_default = external global [2 x i32]
+
+define i32 @test_default_align() {
+  %addr = getelementptr [2 x i32]* @var_default, i32 0, i32 0
+  %val = load i32* %addr
+  ret i32 %val
+; CHECK-LABEL: test_default_align:
+; CHECK: adrp x[[HIREG:[0-9]+]], var_default
+; CHECK: ldr w0, [x[[HIREG]], :lo12:var_default]
+}
+
+define i64 @test_default_unaligned() {
+  %addr = bitcast [2 x i32]* @var_default to i64*
+  %val = load i64* %addr
+  ret i64 %val
+; CHECK-LABEL: test_default_unaligned:
+; CHECK: adrp [[HIREG:x[0-9]+]], var_default
+; CHECK: add x[[ADDR:[0-9]+]], [[HIREG]], :lo12:var_default
+; CHECK: ldr x0, [x[[ADDR]]]
+}
diff --git a/test/CodeGen/AArch64/arm64-ext.ll b/test/CodeGen/AArch64/arm64-ext.ll
new file mode 100644
index 00000000000..67860de51b0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ext.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd:
+;CHECK: {{ext.8b.*#3}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+	ret <8 x i8> %tmp3
+}
+
+define <8 x i8> @test_vextRd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRd:
+;CHECK: {{ext.8b.*#5}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextq:
+;CHECK: {{ext.16b.*3}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+	ret <16 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextRq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRq:
+;CHECK: {{ext.16b.*7}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @test_vextd16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: test_vextd16:
+;CHECK: {{ext.8b.*#6}}
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+	ret <4 x i16> %tmp3
+}
+
+define <4 x i32> @test_vextq32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: test_vextq32:
+;CHECK: {{ext.16b.*12}}
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+	ret <4 x i32> %tmp3
+}
+
+; Undef shuffle indices should not prevent matching to VEXT:
+
+define <8 x i8> @test_vextd_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd_undef:
+;CHECK: {{ext.8b.*}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10>
+	ret <8 x i8> %tmp3
+}
+
+define <8 x i8> @test_vextd_undef2(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd_undef2:
+;CHECK: {{ext.8b.*#6}}
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5>
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRq_undef:
+;CHECK: {{ext.16b.*#7}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 undef, i32 undef, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 undef, i32 6>
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @test_vextRq_undef2(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vextRq_undef2:
+;CHECK: {{ext.16b.*#10}}
+  %tmp1 = load <8 x i16>* %A
+  %vext = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x i16> %vext;
+}
+
+; Tests for ReconstructShuffle function. Indices have to be carefully
+; chosen to reach lowering phase as a BUILD_VECTOR.
+
+; One vector needs vext, the other can be handled by extract_subvector
+; Also checks interleaving of sources is handled correctly.
+; Essence: a vext is used on %A and something saner than stack load/store for final result.
+define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: test_interleaved:
+;CHECK: ext.8b
+;CHECK: zip1.4h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 3, i32 8, i32 5, i32 9>
+        ret <4 x i16> %tmp3
+}
+
+; An undef in the shuffle list should still be optimizable
+define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: test_undef:
+;CHECK: zip1.4h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 undef, i32 8, i32 5, i32 9>
+        ret <4 x i16> %tmp3
+}
diff --git a/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll b/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
new file mode 100644
index 00000000000..048fdb083a4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <4 x float> @foo(<4 x i16> %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: ushll.4s	v0, v0, #0
+; CHECK-NEXT: ucvtf.4s	v0, v0
+; CHECK-NEXT: ret
+  %vcvt.i = uitofp <4 x i16> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @bar(<4 x i16> %a) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: sshll.4s	v0, v0, #0
+; CHECK-NEXT: scvtf.4s	v0, v0
+; CHECK-NEXT: ret
+  %vcvt.i = sitofp <4 x i16> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
diff --git a/test/CodeGen/AArch64/arm64-extend.ll b/test/CodeGen/AArch64/arm64-extend.ll
new file mode 100644
index 00000000000..afcaca2c492
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extend.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+@array = external global [0 x i32]
+
+define i64 @foo(i32 %i) {
+; CHECK: foo
+; CHECK:  adrp  x[[REG:[0-9]+]], _array@GOTPAGE
+; CHECK:  ldr x[[REG1:[0-9]+]], [x[[REG]], _array@GOTPAGEOFF]
+; CHECK:  ldrsw x0, [x[[REG1]], w0, sxtw #2]
+; CHECK:  ret
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds [0 x i32]* @array, i64 0, i64 %idxprom
+  %tmp1 = load i32* %arrayidx, align 4
+  %conv = sext i32 %tmp1 to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll
new file mode 100644
index 00000000000..a239403befa
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extern-weak.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -o - < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
+
+declare extern_weak i32 @var()
+
+define i32()* @foo() {
+; The usual ADRP/ADD pair can't be used for a weak reference because it must
+; evaluate to 0 if the symbol is undefined. We use a litpool entry.
+  ret i32()* @var
+
+; CHECK: adrp x[[VAR:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[VAR]], :got_lo12:var]
+
+  ; In the large model, the usual relocations are absolute and can
+  ; materialise 0.
+; CHECK-LARGE: movz x0, #:abs_g3:var
+; CHECK-LARGE: movk x0, #:abs_g2_nc:var
+; CHECK-LARGE: movk x0, #:abs_g1_nc:var
+; CHECK-LARGE: movk x0, #:abs_g0_nc:var
+}
+
+
+@arr_var = extern_weak global [10 x i32]
+
+define i32* @bar() {
+  %addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5
+; CHECK: adrp x[[ARR_VAR_HI:[0-9]+]], :got:arr_var
+; CHECK: ldr [[ARR_VAR:x[0-9]+]], [x[[ARR_VAR_HI]], :got_lo12:arr_var]
+; CHECK: add x0, [[ARR_VAR]], #20
+  ret i32* %addr
+
+  ; In the large model, the usual relocations are absolute and can
+  ; materialise 0.
+; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g3:arr_var
+; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g2_nc:arr_var
+; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g1_nc:arr_var
+; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g0_nc:arr_var
+}
+
+@defined_weak_var = internal unnamed_addr global i32 0
+
+define i32* @wibble() {
+  ret i32* @defined_weak_var
+; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
+
+; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
+; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
+; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var
+; CHECK-LARGE: movk x0, #:abs_g0_nc:defined_weak_var
+}
diff --git a/test/CodeGen/AArch64/arm64-extload-knownzero.ll b/test/CodeGen/AArch64/arm64-extload-knownzero.ll
new file mode 100644
index 00000000000..14e5fd310d7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extload-knownzero.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+; rdar://12771555
+
+define void @foo(i16* %ptr, i32 %a) nounwind {
+entry:
+; CHECK-LABEL: foo:
+  %tmp1 = icmp ult i32 %a, 100
+  br i1 %tmp1, label %bb1, label %bb2
+bb1:
+; CHECK: %bb1
+; CHECK: ldrh [[REG:w[0-9]+]]
+  %tmp2 = load i16* %ptr, align 2
+  br label %bb2
+bb2:
+; CHECK: %bb2
+; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
+; CHECK: cmp [[REG]], #23
+  %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ]
+  %cmp = icmp ult i16 %tmp3, 24
+  br i1 %cmp, label %bb3, label %exit
+bb3:
+  call void @bar() nounwind
+  br label %exit
+exit:
+  ret void
+}
+
+declare void @bar ()
diff --git a/test/CodeGen/AArch64/arm64-extract.ll b/test/CodeGen/AArch64/arm64-extract.ll
new file mode 100644
index 00000000000..01984662d23
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extract.ll
@@ -0,0 +1,58 @@
+; RUN: llc -aarch64-extr-generation=true -verify-machineinstrs < %s \
+; RUN: -march=arm64 | FileCheck %s
+
+define i64 @ror_i64(i64 %in) {
+; CHECK-LABEL: ror_i64:
+    %left = shl i64 %in, 19
+    %right = lshr i64 %in, 45
+    %val5 = or i64 %left, %right
+; CHECK: ror {{x[0-9]+}}, x0, #45
+    ret i64 %val5
+}
+
+define i32 @ror_i32(i32 %in) {
+; CHECK-LABEL: ror_i32:
+    %left = shl i32 %in, 9
+    %right = lshr i32 %in, 23
+    %val5 = or i32 %left, %right
+; CHECK: ror {{w[0-9]+}}, w0, #23
+    ret i32 %val5
+}
+
+define i32 @extr_i32(i32 %lhs, i32 %rhs) {
+; CHECK-LABEL: extr_i32:
+  %left = shl i32 %lhs, 6
+  %right = lshr i32 %rhs, 26
+  %val = or i32 %left, %right
+  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+  ; something other than w0 and w1.
+; CHECK: extr {{w[0-9]+}}, w0, w1, #26
+
+  ret i32 %val
+}
+
+define i64 @extr_i64(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: extr_i64:
+  %right = lshr i64 %rhs, 40
+  %left = shl i64 %lhs, 24
+  %val = or i64 %right, %left
+  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+  ; something other than w0 and w1.
+; CHECK: extr {{x[0-9]+}}, x0, x1, #40
+
+  ret i64 %val
+}
+
+; Regression test: a bad experimental pattern crept into git which optimised
+; this pattern to a single EXTR.
+define i32 @extr_regress(i32 %a, i32 %b) {
+; CHECK-LABEL: extr_regress:
+
+    %sh1 = shl i32 %a, 14
+    %sh2 = lshr i32 %b, 14
+    %val = or i32 %sh2, %sh1
+; CHECK-NOT: extr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, #{{[0-9]+}}
+
+    ret i32 %val
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-extract_subvector.ll b/test/CodeGen/AArch64/arm64-extract_subvector.ll
new file mode 100644
index 00000000000..8b15a6453b2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extract_subvector.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn.
+
+define <8 x i8> @v8i8(<16 x i8> %a) nounwind {
+; CHECK: v8i8
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32>  <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %ret
+}
+
+define <4 x i16> @v4i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: v4i16:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32>  <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %ret
+}
+
+define <2 x i32> @v2i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: v2i32:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32>  <i32 2, i32 3>
+  ret <2 x i32> %ret
+}
+
+define <1 x i64> @v1i64(<2 x i64> %a) nounwind {
+; CHECK-LABEL: v1i64:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32>  <i32 1>
+  ret <1 x i64> %ret
+}
+
+define <2 x float> @v2f32(<4 x float> %a) nounwind {
+; CHECK-LABEL: v2f32:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32>  <i32 2, i32 3>
+  ret <2 x float> %ret
+}
+
+define <1 x double> @v1f64(<2 x double> %a) nounwind {
+; CHECK-LABEL: v1f64:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32>  <i32 1>
+  ret <1 x double> %ret
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
new file mode 100644
index 00000000000..ebd847e0f72
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+@sortlist = common global [5001 x i32] zeroinitializer, align 16
+@sortlist2 = common global [5001 x i64] zeroinitializer, align 16
+
+; Load an address with an offset larget then LDR imm can handle
+define i32 @foo() nounwind {
+entry:
+; CHECK: @foo
+; CHECK: adrp x[[REG:[0-9]+]], _sortlist@GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist@GOTPAGEOFF]
+; CHECK: movz x[[REG2:[0-9]+]], #0x4e20
+; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
+; CHECK: ldr w0, [x[[REG3]]]
+; CHECK: ret
+  %0 = load i32* getelementptr inbounds ([5001 x i32]* @sortlist, i32 0, i64 5000), align 4
+  ret i32 %0
+}
+
+define i64 @foo2() nounwind {
+entry:
+; CHECK: @foo2
+; CHECK: adrp x[[REG:[0-9]+]], _sortlist2@GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist2@GOTPAGEOFF]
+; CHECK: movz x[[REG2:[0-9]+]], #0x9c40
+; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
+; CHECK: ldr x0, [x[[REG3]]]
+; CHECK: ret
+  %0 = load i64* getelementptr inbounds ([5001 x i64]* @sortlist2, i32 0, i64 5000), align 4
+  ret i64 %0
+}
+
+; Load an address with a ridiculously large offset.
+; rdar://12505553
+@pd2 = common global i8* null, align 8
+
+define signext i8 @foo3() nounwind ssp {
+entry:
+; CHECK: @foo3
+; CHECK: movz x[[REG:[0-9]+]], #0xb3a, lsl #32
+; CHECK: movk x[[REG]], #0x73ce, lsl #16
+; CHECK: movk x[[REG]], #0x2ff2
+  %0 = load i8** @pd2, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 12345678901234
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
new file mode 100644
index 00000000000..1706e9eba2b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
@@ -0,0 +1,25 @@
+; This test should cause the TargetMaterializeAlloca to be invoked
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+%struct.S1Ty = type { i64 }
+%struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty }
+
+define void @takeS1(%struct.S1Ty* %V) nounwind {
+entry:
+  %V.addr = alloca %struct.S1Ty*, align 8
+  store %struct.S1Ty* %V, %struct.S1Ty** %V.addr, align 8
+  ret void
+}
+
+define void @main() nounwind {
+entry:
+; CHECK: main
+; CHECK: mov x29, sp
+; CHECK: mov x[[REG:[0-9]+]], sp
+; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8
+; CHECK-NEXT: add x0, x[[REG]], x[[REG1]]
+  %E = alloca %struct.S2Ty, align 4
+  %B = getelementptr inbounds %struct.S2Ty* %E, i32 0, i32 1
+  call void @takeS1(%struct.S1Ty* %B)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
new file mode 100644
index 00000000000..37a8295c893
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
@@ -0,0 +1,155 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
+
+define void @branch1() nounwind uwtable ssp {
+  %x = alloca i32, align 4
+  store i32 0, i32* %x, align 4
+  %1 = load i32* %x, align 4
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %3, label %4
+
+; <label>:3                                       ; preds = %0
+  br label %4
+
+; <label>:4                                       ; preds = %3, %0
+  ret void
+}
+
+define void @branch2() nounwind uwtable ssp {
+  %1 = alloca i32, align 4
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  store i32 0, i32* %1
+  store i32 1, i32* %y, align 4
+  store i32 1, i32* %x, align 4
+  store i32 0, i32* %z, align 4
+  %2 = load i32* %x, align 4
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %4, label %5
+
+; <label>:4                                       ; preds = %0
+  store i32 0, i32* %1
+  br label %14
+
+; <label>:5                                       ; preds = %0
+  %6 = load i32* %y, align 4
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %8, label %13
+
+; <label>:8                                       ; preds = %5
+  %9 = load i32* %z, align 4
+  %10 = icmp ne i32 %9, 0
+  br i1 %10, label %11, label %12
+
+; <label>:11                                      ; preds = %8
+  store i32 1, i32* %1
+  br label %14
+
+; <label>:12                                      ; preds = %8
+  store i32 0, i32* %1
+  br label %14
+
+; <label>:13                                      ; preds = %5
+  br label %14
+
+; <label>:14                                      ; preds = %4, %11, %12, %13
+  %15 = load i32* %1
+  ret void
+}
+
+define void @true_() nounwind uwtable ssp {
+; CHECK: @true_
+; CHECK: b LBB2_1
+  br i1 true, label %1, label %2
+
+; <label>:1
+; CHECK: LBB2_1
+  br label %2
+
+; <label>:2
+  ret void
+}
+
+define void @false_() nounwind uwtable ssp {
+; CHECK: @false_
+; CHECK: b LBB3_2
+  br i1 false, label %1, label %2
+
+; <label>:1
+  br label %2
+
+; <label>:2
+; CHECK: LBB3_2
+  ret void
+}
+
+define zeroext i8 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) {
+entry:
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %0 = load i16* %b.addr, align 2
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: b.eq LBB4_2
+  %conv = trunc i16 %0 to i1
+  br i1 %conv, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo1()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %1 = load i32* %c.addr, align 4
+; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
+; CHECK: subs w{{[0-9]+}}, w[[REG]], #0
+; CHECK: b.eq LBB4_4
+  %conv1 = trunc i32 %1 to i1
+  br i1 %conv1, label %if.then3, label %if.end4
+
+if.then3:                                         ; preds = %if.end
+  call void @foo1()
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.then3, %if.end
+  %2 = load i64* %d.addr, align 8
+; CHECK: subs w{{[0-9]+}}, w{{[0-9]+}}, #0
+; CHECK: b.eq LBB4_6
+  %conv5 = trunc i64 %2 to i1
+  br i1 %conv5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.end4
+  call void @foo1()
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.then7, %if.end4
+  %3 = load i8* %a.addr, align 1
+  ret i8 %3
+}
+
+declare void @foo1()
+
+; rdar://15174028
+define i32 @trunc64(i64 %foo) nounwind {
+; CHECK: trunc64
+; CHECK: orr  [[REG:x[0-9]+]], xzr, #0x1
+; CHECK: and  [[REG2:x[0-9]+]], x0, [[REG]]
+; CHECK: mov  x[[REG3:[0-9]+]], [[REG2]]
+; CHECK: and  [[REG4:w[0-9]+]], w[[REG3]], #0x1
+; CHECK: subs {{w[0-9]+}}, [[REG4]], #0
+; CHECK: b.eq LBB5_2
+  %a = and i64 %foo, 1
+  %b = trunc i64 %a to i1
+  br i1 %b, label %if.then, label %if.else
+
+if.then:
+  ret i32 1
+
+if.else:
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-call.ll b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
new file mode 100644
index 00000000000..8d756ae5461
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64_be-linux-gnu | FileCheck %s --check-prefix=CHECK-BE
+
+define void @call0() nounwind {
+entry:
+  ret void
+}
+
+define void @foo0() nounwind {
+entry:
+; CHECK: foo0
+; CHECK: bl _call0
+  call void @call0()
+  ret void
+}
+
+define i32 @call1(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  ret i32 %tmp
+}
+
+define i32 @foo1(i32 %a) nounwind {
+entry:
+; CHECK: foo1
+; CHECK: stur w0, [x29, #-4]
+; CHECK-NEXT: ldur w0, [x29, #-4]
+; CHECK-NEXT: bl _call1
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %call = call i32 @call1(i32 %tmp)
+  ret i32 %call
+}
+
+define i32 @sext_(i8 %a, i16 %b) nounwind {
+entry:
+; CHECK: @sext_
+; CHECK: sxtb w0, w0
+; CHECK: sxth w1, w1
+; CHECK: bl _foo_sext_
+  call void @foo_sext_(i8 signext %a, i16 signext %b)
+  ret i32 0
+}
+
+declare void @foo_sext_(i8 %a, i16 %b)
+
+define i32 @zext_(i8 %a, i16 %b) nounwind {
+entry:
+; CHECK: @zext_
+; CHECK: uxtb w0, w0
+; CHECK: uxth w1, w1
+  call void @foo_zext_(i8 zeroext %a, i16 zeroext %b)
+  ret i32 0
+}
+
+declare void @foo_zext_(i8 %a, i16 %b)
+
+define i32 @t1(i32 %argc, i8** nocapture %argv) {
+entry:
+; CHECK: @t1
+; The last parameter will be passed on stack via i8.
+; CHECK: strb w{{[0-9]+}}, [sp]
+; CHECK-NEXT: bl _bar
+  %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70, i8 zeroext 28, i8 zeroext 39, i8 zeroext -41)
+  ret i32 0
+}
+
+declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
+
+; Test materialization of integers.  Target-independent selector handles this.
+define i32 @t2() {
+entry:
+; CHECK: @t2
+; CHECK: movz x0, #0
+; CHECK: orr w1, wzr, #0xfffffff8
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x3ff
+; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x2
+; CHECK: movz w[[REG3:[0-9]+]], #0
+; CHECK: orr w[[REG4:[0-9]+]], wzr, #0x1
+; CHECK: uxth w2, w[[REG]]
+; CHECK: sxtb w3, w[[REG2]]
+; CHECK: and w4, w[[REG3]], #0x1
+; CHECK: and w5, w[[REG4]], #0x1
+; CHECK: bl	_func2
+  %call = call i32 @func2(i64 zeroext 0, i32 signext -8, i16 zeroext 1023, i8 signext -254, i1 zeroext 0, i1 zeroext 1)
+  ret i32 0
+}
+
+declare i32 @func2(i64 zeroext, i32 signext, i16 zeroext, i8 signext, i1 zeroext, i1 zeroext)
+
+declare void @callee_b0f(i8 %bp10, i8 %bp11, i8 %bp12, i8 %bp13, i8 %bp14, i8 %bp15, i8 %bp17, i8 %bp18, i8 %bp19)
+define void @caller_b1f() {
+entry:
+  ; CHECK-BE: strb w{{.*}}, [sp, #7]
+  call void @callee_b0f(i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 42)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
new file mode 100644
index 00000000000..c5417de0ae9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
@@ -0,0 +1,442 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
+
+;; Test various conversions.
+define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: trunc_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldr x3, [sp]
+; CHECK: mov x0, x3
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: strb w0, [sp, #15]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: uxtb w0, w0
+; CHECK: add sp, sp, #16
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i64* %d.addr, align 8
+  %conv = trunc i64 %tmp to i32
+  store i32 %conv, i32* %c.addr, align 4
+  %tmp1 = load i32* %c.addr, align 4
+  %conv2 = trunc i32 %tmp1 to i16
+  store i16 %conv2, i16* %b.addr, align 2
+  %tmp3 = load i16* %b.addr, align 2
+  %conv4 = trunc i16 %tmp3 to i8
+  store i8 %conv4, i8* %a.addr, align 1
+  %tmp5 = load i8* %a.addr, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  ret i32 %conv6
+}
+
+define i64 @zext_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: zext_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: uxtb w0, w0
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: uxth w0, w0
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: mov x3, x0
+; CHECK: ubfx x3, x3, #0, #32
+; CHECK: str x3, [sp]
+; CHECK: ldr x0, [sp]
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i8* %a.addr, align 1
+  %conv = zext i8 %tmp to i16
+  store i16 %conv, i16* %b.addr, align 2
+  %tmp1 = load i16* %b.addr, align 2
+  %conv2 = zext i16 %tmp1 to i32
+  store i32 %conv2, i32* %c.addr, align 4
+  %tmp3 = load i32* %c.addr, align 4
+  %conv4 = zext i32 %tmp3 to i64
+  store i64 %conv4, i64* %d.addr, align 8
+  %tmp5 = load i64* %d.addr, align 8
+  ret i64 %tmp5
+}
+
+define i32 @zext_i1_i32(i1 zeroext %a) nounwind ssp {
+entry:
+; CHECK: @zext_i1_i32
+; CHECK: and w0, w0, #0x1
+  %conv = zext i1 %a to i32
+  ret i32 %conv;
+}
+
+define i64 @zext_i1_i64(i1 zeroext %a) nounwind ssp {
+entry:
+; CHECK: @zext_i1_i64
+; CHECK: and w0, w0, #0x1
+  %conv = zext i1 %a to i64
+  ret i64 %conv;
+}
+
+define i64 @sext_(i8 signext %a, i16 signext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: sext_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: sxtb w0, w0
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: sxth w0, w0
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: mov x3, x0
+; CHECK: sxtw x3, w3
+; CHECK: str x3, [sp]
+; CHECK: ldr x0, [sp]
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i8* %a.addr, align 1
+  %conv = sext i8 %tmp to i16
+  store i16 %conv, i16* %b.addr, align 2
+  %tmp1 = load i16* %b.addr, align 2
+  %conv2 = sext i16 %tmp1 to i32
+  store i32 %conv2, i32* %c.addr, align 4
+  %tmp3 = load i32* %c.addr, align 4
+  %conv4 = sext i32 %tmp3 to i64
+  store i64 %conv4, i64* %d.addr, align 8
+  %tmp5 = load i64* %d.addr, align 8
+  ret i64 %tmp5
+}
+
+; Test sext i8 to i64
+
+define zeroext i64 @sext_i8_i64(i8 zeroext %in) {
+; CHECK-LABEL: sext_i8_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: sxtb x0, w[[TMP]]
+  %big = sext i8 %in to i64
+  ret i64 %big
+}
+
+define zeroext i64 @sext_i16_i64(i16 zeroext %in) {
+; CHECK-LABEL: sext_i16_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: sxth x0, w[[TMP]]
+  %big = sext i16 %in to i64
+  ret i64 %big
+}
+
+; Test sext i1 to i32
+define i32 @sext_i1_i32(i1 signext %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i32
+; CHECK: sbfx w0, w0, #0, #1
+  %conv = sext i1 %a to i32
+  ret i32 %conv
+}
+
+; Test sext i1 to i16
+define signext i16 @sext_i1_i16(i1 %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i16
+; CHECK: sbfx w0, w0, #0, #1
+  %conv = sext i1 %a to i16
+  ret i16 %conv
+}
+
+; Test sext i1 to i8
+define signext i8 @sext_i1_i8(i1 %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i8
+; CHECK: sbfx w0, w0, #0, #1
+  %conv = sext i1 %a to i8
+  ret i8 %conv
+}
+
+; Test fpext
+define double @fpext_(float %a) nounwind ssp {
+entry:
+; CHECK: fpext_
+; CHECK: fcvt d0, s0
+  %conv = fpext float %a to double
+  ret double %conv
+}
+
+; Test fptrunc
+define float @fptrunc_(double %a) nounwind ssp {
+entry:
+; CHECK: fptrunc_
+; CHECK: fcvt s0, d0
+  %conv = fptrunc double %a to float
+  ret float %conv
+}
+
+; Test fptosi
+define i32 @fptosi_ws(float %a) nounwind ssp {
+entry:
+; CHECK: fptosi_ws
+; CHECK: fcvtzs w0, s0
+  %conv = fptosi float %a to i32
+  ret i32 %conv
+}
+
+; Test fptosi
+define i32 @fptosi_wd(double %a) nounwind ssp {
+entry:
+; CHECK: fptosi_wd
+; CHECK: fcvtzs w0, d0
+  %conv = fptosi double %a to i32
+  ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_ws(float %a) nounwind ssp {
+entry:
+; CHECK: fptoui_ws
+; CHECK: fcvtzu w0, s0
+  %conv = fptoui float %a to i32
+  ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_wd(double %a) nounwind ssp {
+entry:
+; CHECK: fptoui_wd
+; CHECK: fcvtzu w0, d0
+  %conv = fptoui double %a to i32
+  ret i32 %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i1(i1 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i1
+; CHECK: sbfx w0, w0, #0, #1
+; CHECK: scvtf s0, w0
+  %conv = sitofp i1 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i8(i8 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i8
+; CHECK: sxtb w0, w0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i8 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i16(i16 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i16
+; CHECK: sxth w0, w0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i16 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw(i32 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw
+; CHECK: scvtf s0, w0
+  %conv = sitofp i32 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sx(i64 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sx
+; CHECK: scvtf s0, x0
+  %conv = sitofp i64 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define double @sitofp_dw(i32 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_dw
+; CHECK: scvtf d0, w0
+  %conv = sitofp i32 %a to double
+  ret double %conv
+}
+
+; Test sitofp
+define double @sitofp_dx(i64 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_dx
+; CHECK: scvtf d0, x0
+  %conv = sitofp i64 %a to double
+  ret double %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i1(i1 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i1
+; CHECK: and w0, w0, #0x1
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i1 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i8(i8 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i8
+; CHECK: uxtb w0, w0
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i8 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i16(i16 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i16
+; CHECK: uxth w0, w0
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i16 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw(i32 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i32 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sx(i64 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sx
+; CHECK: ucvtf s0, x0
+  %conv = uitofp i64 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define double @uitofp_dw(i32 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_dw
+; CHECK: ucvtf d0, w0
+  %conv = uitofp i32 %a to double
+  ret double %conv
+}
+
+; Test uitofp
+define double @uitofp_dx(i64 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_dx
+; CHECK: ucvtf d0, x0
+  %conv = uitofp i64 %a to double
+  ret double %conv
+}
+
+define i32 @i64_trunc_i32(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i32
+; CHECK: mov x1, x0
+  %conv = trunc i64 %a to i32
+  ret i32 %conv
+}
+
+define zeroext i16 @i64_trunc_i16(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i16
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xffff
+; CHECK: uxth w0, [[REG2]]
+  %conv = trunc i64 %a to i16
+  ret i16 %conv
+}
+
+define zeroext i8 @i64_trunc_i8(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i8
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xff
+; CHECK: uxtb w0, [[REG2]]
+  %conv = trunc i64 %a to i8
+  ret i8 %conv
+}
+
+define zeroext i1 @i64_trunc_i1(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i1
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0x1
+; CHECK: and w0, [[REG2]], #0x1
+  %conv = trunc i64 %a to i1
+  ret i1 %conv
+}
+
+; rdar://15101939
+define void @stack_trunc() nounwind {
+; CHECK: stack_trunc
+; CHECK: sub  sp, sp, #16
+; CHECK: ldr  [[REG:x[0-9]+]], [sp]
+; CHECK: mov  x[[REG2:[0-9]+]], [[REG]]
+; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0xff
+; CHECK: strb [[REG3]], [sp, #15]
+; CHECK: add  sp, sp, #16
+  %a = alloca i8, align 1
+  %b = alloca i64, align 8
+  %c = load i64* %b, align 8
+  %d = trunc i64 %c to i8
+  store i8 %d, i8* %a, align 1
+  ret void
+}
+
+define zeroext i64 @zext_i8_i64(i8 zeroext %in) {
+; CHECK-LABEL: zext_i8_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: ubfx x0, x[[TMP]], #0, #8
+  %big = zext i8 %in to i64
+  ret i64 %big
+}
+define zeroext i64 @zext_i16_i64(i16 zeroext %in) {
+; CHECK-LABEL: zext_i16_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: ubfx x0, x[[TMP]], #0, #16
+  %big = zext i16 %in to i64
+  ret i64 %big
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
new file mode 100644
index 00000000000..f0305962076
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
@@ -0,0 +1,146 @@
+; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin | FileCheck %s
+
+define zeroext i1 @fcmp_float1(float %a) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_float1
+; CHECK: fcmp s0, #0.0
+; CHECK: cset w{{[0-9]+}}, ne
+  %cmp = fcmp une float %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_float2(float %a, float %b) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_float2
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, ne
+  %cmp = fcmp une float %a, %b
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_double1(double %a) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_double1
+; CHECK: fcmp d0, #0.0
+; CHECK: cset w{{[0-9]+}}, ne
+  %cmp = fcmp une double %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_double2(double %a, double %b) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_double2
+; CHECK: fcmp d0, d1
+; CHECK: cset w{{[0-9]+}}, ne
+  %cmp = fcmp une double %a, %b
+  ret i1 %cmp
+}
+
+; Check each fcmp condition
+define float @fcmp_oeq(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_oeq
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, eq
+  %cmp = fcmp oeq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ogt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ogt
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, gt
+  %cmp = fcmp ogt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_oge(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_oge
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, ge
+  %cmp = fcmp oge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_olt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_olt
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, mi
+  %cmp = fcmp olt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ole(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ole
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, ls
+  %cmp = fcmp ole float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ord(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ord
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, vc
+  %cmp = fcmp ord float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uno(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_uno
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, vs
+  %cmp = fcmp uno float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ugt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ugt
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, hi
+  %cmp = fcmp ugt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uge(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_uge
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, pl
+  %cmp = fcmp uge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ult(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ult
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, lt
+  %cmp = fcmp ult float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ule(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ule
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, le
+  %cmp = fcmp ule float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_une(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_une
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, ne
+  %cmp = fcmp une float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
new file mode 100644
index 00000000000..dc4d8953c27
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Test load/store of global value from global offset table.
+@seed = common global i64 0, align 8
+
+define void @Initrand() nounwind {
+entry:
+; CHECK: @Initrand
+; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
+; CHECK: str x{{[0-9]+}}, [x[[REG2]]]
+  store i64 74755, i64* @seed, align 8
+  ret void
+}
+
+define i32 @Rand() nounwind {
+entry:
+; CHECK: @Rand
+; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
+; CHECK: movz x[[REG3:[0-9]+]], #0x51d
+; CHECK: ldr x[[REG4:[0-9]+]], [x[[REG2]]]
+; CHECK: mul x[[REG5:[0-9]+]], x[[REG4]], x[[REG3]]
+; CHECK: movz x[[REG6:[0-9]+]], #0x3619
+; CHECK: add x[[REG7:[0-9]+]], x[[REG5]], x[[REG6]]
+; CHECK: orr x[[REG8:[0-9]+]], xzr, #0xffff
+; CHECK: and x[[REG9:[0-9]+]], x[[REG7]], x[[REG8]]
+; CHECK: str x[[REG9]], [x[[REG]]]
+; CHECK: ldr x{{[0-9]+}}, [x[[REG]]]
+  %0 = load i64* @seed, align 8
+  %mul = mul nsw i64 %0, 1309
+  %add = add nsw i64 %mul, 13849
+  %and = and i64 %add, 65535
+  store i64 %and, i64* @seed, align 8
+  %1 = load i64* @seed, align 8
+  %conv = trunc i64 %1 to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
new file mode 100644
index 00000000000..971be5c4346
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
@@ -0,0 +1,214 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
+entry:
+; CHECK: icmp_eq_imm
+; CHECK: cmp  w0, #31
+; CHECK: cset w0, eq
+  %cmp = icmp eq i32 %a, 31
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
+entry:
+; CHECK: icmp_eq_neg_imm
+; CHECK: cmn  w0, #7
+; CHECK: cset w0, eq
+  %cmp = icmp eq i32 %a, -7
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_eq(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, eq
+  %cmp = icmp eq i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ne
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, ne
+  %cmp = icmp ne i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ugt
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, hi
+  %cmp = icmp ugt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_uge
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, hs
+  %cmp = icmp uge i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ult
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, lo
+  %cmp = icmp ult i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ule
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, ls
+  %cmp = icmp ule i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sgt
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, gt
+  %cmp = icmp sgt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sge
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, ge
+  %cmp = icmp sge i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_slt
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, lt
+  %cmp = icmp slt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sle
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, le
+  %cmp = icmp sle i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
+entry:
+; CHECK: icmp_i64
+; CHECK: cmp  x0, x1
+; CHECK: cset w{{[0-9]+}}, le
+  %cmp = icmp sle i64 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq_i16
+; CHECK: sxth w0, w0
+; CHECK: sxth w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, eq
+  %cmp = icmp eq i16 %a, %b
+  ret i1 %cmp
+}
+
+define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq_i8
+; CHECK: sxtb w0, w0
+; CHECK: sxtb w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, eq
+  %cmp = icmp eq i8 %a, %b
+  ret i1 %cmp
+}
+
+define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
+entry:
+; CHECK: icmp_i16_unsigned
+; CHECK: uxth w0, w0
+; CHECK: uxth w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, lo
+  %cmp = icmp ult i16 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
+entry:
+; CHECK: @icmp_i8_signed
+; CHECK: sxtb w0, w0
+; CHECK: sxtb w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, gt
+  %cmp = icmp sgt i8 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+
+define i32 @icmp_i16_signed_const(i16 %a) nounwind {
+entry:
+; CHECK: icmp_i16_signed_const
+; CHECK: sxth w0, w0
+; CHECK: cmn  w0, #233
+; CHECK: cset w0, lt
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp slt i16 %a, -233
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i8_signed_const(i8 %a) nounwind {
+entry:
+; CHECK: icmp_i8_signed_const
+; CHECK: sxtb w0, w0
+; CHECK: cmp  w0, #124
+; CHECK: cset w0, gt
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp sgt i8 %a, 124
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
+entry:
+; CHECK: icmp_i1_unsigned_const
+; CHECK: and w0, w0, #0x1
+; CHECK: cmp  w0, #0
+; CHECK: cset w0, lo
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp ult i1 %a, 0
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
new file mode 100644
index 00000000000..70335ace50c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+@fn.table = internal global [2 x i8*] [i8* blockaddress(@fn, %ZERO), i8* blockaddress(@fn, %ONE)], align 8
+
+define i32 @fn(i32 %target) nounwind {
+entry:
+; CHECK: @fn
+  %retval = alloca i32, align 4
+  %target.addr = alloca i32, align 4
+  store i32 %target, i32* %target.addr, align 4
+  %0 = load i32* %target.addr, align 4
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds [2 x i8*]* @fn.table, i32 0, i64 %idxprom
+  %1 = load i8** %arrayidx, align 8
+  br label %indirectgoto
+
+ZERO:                                             ; preds = %indirectgoto
+; CHECK: LBB0_1
+  store i32 0, i32* %retval
+  br label %return
+
+ONE:                                              ; preds = %indirectgoto
+; CHECK: LBB0_2
+  store i32 1, i32* %retval
+  br label %return
+
+return:                                           ; preds = %ONE, %ZERO
+  %2 = load i32* %retval
+  ret i32 %2
+
+indirectgoto:                                     ; preds = %entry
+; CHECK: ldr x0, [sp]
+; CHECK: br x0
+  %indirect.goto.dest = phi i8* [ %1, %entry ]
+  indirectbr i8* %indirect.goto.dest, [label %ZERO, label %ONE]
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
new file mode 100644
index 00000000000..a3d5f6c3c5a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios | FileCheck %s --check-prefix=ARM64
+
+@message = global [80 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 16
+@temp = common global [80 x i8] zeroinitializer, align 16
+
+define void @t1() {
+; ARM64-LABEL: t1
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x0, x8, _message@PAGEOFF
+; ARM64: movz w9, #0
+; ARM64: movz x2, #0x50
+; ARM64: uxtb w1, w9
+; ARM64: bl _memset
+  call void @llvm.memset.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i8 0, i64 80, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+define void @t2() {
+; ARM64-LABEL: t2
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x1, x8, _message@PAGEOFF
+; ARM64: movz x2, #0x50
+; ARM64: bl _memcpy
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 80, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+
+define void @t3() {
+; ARM64-LABEL: t3
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x1, x8, _message@PAGEOFF
+; ARM64: movz x2, #0x14
+; ARM64: bl _memmove
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 20, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+
+define void @t4() {
+; ARM64-LABEL: t4
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr x10, [x9]
+; ARM64: str x10, [x8]
+; ARM64: ldr x10, [x9, #8]
+; ARM64: str x10, [x8, #8]
+; ARM64: ldrb w11, [x9, #16]
+; ARM64: strb w11, [x8, #16]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 16, i1 false)
+  ret void
+}
+
+define void @t5() {
+; ARM64-LABEL: t5
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr x10, [x9]
+; ARM64: str x10, [x8]
+; ARM64: ldr x10, [x9, #8]
+; ARM64: str x10, [x8, #8]
+; ARM64: ldrb w11, [x9, #16]
+; ARM64: strb w11, [x8, #16]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 8, i1 false)
+  ret void
+}
+
+define void @t6() {
+; ARM64-LABEL: t6
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr w10, [x9]
+; ARM64: str w10, [x8]
+; ARM64: ldr w10, [x9, #4]
+; ARM64: str w10, [x8, #4]
+; ARM64: ldrb w10, [x9, #8]
+; ARM64: strb w10, [x8, #8]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 9, i32 4, i1 false)
+  ret void
+}
+
+define void @t7() {
+; ARM64-LABEL: t7
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldrh w10, [x9]
+; ARM64: strh w10, [x8]
+; ARM64: ldrh w10, [x9, #2]
+; ARM64: strh w10, [x8, #2]
+; ARM64: ldrh w10, [x9, #4]
+; ARM64: strh w10, [x8, #4]
+; ARM64: ldrb w10, [x9, #6]
+; ARM64: strb w10, [x8, #6]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 7, i32 2, i1 false)
+  ret void
+}
+
+define void @t8() {
+; ARM64-LABEL: t8
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldrb w10, [x9]
+; ARM64: strb w10, [x8]
+; ARM64: ldrb w10, [x9, #1]
+; ARM64: strb w10, [x8, #1]
+; ARM64: ldrb w10, [x9, #2]
+; ARM64: strb w10, [x8, #2]
+; ARM64: ldrb w10, [x9, #3]
+; ARM64: strb w10, [x8, #3]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 4, i32 1, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
new file mode 100644
index 00000000000..ffac131f0ca
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Materialize using fmov
+define void @float_(float* %value) {
+; CHECK: @float_
+; CHECK: fmov s0, #1.25000000
+  store float 1.250000e+00, float* %value, align 4
+  ret void
+}
+
+define void @double_(double* %value) {
+; CHECK: @double_
+; CHECK: fmov d0, #1.25000000
+  store double 1.250000e+00, double* %value, align 8
+  ret void
+}
+
+; Materialize from constant pool
+define float @float_cp() {
+; CHECK: @float_cp
+  ret float 0x400921FB60000000
+}
+
+define double @double_cp() {
+; CHECK: @double_cp
+  ret double 0x400921FB54442D18
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
new file mode 100644
index 00000000000..483d1799f9c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
@@ -0,0 +1,68 @@
+; RUN: llc -mtriple=arm64-apple-ios -O0 %s -o - | FileCheck %s
+
+; Fast-isel can't do vector conversions yet, but it was emitting some highly
+; suspect UCVTFUWDri MachineInstrs.
+define <4 x float> @test_uitofp(<4 x i32> %in) {
+; CHECK-LABEL: test_uitofp:
+; CHECK: ucvtf.4s v0, v0
+
+  %res = uitofp <4 x i32> %in to <4 x float>
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_sitofp(<2 x i32> %in) {
+; CHECK-LABEL: test_sitofp:
+; CHECK: sshll.2d [[EXT:v[0-9]+]], v0, #0
+; CHECK: scvtf.2d v0, [[EXT]]
+
+  %res = sitofp <2 x i32> %in to <2 x double>
+  ret <2 x double> %res
+}
+
+define <2 x i32> @test_fptoui(<2 x float> %in) {
+; CHECK-LABEL: test_fptoui:
+; CHECK: fcvtzu.2s v0, v0
+
+  %res = fptoui <2 x float> %in to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <2 x i64> @test_fptosi(<2 x double> %in) {
+; CHECK-LABEL: test_fptosi:
+; CHECK: fcvtzs.2d v0, v0
+
+  %res = fptosi <2 x double> %in to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define fp128 @uitofp_i32_fp128(i32 %a) {
+entry:
+; CHECK-LABEL: uitofp_i32_fp128
+; CHECK: bl ___floatunsitf
+  %conv = uitofp i32 %a to fp128
+  ret fp128 %conv
+}
+
+define fp128 @uitofp_i64_fp128(i64 %a) {
+entry:
+; CHECK-LABEL: uitofp_i64_fp128
+; CHECK: bl ___floatunditf
+  %conv = uitofp i64 %a to fp128
+  ret fp128 %conv
+}
+
+define i32 @uitofp_fp128_i32(fp128 %a) {
+entry:
+; CHECK-LABEL: uitofp_fp128_i32
+; CHECK: ___fixunstfsi
+  %conv = fptoui fp128 %a to i32
+  ret i32 %conv
+}
+
+define i64 @uitofp_fp128_i64(fp128 %a) {
+entry:
+; CHECK-LABEL: uitofp_fp128_i64
+; CHECK: ___fixunstfdi
+  %conv = fptoui fp128 %a to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-rem.ll b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
new file mode 100644
index 00000000000..d5bdbaae9e7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -print-machineinstrs=expand-isel-pseudos -o /dev/null 2> %t
+; RUN: FileCheck %s < %t --check-prefix=CHECK-SSA
+; REQUIRES: asserts
+
+; CHECK-SSA-LABEL: Machine code for function t1
+
+; CHECK-SSA: [[QUOTREG:%vreg[0-9]+]]<def> = SDIVWr
+; CHECK-SSA-NOT: [[QUOTREG]]<def> =
+; CHECK-SSA: {{%vreg[0-9]+}}<def> = MSUBWrrr [[QUOTREG]]
+
+; CHECK-SSA-LABEL: Machine code for function t2
+
+define i32 @t1(i32 %a, i32 %b) {
+; CHECK: @t1
+; CHECK: sdiv [[TMP:w[0-9]+]], w0, w1
+; CHECK: msub w0, [[TMP]], w1, w0
+  %1 = srem i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @t2(i64 %a, i64 %b) {
+; CHECK: @t2
+; CHECK: sdiv [[TMP:x[0-9]+]], x0, x1
+; CHECK: msub x0, [[TMP]], x1, x0
+  %1 = srem i64 %a, %b
+  ret i64 %1
+}
+
+define i32 @t3(i32 %a, i32 %b) {
+; CHECK: @t3
+; CHECK: udiv [[TMP:w[0-9]+]], w0, w1
+; CHECK: msub w0, [[TMP]], w1, w0
+  %1 = urem i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @t4(i64 %a, i64 %b) {
+; CHECK: @t4
+; CHECK: udiv [[TMP:x[0-9]+]], x0, x1
+; CHECK: msub x0, [[TMP]], x1, x0
+  %1 = urem i64 %a, %b
+  ret i64 %1
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
new file mode 100644
index 00000000000..d91fd285d55
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+;; Test returns.
+define void @t0() nounwind ssp {
+entry:
+; CHECK: t0
+; CHECK: ret
+  ret void
+}
+
+define i32 @t1(i32 %a) nounwind ssp {
+entry:
+; CHECK: t1
+; CHECK: str w0, [sp, #12]
+; CHECK-NEXT: ldr w0, [sp, #12]
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  ret i32 %tmp
+}
+
+define i64 @t2(i64 %a) nounwind ssp {
+entry:
+; CHECK: t2
+; CHECK: str x0, [sp, #8]
+; CHECK-NEXT: ldr x0, [sp, #8]
+; CHECK: ret
+  %a.addr = alloca i64, align 8
+  store i64 %a, i64* %a.addr, align 8
+  %tmp = load i64* %a.addr, align 8
+  ret i64 %tmp
+}
+
+define signext i16 @ret_i16(i16 signext %a) nounwind {
+entry:
+; CHECK: @ret_i16
+; CHECK: sxth	w0, w0
+  %a.addr = alloca i16, align 1
+  store i16 %a, i16* %a.addr, align 1
+  %0 = load i16* %a.addr, align 1
+  ret i16 %0
+}
+
+define signext i8 @ret_i8(i8 signext %a) nounwind {
+entry:
+; CHECK: @ret_i8
+; CHECK: sxtb	w0, w0
+  %a.addr = alloca i8, align 1
+  store i8 %a, i8* %a.addr, align 1
+  %0 = load i8* %a.addr, align 1
+  ret i8 %0
+}
+
+define signext i1 @ret_i1(i1 signext %a) nounwind {
+entry:
+; CHECK: @ret_i1
+; CHECK: and w0, w0, #0x1
+  %a.addr = alloca i1, align 1
+  store i1 %a, i1* %a.addr, align 1
+  %0 = load i1* %a.addr, align 1
+  ret i1 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-select.ll b/test/CodeGen/AArch64/arm64-fast-isel-select.ll
new file mode 100644
index 00000000000..1cc207f5915
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-select.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @t1(i32 %c) nounwind readnone {
+entry:
+; CHECK: @t1
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
+  %0 = icmp sgt i32 %c, 1
+  %1 = select i1 %0, i32 123, i32 357
+  ret i32 %1
+}
+
+define i64 @t2(i32 %c) nounwind readnone {
+entry:
+; CHECK: @t2
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
+  %0 = icmp sgt i32 %c, 1
+  %1 = select i1 %0, i64 123, i64 357
+  ret i64 %1
+}
+
+define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
+entry:
+; CHECK: @t3
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
+  %0 = select i1 %c, i32 %a, i32 %b
+  ret i32 %0
+}
+
+define i64 @t4(i1 %c, i64 %a, i64 %b) nounwind readnone {
+entry:
+; CHECK: @t4
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
+  %0 = select i1 %c, i64 %a, i64 %b
+  ret i64 %0
+}
+
+define float @t5(i1 %c, float %a, float %b) nounwind readnone {
+entry:
+; CHECK: @t5
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: fcsel s0, s0, s1, ne
+  %0 = select i1 %c, float %a, float %b
+  ret float %0
+}
+
+define double @t6(i1 %c, double %a, double %b) nounwind readnone {
+entry:
+; CHECK: @t6
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: fcsel d0, d0, d1, ne
+  %0 = select i1 %c, double %a, double %b
+  ret double %0
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel.ll b/test/CodeGen/AArch64/arm64-fast-isel.ll
new file mode 100644
index 00000000000..0194b3a6c2d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel.ll
@@ -0,0 +1,95 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @t0(i32 %a) nounwind {
+entry:
+; CHECK: t0
+; CHECK: str {{w[0-9]+}}, [sp, #12]
+; CHECK-NEXT: ldr [[REGISTER:w[0-9]+]], [sp, #12]
+; CHECK-NEXT: str [[REGISTER]], [sp, #12]
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr
+  %tmp = load i32* %a.addr
+  store i32 %tmp, i32* %a.addr
+  ret void
+}
+
+define void @t1(i64 %a) nounwind {
+; CHECK: t1
+; CHECK: str {{x[0-9]+}}, [sp, #8]
+; CHECK-NEXT: ldr [[REGISTER:x[0-9]+]], [sp, #8]
+; CHECK-NEXT: str [[REGISTER]], [sp, #8]
+; CHECK: ret
+  %a.addr = alloca i64, align 4
+  store i64 %a, i64* %a.addr
+  %tmp = load i64* %a.addr
+  store i64 %tmp, i64* %a.addr
+  ret void
+}
+
+define zeroext i1 @i1(i1 %a) nounwind {
+entry:
+; CHECK: @i1
+; CHECK: and w0, w0, #0x1
+; CHECK: strb w0, [sp, #15]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: and w0, w0, #0x1
+; CHECK: and w0, w0, #0x1
+; CHECK: add sp, sp, #16
+; CHECK: ret
+  %a.addr = alloca i1, align 1
+  store i1 %a, i1* %a.addr, align 1
+  %0 = load i1* %a.addr, align 1
+  ret i1 %0
+}
+
+define i32 @t2(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -1
+  %1 = load i32* %0, align 4
+  ret i32 %1
+}
+
+define i32 @t3(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldur w0, [x0, #-256]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -64
+  %1 = load i32* %0, align 4
+  ret i32 %1
+}
+
+define void @t4(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: movz w8, #0
+; CHECK: stur w8, [x0, #-4]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -1
+  store i32 0, i32* %0, align 4
+  ret void
+}
+
+define void @t5(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: movz w8, #0
+; CHECK: stur w8, [x0, #-256]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -64
+  store i32 0, i32* %0, align 4
+  ret void
+}
+
+define void @t6() nounwind {
+; CHECK: t6
+; CHECK: brk #0x1
+  tail call void @llvm.trap()
+  ret void
+}
+
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll b/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
new file mode 100644
index 00000000000..8a744c513d7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define void @caller(i32* nocapture %p, i32 %a, i32 %b) nounwind optsize ssp {
+; CHECK-NOT: stp
+; CHECK: b       {{_callee|callee}}
+; CHECK-NOT: ldp
+; CHECK: ret
+  %1 = icmp eq i32 %b, 0
+  br i1 %1, label %3, label %2
+
+  tail call fastcc void @callee(i32* %p, i32 %a) optsize
+  br label %3
+
+  ret void
+}
+
+define internal fastcc void @callee(i32* nocapture %p, i32 %a) nounwind optsize noinline ssp {
+  store volatile i32 %a, i32* %p, align 4, !tbaa !0
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll b/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
new file mode 100644
index 00000000000..af9fe056173
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
@@ -0,0 +1,18 @@
+; fastisel should not fold add with non-pointer bitwidth
+; sext(a) + sext(b) != sext(a + b)
+; RUN: llc -mtriple=arm64-apple-darwin %s -O0 -o - | FileCheck %s
+
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %add = add i8 64, 64 ; 0x40 + 0x40
+  %0 = load i8** %ptr.addr, align 8
+
+  ; CHECK-LABEL: _gep_promotion:
+  ; CHECK: ldrb {{[a-z][0-9]+}}, {{\[[a-z][0-9]+\]}}
+  %arrayidx = getelementptr inbounds i8* %0, i8 %add
+
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
+
diff --git a/test/CodeGen/AArch64/arm64-fcmp-opt.ll b/test/CodeGen/AArch64/arm64-fcmp-opt.ll
new file mode 100644
index 00000000000..41027d4b5c7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fcmp-opt.ll
@@ -0,0 +1,204 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -aarch64-neon-syntax=apple | FileCheck %s
+; rdar://10263824
+
+define i1 @fcmp_float1(float %a) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_float1
+; CHECK: fcmp s0, #0.0
+; CHECK: cset w0, ne
+  %cmp = fcmp une float %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_float2(float %a, float %b) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_float2
+; CHECK: fcmp s0, s1
+; CHECK: cset w0, ne
+  %cmp = fcmp une float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_double1(double %a) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_double1
+; CHECK: fcmp d0, #0.0
+; CHECK: cset w0, ne
+  %cmp = fcmp une double %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_double2(double %a, double %b) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_double2
+; CHECK: fcmp d0, d1
+; CHECK: cset w0, ne
+  %cmp = fcmp une double %a, %b
+  ret i1 %cmp
+}
+
+; Check each fcmp condition
+define float @fcmp_oeq(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_oeq
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], eq
+
+  %cmp = fcmp oeq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ogt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ogt
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], gt
+
+  %cmp = fcmp ogt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_oge(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_oge
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ge
+
+  %cmp = fcmp oge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_olt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_olt
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], mi
+
+  %cmp = fcmp olt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ole(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ole
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ls
+
+  %cmp = fcmp ole float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ord(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ord
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], vc
+  %cmp = fcmp ord float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uno(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_uno
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], vs
+  %cmp = fcmp uno float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ugt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ugt
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], hi
+  %cmp = fcmp ugt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uge(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_uge
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], pl
+  %cmp = fcmp uge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ult(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ult
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], lt
+  %cmp = fcmp ult float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ule(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ule
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], le
+  %cmp = fcmp ule float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_une(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_une
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ne
+  %cmp = fcmp une float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+; Possible opportunity for improvement.  See comment in
+; ARM64TargetLowering::LowerSETCC()
+define float @fcmp_one(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_one
+;	fcmp	s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel [[TMP:s[0-9]+]], s[[ONE]], s[[ZERO]], mi
+; CHECK: fcsel s0, s[[ONE]], [[TMP]], gt
+  %cmp = fcmp one float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+; Possible opportunity for improvement.  See comment in
+; ARM64TargetLowering::LowerSETCC()
+define float @fcmp_ueq(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ueq
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel [[TMP:s[0-9]+]], s[[ONE]], s[[ZERO]], eq
+; CHECK: fcsel s0, s[[ONE]], [[TMP]], vs
+  %cmp = fcmp ueq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fcopysign.ll b/test/CodeGen/AArch64/arm64-fcopysign.ll
new file mode 100644
index 00000000000..66241df9444
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fcopysign.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; rdar://9332258
+
+define float @test1(float %x, float %y) nounwind {
+entry:
+; CHECK-LABEL: test1:
+; CHECK: movi.4s	v2, #0x80, lsl #24
+; CHECK: bit.16b	v0, v1, v2
+  %0 = tail call float @copysignf(float %x, float %y) nounwind readnone
+  ret float %0
+}
+
+define double @test2(double %x, double %y) nounwind {
+entry:
+; CHECK-LABEL: test2:
+; CHECK: movi.2d	v2, #0
+; CHECK: fneg.2d	v2, v2
+; CHECK: bit.16b	v0, v1, v2
+  %0 = tail call double @copysign(double %x, double %y) nounwind readnone
+  ret double %0
+}
+
+; rdar://9545768
+define double @test3(double %a, float %b, float %c) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: fcvt d1, s1
+; CHECK: fneg.2d v2, v{{[0-9]+}}
+; CHECK: bit.16b v0, v1, v2
+  %tmp1 = fadd float %b, %c
+  %tmp2 = fpext float %tmp1 to double
+  %tmp = tail call double @copysign( double %a, double %tmp2 ) nounwind readnone
+  ret double %tmp
+}
+
+define float @test4() nounwind {
+entry:
+; CHECK-LABEL: test4:
+; CHECK: fcvt s0, d0
+; CHECK: movi.4s v[[CONST:[0-9]+]], #0x80, lsl #24
+; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
+  %0 = tail call double (...)* @bar() nounwind
+  %1 = fptrunc double %0 to float
+  %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
+  %3 = fadd float %1, %2
+  ret float %3
+}
+
+declare double @bar(...)
+declare double @copysign(double, double) nounwind readnone
+declare float @copysignf(float, float) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll b/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
new file mode 100644
index 00000000000..e51c38b2b95
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; DAGCombine to transform a conversion of an extract_vector_elt to an
+; extract_vector_elt of a conversion, which saves a round trip of copies
+; of the value to a GPR and back to and FPR.
+; rdar://11855286
+define double @foo0(<2 x i64> %a) nounwind {
+; CHECK:  scvtf.2d  [[REG:v[0-9]+]], v0, #9
+; CHECK-NEXT:  ins.d v0[0], [[REG]][1]
+  %vecext = extractelement <2 x i64> %a, i32 1
+  %fcvt_n = tail call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
+  ret double %fcvt_n
+}
+
+declare double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-fmadd.ll b/test/CodeGen/AArch64/arm64-fmadd.ll
new file mode 100644
index 00000000000..c791900cc2f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fmadd.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+define float @fma32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fma32:
+; CHECK: fmadd s0, s0, s1, s2
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %0
+}
+
+define float @fnma32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fnma32:
+; CHECK: fnmadd s0, s0, s1, s2
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+  %mul = fmul float %0, -1.000000e+00
+  ret float %mul
+}
+
+define float @fms32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fms32:
+; CHECK: fmsub s0, s0, s1, s2
+  %mul = fmul float %b, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %a, float %mul, float %c)
+  ret float %0
+}
+
+define float @fms32_com(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fms32_com:
+; CHECK: fmsub s0, s1, s0, s2
+  %mul = fmul float %b, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %mul, float %a, float %c)
+  ret float %0
+}
+
+define float @fnms32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fnms32:
+; CHECK: fnmsub s0, s0, s1, s2
+  %mul = fmul float %c, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %mul)
+  ret float %0
+}
+
+define double @fma64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fma64:
+; CHECK: fmadd d0, d0, d1, d2
+entry:
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %0
+}
+
+define double @fnma64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnma64:
+; CHECK: fnmadd d0, d0, d1, d2
+entry:
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
+  %mul = fmul double %0, -1.000000e+00
+  ret double %mul
+}
+
+define double @fms64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fms64:
+; CHECK: fmsub d0, d0, d1, d2
+entry:
+  %mul = fmul double %b, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %a, double %mul, double %c)
+  ret double %0
+}
+
+define double @fms64_com(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fms64_com:
+; CHECK: fmsub d0, d1, d0, d2
+entry:
+  %mul = fmul double %b, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %mul, double %a, double %c)
+  ret double %0
+}
+
+define double @fnms64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnms64:
+; CHECK: fnmsub d0, d0, d1, d2
+entry:
+  %mul = fmul double %c, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %mul)
+  ret double %0
+}
+
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-fmax.ll b/test/CodeGen/AArch64/arm64-fmax.ll
new file mode 100644
index 00000000000..94b745437bd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fmax.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s
+
+define double @test_direct(float %in) #1 {
+; CHECK-LABEL: test_direct:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %longer = fpext float %in to double
+  %val = select i1 %cmp, double 0.000000e+00, double %longer
+  ret double %val
+
+; CHECK: fmax
+}
+
+define double @test_cross(float %in) #1 {
+; CHECK-LABEL: test_cross:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %longer = fpext float %in to double
+  %val = select i1 %cmp, double %longer, double 0.000000e+00
+  ret double %val
+
+; CHECK: fmin
+}
+
+; This isn't a min or a max, but passes the first condition for swapping the
+; results. Make sure they're put back before we resort to the normal fcsel.
+define float @test_cross_fail(float %lhs, float %rhs) {
+; CHECK-LABEL: test_cross_fail:
+  %tst = fcmp une float %lhs, %rhs
+  %res = select i1 %tst, float %rhs, float %lhs
+  ret float %res
+
+  ; The register allocator would have to decide to be deliberately obtuse before
+  ; other register were used.
+; CHECK: fcsel s0, s1, s0, ne
+}
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-fminv.ll b/test/CodeGen/AArch64/arm64-fminv.ll
new file mode 100644
index 00000000000..f4c97355dd1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fminv.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+define float @test_fminv_v2f32(<2 x float> %in) {
+; CHECK: test_fminv_v2f32:
+; CHECK: fminp s0, v0.2s
+  %min = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %in)
+  ret float %min
+}
+
+define float @test_fminv_v4f32(<4 x float> %in) {
+; CHECK: test_fminv_v4f32:
+; CHECK: fminv s0, v0.4s
+  %min = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %in)
+  ret float %min
+}
+
+define double @test_fminv_v2f64(<2 x double> %in) {
+; CHECK: test_fminv_v2f64:
+; CHECK: fminp d0, v0.2d
+  %min = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxv_v2f32:
+; CHECK: fmaxp s0, v0.2s
+  %max = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %in)
+  ret float %max
+}
+
+define float @test_fmaxv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxv_v4f32:
+; CHECK: fmaxv s0, v0.4s
+  %max = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %in)
+  ret float %max
+}
+
+define double @test_fmaxv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxv_v2f64:
+; CHECK: fmaxp d0, v0.2d
+  %max = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double>)
+
+define float @test_fminnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fminnmv_v2f32:
+; CHECK: fminnmp s0, v0.2s
+  %minnm = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %in)
+  ret float %minnm
+}
+
+define float @test_fminnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fminnmv_v4f32:
+; CHECK: fminnmv s0, v0.4s
+  %minnm = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %in)
+  ret float %minnm
+}
+
+define double @test_fminnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fminnmv_v2f64:
+; CHECK: fminnmp d0, v0.2d
+  %minnm = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %minnm
+}
+
+declare float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxnmv_v2f32:
+; CHECK: fmaxnmp s0, v0.2s
+  %maxnm = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %in)
+  ret float %maxnm
+}
+
+define float @test_fmaxnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxnmv_v4f32:
+; CHECK: fmaxnmv s0, v0.4s
+  %maxnm = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %in)
+  ret float %maxnm
+}
+
+define double @test_fmaxnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxnmv_v2f64:
+; CHECK: fmaxnmp d0, v0.2d
+  %maxnm = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %maxnm
+}
+
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/AArch64/arm64-fmuladd.ll b/test/CodeGen/AArch64/arm64-fmuladd.ll
new file mode 100644
index 00000000000..6c5eecabd75
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fmuladd.ll
@@ -0,0 +1,88 @@
+; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define float @test_f32(float* %A, float* %B, float* %C) nounwind {
+;CHECK-LABEL: test_f32:
+;CHECK: fmadd
+;CHECK-NOT: fmadd
+  %tmp1 = load float* %A
+  %tmp2 = load float* %B
+  %tmp3 = load float* %C
+  %tmp4 = call float @llvm.fmuladd.f32(float %tmp1, float %tmp2, float %tmp3)
+  ret float %tmp4
+}
+
+define <2 x float> @test_v2f32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: test_v2f32:
+;CHECK: fmla.2s
+;CHECK-NOT: fmla.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @test_v4f32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: test_v4f32:
+;CHECK: fmla.4s
+;CHECK-NOT: fmla.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <8 x float> @test_v8f32(<8 x float>* %A, <8 x float>* %B, <8 x float>* %C) nounwind {
+;CHECK-LABEL: test_v8f32:
+;CHECK: fmla.4s
+;CHECK: fmla.4s
+;CHECK-NOT: fmla.4s
+  %tmp1 = load <8 x float>* %A
+  %tmp2 = load <8 x float>* %B
+  %tmp3 = load <8 x float>* %C
+  %tmp4 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %tmp1, <8 x float> %tmp2, <8 x float> %tmp3)
+  ret <8 x float> %tmp4
+}
+
+define double @test_f64(double* %A, double* %B, double* %C) nounwind {
+;CHECK-LABEL: test_f64:
+;CHECK: fmadd
+;CHECK-NOT: fmadd
+  %tmp1 = load double* %A
+  %tmp2 = load double* %B
+  %tmp3 = load double* %C
+  %tmp4 = call double @llvm.fmuladd.f64(double %tmp1, double %tmp2, double %tmp3)
+  ret double %tmp4
+}
+
+define <2 x double> @test_v2f64(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: test_v2f64:
+;CHECK: fmla.2d
+;CHECK-NOT: fmla.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+define <4 x double> @test_v4f64(<4 x double>* %A, <4 x double>* %B, <4 x double>* %C) nounwind {
+;CHECK-LABEL: test_v4f64:
+;CHECK: fmla.2d
+;CHECK: fmla.2d
+;CHECK-NOT: fmla.2d
+  %tmp1 = load <4 x double>* %A
+  %tmp2 = load <4 x double>* %B
+  %tmp3 = load <4 x double>* %C
+  %tmp4 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %tmp1, <4 x double> %tmp2, <4 x double> %tmp3)
+  ret <4 x double> %tmp4
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+declare double @llvm.fmuladd.f64(double, double, double) nounwind readnone
+declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-fold-address.ll b/test/CodeGen/AArch64/arm64-fold-address.ll
new file mode 100644
index 00000000000..96cc3e90f63
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fold-address.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -O2 -mtriple=arm64-apple-darwin | FileCheck %s
+
+%0 = type opaque
+%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
+%struct.CGPoint = type { double, double }
+%struct.CGSize = type { double, double }
+
+@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+define hidden %struct.CGRect @nofold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: nofold:
+; CHECK: add x[[REG:[0-9]+]], x0, x{{[0-9]+}}
+; CHECK: ldp d0, d1, [x[[REG]]]
+; CHECK: ldp d2, d3, [x[[REG]], #16]
+; CHECK: ret
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr.sum = add i64 %ivar, 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr.sum17 = add i64 %ivar, 16
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.sum = add i64 %ivar, 24
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+define hidden %struct.CGRect @fold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: fold:
+; CHECK: ldr d0, [x0, x{{[0-9]+}}]
+; CHECK-NOT: add x0, x0, x1
+; CHECK: ret
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{}
diff --git a/test/CodeGen/AArch64/arm64-fold-lsl.ll b/test/CodeGen/AArch64/arm64-fold-lsl.ll
new file mode 100644
index 00000000000..ec65e467e37
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fold-lsl.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+;
+; <rdar://problem/14486451>
+
+%struct.a = type [256 x i16]
+%struct.b = type [256 x i32]
+%struct.c = type [256 x i64]
+
+define i16 @load_halfword(%struct.a* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_halfword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldrh w0, [x0, [[REG]], lsl #1]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
+  %result = load i16* %arrayidx86, align 2
+  ret i16 %result
+}
+
+define i32 @load_word(%struct.b* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_word:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldr w0, [x0, [[REG]], lsl #2]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
+  %result = load i32* %arrayidx86, align 4
+  ret i32 %result
+}
+
+define i64 @load_doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_doubleword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldr x0, [x0, [[REG]], lsl #3]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
+  %result = load i64* %arrayidx86, align 8
+  ret i64 %result
+}
+
+define void @store_halfword(%struct.a* %ctx, i32 %xor72, i16 %val) nounwind {
+; CHECK-LABEL: store_halfword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: strh w2, [x0, [[REG]], lsl #1]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
+  store i16 %val, i16* %arrayidx86, align 8
+  ret void
+}
+
+define void @store_word(%struct.b* %ctx, i32 %xor72, i32 %val) nounwind {
+; CHECK-LABEL: store_word:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: str w2, [x0, [[REG]], lsl #2]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
+  store i32 %val, i32* %arrayidx86, align 8
+  ret void
+}
+
+define void @store_doubleword(%struct.c* %ctx, i32 %xor72, i64 %val) nounwind {
+; CHECK-LABEL: store_doubleword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: str x2, [x0, [[REG]], lsl #3]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
+  store i64 %val, i64* %arrayidx86, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fp-contract-zero.ll b/test/CodeGen/AArch64/arm64-fp-contract-zero.ll
new file mode 100644
index 00000000000..f982cbb7f5e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp-contract-zero.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=arm64 -fp-contract=fast -o - %s | FileCheck %s
+
+
+; Make sure we don't try to fold an fneg into +0.0, creating an illegal constant
+; -0.0. It's also good, though not essential, that we don't resort to a litpool.
+define double @test_fms_fold(double %a, double %b) {
+; CHECK-LABEL: test_fms_fold:
+; CHECK: fmov {{d[0-9]+}}, xzr
+; CHECK: ret
+  %mul = fmul double %a, 0.000000e+00
+  %mul1 = fmul double %b, 0.000000e+00
+  %sub = fsub double %mul, %mul1
+  ret double %sub
+}
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-fp-imm.ll b/test/CodeGen/AArch64/arm64-fp-imm.ll
new file mode 100644
index 00000000000..6e271e03d28
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp-imm.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; CHECK: literal8
+; CHECK: .quad  4614256656552045848
+define double @foo() {
+; CHECK: _foo:
+; CHECK: adrp x[[REG:[0-9]+]], lCPI0_0@PAGE
+; CHECK: ldr  d0, [x[[REG]], lCPI0_0@PAGEOFF]
+; CHECK-NEXT: ret
+  ret double 0x400921FB54442D18
+}
+
+; CHECK: literal4
+; CHECK: .long 1078530011
+define float @bar() {
+; CHECK: _bar:
+; CHECK:  adrp  x[[REG:[0-9]+]], lCPI1_0@PAGE
+; CHECK:  ldr s0, [x[[REG]], lCPI1_0@PAGEOFF]
+; CHECK-NEXT:  ret
+  ret float 0x400921FB60000000
+}
+
+; CHECK: literal16
+; CHECK: .quad 0
+; CHECK: .quad 0
+define fp128 @baz() {
+; CHECK: _baz:
+; CHECK:  adrp x[[REG:[0-9]+]], lCPI2_0@PAGE
+; CHECK:  ldr  q0, [x[[REG]], lCPI2_0@PAGEOFF]
+; CHECK-NEXT:  ret
+  ret fp128 0xL00000000000000000000000000000000
+}
diff --git a/test/CodeGen/AArch64/arm64-fp.ll b/test/CodeGen/AArch64/arm64-fp.ll
new file mode 100644
index 00000000000..08b1b6754c2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @t1(i1 %a, float %b, float %c) nounwind {
+; CHECK: t1
+; CHECK: fcsel	s0, s0, s1, ne
+  %sel = select i1 %a, float %b, float %c
+  ret float %sel
+}
diff --git a/test/CodeGen/AArch64/arm64-fp128-folding.ll b/test/CodeGen/AArch64/arm64-fp128-folding.ll
new file mode 100644
index 00000000000..6a7d203f5b1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp128-folding.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
+declare void @bar(i8*, i8*, i32*)
+
+; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
+; which is not supported.
+
+define fp128 @test_folding() {
+; CHECK-LABEL: test_folding:
+  %l = alloca i32
+  store i32 42, i32* %l
+  %val = load i32* %l
+  %fpval = sitofp i32 %val to fp128
+  ; If the value is loaded from a constant pool into an fp128, it's been folded
+  ; successfully.
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}},
+  ret fp128 %fpval
+}
diff --git a/test/CodeGen/AArch64/arm64-fp128.ll b/test/CodeGen/AArch64/arm64-fp128.ll
new file mode 100644
index 00000000000..57bbb93e12b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp128.ll
@@ -0,0 +1,273 @@
+; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone < %s | FileCheck %s
+
+@lhs = global fp128 zeroinitializer, align 16
+@rhs = global fp128 zeroinitializer, align 16
+
+define fp128 @test_add() {
+; CHECK-LABEL: test_add:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fadd fp128 %lhs, %rhs
+; CHECK: bl __addtf3
+  ret fp128 %val
+}
+
+define fp128 @test_sub() {
+; CHECK-LABEL: test_sub:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fsub fp128 %lhs, %rhs
+; CHECK: bl __subtf3
+  ret fp128 %val
+}
+
+define fp128 @test_mul() {
+; CHECK-LABEL: test_mul:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fmul fp128 %lhs, %rhs
+; CHECK: bl __multf3
+  ret fp128 %val
+}
+
+define fp128 @test_div() {
+; CHECK-LABEL: test_div:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fdiv fp128 %lhs, %rhs
+; CHECK: bl __divtf3
+  ret fp128 %val
+}
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_fptosi() {
+; CHECK-LABEL: test_fptosi:
+  %val = load fp128* @lhs, align 16
+
+  %val32 = fptosi fp128 %val to i32
+  store i32 %val32, i32* @var32
+; CHECK: bl __fixtfsi
+
+  %val64 = fptosi fp128 %val to i64
+  store i64 %val64, i64* @var64
+; CHECK: bl __fixtfdi
+
+  ret void
+}
+
+define void @test_fptoui() {
+; CHECK-LABEL: test_fptoui:
+  %val = load fp128* @lhs, align 16
+
+  %val32 = fptoui fp128 %val to i32
+  store i32 %val32, i32* @var32
+; CHECK: bl __fixunstfsi
+
+  %val64 = fptoui fp128 %val to i64
+  store i64 %val64, i64* @var64
+; CHECK: bl __fixunstfdi
+
+  ret void
+}
+
+define void @test_sitofp() {
+; CHECK-LABEL: test_sitofp:
+
+  %src32 = load i32* @var32
+  %val32 = sitofp i32 %src32 to fp128
+  store volatile fp128 %val32, fp128* @lhs
+; CHECK: bl __floatsitf
+
+  %src64 = load i64* @var64
+  %val64 = sitofp i64 %src64 to fp128
+  store volatile fp128 %val64, fp128* @lhs
+; CHECK: bl __floatditf
+
+  ret void
+}
+
+define void @test_uitofp() {
+; CHECK-LABEL: test_uitofp:
+
+  %src32 = load i32* @var32
+  %val32 = uitofp i32 %src32 to fp128
+  store volatile fp128 %val32, fp128* @lhs
+; CHECK: bl __floatunsitf
+
+  %src64 = load i64* @var64
+  %val64 = uitofp i64 %src64 to fp128
+  store volatile fp128 %val64, fp128* @lhs
+; CHECK: bl __floatunditf
+
+  ret void
+}
+
+define i1 @test_setcc1() {
+; CHECK-LABEL: test_setcc1:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+; Technically, everything after the call to __letf2 is redundant, but we'll let
+; LLVM have its fun for now.
+  %val = fcmp ole fp128 %lhs, %rhs
+; CHECK: bl __letf2
+; CHECK: cmp w0, #0
+; CHECK: cset w0, le
+
+  ret i1 %val
+; CHECK: ret
+}
+
+define i1 @test_setcc2() {
+; CHECK-LABEL: test_setcc2:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fcmp ugt fp128 %lhs, %rhs
+; CHECK: bl      __gttf2
+; CHECK: cmp     w0, #0
+; CHECK: cset   [[GT:w[0-9]+]], gt
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp     w0, #0
+; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
+; CHECK: orr     w0, [[UNORDERED]], [[GT]]
+
+  ret i1 %val
+; CHECK: ret
+}
+
+define i32 @test_br_cc() {
+; CHECK-LABEL: test_br_cc:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
+  %cond = fcmp olt fp128 %lhs, %rhs
+; CHECK: bl      __getf2
+; CHECK: cmp     w0, #0
+; CHECK: cset   [[OGE:w[0-9]+]], ge
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp     w0, #0
+; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
+
+; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
+; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
+  br i1 %cond, label %iftrue, label %iffalse
+
+iftrue:
+  ret i32 42
+; CHECK-NEXT: BB#
+; CHECK-NEXT: movz w0, #0x2a
+; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
+
+iffalse:
+  ret i32 29
+; CHECK: [[RET29]]:
+; CHECK-NEXT: movz w0, #0x1d
+; CHECK-NEXT: [[REALRET]]:
+; CHECK: ret
+}
+
+define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
+; CHECK-LABEL: test_select:
+
+  %val = select i1 %cond, fp128 %lhs, fp128 %rhs
+  store fp128 %val, fp128* @lhs, align 16
+; CHECK: tst w0, #0x1
+; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: BB#
+; CHECK-NEXT: mov v[[VAL:[0-9]+]].16b, v0.16b
+; CHECK-NEXT: [[IFFALSE]]:
+; CHECK: str q[[VAL]], [{{x[0-9]+}}, :lo12:lhs]
+  ret void
+; CHECK: ret
+}
+
+@varfloat = global float 0.0, align 4
+@vardouble = global double 0.0, align 8
+
+define void @test_round() {
+; CHECK-LABEL: test_round:
+
+  %val = load fp128* @lhs, align 16
+
+  %float = fptrunc fp128 %val to float
+  store float %float, float* @varfloat, align 4
+; CHECK: bl __trunctfsf2
+; CHECK: str s0, [{{x[0-9]+}}, :lo12:varfloat]
+
+  %double = fptrunc fp128 %val to double
+  store double %double, double* @vardouble, align 8
+; CHECK: bl __trunctfdf2
+; CHECK: str d0, [{{x[0-9]+}}, :lo12:vardouble]
+
+  ret void
+}
+
+define void @test_extend() {
+; CHECK-LABEL: test_extend:
+
+  %val = load fp128* @lhs, align 16
+
+  %float = load float* @varfloat
+  %fromfloat = fpext float %float to fp128
+  store volatile fp128 %fromfloat, fp128* @lhs, align 16
+; CHECK: bl __extendsftf2
+; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
+
+  %double = load double* @vardouble
+  %fromdouble = fpext double %double to fp128
+  store volatile fp128 %fromdouble, fp128* @lhs, align 16
+; CHECK: bl __extenddftf2
+; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
+
+  ret void
+; CHECK: ret
+}
+
+define fp128 @test_neg(fp128 %in) {
+; CHECK: [[MINUS0:.LCPI[0-9]+_0]]:
+; Make sure the weird hex constant below *is* -0.0
+; CHECK-NEXT: fp128 -0
+
+; CHECK-LABEL: test_neg:
+
+  ; Could in principle be optimized to fneg which we can't select, this makes
+  ; sure that doesn't happen.
+  %ret = fsub fp128 0xL00000000000000008000000000000000, %in
+; CHECK: mov v1.16b, v0.16b
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:[[MINUS0]]]
+; CHECK: bl __subtf3
+
+  ret fp128 %ret
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-frame-index.ll b/test/CodeGen/AArch64/arm64-frame-index.ll
new file mode 100644
index 00000000000..4a91ff31d8c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-frame-index.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; rdar://11935841
+
+define void @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: add x{{[0-9]+}}, sp
+; CHECK: stp x28, x27, [sp, #-16]!
+  %v = alloca [288 x i32], align 4
+  unreachable
+}
diff --git a/test/CodeGen/AArch64/arm64-frameaddr.ll b/test/CodeGen/AArch64/arm64-frameaddr.ll
new file mode 100644
index 00000000000..469078c8814
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-frameaddr.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i8* @t() nounwind {
+entry:
+; CHECK-LABEL: t:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: mov x0, x29
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+	%0 = call i8* @llvm.frameaddress(i32 0)
+        ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-global-address.ll b/test/CodeGen/AArch64/arm64-global-address.ll
new file mode 100644
index 00000000000..005f414f875
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-global-address.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; rdar://9618644
+
+@G = external global i32
+
+define i32 @test(i32 %off) nounwind {
+; CHECK-LABEL: test:
+; CHECK: adrp x[[REG:[0-9]+]], _G@GOTPAGE
+; CHECK: ldr  x[[REG2:[0-9]+]], [x[[REG]], _G@GOTPAGEOFF]
+; CHECK: add w0, w[[REG2]], w0
+  %tmp = ptrtoint i32* @G to i32
+  %tmp1 = add i32 %tmp, %off
+  ret i32 %tmp1
+}
diff --git a/test/CodeGen/AArch64/arm64-hello.ll b/test/CodeGen/AArch64/arm64-hello.ll
new file mode 100644
index 00000000000..a6346fb467f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-hello.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+
+; CHECK-LABEL: main:
+; CHECK:	stp	x29, x30, [sp, #-16]!
+; CHECK-NEXT:	mov	x29, sp
+; CHECK-NEXT:	sub	sp, sp, #16
+; CHECK-NEXT:	stur	wzr, [x29, #-4]
+; CHECK:	adrp	x0, L_.str@PAGE
+; CHECK:	add	x0, x0, L_.str@PAGEOFF
+; CHECK-NEXT:	bl	_puts
+; CHECK-NEXT:	mov	sp, x29
+; CHECK-NEXT:	ldp	x29, x30, [sp], #16
+; CHECK-NEXT:	ret
+
+; CHECK-LINUX-LABEL: main:
+; CHECK-LINUX:	stp	x29, x30, [sp, #-16]!
+; CHECK-LINUX-NEXT:	mov	x29, sp
+; CHECK-LINUX-NEXT:	sub	sp, sp, #16
+; CHECK-LINUX-NEXT:	stur	wzr, [x29, #-4]
+; CHECK-LINUX:	adrp	x0, .L.str
+; CHECK-LINUX:	add	x0, x0, :lo12:.L.str
+; CHECK-LINUX-NEXT:	bl	puts
+; CHECK-LINUX-NEXT:	mov	sp, x29
+; CHECK-LINUX-NEXT:	ldp	x29, x30, [sp], #16
+; CHECK-LINUX-NEXT:	ret
+
+@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
+
+define i32 @main() nounwind ssp {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0))
+  ret i32 %call
+}
+
+declare i32 @puts(i8*)
diff --git a/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll b/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
new file mode 100644
index 00000000000..ba759e32aae
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @foo(<4 x i16>* %__a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: umov.h w{{[0-9]+}}, v{{[0-9]+}}[0]
+  %tmp18 = load <4 x i16>* %__a, align 8
+  %vget_lane = extractelement <4 x i16> %tmp18, i32 0
+  %conv = zext i16 %vget_lane to i32
+  %mul = mul nsw i32 3, %conv
+  ret i32 %mul
+}
+
diff --git a/test/CodeGen/AArch64/arm64-icmp-opt.ll b/test/CodeGen/AArch64/arm64-icmp-opt.ll
new file mode 100644
index 00000000000..7b12ed74861
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-icmp-opt.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; Optimize (x > -1) to (x >= 0) etc.
+; Optimize (cmp (add / sub), 0): eliminate the subs used to update flag
+;   for comparison only
+; rdar://10233472
+
+define i32 @t1(i64 %a) nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: movn
+; CHECK: cmp  x0, #0
+; CHECK: cset w0, ge
+  %cmp = icmp sgt i64 %a, -1
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-illegal-float-ops.ll b/test/CodeGen/AArch64/arm64-illegal-float-ops.ll
new file mode 100644
index 00000000000..9a35fe54d32
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-illegal-float-ops.ll
@@ -0,0 +1,295 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+@varfp128 = global fp128 zeroinitializer
+
+declare float @llvm.cos.f32(float)
+declare double @llvm.cos.f64(double)
+declare fp128 @llvm.cos.f128(fp128)
+
+define void @test_cos(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_cos:
+
+   %cosfloat = call float @llvm.cos.f32(float %float)
+   store float %cosfloat, float* @varfloat
+; CHECK: bl cosf
+
+   %cosdouble = call double @llvm.cos.f64(double %double)
+   store double %cosdouble, double* @vardouble
+; CHECK: bl cos
+
+   %cosfp128 = call fp128 @llvm.cos.f128(fp128 %fp128)
+   store fp128 %cosfp128, fp128* @varfp128
+; CHECK: bl cosl
+
+  ret void
+}
+
+declare float @llvm.exp.f32(float)
+declare double @llvm.exp.f64(double)
+declare fp128 @llvm.exp.f128(fp128)
+
+define void @test_exp(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_exp:
+
+   %expfloat = call float @llvm.exp.f32(float %float)
+   store float %expfloat, float* @varfloat
+; CHECK: bl expf
+
+   %expdouble = call double @llvm.exp.f64(double %double)
+   store double %expdouble, double* @vardouble
+; CHECK: bl exp
+
+   %expfp128 = call fp128 @llvm.exp.f128(fp128 %fp128)
+   store fp128 %expfp128, fp128* @varfp128
+; CHECK: bl expl
+
+  ret void
+}
+
+declare float @llvm.exp2.f32(float)
+declare double @llvm.exp2.f64(double)
+declare fp128 @llvm.exp2.f128(fp128)
+
+define void @test_exp2(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_exp2:
+
+   %exp2float = call float @llvm.exp2.f32(float %float)
+   store float %exp2float, float* @varfloat
+; CHECK: bl exp2f
+
+   %exp2double = call double @llvm.exp2.f64(double %double)
+   store double %exp2double, double* @vardouble
+; CHECK: bl exp2
+
+   %exp2fp128 = call fp128 @llvm.exp2.f128(fp128 %fp128)
+   store fp128 %exp2fp128, fp128* @varfp128
+; CHECK: bl exp2l
+  ret void
+
+}
+
+declare float @llvm.log.f32(float)
+declare double @llvm.log.f64(double)
+declare fp128 @llvm.log.f128(fp128)
+
+define void @test_log(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log:
+
+   %logfloat = call float @llvm.log.f32(float %float)
+   store float %logfloat, float* @varfloat
+; CHECK: bl logf
+
+   %logdouble = call double @llvm.log.f64(double %double)
+   store double %logdouble, double* @vardouble
+; CHECK: bl log
+
+   %logfp128 = call fp128 @llvm.log.f128(fp128 %fp128)
+   store fp128 %logfp128, fp128* @varfp128
+; CHECK: bl logl
+
+  ret void
+}
+
+declare float @llvm.log2.f32(float)
+declare double @llvm.log2.f64(double)
+declare fp128 @llvm.log2.f128(fp128)
+
+define void @test_log2(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log2:
+
+   %log2float = call float @llvm.log2.f32(float %float)
+   store float %log2float, float* @varfloat
+; CHECK: bl log2f
+
+   %log2double = call double @llvm.log2.f64(double %double)
+   store double %log2double, double* @vardouble
+; CHECK: bl log2
+
+   %log2fp128 = call fp128 @llvm.log2.f128(fp128 %fp128)
+   store fp128 %log2fp128, fp128* @varfp128
+; CHECK: bl log2l
+  ret void
+
+}
+
+declare float @llvm.log10.f32(float)
+declare double @llvm.log10.f64(double)
+declare fp128 @llvm.log10.f128(fp128)
+
+define void @test_log10(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log10:
+
+   %log10float = call float @llvm.log10.f32(float %float)
+   store float %log10float, float* @varfloat
+; CHECK: bl log10f
+
+   %log10double = call double @llvm.log10.f64(double %double)
+   store double %log10double, double* @vardouble
+; CHECK: bl log10
+
+   %log10fp128 = call fp128 @llvm.log10.f128(fp128 %fp128)
+   store fp128 %log10fp128, fp128* @varfp128
+; CHECK: bl log10l
+
+  ret void
+}
+
+declare float @llvm.sin.f32(float)
+declare double @llvm.sin.f64(double)
+declare fp128 @llvm.sin.f128(fp128)
+
+define void @test_sin(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_sin:
+
+   %sinfloat = call float @llvm.sin.f32(float %float)
+   store float %sinfloat, float* @varfloat
+; CHECK: bl sinf
+
+   %sindouble = call double @llvm.sin.f64(double %double)
+   store double %sindouble, double* @vardouble
+; CHECK: bl sin
+
+   %sinfp128 = call fp128 @llvm.sin.f128(fp128 %fp128)
+   store fp128 %sinfp128, fp128* @varfp128
+; CHECK: bl sinl
+  ret void
+
+}
+
+declare float @llvm.pow.f32(float, float)
+declare double @llvm.pow.f64(double, double)
+declare fp128 @llvm.pow.f128(fp128, fp128)
+
+define void @test_pow(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_pow:
+
+   %powfloat = call float @llvm.pow.f32(float %float, float %float)
+   store float %powfloat, float* @varfloat
+; CHECK: bl powf
+
+   %powdouble = call double @llvm.pow.f64(double %double, double %double)
+   store double %powdouble, double* @vardouble
+; CHECK: bl pow
+
+   %powfp128 = call fp128 @llvm.pow.f128(fp128 %fp128, fp128 %fp128)
+   store fp128 %powfp128, fp128* @varfp128
+; CHECK: bl powl
+
+  ret void
+}
+
+declare float @llvm.powi.f32(float, i32)
+declare double @llvm.powi.f64(double, i32)
+declare fp128 @llvm.powi.f128(fp128, i32)
+
+define void @test_powi(float %float, double %double, i32 %exponent, fp128 %fp128) {
+; CHECK-LABEL: test_powi:
+
+   %powifloat = call float @llvm.powi.f32(float %float, i32 %exponent)
+   store float %powifloat, float* @varfloat
+; CHECK: bl __powisf2
+
+   %powidouble = call double @llvm.powi.f64(double %double, i32 %exponent)
+   store double %powidouble, double* @vardouble
+; CHECK: bl __powidf2
+
+   %powifp128 = call fp128 @llvm.powi.f128(fp128 %fp128, i32 %exponent)
+   store fp128 %powifp128, fp128* @varfp128
+; CHECK: bl __powitf2
+  ret void
+
+}
+
+define void @test_frem(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_frem:
+
+  %fremfloat = frem float %float, %float
+  store float %fremfloat, float* @varfloat
+; CHECK: bl fmodf
+
+  %fremdouble = frem double %double, %double
+  store double %fremdouble, double* @vardouble
+; CHECK: bl fmod
+
+  %fremfp128 = frem fp128 %fp128, %fp128
+  store fp128 %fremfp128, fp128* @varfp128
+; CHECK: bl fmodl
+
+  ret void
+}
+
+declare fp128 @llvm.fma.f128(fp128, fp128, fp128)
+
+define void @test_fma(fp128 %fp128) {
+; CHECK-LABEL: test_fma:
+
+  %fmafp128 = call fp128 @llvm.fma.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+  store fp128 %fmafp128, fp128* @varfp128
+; CHECK: bl fmal
+
+  ret void
+}
+
+declare fp128 @llvm.fmuladd.f128(fp128, fp128, fp128)
+
+define void @test_fmuladd(fp128 %fp128) {
+; CHECK-LABEL: test_fmuladd:
+
+  %fmuladdfp128 = call fp128 @llvm.fmuladd.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+  store fp128 %fmuladdfp128, fp128* @varfp128
+; CHECK-NOT: bl fmal
+; CHECK: bl __multf3
+; CHECK: bl __addtf3
+
+  ret void
+}
+
+define i32 @test_fptosi32(fp128 %a) {
+; CHECK-LABEL: test_fptosi32:
+; CHECK: bl __fixtfsi
+  %conv.i = fptosi fp128 %a to i32
+  %b = add nsw i32 %conv.i, 48
+  ret i32 %b
+}
+
+define i64 @test_fptosi64(fp128 %a) {
+; CHECK-LABEL: test_fptosi64:
+; CHECK: bl __fixtfdi
+  %conv.i = fptosi fp128 %a to i64
+  %b = add nsw i64 %conv.i, 48
+  ret i64 %b
+}
+
+define i128 @test_fptosi128(fp128 %a) {
+; CHECK-LABEL: test_fptosi128:
+; CHECK: bl __fixtfti
+  %conv.i = fptosi fp128 %a to i128
+  %b = add nsw i128 %conv.i, 48
+  ret i128 %b
+}
+
+define i32 @test_fptoui32(fp128 %a) {
+; CHECK-LABEL: test_fptoui32:
+; CHECK: bl __fixunstfsi
+  %conv.i = fptoui fp128 %a to i32
+  %b = add nsw i32 %conv.i, 48
+  ret i32 %b
+}
+
+define i64 @test_fptoui64(fp128 %a) {
+; CHECK-LABEL: test_fptoui64:
+; CHECK: bl __fixunstfdi
+  %conv.i = fptoui fp128 %a to i64
+  %b = add nsw i64 %conv.i, 48
+  ret i64 %b
+}
+
+define i128 @test_fptoui128(fp128 %a) {
+; CHECK-LABEL: test_fptoui128:
+; CHECK: bl __fixunstfti
+  %conv.i = fptoui fp128 %a to i128
+  %b = add nsw i128 %conv.i, 48
+  ret i128 %b
+}
diff --git a/test/CodeGen/AArch64/arm64-indexed-memory.ll b/test/CodeGen/AArch64/arm64-indexed-memory.ll
new file mode 100644
index 00000000000..e501c6e403b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -0,0 +1,351 @@
+; RUN: llc < %s -march=arm64 -aarch64-redzone | FileCheck %s
+
+define void @store64(i64** nocapture %out, i64 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store64:
+; CHECK: str x{{[0-9+]}}, [x{{[0-9+]}}], #8
+; CHECK: ret
+  %tmp = load i64** %out, align 8
+  %incdec.ptr = getelementptr inbounds i64* %tmp, i64 1
+  store i64 %spacing, i64* %tmp, align 4
+  store i64* %incdec.ptr, i64** %out, align 8
+  ret void
+}
+
+define void @store32(i32** nocapture %out, i32 %index, i32 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store32:
+; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load i32** %out, align 8
+  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
+  store i32 %spacing, i32* %tmp, align 4
+  store i32* %incdec.ptr, i32** %out, align 8
+  ret void
+}
+
+define void @store16(i16** nocapture %out, i16 %index, i16 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store16:
+; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+  %tmp = load i16** %out, align 8
+  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
+  store i16 %spacing, i16* %tmp, align 4
+  store i16* %incdec.ptr, i16** %out, align 8
+  ret void
+}
+
+define void @store8(i8** nocapture %out, i8 %index, i8 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store8:
+; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
+; CHECK: ret
+  %tmp = load i8** %out, align 8
+  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
+  store i8 %spacing, i8* %tmp, align 4
+  store i8* %incdec.ptr, i8** %out, align 8
+  ret void
+}
+
+define void @truncst64to32(i32** nocapture %out, i32 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to32:
+; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load i32** %out, align 8
+  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i32
+  store i32 %trunc, i32* %tmp, align 4
+  store i32* %incdec.ptr, i32** %out, align 8
+  ret void
+}
+
+define void @truncst64to16(i16** nocapture %out, i16 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to16:
+; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+  %tmp = load i16** %out, align 8
+  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i16
+  store i16 %trunc, i16* %tmp, align 4
+  store i16* %incdec.ptr, i16** %out, align 8
+  ret void
+}
+
+define void @truncst64to8(i8** nocapture %out, i8 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to8:
+; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
+; CHECK: ret
+  %tmp = load i8** %out, align 8
+  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i8
+  store i8 %trunc, i8* %tmp, align 4
+  store i8* %incdec.ptr, i8** %out, align 8
+  ret void
+}
+
+
+define void @storef32(float** nocapture %out, float %index, float %spacing) nounwind noinline ssp {
+; CHECK-LABEL: storef32:
+; CHECK: str s{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load float** %out, align 8
+  %incdec.ptr = getelementptr inbounds float* %tmp, i64 1
+  store float %spacing, float* %tmp, align 4
+  store float* %incdec.ptr, float** %out, align 8
+  ret void
+}
+
+define void @storef64(double** nocapture %out, double %index, double %spacing) nounwind noinline ssp {
+; CHECK-LABEL: storef64:
+; CHECK: str d{{[0-9+]}}, [x{{[0-9+]}}], #8
+; CHECK: ret
+  %tmp = load double** %out, align 8
+  %incdec.ptr = getelementptr inbounds double* %tmp, i64 1
+  store double %spacing, double* %tmp, align 4
+  store double* %incdec.ptr, double** %out, align 8
+  ret void
+}
+
+define double * @pref64(double** nocapture %out, double %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pref64:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     d0, [x0, #32]!
+; CHECK-NEXT: ret
+  %tmp = load double** %out, align 8
+  %ptr = getelementptr inbounds double* %tmp, i64 4
+  store double %spacing, double* %ptr, align 4
+  ret double *%ptr
+}
+
+define float * @pref32(float** nocapture %out, float %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pref32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     s0, [x0, #12]!
+; CHECK-NEXT: ret
+  %tmp = load float** %out, align 8
+  %ptr = getelementptr inbounds float* %tmp, i64 3
+  store float %spacing, float* %ptr, align 4
+  ret float *%ptr
+}
+
+define i64 * @pre64(i64** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre64:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     x1, [x0, #16]!
+; CHECK-NEXT: ret
+  %tmp = load i64** %out, align 8
+  %ptr = getelementptr inbounds i64* %tmp, i64 2
+  store i64 %spacing, i64* %ptr, align 4
+  ret i64 *%ptr
+}
+
+define i32 * @pre32(i32** nocapture %out, i32 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     w1, [x0, #8]!
+; CHECK-NEXT: ret
+  %tmp = load i32** %out, align 8
+  %ptr = getelementptr inbounds i32* %tmp, i64 2
+  store i32 %spacing, i32* %ptr, align 4
+  ret i32 *%ptr
+}
+
+define i16 * @pre16(i16** nocapture %out, i16 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre16:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strh    w1, [x0, #4]!
+; CHECK-NEXT: ret
+  %tmp = load i16** %out, align 8
+  %ptr = getelementptr inbounds i16* %tmp, i64 2
+  store i16 %spacing, i16* %ptr, align 4
+  ret i16 *%ptr
+}
+
+define i8 * @pre8(i8** nocapture %out, i8 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre8:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strb    w1, [x0, #2]!
+; CHECK-NEXT: ret
+  %tmp = load i8** %out, align 8
+  %ptr = getelementptr inbounds i8* %tmp, i64 2
+  store i8 %spacing, i8* %ptr, align 4
+  ret i8 *%ptr
+}
+
+define i32 * @pretrunc64to32(i32** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     w1, [x0, #8]!
+; CHECK-NEXT: ret
+  %tmp = load i32** %out, align 8
+  %ptr = getelementptr inbounds i32* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i32
+  store i32 %trunc, i32* %ptr, align 4
+  ret i32 *%ptr
+}
+
+define i16 * @pretrunc64to16(i16** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to16:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strh    w1, [x0, #4]!
+; CHECK-NEXT: ret
+  %tmp = load i16** %out, align 8
+  %ptr = getelementptr inbounds i16* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i16
+  store i16 %trunc, i16* %ptr, align 4
+  ret i16 *%ptr
+}
+
+define i8 * @pretrunc64to8(i8** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to8:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strb    w1, [x0, #2]!
+; CHECK-NEXT: ret
+  %tmp = load i8** %out, align 8
+  %ptr = getelementptr inbounds i8* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i8
+  store i8 %trunc, i8* %ptr, align 4
+  ret i8 *%ptr
+}
+
+;-----
+; Pre-indexed loads
+;-----
+define double* @preidxf64(double* %src, double* %out) {
+; CHECK-LABEL: preidxf64:
+; CHECK: ldr     d0, [x0, #8]!
+; CHECK: str     d0, [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds double* %src, i64 1
+  %tmp = load double* %ptr, align 4
+  store double %tmp, double* %out, align 4
+  ret double* %ptr
+}
+
+define float* @preidxf32(float* %src, float* %out) {
+; CHECK-LABEL: preidxf32:
+; CHECK: ldr     s0, [x0, #4]!
+; CHECK: str     s0, [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds float* %src, i64 1
+  %tmp = load float* %ptr, align 4
+  store float %tmp, float* %out, align 4
+  ret float* %ptr
+}
+
+define i64* @preidx64(i64* %src, i64* %out) {
+; CHECK-LABEL: preidx64:
+; CHECK: ldr     x[[REG:[0-9]+]], [x0, #8]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i64* %src, i64 1
+  %tmp = load i64* %ptr, align 4
+  store i64 %tmp, i64* %out, align 4
+  ret i64* %ptr
+}
+
+define i32* @preidx32(i32* %src, i32* %out) {
+; CHECK: ldr     w[[REG:[0-9]+]], [x0, #4]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i32* %src, i64 1
+  %tmp = load i32* %ptr, align 4
+  store i32 %tmp, i32* %out, align 4
+  ret i32* %ptr
+}
+
+define i16* @preidx16zext32(i16* %src, i32* %out) {
+; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = zext i16 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i16* %ptr
+}
+
+define i16* @preidx16zext64(i16* %src, i64* %out) {
+; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = zext i16 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i16* %ptr
+}
+
+define i8* @preidx8zext32(i8* %src, i32* %out) {
+; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = zext i8 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i8* %ptr
+}
+
+define i8* @preidx8zext64(i8* %src, i64* %out) {
+; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = zext i8 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i8* %ptr
+}
+
+define i32* @preidx32sext64(i32* %src, i64* %out) {
+; CHECK: ldrsw   x[[REG:[0-9]+]], [x0, #4]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i32* %src, i64 1
+  %tmp = load i32* %ptr, align 4
+  %ext = sext i32 %tmp to i64
+  store i64 %ext, i64* %out, align 8
+  ret i32* %ptr
+}
+
+define i16* @preidx16sext32(i16* %src, i32* %out) {
+; CHECK: ldrsh   w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = sext i16 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i16* %ptr
+}
+
+define i16* @preidx16sext64(i16* %src, i64* %out) {
+; CHECK: ldrsh   x[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = sext i16 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i16* %ptr
+}
+
+define i8* @preidx8sext32(i8* %src, i32* %out) {
+; CHECK: ldrsb   w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = sext i8 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i8* %ptr
+}
+
+define i8* @preidx8sext64(i8* %src, i64* %out) {
+; CHECK: ldrsb   x[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = sext i8 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i8* %ptr
+}
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
new file mode 100644
index 00000000000..c118f109289
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s
+
+; This used to assert with "Overran sorted position" in AssignTopologicalOrder
+; due to a cycle created in performPostLD1Combine.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp
+define void @f(double* %P1) #0 {
+entry:
+  %arrayidx4 = getelementptr inbounds double* %P1, i64 1
+  %0 = load double* %arrayidx4, align 8, !tbaa !1
+  %1 = load double* %P1, align 8, !tbaa !1
+  %2 = insertelement <2 x double> undef, double %0, i32 0
+  %3 = insertelement <2 x double> %2, double %1, i32 1
+  %4 = fsub <2 x double> zeroinitializer, %3
+  %5 = fmul <2 x double> undef, %4
+  %6 = extractelement <2 x double> %5, i32 0
+  %cmp168 = fcmp olt double %6, undef
+  br i1 %cmp168, label %if.then172, label %return
+
+if.then172:                                       ; preds = %cond.end90
+  %7 = tail call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 false)
+  br label %return
+
+return:                                           ; preds = %if.then172, %cond.end90, %entry
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) #1
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"double", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
new file mode 100644
index 00000000000..9ee4063658b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -0,0 +1,6174 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+
+@ptr = global i8* null
+
+define <8 x i8> @test_v8i8_pre_load(<8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <8 x i8>* %addr, i32 5
+  %val = load <8 x i8>* %newaddr, align 8
+  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+  ret <8 x i8> %val
+}
+
+define <8 x i8> @test_v8i8_post_load(<8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <8 x i8>* %addr, i32 5
+  %val = load <8 x i8>* %addr, align 8
+  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+  ret <8 x i8> %val
+}
+
+define void @test_v8i8_pre_store(<8 x i8> %in, <8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <8 x i8>* %addr, i32 5
+  store <8 x i8> %in, <8 x i8>* %newaddr, align 8
+  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+  ret void
+}
+
+define void @test_v8i8_post_store(<8 x i8> %in, <8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <8 x i8>* %addr, i32 5
+  store <8 x i8> %in, <8 x i8>* %addr, align 8
+  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+  ret void
+}
+
+define <4 x i16> @test_v4i16_pre_load(<4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <4 x i16>* %addr, i32 5
+  %val = load <4 x i16>* %newaddr, align 8
+  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+  ret <4 x i16> %val
+}
+
+define <4 x i16> @test_v4i16_post_load(<4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <4 x i16>* %addr, i32 5
+  %val = load <4 x i16>* %addr, align 8
+  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+  ret <4 x i16> %val
+}
+
+define void @test_v4i16_pre_store(<4 x i16> %in, <4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <4 x i16>* %addr, i32 5
+  store <4 x i16> %in, <4 x i16>* %newaddr, align 8
+  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+  ret void
+}
+
+define void @test_v4i16_post_store(<4 x i16> %in, <4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <4 x i16>* %addr, i32 5
+  store <4 x i16> %in, <4 x i16>* %addr, align 8
+  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+  ret void
+}
+
+define <2 x i32> @test_v2i32_pre_load(<2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <2 x i32>* %addr, i32 5
+  %val = load <2 x i32>* %newaddr, align 8
+  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+  ret <2 x i32> %val
+}
+
+define <2 x i32> @test_v2i32_post_load(<2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <2 x i32>* %addr, i32 5
+  %val = load <2 x i32>* %addr, align 8
+  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+  ret <2 x i32> %val
+}
+
+define void @test_v2i32_pre_store(<2 x i32> %in, <2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <2 x i32>* %addr, i32 5
+  store <2 x i32> %in, <2 x i32>* %newaddr, align 8
+  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+  ret void
+}
+
+define void @test_v2i32_post_store(<2 x i32> %in, <2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <2 x i32>* %addr, i32 5
+  store <2 x i32> %in, <2 x i32>* %addr, align 8
+  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+  ret void
+}
+
+define <2 x float> @test_v2f32_pre_load(<2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <2 x float>* %addr, i32 5
+  %val = load <2 x float>* %newaddr, align 8
+  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+  ret <2 x float> %val
+}
+
+define <2 x float> @test_v2f32_post_load(<2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <2 x float>* %addr, i32 5
+  %val = load <2 x float>* %addr, align 8
+  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+  ret <2 x float> %val
+}
+
+define void @test_v2f32_pre_store(<2 x float> %in, <2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <2 x float>* %addr, i32 5
+  store <2 x float> %in, <2 x float>* %newaddr, align 8
+  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+  ret void
+}
+
+define void @test_v2f32_post_store(<2 x float> %in, <2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <2 x float>* %addr, i32 5
+  store <2 x float> %in, <2 x float>* %addr, align 8
+  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+  ret void
+}
+
+define <1 x i64> @test_v1i64_pre_load(<1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <1 x i64>* %addr, i32 5
+  %val = load <1 x i64>* %newaddr, align 8
+  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+  ret <1 x i64> %val
+}
+
+define <1 x i64> @test_v1i64_post_load(<1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <1 x i64>* %addr, i32 5
+  %val = load <1 x i64>* %addr, align 8
+  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+  ret <1 x i64> %val
+}
+
+define void @test_v1i64_pre_store(<1 x i64> %in, <1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <1 x i64>* %addr, i32 5
+  store <1 x i64> %in, <1 x i64>* %newaddr, align 8
+  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+  ret void
+}
+
+define void @test_v1i64_post_store(<1 x i64> %in, <1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <1 x i64>* %addr, i32 5
+  store <1 x i64> %in, <1 x i64>* %addr, align 8
+  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+  ret void
+}
+
+define <16 x i8> @test_v16i8_pre_load(<16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <16 x i8>* %addr, i32 5
+  %val = load <16 x i8>* %newaddr, align 8
+  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+  ret <16 x i8> %val
+}
+
+define <16 x i8> @test_v16i8_post_load(<16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <16 x i8>* %addr, i32 5
+  %val = load <16 x i8>* %addr, align 8
+  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+  ret <16 x i8> %val
+}
+
+define void @test_v16i8_pre_store(<16 x i8> %in, <16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <16 x i8>* %addr, i32 5
+  store <16 x i8> %in, <16 x i8>* %newaddr, align 8
+  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+  ret void
+}
+
+define void @test_v16i8_post_store(<16 x i8> %in, <16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <16 x i8>* %addr, i32 5
+  store <16 x i8> %in, <16 x i8>* %addr, align 8
+  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+  ret void
+}
+
+define <8 x i16> @test_v8i16_pre_load(<8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <8 x i16>* %addr, i32 5
+  %val = load <8 x i16>* %newaddr, align 8
+  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+  ret <8 x i16> %val
+}
+
+define <8 x i16> @test_v8i16_post_load(<8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <8 x i16>* %addr, i32 5
+  %val = load <8 x i16>* %addr, align 8
+  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+  ret <8 x i16> %val
+}
+
+define void @test_v8i16_pre_store(<8 x i16> %in, <8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <8 x i16>* %addr, i32 5
+  store <8 x i16> %in, <8 x i16>* %newaddr, align 8
+  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+  ret void
+}
+
+define void @test_v8i16_post_store(<8 x i16> %in, <8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <8 x i16>* %addr, i32 5
+  store <8 x i16> %in, <8 x i16>* %addr, align 8
+  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+  ret void
+}
+
+define <4 x i32> @test_v4i32_pre_load(<4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <4 x i32>* %addr, i32 5
+  %val = load <4 x i32>* %newaddr, align 8
+  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+  ret <4 x i32> %val
+}
+
+define <4 x i32> @test_v4i32_post_load(<4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <4 x i32>* %addr, i32 5
+  %val = load <4 x i32>* %addr, align 8
+  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+  ret <4 x i32> %val
+}
+
+define void @test_v4i32_pre_store(<4 x i32> %in, <4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <4 x i32>* %addr, i32 5
+  store <4 x i32> %in, <4 x i32>* %newaddr, align 8
+  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+  ret void
+}
+
+define void @test_v4i32_post_store(<4 x i32> %in, <4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <4 x i32>* %addr, i32 5
+  store <4 x i32> %in, <4 x i32>* %addr, align 8
+  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+  ret void
+}
+
+
+define <4 x float> @test_v4f32_pre_load(<4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <4 x float>* %addr, i32 5
+  %val = load <4 x float>* %newaddr, align 8
+  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+  ret <4 x float> %val
+}
+
+define <4 x float> @test_v4f32_post_load(<4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <4 x float>* %addr, i32 5
+  %val = load <4 x float>* %addr, align 8
+  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+  ret <4 x float> %val
+}
+
+define void @test_v4f32_pre_store(<4 x float> %in, <4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <4 x float>* %addr, i32 5
+  store <4 x float> %in, <4 x float>* %newaddr, align 8
+  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+  ret void
+}
+
+define void @test_v4f32_post_store(<4 x float> %in, <4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <4 x float>* %addr, i32 5
+  store <4 x float> %in, <4 x float>* %addr, align 8
+  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+  ret void
+}
+
+
+define <2 x i64> @test_v2i64_pre_load(<2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <2 x i64>* %addr, i32 5
+  %val = load <2 x i64>* %newaddr, align 8
+  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+  ret <2 x i64> %val
+}
+
+define <2 x i64> @test_v2i64_post_load(<2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <2 x i64>* %addr, i32 5
+  %val = load <2 x i64>* %addr, align 8
+  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+  ret <2 x i64> %val
+}
+
+define void @test_v2i64_pre_store(<2 x i64> %in, <2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <2 x i64>* %addr, i32 5
+  store <2 x i64> %in, <2 x i64>* %newaddr, align 8
+  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+  ret void
+}
+
+define void @test_v2i64_post_store(<2 x i64> %in, <2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <2 x i64>* %addr, i32 5
+  store <2 x i64> %in, <2 x i64>* %addr, align 8
+  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+  ret void
+}
+
+
+define <2 x double> @test_v2f64_pre_load(<2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <2 x double>* %addr, i32 5
+  %val = load <2 x double>* %newaddr, align 8
+  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+  ret <2 x double> %val
+}
+
+define <2 x double> @test_v2f64_post_load(<2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <2 x double>* %addr, i32 5
+  %val = load <2 x double>* %addr, align 8
+  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+  ret <2 x double> %val
+}
+
+define void @test_v2f64_pre_store(<2 x double> %in, <2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <2 x double>* %addr, i32 5
+  store <2 x double> %in, <2 x double>* %newaddr, align 8
+  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+  ret void
+}
+
+define void @test_v2f64_post_store(<2 x double> %in, <2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <2 x double>* %addr, i32 5
+  store <2 x double> %in, <2 x double>* %addr, align 8
+  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+  ret void
+}
+
+define i8* @test_v16i8_post_imm_st1_lane(<16 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v16i8_post_imm_st1_lane:
+; CHECK: st1.b { v0 }[3], [x0], #1
+  %elt = extractelement <16 x i8> %in, i32 3
+  store i8 %elt, i8* %addr
+
+  %newaddr = getelementptr i8* %addr, i32 1
+  ret i8* %newaddr
+}
+
+define i8* @test_v16i8_post_reg_st1_lane(<16 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v16i8_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x2
+; CHECK: st1.b { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <16 x i8> %in, i32 3
+  store i8 %elt, i8* %addr
+
+  %newaddr = getelementptr i8* %addr, i32 2
+  ret i8* %newaddr
+}
+
+
+define i16* @test_v8i16_post_imm_st1_lane(<8 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v8i16_post_imm_st1_lane:
+; CHECK: st1.h { v0 }[3], [x0], #2
+  %elt = extractelement <8 x i16> %in, i32 3
+  store i16 %elt, i16* %addr
+
+  %newaddr = getelementptr i16* %addr, i32 1
+  ret i16* %newaddr
+}
+
+define i16* @test_v8i16_post_reg_st1_lane(<8 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v8i16_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x4
+; CHECK: st1.h { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <8 x i16> %in, i32 3
+  store i16 %elt, i16* %addr
+
+  %newaddr = getelementptr i16* %addr, i32 2
+  ret i16* %newaddr
+}
+
+define i32* @test_v4i32_post_imm_st1_lane(<4 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v4i32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[3], [x0], #4
+  %elt = extractelement <4 x i32> %in, i32 3
+  store i32 %elt, i32* %addr
+
+  %newaddr = getelementptr i32* %addr, i32 1
+  ret i32* %newaddr
+}
+
+define i32* @test_v4i32_post_reg_st1_lane(<4 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v4i32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <4 x i32> %in, i32 3
+  store i32 %elt, i32* %addr
+
+  %newaddr = getelementptr i32* %addr, i32 2
+  ret i32* %newaddr
+}
+
+define float* @test_v4f32_post_imm_st1_lane(<4 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v4f32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[3], [x0], #4
+  %elt = extractelement <4 x float> %in, i32 3
+  store float %elt, float* %addr
+
+  %newaddr = getelementptr float* %addr, i32 1
+  ret float* %newaddr
+}
+
+define float* @test_v4f32_post_reg_st1_lane(<4 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v4f32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <4 x float> %in, i32 3
+  store float %elt, float* %addr
+
+  %newaddr = getelementptr float* %addr, i32 2
+  ret float* %newaddr
+}
+
+define i64* @test_v2i64_post_imm_st1_lane(<2 x i64> %in, i64* %addr) {
+; CHECK-LABEL: test_v2i64_post_imm_st1_lane:
+; CHECK: st1.d { v0 }[1], [x0], #8
+  %elt = extractelement <2 x i64> %in, i64 1
+  store i64 %elt, i64* %addr
+
+  %newaddr = getelementptr i64* %addr, i64 1
+  ret i64* %newaddr
+}
+
+define i64* @test_v2i64_post_reg_st1_lane(<2 x i64> %in, i64* %addr) {
+; CHECK-LABEL: test_v2i64_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x10
+; CHECK: st1.d { v0 }[1], [x0], x[[OFFSET]]
+  %elt = extractelement <2 x i64> %in, i64 1
+  store i64 %elt, i64* %addr
+
+  %newaddr = getelementptr i64* %addr, i64 2
+  ret i64* %newaddr
+}
+
+define double* @test_v2f64_post_imm_st1_lane(<2 x double> %in, double* %addr) {
+; CHECK-LABEL: test_v2f64_post_imm_st1_lane:
+; CHECK: st1.d { v0 }[1], [x0], #8
+  %elt = extractelement <2 x double> %in, i32 1
+  store double %elt, double* %addr
+
+  %newaddr = getelementptr double* %addr, i32 1
+  ret double* %newaddr
+}
+
+define double* @test_v2f64_post_reg_st1_lane(<2 x double> %in, double* %addr) {
+; CHECK-LABEL: test_v2f64_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x10
+; CHECK: st1.d { v0 }[1], [x0], x[[OFFSET]]
+  %elt = extractelement <2 x double> %in, i32 1
+  store double %elt, double* %addr
+
+  %newaddr = getelementptr double* %addr, i32 2
+  ret double* %newaddr
+}
+
+define i8* @test_v8i8_post_imm_st1_lane(<8 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v8i8_post_imm_st1_lane:
+; CHECK: st1.b { v0 }[3], [x0], #1
+  %elt = extractelement <8 x i8> %in, i32 3
+  store i8 %elt, i8* %addr
+
+  %newaddr = getelementptr i8* %addr, i32 1
+  ret i8* %newaddr
+}
+
+define i8* @test_v8i8_post_reg_st1_lane(<8 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v8i8_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x2
+; CHECK: st1.b { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <8 x i8> %in, i32 3
+  store i8 %elt, i8* %addr
+
+  %newaddr = getelementptr i8* %addr, i32 2
+  ret i8* %newaddr
+}
+
+define i16* @test_v4i16_post_imm_st1_lane(<4 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v4i16_post_imm_st1_lane:
+; CHECK: st1.h { v0 }[3], [x0], #2
+  %elt = extractelement <4 x i16> %in, i32 3
+  store i16 %elt, i16* %addr
+
+  %newaddr = getelementptr i16* %addr, i32 1
+  ret i16* %newaddr
+}
+
+define i16* @test_v4i16_post_reg_st1_lane(<4 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v4i16_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x4
+; CHECK: st1.h { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <4 x i16> %in, i32 3
+  store i16 %elt, i16* %addr
+
+  %newaddr = getelementptr i16* %addr, i32 2
+  ret i16* %newaddr
+}
+
+define i32* @test_v2i32_post_imm_st1_lane(<2 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v2i32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[1], [x0], #4
+  %elt = extractelement <2 x i32> %in, i32 1
+  store i32 %elt, i32* %addr
+
+  %newaddr = getelementptr i32* %addr, i32 1
+  ret i32* %newaddr
+}
+
+define i32* @test_v2i32_post_reg_st1_lane(<2 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v2i32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[1], [x0], x[[OFFSET]]
+  %elt = extractelement <2 x i32> %in, i32 1
+  store i32 %elt, i32* %addr
+
+  %newaddr = getelementptr i32* %addr, i32 2
+  ret i32* %newaddr
+}
+
+define float* @test_v2f32_post_imm_st1_lane(<2 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v2f32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[1], [x0], #4
+  %elt = extractelement <2 x float> %in, i32 1
+  store float %elt, float* %addr
+
+  %newaddr = getelementptr float* %addr, i32 1
+  ret float* %newaddr
+}
+
+define float* @test_v2f32_post_reg_st1_lane(<2 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v2f32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[1], [x0], x[[OFFSET]]
+  %elt = extractelement <2 x float> %in, i32 1
+  store float %elt, float* %addr
+
+  %newaddr = getelementptr float* %addr, i32 2
+  ret float* %newaddr
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld2:
+;CHECK: ld2.16b { v0, v1 }, [x0], #32
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld2:
+;CHECK: ld2.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld2:
+;CHECK: ld2.8b { v0, v1 }, [x0], #16
+  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 16
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld2:
+;CHECK: ld2.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld2:
+;CHECK: ld2.8h { v0, v1 }, [x0], #32
+  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld2:
+;CHECK: ld2.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld2:
+;CHECK: ld2.4h { v0, v1 }, [x0], #16
+  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 8
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld2:
+;CHECK: ld2.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], #32
+  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], #16
+  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], #32
+  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], #32
+  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], #16
+  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], #32
+  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld3:
+;CHECK: ld3.16b { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 48
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld3:
+;CHECK: ld3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld3:
+;CHECK: ld3.8b { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 24
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld3:
+;CHECK: ld3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld3:
+;CHECK: ld3.8h { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 24
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld3:
+;CHECK: ld3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld3:
+;CHECK: ld3.4h { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 12
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld3:
+;CHECK: ld3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 12
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 6
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 6
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 12
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 6
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 6
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld4:
+;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 64
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld4:
+;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld4:
+;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld4:
+;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld4:
+;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 32
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld4:
+;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld4:
+;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld4:
+;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 16
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 8
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 16
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 8
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double*)
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld1x2:
+;CHECK: ld1.16b { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld1x2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld1x2:
+;CHECK: ld1.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld1x2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld1x2:
+;CHECK: ld1.8b { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 16
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld1x2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld1x2:
+;CHECK: ld1.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld1x2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld1x2:
+;CHECK: ld1.8h { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld1x2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld1x2:
+;CHECK: ld1.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld1x2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld1x2:
+;CHECK: ld1.4h { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 8
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld1x2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld1x2:
+;CHECK: ld1.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld1x2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld1x2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld1x2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld1x2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld1x2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld1x2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld1x2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld1x2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld1x2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld1x2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld1x2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld1x2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld1x2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld1x2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld1x2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld1x2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld1x2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld1x3:
+;CHECK: ld1.16b { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 48
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld1x3:
+;CHECK: ld1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld1x3:
+;CHECK: ld1.8b { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 24
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld1x3:
+;CHECK: ld1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld1x3:
+;CHECK: ld1.8h { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 24
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld1x3:
+;CHECK: ld1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld1x3:
+;CHECK: ld1.4h { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 12
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld1x3:
+;CHECK: ld1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 12
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 6
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 6
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 12
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 6
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 6
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld1x4:
+;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 64
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld1x4:
+;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld1x4:
+;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld1x4:
+;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld1x4:
+;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 32
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld1x4:
+;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld1x4:
+;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld1x4:
+;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 16
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 8
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 16
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 8
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld2r:
+;CHECK: ld2r.16b { v0, v1 }, [x0], #2
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld2r:
+;CHECK: ld2r.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld2r:
+;CHECK: ld2r.8b { v0, v1 }, [x0], #2
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld2r:
+;CHECK: ld2r.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld2r:
+;CHECK: ld2r.8h { v0, v1 }, [x0], #4
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld2r:
+;CHECK: ld2r.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld2r:
+;CHECK: ld2r.4h { v0, v1 }, [x0], #4
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld2r:
+;CHECK: ld2r.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], #8
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], #8
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], #16
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], #16
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], #8
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float*) nounwind readonly
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], #8
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], #16
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double*) nounwind readonly
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], #16
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld3r:
+;CHECK: ld3r.16b { v0, v1, v2 }, [x0], #3
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld3r:
+;CHECK: ld3r.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld3r:
+;CHECK: ld3r.8b { v0, v1, v2 }, [x0], #3
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld3r:
+;CHECK: ld3r.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld3r:
+;CHECK: ld3r.8h { v0, v1, v2 }, [x0], #6
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld3r:
+;CHECK: ld3r.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld3r:
+;CHECK: ld3r.4h { v0, v1, v2 }, [x0], #6
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld3r:
+;CHECK: ld3r.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float*) nounwind readonly
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double*) nounwind readonly
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld4r:
+;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], #4
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld4r:
+;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld4r:
+;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], #4
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld4r:
+;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld4r:
+;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], #8
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld4r:
+;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld4r:
+;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], #8
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld4r:
+;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float*) nounwind readonly
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double*) nounwind readonly
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], #2
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], #2
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], #4
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], #4
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) nounwind readonly
+
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
+
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
+
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
+
+
+define i8* @test_v16i8_post_imm_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st2:
+;CHECK: st2.16b { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st2:
+;CHECK: st2.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st2:
+;CHECK: st2.8b { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i32 16
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st2:
+;CHECK: st2.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st2:
+;CHECK: st2.8h { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st2:
+;CHECK: st2.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st2:
+;CHECK: st2.4h { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i32 8
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st2:
+;CHECK: st2.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 2
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 2
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st3:
+;CHECK: st3.16b { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i32 48
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st3:
+;CHECK: st3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st3:
+;CHECK: st3.8b { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i32 24
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st3:
+;CHECK: st3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st3:
+;CHECK: st3.8h { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i32 24
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st3:
+;CHECK: st3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st3:
+;CHECK: st3.4h { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i32 12
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st3:
+;CHECK: st3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i32 12
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i32 6
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 6
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 3
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i32 12
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i32 6
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 6
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 3
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st4:
+;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i32 64
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st4:
+;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st4:
+;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st4:
+;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st4:
+;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i32 32
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st4:
+;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st4:
+;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st4:
+;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
+
+
+define i32* @test_v4i32_post_imm_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i32 16
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
+
+
+define i32* @test_v2i32_post_imm_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 8
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
+
+
+define i64* @test_v1i64_post_imm_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
+
+
+define float* @test_v4f32_post_imm_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i32 16
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 8
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
+
+
+define double* @test_v1f64_post_imm_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st1x2:
+;CHECK: st1.16b { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st1x2:
+;CHECK: st1.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st1x2:
+;CHECK: st1.8b { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i32 16
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st1x2:
+;CHECK: st1.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st1x2:
+;CHECK: st1.8h { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st1x2:
+;CHECK: st1.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st1x2:
+;CHECK: st1.4h { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i32 8
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st1x2:
+;CHECK: st1.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 2
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 2
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st1x3:
+;CHECK: st1.16b { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i32 48
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st1x3:
+;CHECK: st1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st1x3:
+;CHECK: st1.8b { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i32 24
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st1x3:
+;CHECK: st1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st1x3:
+;CHECK: st1.8h { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i32 24
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st1x3:
+;CHECK: st1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st1x3:
+;CHECK: st1.4h { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i32 12
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st1x3:
+;CHECK: st1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i32 12
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i32 6
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 6
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 3
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i32 12
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i32 6
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 6
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 3
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st1x4:
+;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i32 64
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st1x4:
+;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st1x4:
+;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st1x4:
+;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st1x4:
+;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i32 32
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st1x4:
+;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st1x4:
+;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st1x4:
+;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
+
+
+define i32* @test_v4i32_post_imm_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i32 16
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
+
+
+define i32* @test_v2i32_post_imm_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 8
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
+
+
+define i64* @test_v1i64_post_imm_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
+
+
+define float* @test_v4f32_post_imm_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i32 16
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 8
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
+
+
+define double* @test_v1f64_post_imm_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) {
+  call void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) {
+  call void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i64, i8*) nounwind readnone
+
+
+define i8* @test_v16i8_post_imm_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], #2
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*)
+
+
+define i8* @test_v8i8_post_imm_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], #2
+  call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*)
+
+
+define i16* @test_v8i16_post_imm_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], #4
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*)
+
+
+define i16* @test_v4i16_post_imm_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], #4
+  call void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*)
+
+
+define i32* @test_v4i32_post_imm_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
+
+
+define i32* @test_v2i32_post_imm_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*)
+
+
+define i64* @test_v2i64_post_imm_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 2
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*)
+
+
+define i64* @test_v1i64_post_imm_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 2
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
+
+
+define float* @test_v4f32_post_imm_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
+
+
+define float* @test_v2f32_post_imm_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
+
+
+define double* @test_v2f64_post_imm_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 2
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*)
+
+
+define double* @test_v1f64_post_imm_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 2
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*)
+
+
+define i8* @test_v16i8_post_imm_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
+
+
+define i8* @test_v8i8_post_imm_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
+  call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
+
+
+define i16* @test_v8i16_post_imm_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
+
+
+define i16* @test_v4i16_post_imm_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
+  call void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
+
+
+define i32* @test_v4i32_post_imm_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+
+
+define i32* @test_v2i32_post_imm_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+  call void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
+
+
+define i64* @test_v2i64_post_imm_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 3
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
+
+
+define i64* @test_v1i64_post_imm_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+  call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 3
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+
+
+define float* @test_v4f32_post_imm_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+  call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
+
+
+define float* @test_v2f32_post_imm_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+  call void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
+
+
+define double* @test_v2f64_post_imm_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+  call void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 3
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*)
+
+
+define double* @test_v1f64_post_imm_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+  call void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 3
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*)
+
+
+define i8* @test_v16i8_post_imm_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
+
+
+define i8* @test_v8i8_post_imm_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
+  call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
+
+
+define i16* @test_v8i16_post_imm_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
+
+
+define i16* @test_v4i16_post_imm_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
+
+
+define i32* @test_v4i32_post_imm_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+
+
+define i32* @test_v2i32_post_imm_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
+
+
+define i64* @test_v2i64_post_imm_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
+
+
+define i64* @test_v1i64_post_imm_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+  call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+
+
+define float* @test_v4f32_post_imm_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
+
+
+define float* @test_v2f32_post_imm_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
+
+
+define double* @test_v2f64_post_imm_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+  call void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*)
+
+
+define double* @test_v1f64_post_imm_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+  call void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*)
+
+define <16 x i8> @test_v16i8_post_imm_ld1r(i8* %bar, i8** %ptr) {
+; CHECK-LABEL: test_v16i8_post_imm_ld1r:
+; CHECK: ld1r.16b { v0 }, [x0], #1
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+  %tmp18 = getelementptr i8* %bar, i64 1
+  store i8* %tmp18, i8** %ptr
+  ret <16 x i8> %tmp17
+}
+
+define <16 x i8> @test_v16i8_post_reg_ld1r(i8* %bar, i8** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v16i8_post_reg_ld1r:
+; CHECK: ld1r.16b { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+  %tmp18 = getelementptr i8* %bar, i64 %inc
+  store i8* %tmp18, i8** %ptr
+  ret <16 x i8> %tmp17
+}
+
+define <8 x i8> @test_v8i8_post_imm_ld1r(i8* %bar, i8** %ptr) {
+; CHECK-LABEL: test_v8i8_post_imm_ld1r:
+; CHECK: ld1r.8b { v0 }, [x0], #1
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = getelementptr i8* %bar, i64 1
+  store i8* %tmp10, i8** %ptr
+  ret <8 x i8> %tmp9
+}
+
+define <8 x i8> @test_v8i8_post_reg_ld1r(i8* %bar, i8** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v8i8_post_reg_ld1r:
+; CHECK: ld1r.8b { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = getelementptr i8* %bar, i64 %inc
+  store i8* %tmp10, i8** %ptr
+  ret <8 x i8> %tmp9
+}
+
+define <8 x i16> @test_v8i16_post_imm_ld1r(i16* %bar, i16** %ptr) {
+; CHECK-LABEL: test_v8i16_post_imm_ld1r:
+; CHECK: ld1r.8h { v0 }, [x0], #2
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+  %tmp10 = getelementptr i16* %bar, i64 1
+  store i16* %tmp10, i16** %ptr
+  ret <8 x i16> %tmp9
+}
+
+define <8 x i16> @test_v8i16_post_reg_ld1r(i16* %bar, i16** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v8i16_post_reg_ld1r:
+; CHECK: ld1r.8h { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+  %tmp10 = getelementptr i16* %bar, i64 %inc
+  store i16* %tmp10, i16** %ptr
+  ret <8 x i16> %tmp9
+}
+
+define <4 x i16> @test_v4i16_post_imm_ld1r(i16* %bar, i16** %ptr) {
+; CHECK-LABEL: test_v4i16_post_imm_ld1r:
+; CHECK: ld1r.4h { v0 }, [x0], #2
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = getelementptr i16* %bar, i64 1
+  store i16* %tmp6, i16** %ptr
+  ret <4 x i16> %tmp5
+}
+
+define <4 x i16> @test_v4i16_post_reg_ld1r(i16* %bar, i16** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v4i16_post_reg_ld1r:
+; CHECK: ld1r.4h { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = getelementptr i16* %bar, i64 %inc
+  store i16* %tmp6, i16** %ptr
+  ret <4 x i16> %tmp5
+}
+
+define <4 x i32> @test_v4i32_post_imm_ld1r(i32* %bar, i32** %ptr) {
+; CHECK-LABEL: test_v4i32_post_imm_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], #4
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+  %tmp6 = getelementptr i32* %bar, i64 1
+  store i32* %tmp6, i32** %ptr
+  ret <4 x i32> %tmp5
+}
+
+define <4 x i32> @test_v4i32_post_reg_ld1r(i32* %bar, i32** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v4i32_post_reg_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+  %tmp6 = getelementptr i32* %bar, i64 %inc
+  store i32* %tmp6, i32** %ptr
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i32> @test_v2i32_post_imm_ld1r(i32* %bar, i32** %ptr) {
+; CHECK-LABEL: test_v2i32_post_imm_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], #4
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = getelementptr i32* %bar, i64 1
+  store i32* %tmp4, i32** %ptr
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @test_v2i32_post_reg_ld1r(i32* %bar, i32** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2i32_post_reg_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = getelementptr i32* %bar, i64 %inc
+  store i32* %tmp4, i32** %ptr
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i64> @test_v2i64_post_imm_ld1r(i64* %bar, i64** %ptr) {
+; CHECK-LABEL: test_v2i64_post_imm_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], #8
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+  %tmp4 = getelementptr i64* %bar, i64 1
+  store i64* %tmp4, i64** %ptr
+  ret <2 x i64> %tmp3
+}
+
+define <2 x i64> @test_v2i64_post_reg_ld1r(i64* %bar, i64** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2i64_post_reg_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+  %tmp4 = getelementptr i64* %bar, i64 %inc
+  store i64* %tmp4, i64** %ptr
+  ret <2 x i64> %tmp3
+}
+
+define <4 x float> @test_v4f32_post_imm_ld1r(float* %bar, float** %ptr) {
+; CHECK-LABEL: test_v4f32_post_imm_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], #4
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> <float undef, float undef, float undef, float undef>, float %tmp1, i32 0
+  %tmp3 = insertelement <4 x float> %tmp2, float %tmp1, i32 1
+  %tmp4 = insertelement <4 x float> %tmp3, float %tmp1, i32 2
+  %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 3
+  %tmp6 = getelementptr float* %bar, i64 1
+  store float* %tmp6, float** %ptr
+  ret <4 x float> %tmp5
+}
+
+define <4 x float> @test_v4f32_post_reg_ld1r(float* %bar, float** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v4f32_post_reg_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> <float undef, float undef, float undef, float undef>, float %tmp1, i32 0
+  %tmp3 = insertelement <4 x float> %tmp2, float %tmp1, i32 1
+  %tmp4 = insertelement <4 x float> %tmp3, float %tmp1, i32 2
+  %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 3
+  %tmp6 = getelementptr float* %bar, i64 %inc
+  store float* %tmp6, float** %ptr
+  ret <4 x float> %tmp5
+}
+
+define <2 x float> @test_v2f32_post_imm_ld1r(float* %bar, float** %ptr) {
+; CHECK-LABEL: test_v2f32_post_imm_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], #4
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> <float undef, float undef>, float %tmp1, i32 0
+  %tmp3 = insertelement <2 x float> %tmp2, float %tmp1, i32 1
+  %tmp4 = getelementptr float* %bar, i64 1
+  store float* %tmp4, float** %ptr
+  ret <2 x float> %tmp3
+}
+
+define <2 x float> @test_v2f32_post_reg_ld1r(float* %bar, float** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2f32_post_reg_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> <float undef, float undef>, float %tmp1, i32 0
+  %tmp3 = insertelement <2 x float> %tmp2, float %tmp1, i32 1
+  %tmp4 = getelementptr float* %bar, i64 %inc
+  store float* %tmp4, float** %ptr
+  ret <2 x float> %tmp3
+}
+
+define <2 x double> @test_v2f64_post_imm_ld1r(double* %bar, double** %ptr) {
+; CHECK-LABEL: test_v2f64_post_imm_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], #8
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> <double undef, double undef>, double %tmp1, i32 0
+  %tmp3 = insertelement <2 x double> %tmp2, double %tmp1, i32 1
+  %tmp4 = getelementptr double* %bar, i64 1
+  store double* %tmp4, double** %ptr
+  ret <2 x double> %tmp3
+}
+
+define <2 x double> @test_v2f64_post_reg_ld1r(double* %bar, double** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2f64_post_reg_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> <double undef, double undef>, double %tmp1, i32 0
+  %tmp3 = insertelement <2 x double> %tmp2, double %tmp1, i32 1
+  %tmp4 = getelementptr double* %bar, i64 %inc
+  store double* %tmp4, double** %ptr
+  ret <2 x double> %tmp3
+}
+
+define <16 x i8> @test_v16i8_post_imm_ld1lane(i8* %bar, i8** %ptr, <16 x i8> %A) {
+; CHECK-LABEL: test_v16i8_post_imm_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], #1
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1
+  %tmp3 = getelementptr i8* %bar, i64 1
+  store i8* %tmp3, i8** %ptr
+  ret <16 x i8> %tmp2
+}
+
+define <16 x i8> @test_v16i8_post_reg_ld1lane(i8* %bar, i8** %ptr, i64 %inc, <16 x i8> %A) {
+; CHECK-LABEL: test_v16i8_post_reg_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1
+  %tmp3 = getelementptr i8* %bar, i64 %inc
+  store i8* %tmp3, i8** %ptr
+  ret <16 x i8> %tmp2
+}
+
+define <8 x i8> @test_v8i8_post_imm_ld1lane(i8* %bar, i8** %ptr, <8 x i8> %A) {
+; CHECK-LABEL: test_v8i8_post_imm_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], #1
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1
+  %tmp3 = getelementptr i8* %bar, i64 1
+  store i8* %tmp3, i8** %ptr
+  ret <8 x i8> %tmp2
+}
+
+define <8 x i8> @test_v8i8_post_reg_ld1lane(i8* %bar, i8** %ptr, i64 %inc, <8 x i8> %A) {
+; CHECK-LABEL: test_v8i8_post_reg_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1
+  %tmp3 = getelementptr i8* %bar, i64 %inc
+  store i8* %tmp3, i8** %ptr
+  ret <8 x i8> %tmp2
+}
+
+define <8 x i16> @test_v8i16_post_imm_ld1lane(i16* %bar, i16** %ptr, <8 x i16> %A) {
+; CHECK-LABEL: test_v8i16_post_imm_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], #2
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1
+  %tmp3 = getelementptr i16* %bar, i64 1
+  store i16* %tmp3, i16** %ptr
+  ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @test_v8i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <8 x i16> %A) {
+; CHECK-LABEL: test_v8i16_post_reg_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1
+  %tmp3 = getelementptr i16* %bar, i64 %inc
+  store i16* %tmp3, i16** %ptr
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i16> @test_v4i16_post_imm_ld1lane(i16* %bar, i16** %ptr, <4 x i16> %A) {
+; CHECK-LABEL: test_v4i16_post_imm_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], #2
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1
+  %tmp3 = getelementptr i16* %bar, i64 1
+  store i16* %tmp3, i16** %ptr
+  ret <4 x i16> %tmp2
+}
+
+define <4 x i16> @test_v4i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <4 x i16> %A) {
+; CHECK-LABEL: test_v4i16_post_reg_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1
+  %tmp3 = getelementptr i16* %bar, i64 %inc
+  store i16* %tmp3, i16** %ptr
+  ret <4 x i16> %tmp2
+}
+
+define <4 x i32> @test_v4i32_post_imm_ld1lane(i32* %bar, i32** %ptr, <4 x i32> %A) {
+; CHECK-LABEL: test_v4i32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1
+  %tmp3 = getelementptr i32* %bar, i64 1
+  store i32* %tmp3, i32** %ptr
+  ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @test_v4i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <4 x i32> %A) {
+; CHECK-LABEL: test_v4i32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1
+  %tmp3 = getelementptr i32* %bar, i64 %inc
+  store i32* %tmp3, i32** %ptr
+  ret <4 x i32> %tmp2
+}
+
+define <2 x i32> @test_v2i32_post_imm_ld1lane(i32* %bar, i32** %ptr, <2 x i32> %A) {
+; CHECK-LABEL: test_v2i32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1
+  %tmp3 = getelementptr i32* %bar, i64 1
+  store i32* %tmp3, i32** %ptr
+  ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @test_v2i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <2 x i32> %A) {
+; CHECK-LABEL: test_v2i32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1
+  %tmp3 = getelementptr i32* %bar, i64 %inc
+  store i32* %tmp3, i32** %ptr
+  ret <2 x i32> %tmp2
+}
+
+define <2 x i64> @test_v2i64_post_imm_ld1lane(i64* %bar, i64** %ptr, <2 x i64> %A) {
+; CHECK-LABEL: test_v2i64_post_imm_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], #8
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1
+  %tmp3 = getelementptr i64* %bar, i64 1
+  store i64* %tmp3, i64** %ptr
+  ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @test_v2i64_post_reg_ld1lane(i64* %bar, i64** %ptr, i64 %inc, <2 x i64> %A) {
+; CHECK-LABEL: test_v2i64_post_reg_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1
+  %tmp3 = getelementptr i64* %bar, i64 %inc
+  store i64* %tmp3, i64** %ptr
+  ret <2 x i64> %tmp2
+}
+
+define <4 x float> @test_v4f32_post_imm_ld1lane(float* %bar, float** %ptr, <4 x float> %A) {
+; CHECK-LABEL: test_v4f32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1
+  %tmp3 = getelementptr float* %bar, i64 1
+  store float* %tmp3, float** %ptr
+  ret <4 x float> %tmp2
+}
+
+define <4 x float> @test_v4f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <4 x float> %A) {
+; CHECK-LABEL: test_v4f32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1
+  %tmp3 = getelementptr float* %bar, i64 %inc
+  store float* %tmp3, float** %ptr
+  ret <4 x float> %tmp2
+}
+
+define <2 x float> @test_v2f32_post_imm_ld1lane(float* %bar, float** %ptr, <2 x float> %A) {
+; CHECK-LABEL: test_v2f32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1
+  %tmp3 = getelementptr float* %bar, i64 1
+  store float* %tmp3, float** %ptr
+  ret <2 x float> %tmp2
+}
+
+define <2 x float> @test_v2f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <2 x float> %A) {
+; CHECK-LABEL: test_v2f32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1
+  %tmp3 = getelementptr float* %bar, i64 %inc
+  store float* %tmp3, float** %ptr
+  ret <2 x float> %tmp2
+}
+
+define <2 x double> @test_v2f64_post_imm_ld1lane(double* %bar, double** %ptr, <2 x double> %A) {
+; CHECK-LABEL: test_v2f64_post_imm_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], #8
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1
+  %tmp3 = getelementptr double* %bar, i64 1
+  store double* %tmp3, double** %ptr
+  ret <2 x double> %tmp2
+}
+
+define <2 x double> @test_v2f64_post_reg_ld1lane(double* %bar, double** %ptr, i64 %inc, <2 x double> %A) {
+; CHECK-LABEL: test_v2f64_post_reg_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1
+  %tmp3 = getelementptr double* %bar, i64 %inc
+  store double* %tmp3, double** %ptr
+  ret <2 x double> %tmp2
+}
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
new file mode 100644
index 00000000000..a7aaf9e55d1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'I'
+
+define i32 @constraint_I(i32 %i, i32 %j) nounwind ssp {
+entry:
+  %0 = tail call i32 asm sideeffect "add $0, $1, $2", "=r,r,I"(i32 %i, i32 4097) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
new file mode 100644
index 00000000000..077e1b80d93
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'J'
+
+define i32 @constraint_J(i32 %i, i32 %j) nounwind ssp {
+entry:
+  %0 = tail call i32 asm sideeffect "sub $0, $1, $2", "=r,r,J"(i32 %i, i32 2) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
new file mode 100644
index 00000000000..2a7f9619de5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'K'
+
+define i32 @constraint_K(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,K"(i32 %i, i32 -1) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
new file mode 100644
index 00000000000..17019434195
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'L'
+
+define i32 @constraint_L(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,L"(i32 %i, i64 -1) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
new file mode 100644
index 00000000000..952bf6042c2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'M'
+
+define i32 @constraint_M(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,M"(i32 305418240) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
new file mode 100644
index 00000000000..b4a199f160a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'N'
+
+define i32 @constraint_N(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,N"(i64 1311761352401879040) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll b/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
new file mode 100644
index 00000000000..6bfce8f8f6a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
@@ -0,0 +1,11 @@
+; RUN: not llc < %s -march=arm64 2>&1 | FileCheck %s
+
+
+; The 'z' constraint allocates either xzr or wzr, but obviously an input of 1 is
+; incompatible.
+define void @test_bad_zero_reg() {
+  tail call void asm sideeffect "USE($0)", "z"(i32 1) nounwind
+; CHECK: error: invalid operand for inline asm constraint 'z'
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
new file mode 100644
index 00000000000..d76cca3f21c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as | FileCheck %s
+
+; rdar://9167275
+
+define i32 @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: mov {{w[0-9]+}}, 7
+  %0 = tail call i32 asm "mov ${0:w}, 7", "=r"() nounwind
+  ret i32 %0
+}
+
+define i64 @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: mov {{x[0-9]+}}, 7
+  %0 = tail call i64 asm "mov $0, 7", "=r"() nounwind
+  ret i64 %0
+}
+
+define i64 @t3() nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: mov {{w[0-9]+}}, 7
+  %0 = tail call i64 asm "mov ${0:w}, 7", "=r"() nounwind
+  ret i64 %0
+}
+
+; rdar://9281206
+
+define void @t4(i64 %op) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: mov x0, {{x[0-9]+}}; svc #0
+  %0 = tail call i64 asm sideeffect "mov x0, $1; svc #0;", "=r,r,r,~{x0}"(i64 %op, i64 undef) nounwind
+  ret void
+}
+
+; rdar://9394290
+
+define float @t5(float %x) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %0 = tail call float asm "fadd ${0:s}, ${0:s}, ${0:s}", "=w,0"(float %x) nounwind
+  ret float %0
+}
+
+; rdar://9553599
+
+define zeroext i8 @t6(i8* %src) nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: ldtrb {{w[0-9]+}}, [{{x[0-9]+}}]
+  %0 = tail call i8 asm "ldtrb ${0:w}, [$1]", "=r,r"(i8* %src) nounwind
+  ret i8 %0
+}
+
+define void @t7(i8* %f, i32 %g) nounwind {
+entry:
+  %f.addr = alloca i8*, align 8
+  store i8* %f, i8** %f.addr, align 8
+  ; CHECK-LABEL: t7:
+  ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+  call void asm "str ${1:w}, $0", "=*Q,r"(i8** %f.addr, i32 %g) nounwind
+  ret void
+}
+
+; rdar://10258229
+; ARM64TargetLowering::getRegForInlineAsmConstraint() should recognize 'v'
+; registers.
+define void @t8() nounwind ssp {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: stp {{d[0-9]+}}, {{d[0-9]+}}, [sp, #-16]
+  tail call void asm sideeffect "nop", "~{v8}"() nounwind
+  ret void
+}
+
+define i32 @constraint_I(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_I:
+  %0 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 16773120) nounwind
+  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #16773120
+  %1 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 4096) nounwind
+  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #4096
+  ret i32 %1
+}
+
+define i32 @constraint_J(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_J:
+  %0 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -16773120) nounwind
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4278194176
+  %1 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -1) nounwind
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4294967295
+  ret i32 %1
+}
+
+define i32 @constraint_KL(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_KL:
+  %0 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,K"(i32 %i, i32 255) nounwind
+  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #255
+  %1 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,L"(i32 %i, i64 16711680) nounwind
+  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #16711680
+  ret i32 %1
+}
+
+define i32 @constraint_MN(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_MN:
+  %0 = tail call i32 asm sideeffect "movk ${0:w}, $1", "=r,M"(i32 65535) nounwind
+  ; CHECK: movk  {{w[0-9]+}}, #65535
+  %1 = tail call i32 asm sideeffect "movz ${0:w}, $1", "=r,N"(i64 0) nounwind
+  ; CHECK: movz  {{w[0-9]+}}, #0
+  ret i32 %1
+}
+
+define void @t9() nounwind {
+entry:
+  ; CHECK-LABEL: t9:
+  %data = alloca <2 x double>, align 16
+  %0 = load <2 x double>* %data, align 16
+  call void asm sideeffect "mov.2d v4, $0\0A", "w,~{v4}"(<2 x double> %0) nounwind
+  ; CHECK: mov.2d v4, {{v[0-9]+}}
+  ret void
+}
+
+define void @t10() nounwind {
+entry:
+  ; CHECK-LABEL: t10:
+  %data = alloca <2 x float>, align 8
+  %a = alloca [2 x float], align 4
+  %arraydecay = getelementptr inbounds [2 x float]* %a, i32 0, i32 0
+  %0 = load <2 x float>* %data, align 8
+  call void asm sideeffect "ldr ${1:q}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:d}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:s}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:h}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{h[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:b}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{b[0-9]+}}, [{{x[0-9]+}}]
+  ret void
+}
+
+define void @t11() nounwind {
+entry:
+  ; CHECK-LABEL: t11:
+  %a = alloca i32, align 4
+  %0 = load i32* %a, align 4
+  call void asm sideeffect "mov ${1:x}, ${0:x}\0A", "r,i"(i32 %0, i32 0) nounwind
+  ; CHECK: mov xzr, {{x[0-9]+}}
+  %1 = load i32* %a, align 4
+  call void asm sideeffect "mov ${1:w}, ${0:w}\0A", "r,i"(i32 %1, i32 0) nounwind
+  ; CHECK: mov wzr, {{w[0-9]+}}
+  ret void
+}
+
+define void @t12() nounwind {
+entry:
+  ; CHECK-LABEL: t12:
+  %data = alloca <4 x float>, align 16
+  %0 = load <4 x float>* %data, align 16
+  call void asm sideeffect "mov.2d v4, $0\0A", "x,~{v4}"(<4 x float> %0) nounwind
+  ; CHECK mov.2d v4, {{v([0-9])|(1[0-5])}}
+  ret void
+}
+
+define void @t13() nounwind {
+entry:
+  ; CHECK-LABEL: t13:
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 1311673391471656960) nounwind
+  ; CHECK: mov x4, #1311673391471656960
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 -4662) nounwind
+  ; CHECK: mov x4, #-4662
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 4660) nounwind
+  ; CHECK: mov x4, #4660
+  call void asm sideeffect "mov x4, $0\0A", "N"(i64 -71777214294589696) nounwind
+  ; CHECK: mov x4, #-71777214294589696
+  ret void
+}
+
+define void @t14() nounwind {
+entry:
+  ; CHECK-LABEL: t14:
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 305397760) nounwind
+  ; CHECK: mov w4, #305397760
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 -4662) nounwind
+  ; CHECK: mov w4, #4294962634
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 4660) nounwind
+  ; CHECK: mov w4, #4660
+  call void asm sideeffect "mov w4, $0\0A", "M"(i32 -16711936) nounwind
+  ; CHECK: mov w4, #4278255360
+  ret void
+}
+
+define void @t15() nounwind {
+entry:
+  %0 = tail call double asm sideeffect "fmov $0, d8", "=r"() nounwind
+  ; CHECK: fmov {{x[0-9]+}}, d8
+  ret void
+}
+
+; rdar://problem/14285178
+
+define void @test_zero_reg(i32* %addr) {
+; CHECK-LABEL: test_zero_reg:
+
+  tail call void asm sideeffect "USE($0)", "z"(i32 0) nounwind
+; CHECK: USE(xzr)
+
+  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 0)
+; CHECK: USE(wzr)
+
+  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 1)
+; CHECK: orr [[VAL1:w[0-9]+]], wzr, #0x1
+; CHECK: USE([[VAL1]])
+
+  tail call void asm sideeffect "USE($0), USE($1)", "z,z"(i32 0, i32 0) nounwind
+; CHECK: USE(xzr), USE(xzr)
+
+  tail call void asm sideeffect "USE($0), USE(${1:w})", "z,z"(i32 0, i32 0) nounwind
+; CHECK: USE(xzr), USE(wzr)
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll
new file mode 100644
index 00000000000..e99168b5eba
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-join-reserved.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+target triple = "arm64-apple-macosx10"
+
+; Make sure that a store to [sp] addresses off sp directly.
+; A move isn't necessary.
+; <rdar://problem/11492712>
+; CHECK-LABEL: g:
+; CHECK: str xzr, [sp]
+; CHECK: bl
+; CHECK: ret
+define void @g() nounwind ssp {
+entry:
+  tail call void (i32, ...)* @f(i32 0, i32 0) nounwind
+  ret void
+}
+
+declare void @f(i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-jumptable.ll b/test/CodeGen/AArch64/arm64-jumptable.ll
new file mode 100644
index 00000000000..4635cfe5858
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-jumptable.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; <rdar://11417675>
+
+define void @sum(i32* %to) {
+entry:
+  switch i32 undef, label %exit [
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+  ]
+bb1:
+  store i32 undef, i32* %to
+  br label %exit
+bb2:
+  store i32 undef, i32* %to
+  br label %exit
+bb3:
+  store i32 undef, i32* %to
+  br label %exit
+bb4:
+  store i32 undef, i32* %to
+  br label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: sum:
+; CHECK: adrp    {{x[0-9]+}}, LJTI0_0@PAGE
+; CHECK:  add    {{x[0-9]+}}, {{x[0-9]+}}, LJTI0_0@PAGEOFF
+
+; CHECK-LINUX-LABEL: sum:
+; CHECK-LINUX: adrp    {{x[0-9]+}}, .LJTI0_0
+; CHECK-LINUX:  add    {{x[0-9]+}}, {{x[0-9]+}}, :lo12:.LJTI0_0
diff --git a/test/CodeGen/AArch64/arm64-large-frame.ll b/test/CodeGen/AArch64/arm64-large-frame.ll
new file mode 100644
index 00000000000..5a53da69388
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-large-frame.ll
@@ -0,0 +1,69 @@
+; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
+declare void @use_addr(i8*)
+
+@addr = global i8* null
+
+define void @test_bigframe() {
+; CHECK-LABEL: test_bigframe:
+; CHECK: .cfi_startproc
+
+  %var1 = alloca i8, i32 20000000
+  %var2 = alloca i8, i32 16
+  %var3 = alloca i8, i32 20000000
+
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #1575, lsl #12
+; CHECK: sub sp, sp, #2576
+; CHECK: .cfi_def_cfa_offset 40000032
+
+
+; CHECK: add [[TMP:x[0-9]+]], sp, #4095, lsl #12
+; CHECK: add [[TMP1:x[0-9]+]], [[TMP]], #787, lsl #12
+; CHECK: add {{x[0-9]+}}, [[TMP1]], #3344
+  store volatile i8* %var1, i8** @addr
+
+  %var1plus2 = getelementptr i8* %var1, i32 2
+  store volatile i8* %var1plus2, i8** @addr
+
+; CHECK: add [[TMP:x[0-9]+]], sp, #4095, lsl #12
+; CHECK: add [[TMP1:x[0-9]+]], [[TMP]], #787, lsl #12
+; CHECK: add {{x[0-9]+}}, [[TMP1]], #3328
+  store volatile i8* %var2, i8** @addr
+
+  %var2plus2 = getelementptr i8* %var2, i32 2
+  store volatile i8* %var2plus2, i8** @addr
+
+  store volatile i8* %var3, i8** @addr
+
+  %var3plus2 = getelementptr i8* %var3, i32 2
+  store volatile i8* %var3plus2, i8** @addr
+
+; CHECK: add sp, sp, #4095, lsl #12
+; CHECK: add sp, sp, #4095, lsl #12
+; CHECK: add sp, sp, #1575, lsl #12
+; CHECK: add sp, sp, #2576
+; CHECK: .cfi_endproc
+  ret void
+}
+
+define void @test_mediumframe() {
+; CHECK-LABEL: test_mediumframe:
+  %var1 = alloca i8, i32 1000000
+  %var2 = alloca i8, i32 16
+  %var3 = alloca i8, i32 1000000
+; CHECK: sub sp, sp, #488, lsl #12
+; CHECK-NEXT: sub sp, sp, #1168
+
+  store volatile i8* %var1, i8** @addr
+; CHECK: add     [[VAR1ADDR:x[0-9]+]], sp, #244, lsl #12
+; CHECK: add     [[VAR1ADDR]], [[VAR1ADDR]], #592
+
+; CHECK: add [[VAR2ADDR:x[0-9]+]], sp, #244, lsl #12
+; CHECK: add [[VAR2ADDR]], [[VAR2ADDR]], #576
+
+  store volatile i8* %var2, i8** @addr
+; CHECK: add     sp, sp, #488, lsl #12
+; CHECK: add     sp, sp, #1168
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-ld1.ll b/test/CodeGen/AArch64/arm64-ld1.ll
new file mode 100644
index 00000000000..72d808ccc34
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ld1.ll
@@ -0,0 +1,1345 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>, <8 x i8>,  <8 x i8> }
+
+define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
+; CHECK-LABEL: ld2_8b
+; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
+; and from the argument of the function also defined by ABI (i.e., x0)
+; CHECK ld2.8b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x2_t  %tmp2
+}
+
+define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
+; CHECK-LABEL: ld3_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.8b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x3_t  %tmp2
+}
+
+define %struct.__neon_int8x8x4_t @ld4_8b(i8* %A) nounwind {
+; CHECK-LABEL: ld4_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
+
+%struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x3_t = type { <16 x i8>,  <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x4_t = type { <16 x i8>,  <16 x i8>, <16 x i8>,  <16 x i8> }
+
+define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
+; CHECK-LABEL: ld2_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.16b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
+; CHECK-LABEL: ld3_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.16b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4_16b(i8* %A) nounwind {
+; CHECK-LABEL: ld4_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
+
+%struct.__neon_int16x4x2_t = type { <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>,  <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x4_t = type { <4 x i16>,  <4 x i16>, <4 x i16>,  <4 x i16> }
+
+define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
+; CHECK-LABEL: ld2_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.4h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x2_t  %tmp2
+}
+
+define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
+; CHECK-LABEL: ld3_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.4h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x3_t  %tmp2
+}
+
+define %struct.__neon_int16x4x4_t @ld4_4h(i16* %A) nounwind {
+; CHECK-LABEL: ld4_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
+
+%struct.__neon_int16x8x2_t = type { <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x3_t = type { <8 x i16>,  <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x4_t = type { <8 x i16>,  <8 x i16>, <8 x i16>,  <8 x i16> }
+
+define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
+; CHECK-LABEL: ld2_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.8h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
+; CHECK-LABEL: ld3_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.8h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x3_t %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4_8h(i16* %A) nounwind {
+; CHECK-LABEL: ld4_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
+
+%struct.__neon_int32x2x2_t = type { <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>,  <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>,  <2 x i32>, <2 x i32>,  <2 x i32> }
+
+define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
+; CHECK-LABEL: ld2_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.2s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x2_t  %tmp2
+}
+
+define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
+; CHECK-LABEL: ld3_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.2s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x3_t  %tmp2
+}
+
+define %struct.__neon_int32x2x4_t @ld4_2s(i32* %A) nounwind {
+; CHECK-LABEL: ld4_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
+
+%struct.__neon_int32x4x2_t = type { <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x3_t = type { <4 x i32>,  <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x4_t = type { <4 x i32>,  <4 x i32>, <4 x i32>,  <4 x i32> }
+
+define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
+; CHECK-LABEL: ld2_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.4s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
+; CHECK-LABEL: ld3_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.4s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4_4s(i32* %A) nounwind {
+; CHECK-LABEL: ld4_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
+
+%struct.__neon_int64x2x2_t = type { <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x3_t = type { <2 x i64>,  <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x4_t = type { <2 x i64>,  <2 x i64>, <2 x i64>,  <2 x i64> }
+
+define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
+; CHECK-LABEL: ld2_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.2d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
+; CHECK-LABEL: ld3_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.2d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4_2d(i64* %A) nounwind {
+; CHECK-LABEL: ld4_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
+
+%struct.__neon_int64x1x2_t = type { <1 x i64>,  <1 x i64> }
+%struct.__neon_int64x1x3_t = type { <1 x i64>,  <1 x i64>, <1 x i64> }
+%struct.__neon_int64x1x4_t = type { <1 x i64>,  <1 x i64>, <1 x i64>, <1 x i64> }
+
+
+define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
+; CHECK-LABEL: ld2_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x2_t  %tmp2
+}
+
+define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
+; CHECK-LABEL: ld3_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x3_t  %tmp2
+}
+
+define %struct.__neon_int64x1x4_t @ld4_1di64(i64* %A) nounwind {
+; CHECK-LABEL: ld4_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x4_t  %tmp2
+}
+
+
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
+
+%struct.__neon_float64x1x2_t = type { <1 x double>,  <1 x double> }
+%struct.__neon_float64x1x3_t = type { <1 x double>,  <1 x double>, <1 x double> }
+%struct.__neon_float64x1x4_t = type { <1 x double>,  <1 x double>, <1 x double>, <1 x double> }
+
+
+define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
+; CHECK-LABEL: ld2_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x2_t  %tmp2
+}
+
+define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
+; CHECK-LABEL: ld3_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x3_t  %tmp2
+}
+
+define %struct.__neon_float64x1x4_t @ld4_1df64(double* %A) nounwind {
+; CHECK-LABEL: ld4_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x4_t  %tmp2
+}
+
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
+
+
+define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_16b
+; CHECK ld2.b { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_16b
+; CHECK ld3.b { v0, v1, v2 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_16b
+; CHECK ld4.b { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_8h
+; CHECK ld2.h { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_8h
+; CHECK ld3.h { v0, v1, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x3_t  %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_8h
+; CHECK ld4.h { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_4s
+; CHECK ld2.s { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_4s
+; CHECK ld3.s { v0, v1, v2 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_4s
+; CHECK ld4.s { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_2d
+; CHECK ld2.d { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_2d
+; CHECK ld3.d { v0, v1, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_2d
+; CHECK ld4.d { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+define <8 x i8> @ld1r_8b(i8* %bar) {
+; CHECK: ld1r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.8b { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+  ret <8 x i8> %tmp9
+}
+
+define <16 x i8> @ld1r_16b(i8* %bar) {
+; CHECK: ld1r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.16b { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+  ret <16 x i8> %tmp17
+}
+
+define <4 x i16> @ld1r_4h(i16* %bar) {
+; CHECK: ld1r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4h { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+  ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @ld1r_8h(i16* %bar) {
+; CHECK: ld1r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.8h { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+  ret <8 x i16> %tmp9
+}
+
+define <2 x i32> @ld1r_2s(i32* %bar) {
+; CHECK: ld1r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @ld1r_4s(i32* %bar) {
+; CHECK: ld1r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ld1r_2d(i64* %bar) {
+; CHECK: ld1r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+  ret <2 x i64> %tmp3
+}
+
+define %struct.__neon_int8x8x2_t @ld2r_8b(i8* %A) nounwind {
+; CHECK: ld2r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.8b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x2_t  %tmp2
+}
+
+define %struct.__neon_int8x8x3_t @ld3r_8b(i8* %A) nounwind {
+; CHECK: ld3r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.8b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x3_t  %tmp2
+}
+
+define %struct.__neon_int8x8x4_t @ld4r_8b(i8* %A) nounwind {
+; CHECK: ld4r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.8b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld2r_16b(i8* %A) nounwind {
+; CHECK: ld2r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.16b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3r_16b(i8* %A) nounwind {
+; CHECK: ld3r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.16b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4r_16b(i8* %A) nounwind {
+; CHECK: ld4r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.16b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+
+define %struct.__neon_int16x4x2_t @ld2r_4h(i16* %A) nounwind {
+; CHECK: ld2r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.4h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x2_t  %tmp2
+}
+
+define %struct.__neon_int16x4x3_t @ld3r_4h(i16* %A) nounwind {
+; CHECK: ld3r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.4h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x3_t  %tmp2
+}
+
+define %struct.__neon_int16x4x4_t @ld4r_4h(i16* %A) nounwind {
+; CHECK: ld4r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.4h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2r_8h(i16* %A) nounwind {
+; CHECK: ld2r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.8h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3r_8h(i16* %A) nounwind {
+; CHECK: ld3r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.8h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x3_t  %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4r_8h(i16* %A) nounwind {
+; CHECK: ld4r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.8h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+
+define %struct.__neon_int32x2x2_t @ld2r_2s(i32* %A) nounwind {
+; CHECK: ld2r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.2s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x2_t  %tmp2
+}
+
+define %struct.__neon_int32x2x3_t @ld3r_2s(i32* %A) nounwind {
+; CHECK: ld3r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.2s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x3_t  %tmp2
+}
+
+define %struct.__neon_int32x2x4_t @ld4r_2s(i32* %A) nounwind {
+; CHECK: ld4r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.2s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2r_4s(i32* %A) nounwind {
+; CHECK: ld2r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.4s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3r_4s(i32* %A) nounwind {
+; CHECK: ld3r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.4s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4r_4s(i32* %A) nounwind {
+; CHECK: ld4r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.4s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+
+define %struct.__neon_int64x1x2_t @ld2r_1d(i64* %A) nounwind {
+; CHECK: ld2r_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x2_t  %tmp2
+}
+
+define %struct.__neon_int64x1x3_t @ld3r_1d(i64* %A) nounwind {
+; CHECK: ld3r_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x3_t  %tmp2
+}
+
+define %struct.__neon_int64x1x4_t @ld4r_1d(i64* %A) nounwind {
+; CHECK: ld4r_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2r_2d(i64* %A) nounwind {
+; CHECK: ld2r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.2d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3r_2d(i64* %A) nounwind {
+; CHECK: ld3r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.2d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4r_2d(i64* %A) nounwind {
+; CHECK: ld4r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.2d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+
+define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
+; CHECK-LABEL: ld1_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> %V, i8 %tmp1, i32 0
+  ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @ld1_8h(<8 x i16> %V, i16* %bar) {
+; CHECK-LABEL: ld1_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> %V, i16 %tmp1, i32 0
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @ld1_4s(<4 x i32> %V, i32* %bar) {
+; CHECK-LABEL: ld1_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> %V, i32 %tmp1, i32 0
+  ret <4 x i32> %tmp2
+}
+
+define <4 x float> @ld1_4s_float(<4 x float> %V, float* %bar) {
+; CHECK-LABEL: ld1_4s_float:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> %V, float %tmp1, i32 0
+  ret <4 x float> %tmp2
+}
+
+define <2 x i64> @ld1_2d(<2 x i64> %V, i64* %bar) {
+; CHECK-LABEL: ld1_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> %V, i64 %tmp1, i32 0
+  ret <2 x i64> %tmp2
+}
+
+define <2 x double> @ld1_2d_double(<2 x double> %V, double* %bar) {
+; CHECK-LABEL: ld1_2d_double:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> %V, double %tmp1, i32 0
+  ret <2 x double> %tmp2
+}
+
+define <1 x i64> @ld1_1d(<1 x i64>* %p) {
+; CHECK-LABEL: ld1_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr [[REG:d[0-9]+]], [x0]
+; CHECK-NEXT: ret
+  %tmp = load <1 x i64>* %p, align 8
+  ret <1 x i64> %tmp
+}
+
+define <8 x i8> @ld1_8b(<8 x i8> %V, i8* %bar) {
+; CHECK-LABEL: ld1_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> %V, i8 %tmp1, i32 0
+  ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @ld1_4h(<4 x i16> %V, i16* %bar) {
+; CHECK-LABEL: ld1_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> %V, i16 %tmp1, i32 0
+  ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @ld1_2s(<2 x i32> %V, i32* %bar) {
+; CHECK-LABEL: ld1_2s:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> %V, i32 %tmp1, i32 0
+  ret <2 x i32> %tmp2
+}
+
+define <2 x float> @ld1_2s_float(<2 x float> %V, float* %bar) {
+; CHECK-LABEL: ld1_2s_float:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> %V, float %tmp1, i32 0
+  ret <2 x float> %tmp2
+}
+
+
+; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
+define void @ld1r_2s_from_dup(i8* nocapture %a, i8* nocapture %b, i16* nocapture %diff) nounwind ssp {
+entry:
+; CHECK: ld1r_2s_from_dup
+; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0]
+; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1]
+; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]]
+; CHECK-NEXT: str d[[RESREGNUM]], [x2]
+; CHECK-NEXT: ret
+  %tmp = bitcast i8* %a to i32*
+  %tmp1 = load i32* %tmp, align 4
+  %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0
+  %lane = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmp3 = bitcast <2 x i32> %lane to <8 x i8>
+  %tmp4 = bitcast i8* %b to i32*
+  %tmp5 = load i32* %tmp4, align 4
+  %tmp6 = insertelement <2 x i32> undef, i32 %tmp5, i32 0
+  %lane1 = shufflevector <2 x i32> %tmp6, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmp7 = bitcast <2 x i32> %lane1 to <8 x i8>
+  %vmovl.i.i = zext <8 x i8> %tmp3 to <8 x i16>
+  %vmovl.i4.i = zext <8 x i8> %tmp7 to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i4.i
+  %tmp8 = bitcast <8 x i16> %sub.i to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %tmp8, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp9 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %tmp10 = bitcast i16* %diff to <4 x i16>*
+  store <4 x i16> %tmp9, <4 x i16>* %tmp10, align 8
+  ret void
+}
+
+; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
+define <4 x float> @ld1r_4s_float(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_4s_float
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp2 = insertelement <4 x float> %tmp1, float %tmp, i32 1
+  %tmp3 = insertelement <4 x float> %tmp2, float %tmp, i32 2
+  %tmp4 = insertelement <4 x float> %tmp3, float %tmp, i32 3
+  ret <4 x float> %tmp4
+}
+
+define <2 x float> @ld1r_2s_float(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2s_float
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+  %tmp2 = insertelement <2 x float> %tmp1, float %tmp, i32 1
+  ret <2 x float> %tmp2
+}
+
+define <2 x double> @ld1r_2d_double(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2d_double
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+  %tmp2 = insertelement <2 x double> %tmp1, double %tmp, i32 1
+  ret <2 x double> %tmp2
+}
+
+define <1 x double> @ld1r_1d_double(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_1d_double
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
+  ret <1 x double> %tmp1
+}
+
+define <4 x float> @ld1r_4s_float_shuff(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_4s_float_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+  %lane = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x float> @ld1r_2s_float_shuff(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2s_float_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+  %lane = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <2 x double> @ld1r_2d_double_shuff(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2d_double_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+  %lane = shufflevector <2 x double> %tmp1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <1 x double> @ld1r_1d_double_shuff(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_1d_double_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
+  %lane = shufflevector <1 x double> %tmp1, <1 x double> undef, <1 x i32> zeroinitializer
+  ret <1 x double> %lane
+}
+
+%struct.__neon_float32x2x2_t = type { <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x2_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x2_t %val
+}
+
+define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x2_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x2_t %val
+}
+
+define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x2_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x2_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x2_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x2_t %val
+}
+
+define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x2_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x2_t %val
+}
+
+
+%struct.__neon_float32x4x2_t = type { <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x3_t = type { <4 x float>,  <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x4_t = type { <4 x float>,  <4 x float>, <4 x float>,  <4 x float> }
+
+%struct.__neon_float64x2x2_t = type { <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x3_t = type { <2 x double>,  <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x4_t = type { <2 x double>,  <2 x double>, <2 x double>,  <2 x double> }
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x2_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x2_t %val
+}
+
+define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x2_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x2_t %val
+}
+
+define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x2_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x2_t %val
+}
+
+define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x2_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x2_t %val
+}
+
+define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x2_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x2_t %val
+}
+
+define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x2_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x2_t %val
+}
+
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x3_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x3_t %val
+}
+
+define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x3_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x3_t %val
+}
+
+define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x3_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x3_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x3_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x3_t %val
+}
+
+define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x3_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x3_t %val
+}
+
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x3_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x3_t %val
+}
+
+define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x3_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x3_t %val
+}
+
+define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x3_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x3_t %val
+}
+
+define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x3_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x3_t %val
+}
+
+define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x3_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x3_t %val
+}
+
+define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x3_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x3_t %val
+}
+
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x4_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x4_t %val
+}
+
+define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x4_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x4_t %val
+}
+
+define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x4_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x4_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x4_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x4_t %val
+}
+
+define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x4_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x4_t %val
+}
+
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x4_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x4_t %val
+}
+
+define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x4_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x4_t %val
+}
+
+define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x4_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x4_t %val
+}
+
+define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x4_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x4_t %val
+}
+
+define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x4_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x4_t %val
+}
+
+define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x4_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x4_t %val
+}
diff --git a/test/CodeGen/AArch64/arm64-ldp.ll b/test/CodeGen/AArch64/arm64-ldp.ll
new file mode 100644
index 00000000000..5a986261b31
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldp.ll
@@ -0,0 +1,149 @@
+; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
+; RUN:   -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
+
+; CHECK: ldp_int
+; CHECK: ldp
+define i32 @ldp_int(i32* %p) nounwind {
+  %tmp = load i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32* %p, i64 1
+  %tmp1 = load i32* %add.ptr, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  ret i32 %add
+}
+
+; CHECK: ldp_long
+; CHECK: ldp
+define i64 @ldp_long(i64* %p) nounwind {
+  %tmp = load i64* %p, align 8
+  %add.ptr = getelementptr inbounds i64* %p, i64 1
+  %tmp1 = load i64* %add.ptr, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
+
+; CHECK: ldp_float
+; CHECK: ldp
+define float @ldp_float(float* %p) nounwind {
+  %tmp = load float* %p, align 4
+  %add.ptr = getelementptr inbounds float* %p, i64 1
+  %tmp1 = load float* %add.ptr, align 4
+  %add = fadd float %tmp, %tmp1
+  ret float %add
+}
+
+; CHECK: ldp_double
+; CHECK: ldp
+define double @ldp_double(double* %p) nounwind {
+  %tmp = load double* %p, align 8
+  %add.ptr = getelementptr inbounds double* %p, i64 1
+  %tmp1 = load double* %add.ptr, align 8
+  %add = fadd double %tmp, %tmp1
+  ret double %add
+}
+
+; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
+define i32 @ldur_int(i32* %a) nounwind {
+; LDUR_CHK: ldur_int
+; LDUR_CHK: ldp     [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add     w{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %a, i32 -1
+  %tmp1 = load i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %a, i32 -2
+  %tmp2 = load i32* %p2, align 2
+  %tmp3 = add i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+define i64 @ldur_long(i64* %a) nounwind ssp {
+; LDUR_CHK: ldur_long
+; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -1
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -2
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define float @ldur_float(float* %a) {
+; LDUR_CHK: ldur_float
+; LDUR_CHK: ldp     [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add     s{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds float* %a, i64 -1
+  %tmp1 = load float* %p1, align 2
+  %p2 = getelementptr inbounds float* %a, i64 -2
+  %tmp2 = load float* %p2, align 2
+  %tmp3 = fadd float %tmp1, %tmp2
+  ret float %tmp3
+}
+
+define double @ldur_double(double* %a) {
+; LDUR_CHK: ldur_double
+; LDUR_CHK: ldp     [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16]
+; LDUR_CHK-NEXT: add     d{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds double* %a, i64 -1
+  %tmp1 = load double* %p1, align 2
+  %p2 = getelementptr inbounds double* %a, i64 -2
+  %tmp2 = load double* %p2, align 2
+  %tmp3 = fadd double %tmp1, %tmp2
+  ret double %tmp3
+}
+
+; Now check some boundary conditions
+define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyIn
+; LDUR_CHK-NOT: ldur
+; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -31
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -32
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyOut
+; LDUR_CHK-NOT: ldp
+; Don't be fragile about which loads or manipulations of the base register
+; are used---just check that there isn't an ldp before the add
+; LDUR_CHK: add
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -32
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -33
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpNotAligned
+; LDUR_CHK-NOT: ldp
+; LDUR_CHK: ldur
+; LDUR_CHK-NEXT: ldur
+; LDUR_CHK-NEXT: add
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -18
+  %bp1 = bitcast i64* %p1 to i8*
+  %bp1p1 = getelementptr inbounds i8* %bp1, i64 1
+  %dp1 = bitcast i8* %bp1p1 to i64*
+  %tmp1 = load i64* %dp1, align 1
+
+  %p2 = getelementptr inbounds i64* %a, i64 -17
+  %bp2 = bitcast i64* %p2 to i8*
+  %bp2p1 = getelementptr inbounds i8* %bp2, i64 1
+  %dp2 = bitcast i8* %bp2p1 to i64*
+  %tmp2 = load i64* %dp2, align 1
+
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
diff --git a/test/CodeGen/AArch64/arm64-ldur.ll b/test/CodeGen/AArch64/arm64-ldur.ll
new file mode 100644
index 00000000000..2848c06f9bb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldur.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i64 @_f0(i64* %p) {
+; CHECK: f0:
+; CHECK: ldur x0, [x0, #-8]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i64* %p, i64 -1
+  %ret = load i64* %tmp, align 2
+  ret i64 %ret
+}
+define i32 @_f1(i32* %p) {
+; CHECK: f1:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i32* %p, i64 -1
+  %ret = load i32* %tmp, align 2
+  ret i32 %ret
+}
+define i16 @_f2(i16* %p) {
+; CHECK: f2:
+; CHECK: ldurh w0, [x0, #-2]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i16* %p, i64 -1
+  %ret = load i16* %tmp, align 2
+  ret i16 %ret
+}
+define i8 @_f3(i8* %p) {
+; CHECK: f3:
+; CHECK: ldurb w0, [x0, #-1]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i8* %p, i64 -1
+  %ret = load i8* %tmp, align 2
+  ret i8 %ret
+}
+
+define i64 @zext32(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext32:
+; CHECK: ldur w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp1 = bitcast i8* %p to i32*
+  %tmp2 = load i32* %tmp1, align 4
+  %ret = zext i32 %tmp2 to i64
+
+  ret i64 %ret
+}
+define i64 @zext16(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext16:
+; CHECK: ldurh w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp1 = bitcast i8* %p to i16*
+  %tmp2 = load i16* %tmp1, align 2
+  %ret = zext i16 %tmp2 to i64
+
+  ret i64 %ret
+}
+define i64 @zext8(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext8:
+; CHECK: ldurb w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp2 = load i8* %p, align 1
+  %ret = zext i8 %tmp2 to i64
+
+  ret i64 %ret
+}
diff --git a/test/CodeGen/AArch64/arm64-ldxr-stxr.ll b/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
new file mode 100644
index 00000000000..9093df27cdd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
@@ -0,0 +1,270 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
+
+%0 = type { i64, i64 }
+
+define i128 @f0(i8* %p) nounwind readonly {
+; CHECK-LABEL: f0:
+; CHECK: ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %ldrexd = tail call %0 @llvm.aarch64.ldxp(i8* %p)
+  %0 = extractvalue %0 %ldrexd, 1
+  %1 = extractvalue %0 %ldrexd, 0
+  %2 = zext i64 %0 to i128
+  %3 = zext i64 %1 to i128
+  %shl = shl nuw i128 %2, 64
+  %4 = or i128 %shl, %3
+  ret i128 %4
+}
+
+define i32 @f1(i8* %ptr, i128 %val) nounwind {
+; CHECK-LABEL: f1:
+; CHECK: stxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %tmp4 = trunc i128 %val to i64
+  %tmp6 = lshr i128 %val, 64
+  %tmp7 = trunc i128 %tmp6 to i64
+  %strexd = tail call i32 @llvm.aarch64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  ret i32 %strexd
+}
+
+declare %0 @llvm.aarch64.ldxp(i8*) nounwind
+declare i32 @llvm.aarch64.stxp(i64, i64, i8*) nounwind
+
+@var = global i64 0, align 8
+
+define void @test_load_i8(i8* %addr) {
+; CHECK-LABEL: test_load_i8:
+; CHECK: ldxrb w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
+  %shortval = trunc i64 %val to i8
+  %extval = zext i8 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i16(i16* %addr) {
+; CHECK-LABEL: test_load_i16:
+; CHECK: ldxrh w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i16(i16* %addr)
+  %shortval = trunc i64 %val to i16
+  %extval = zext i16 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i32(i32* %addr) {
+; CHECK-LABEL: test_load_i32:
+; CHECK: ldxr w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
+  %shortval = trunc i64 %val to i32
+  %extval = zext i32 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i64(i64* %addr) {
+; CHECK-LABEL: test_load_i64:
+; CHECK: ldxr x[[LOADVAL:[0-9]+]], [x0]
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i64(i64* %addr)
+  store i64 %val, i64* @var, align 8
+  ret void
+}
+
+
+declare i64 @llvm.aarch64.ldxr.p0i8(i8*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i16(i16*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i32(i32*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i64(i64*) nounwind
+
+define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
+; CHECK-LABEL: test_store_i8:
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: stxrb w0, w1, [x2]
+  %extval = zext i8 %val to i64
+  %res = call i32 @llvm.aarch64.stxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i16(i32, i16 %val, i16* %addr) {
+; CHECK-LABEL: test_store_i16:
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: stxrh w0, w1, [x2]
+  %extval = zext i16 %val to i64
+  %res = call i32 @llvm.aarch64.stxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i32(i32, i32 %val, i32* %addr) {
+; CHECK-LABEL: test_store_i32:
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: stxr w0, w1, [x2]
+  %extval = zext i32 %val to i64
+  %res = call i32 @llvm.aarch64.stxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i64(i32, i64 %val, i64* %addr) {
+; CHECK-LABEL: test_store_i64:
+; CHECK: stxr w0, x1, [x2]
+  %res = call i32 @llvm.aarch64.stxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %res
+}
+
+declare i32 @llvm.aarch64.stxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i64(i64, i64*) nounwind
+
+; CHECK: test_clear:
+; CHECK: clrex
+define void @test_clear() {
+  call void @llvm.aarch64.clrex()
+  ret void
+}
+
+declare void @llvm.aarch64.clrex() nounwind
+
+define i128 @test_load_acquire_i128(i8* %p) nounwind readonly {
+; CHECK-LABEL: test_load_acquire_i128:
+; CHECK: ldaxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %ldrexd = tail call %0 @llvm.aarch64.ldaxp(i8* %p)
+  %0 = extractvalue %0 %ldrexd, 1
+  %1 = extractvalue %0 %ldrexd, 0
+  %2 = zext i64 %0 to i128
+  %3 = zext i64 %1 to i128
+  %shl = shl nuw i128 %2, 64
+  %4 = or i128 %shl, %3
+  ret i128 %4
+}
+
+define i32 @test_store_release_i128(i8* %ptr, i128 %val) nounwind {
+; CHECK-LABEL: test_store_release_i128:
+; CHECK: stlxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %tmp4 = trunc i128 %val to i64
+  %tmp6 = lshr i128 %val, 64
+  %tmp7 = trunc i128 %tmp6 to i64
+  %strexd = tail call i32 @llvm.aarch64.stlxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  ret i32 %strexd
+}
+
+declare %0 @llvm.aarch64.ldaxp(i8*) nounwind
+declare i32 @llvm.aarch64.stlxp(i64, i64, i8*) nounwind
+
+define void @test_load_acquire_i8(i8* %addr) {
+; CHECK-LABEL: test_load_acquire_i8:
+; CHECK: ldaxrb w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
+  %shortval = trunc i64 %val to i8
+  %extval = zext i8 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_acquire_i16(i16* %addr) {
+; CHECK-LABEL: test_load_acquire_i16:
+; CHECK: ldaxrh w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i16(i16* %addr)
+  %shortval = trunc i64 %val to i16
+  %extval = zext i16 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_acquire_i32(i32* %addr) {
+; CHECK-LABEL: test_load_acquire_i32:
+; CHECK: ldaxr w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i32(i32* %addr)
+  %shortval = trunc i64 %val to i32
+  %extval = zext i32 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_acquire_i64(i64* %addr) {
+; CHECK-LABEL: test_load_acquire_i64:
+; CHECK: ldaxr x[[LOADVAL:[0-9]+]], [x0]
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i64(i64* %addr)
+  store i64 %val, i64* @var, align 8
+  ret void
+}
+
+
+declare i64 @llvm.aarch64.ldaxr.p0i8(i8*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i16(i16*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i32(i32*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i64(i64*) nounwind
+
+define i32 @test_store_release_i8(i32, i8 %val, i8* %addr) {
+; CHECK-LABEL: test_store_release_i8:
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: stlxrb w0, w1, [x2]
+  %extval = zext i8 %val to i64
+  %res = call i32 @llvm.aarch64.stlxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_release_i16(i32, i16 %val, i16* %addr) {
+; CHECK-LABEL: test_store_release_i16:
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: stlxrh w0, w1, [x2]
+  %extval = zext i16 %val to i64
+  %res = call i32 @llvm.aarch64.stlxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_release_i32(i32, i32 %val, i32* %addr) {
+; CHECK-LABEL: test_store_release_i32:
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: stlxr w0, w1, [x2]
+  %extval = zext i32 %val to i64
+  %res = call i32 @llvm.aarch64.stlxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_release_i64(i32, i64 %val, i64* %addr) {
+; CHECK-LABEL: test_store_release_i64:
+; CHECK: stlxr w0, x1, [x2]
+  %res = call i32 @llvm.aarch64.stlxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %res
+}
+
+declare i32 @llvm.aarch64.stlxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i64(i64, i64*) nounwind
diff --git a/test/CodeGen/AArch64/arm64-leaf.ll b/test/CodeGen/AArch64/arm64-leaf.ll
new file mode 100644
index 00000000000..d3b2031686e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-leaf.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; rdar://12829704
+
+define void @t8() nounwind ssp {
+; CHECK-LABEL: t8:
+; CHECK-NOT: stp	fp, lr, [sp, #-16]!
+; CHECK-NOT: mov	fp, sp
+; CHECK: nop
+; CHECK-NOT: mov	sp, fp
+; CHECK-NOT: ldp	fp, lr, [sp], #16
+  tail call void asm sideeffect "nop", "~{v8}"() nounwind
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-long-shift.ll b/test/CodeGen/AArch64/arm64-long-shift.ll
new file mode 100644
index 00000000000..d5baf16bdd5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-long-shift.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+define i128 @shl(i128 %r, i128 %s) nounwind readnone {
+; CHECK-LABEL: shl:
+; CHECK: lsl  [[XREG_0:x[0-9]+]], x1, x2
+; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
+; CHECK-NEXT: lsr  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
+; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
+; CHECK-NEXT: lsl  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
+; CHECK-NEXT: cmp   [[XREG_4]], #0
+; CHECK-NEXT: csel  x1, [[XREG_5]], [[XREG_6]], ge
+; CHECK-NEXT: lsl  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
+; CHECK-NEXT: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
+; CHECK-NEXT: ret
+
+  %shl = shl i128 %r, %s
+  ret i128 %shl
+}
+
+define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
+; CHECK-LABEL: ashr:
+; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
+; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
+; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
+; CHECK-NEXT: asr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: cmp   [[XREG_5]], #0
+; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
+; CHECK-NEXT: asr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
+; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
+; CHECK-NEXT: ret
+
+  %shr = ashr i128 %r, %s
+  ret i128 %shr
+}
+
+define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
+; CHECK-LABEL: lshr:
+; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
+; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
+; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
+; CHECK-NEXT: lsr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: cmp   [[XREG_5]], #0
+; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
+; CHECK-NEXT: lsr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
+; CHECK-NEXT: ret
+
+  %shr = lshr i128 %r, %s
+  ret i128 %shr
+}
diff --git a/test/CodeGen/AArch64/arm64-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
new file mode 100644
index 00000000000..f921a592451
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
+
+@src = external global %struct.x
+@dst = external global %struct.x
+
+@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
+@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
+@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
+@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR  \00", align 1
+@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
+@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
+@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
+
+define i32 @t0() {
+entry:
+; CHECK-LABEL: t0:
+; CHECK: ldrb [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #10]
+; CHECK: strb [[REG0]], [x[[BASEREG2:[0-9]+]], #10]
+; CHECK: ldrh [[REG1:w[0-9]+]], [x[[BASEREG]], #8]
+; CHECK: strh [[REG1]], [x[[BASEREG2]], #8]
+; CHECK: ldr [[REG2:x[0-9]+]],
+; CHECK: str [[REG2]],
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
+  ret i32 0
+}
+
+define void @t1(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
+; CHECK: stur [[DEST]], [x0, #15]
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
+; CHECK: str [[DEST]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
+  ret void
+}
+
+define void @t2(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: movz [[REG3:w[0-9]+]]
+; CHECK: movk [[REG3]],
+; CHECK: str [[REG3]], [x0, #32]
+; CHECK: ldp [[DEST1:q[0-9]+]], [[DEST2:q[0-9]+]], [x{{[0-9]+}}]
+; CHECK: stp [[DEST1]], [[DEST2]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
+  ret void
+}
+
+define void @t3(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16]
+; CHECK: str [[REG4]], [x0, #16]
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
+; CHECK: str [[DEST]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
+  ret void
+}
+
+define void @t4(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: orr [[REG5:w[0-9]+]], wzr, #0x20
+; CHECK: strh [[REG5]], [x0, #16]
+; CHECK: ldr [[REG6:q[0-9]+]], [x{{[0-9]+}}]
+; CHECK: str [[REG6]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
+  ret void
+}
+
+define void @t5(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: strb wzr, [x0, #6]
+; CHECK: movz [[REG7:w[0-9]+]], #0x5453
+; CHECK: strh [[REG7]], [x0, #4]
+; CHECK: movz [[REG8:w[0-9]+]],
+; CHECK: movk [[REG8]],
+; CHECK: str [[REG8]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
+  ret void
+}
+
+define void @t6() nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6]
+; CHECK: stur [[REG9]], [x{{[0-9]+}}, #6]
+; CHECK: ldr
+; CHECK: str
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
+  ret void
+}
+
+%struct.Foo = type { i32, i32, i32, i32 }
+
+define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
+entry:
+; CHECK: t7
+; CHECK: ldr [[REG10:q[0-9]+]], [x1]
+; CHECK: str [[REG10]], [x0]
+  %0 = bitcast %struct.Foo* %a to i8*
+  %1 = bitcast %struct.Foo* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/AArch64/arm64-memset-inline.ll b/test/CodeGen/AArch64/arm64-memset-inline.ll
new file mode 100644
index 00000000000..2e237f4a882
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-memset-inline.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define void @t1(i8* nocapture %c) nounwind optsize {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: str wzr, [x0, #8]
+; CHECK: str xzr, [x0]
+  call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
+  ret void
+}
+
+define void @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: strh wzr, [sp, #32]
+; CHECK: stp xzr, xzr, [sp, #16]
+; CHECK: str xzr, [sp, #8]
+  %buf = alloca [26 x i8], align 1
+  %0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
+  call void @something(i8* %0) nounwind
+  ret void
+}
+
+declare void @something(i8*) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
new file mode 100644
index 00000000000..29036caabf3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
@@ -0,0 +1,108 @@
+; RUN: llc %s -mtriple=arm64-apple-darwin -o - | \
+; RUN:   FileCheck --check-prefix=CHECK-DARWIN --check-prefix=CHECK %s
+; RUN: llc %s -mtriple=arm64-linux-gnu -o - | \
+; RUN:   FileCheck --check-prefix=CHECK-LINUX --check-prefix=CHECK %s
+; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
+
+; CHECK: @fct1
+; For small size (<= 256), we do not change memset to bzero.
+; CHECK: memset
+define void @fct1(i8* nocapture %ptr) {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+; CHECK: @fct2
+; When the size is bigger than 256, change into bzero.
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
+define void @fct2(i8* nocapture %ptr) {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: @fct3
+; For unknown size, change to bzero.
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
+define void @fct3(i8* nocapture %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: @fct4
+; Size <= 256, no change.
+; CHECK: memset
+define void @fct4(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
+
+declare i64 @llvm.objectsize.i64(i8*, i1)
+
+; CHECK: @fct5
+; Size > 256, change.
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
+define void @fct5(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct6
+; Size = unknown, change.
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
+define void @fct6(i8* %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp)
+  ret void
+}
+
+; Next functions check that memset is not turned into bzero
+; when the set constant is non-zero, whatever the given size.
+
+; CHECK: @fct7
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct7(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct8
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct8(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct9
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct9(i8* %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
new file mode 100644
index 00000000000..f88bd6a4fe3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -0,0 +1,124 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+;
+; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
+; much higher than the ADD instructions in order to hide latency. When not
+; specifying a subtarget, the MADD will remain near the end of the block.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: main
+; CHECK: *** Final schedule for BB#2 ***
+; CHECK: MADDWrrr
+; CHECK: ADDWri
+; CHECK: ********** INTERVALS **********
+@main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
+@main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %x = alloca [8 x i32], align 4
+  %y = alloca [8 x i32], align 4
+  %i = alloca i32, align 4
+  %xx = alloca i32, align 4
+  %yy = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = bitcast [8 x i32]* %x to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([8 x i32]* @main.x to i8*), i64 32, i32 4, i1 false)
+  %1 = bitcast [8 x i32]* %y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([8 x i32]* @main.y to i8*), i64 32, i32 4, i1 false)
+  store i32 0, i32* %xx, align 4
+  store i32 0, i32* %yy, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %2 = load i32* %i, align 4
+  %cmp = icmp slt i32 %2, 8
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom
+  %4 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %4, 1
+  store i32 %add, i32* %xx, align 4
+  %5 = load i32* %xx, align 4
+  %add1 = add nsw i32 %5, 12
+  store i32 %add1, i32* %xx, align 4
+  %6 = load i32* %xx, align 4
+  %add2 = add nsw i32 %6, 23
+  store i32 %add2, i32* %xx, align 4
+  %7 = load i32* %xx, align 4
+  %add3 = add nsw i32 %7, 34
+  store i32 %add3, i32* %xx, align 4
+  %8 = load i32* %i, align 4
+  %idxprom4 = sext i32 %8 to i64
+  %arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4
+  %9 = load i32* %arrayidx5, align 4
+  %10 = load i32* %yy, align 4
+  %mul = mul nsw i32 %10, %9
+  store i32 %mul, i32* %yy, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %11 = load i32* %i, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %12 = load i32* %xx, align 4
+  %13 = load i32* %yy, align 4
+  %add6 = add nsw i32 %12, %13
+  ret i32 %add6
+}
+
+
+; The Cortex-A53 machine model will cause the FDIVvvv_42 to be raised to
+; hide latency. Whereas normally there would only be a single FADDvvv_4s
+; after it, this test checks to make sure there are more than one.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: neon4xfloat:BB#0
+; CHECK: *** Final schedule for BB#0 ***
+; CHECK: FDIVv4f32
+; CHECK: FADDv4f32
+; CHECK: FADDv4f32
+; CHECK: ********** INTERVALS **********
+define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
+        %tmp1 = fadd <4 x float> %A, %B;
+        %tmp2 = fadd <4 x float> %A, %tmp1;
+        %tmp3 = fadd <4 x float> %A, %tmp2;
+        %tmp4 = fadd <4 x float> %A, %tmp3;
+        %tmp5 = fadd <4 x float> %A, %tmp4;
+        %tmp6 = fadd <4 x float> %A, %tmp5;
+        %tmp7 = fadd <4 x float> %A, %tmp6;
+        %tmp8 = fadd <4 x float> %A, %tmp7;
+        %tmp9 = fdiv <4 x float> %A, %B;
+        %tmp10 = fadd <4 x float> %tmp8, %tmp9;
+
+        ret <4 x float> %tmp10
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+
+; Regression Test for PR19761
+;   [ARM64] Cortex-a53 schedule mode can't handle NEON post-increment load
+;
+; Nothing explicit to check other than llc not crashing.
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
diff --git a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
new file mode 100644
index 00000000000..97bfb5ca9d3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
@@ -0,0 +1,21 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+;
+; For Cortex-A53, shiftable operands that are not actually shifted
+; are not needed for an additional two cycles.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: shiftable
+; CHECK: *** Final schedule for BB#0 ***
+; CHECK: ADDXrr %vreg0, %vreg2
+; CHECK: ADDXrs %vreg0, %vreg2, 5
+; CHECK: ********** INTERVALS **********
+define i64 @shiftable(i64 %A, i64 %B) {
+        %tmp0 = sub i64 %B, 20
+        %tmp1 = shl i64 %tmp0, 5;
+        %tmp2 = add i64 %A, %tmp1;
+        %tmp3 = add i64 %A, %tmp0
+        %tmp4 = mul i64 %tmp2, %tmp3
+
+        ret i64 %tmp4
+}
diff --git a/test/CodeGen/AArch64/arm64-movi.ll b/test/CodeGen/AArch64/arm64-movi.ll
new file mode 100644
index 00000000000..2cd368d909d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-movi.ll
@@ -0,0 +1,202 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+;==--------------------------------------------------------------------------==
+; Tests for MOV-immediate implemented with ORR-immediate.
+;==--------------------------------------------------------------------------==
+
+; 64-bit immed with 32-bit pattern size, rotated by 0.
+define i64 @test64_32_rot0() nounwind {
+; CHECK-LABEL: test64_32_rot0:
+; CHECK: orr x0, xzr, #0x700000007
+  ret i64 30064771079
+}
+
+; 64-bit immed with 32-bit pattern size, rotated by 2.
+define i64 @test64_32_rot2() nounwind {
+; CHECK-LABEL: test64_32_rot2:
+; CHECK: orr x0, xzr, #0xc0000003c0000003
+  ret i64 13835058071388291075
+}
+
+; 64-bit immed with 4-bit pattern size, rotated by 3.
+define i64 @test64_4_rot3() nounwind {
+; CHECK-LABEL: test64_4_rot3:
+; CHECK: orr  x0, xzr, #0xeeeeeeeeeeeeeeee
+  ret i64 17216961135462248174
+}
+
+; 32-bit immed with 32-bit pattern size, rotated by 16.
+define i32 @test32_32_rot16() nounwind {
+; CHECK-LABEL: test32_32_rot16:
+; CHECK: orr w0, wzr, #0xff0000
+  ret i32 16711680
+}
+
+; 32-bit immed with 2-bit pattern size, rotated by 1.
+define i32 @test32_2_rot1() nounwind {
+; CHECK-LABEL: test32_2_rot1:
+; CHECK: orr w0, wzr, #0xaaaaaaaa
+  ret i32 2863311530
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for MOVZ with MOVK.
+;==--------------------------------------------------------------------------==
+
+define i32 @movz() nounwind {
+; CHECK-LABEL: movz:
+; CHECK: movz w0, #0x5
+  ret i32 5
+}
+
+define i64 @movz_3movk() nounwind {
+; CHECK-LABEL: movz_3movk:
+; CHECK:      movz x0, #0x5, lsl #48
+; CHECK-NEXT: movk x0, #0x1234, lsl #32
+; CHECK-NEXT: movk x0, #0xabcd, lsl #16
+; CHECK-NEXT: movk x0, #0x5678
+  ret i64 1427392313513592
+}
+
+define i64 @movz_movk_skip1() nounwind {
+; CHECK-LABEL: movz_movk_skip1:
+; CHECK:      movz x0, #0x5, lsl #32
+; CHECK-NEXT: movk x0, #0x4321, lsl #16
+  ret i64 22601072640
+}
+
+define i64 @movz_skip1_movk() nounwind {
+; CHECK-LABEL: movz_skip1_movk:
+; CHECK:      movz x0, #0x8654, lsl #32
+; CHECK-NEXT: movk x0, #0x1234
+  ret i64 147695335379508
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for MOVN with MOVK.
+;==--------------------------------------------------------------------------==
+
+define i64 @movn() nounwind {
+; CHECK-LABEL: movn:
+; CHECK: movn x0, #0x29
+  ret i64 -42
+}
+
+define i64 @movn_skip1_movk() nounwind {
+; CHECK-LABEL: movn_skip1_movk:
+; CHECK:      movn x0, #0x29, lsl #32
+; CHECK-NEXT: movk x0, #0x1234
+  ret i64 -176093720012
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for ORR with MOVK.
+;==--------------------------------------------------------------------------==
+; rdar://14987673
+
+define i64 @orr_movk1() nounwind {
+; CHECK-LABEL: orr_movk1:
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #0xdead, lsl #16
+  ret i64 72056498262245120
+}
+
+define i64 @orr_movk2() nounwind {
+; CHECK-LABEL: orr_movk2:
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #0xdead, lsl #48
+  ret i64 -2400982650836746496
+}
+
+define i64 @orr_movk3() nounwind {
+; CHECK-LABEL: orr_movk3:
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #0xdead, lsl #32
+  ret i64 72020953688702720
+}
+
+define i64 @orr_movk4() nounwind {
+; CHECK-LABEL: orr_movk4:
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #0xdead
+  ret i64 72056494543068845
+}
+
+; rdar://14987618
+define i64 @orr_movk5() nounwind {
+; CHECK-LABEL: orr_movk5:
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #0xdead, lsl #16
+  ret i64 -71777214836900096
+}
+
+define i64 @orr_movk6() nounwind {
+; CHECK-LABEL: orr_movk6:
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #0xdead, lsl #16
+; CHECK: movk x0, #0xdead, lsl #48
+  ret i64 -2400982647117578496
+}
+
+define i64 @orr_movk7() nounwind {
+; CHECK-LABEL: orr_movk7:
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #0xdead, lsl #48
+  ret i64 -2400982646575268096
+}
+
+define i64 @orr_movk8() nounwind {
+; CHECK-LABEL: orr_movk8:
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #0xdead
+; CHECK: movk x0, #0xdead, lsl #48
+  ret i64 -2400982646575276371
+}
+
+; rdar://14987715
+define i64 @orr_movk9() nounwind {
+; CHECK-LABEL: orr_movk9:
+; CHECK: orr x0, xzr, #0xffffff000000000
+; CHECK: movk x0, #0xff00
+; CHECK: movk x0, #0xdead, lsl #16
+  ret i64 1152921439623315200
+}
+
+define i64 @orr_movk10() nounwind {
+; CHECK-LABEL: orr_movk10:
+; CHECK: orr x0, xzr, #0xfffffffffffff00
+; CHECK: movk x0, #0xdead, lsl #16
+  ret i64 1152921504047824640
+}
+
+define i64 @orr_movk11() nounwind {
+; CHECK-LABEL: orr_movk11:
+; CHECK: orr x0, xzr, #0xfff00000000000ff
+; CHECK: movk x0, #0xdead, lsl #16
+; CHECK: movk x0, #0xffff, lsl #32
+  ret i64 -4222125209747201
+}
+
+define i64 @orr_movk12() nounwind {
+; CHECK-LABEL: orr_movk12:
+; CHECK: orr x0, xzr, #0xfff00000000000ff
+; CHECK: movk x0, #0xdead, lsl #32
+  ret i64 -4258765016661761
+}
+
+define i64 @orr_movk13() nounwind {
+; CHECK-LABEL: orr_movk13:
+; CHECK: orr x0, xzr, #0xfffff000000
+; CHECK: movk x0, #0xdead
+; CHECK: movk x0, #0xdead, lsl #48
+  ret i64 -2401245434149282131
+}
+
+; rdar://13944082
+define i64 @g() nounwind {
+; CHECK-LABEL: g:
+; CHECK: movz x0, #0xffff, lsl #48
+; CHECK: movk x0, #0x2
+entry:
+  ret i64 -281474976710654
+}
diff --git a/test/CodeGen/AArch64/arm64-mul.ll b/test/CodeGen/AArch64/arm64-mul.ll
new file mode 100644
index 00000000000..2e7986d67d9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-mul.ll
@@ -0,0 +1,90 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; rdar://9296808
+; rdar://9349137
+
+define i128 @t1(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: umulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = zext i64 %a to i128
+  %tmp2 = zext i64 %b to i128
+  %tmp3 = mul i128 %tmp1, %tmp2
+  ret i128 %tmp3
+}
+
+define i128 @t2(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: smulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = sext i64 %a to i128
+  %tmp2 = sext i64 %b to i128
+  %tmp3 = mul i128 %tmp1, %tmp2
+  ret i128 %tmp3
+}
+
+define i64 @t3(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: umull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @t4(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: smull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @t5(i32 %a, i32 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = add i64 %c, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t6(i32 %a, i32 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 %c, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t7(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: umnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 0, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t8(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 0, %tmp3
+  ret i64 %tmp4
+}
diff --git a/test/CodeGen/AArch64/arm64-named-reg-alloc.ll b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
new file mode 100644
index 00000000000..d86d2e617ee
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
@@ -0,0 +1,14 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"x5\00"}
diff --git a/test/CodeGen/AArch64/arm64-named-reg-notareg.ll b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
new file mode 100644
index 00000000000..3ca14c408f4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"notareg\00"}
diff --git a/test/CodeGen/AArch64/arm64-neg.ll b/test/CodeGen/AArch64/arm64-neg.ll
new file mode 100644
index 00000000000..659ce988a70
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neg.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+define i32 @test_neg_i32(i32 %in) {
+; CHECK-LABEL: test_neg_i32:
+; CHECK: neg w0, w0
+  %res = sub i32 0, %in
+  ret i32 %res
+}
+
+define i64 @test_neg_i64(i64 %in) {
+; CHECK-LABEL: test_neg_i64:
+; CHECK: neg x0, x0
+  %res = sub i64 0, %in
+  ret i64 %res
+}
+
+define <8 x i8> @test_neg_v8i8(<8 x i8> %in) {
+; CHECK-LABEL: test_neg_v8i8:
+; CHECK: neg v0.8b, v0.8b
+  %res = sub <8 x i8> zeroinitializer, %in
+  ret <8 x i8> %res
+}
+
+define <4 x i16> @test_neg_v4i16(<4 x i16> %in) {
+; CHECK-LABEL: test_neg_v4i16:
+; CHECK: neg v0.4h, v0.4h
+  %res = sub <4 x i16> zeroinitializer, %in
+  ret <4 x i16> %res
+}
+
+define <2 x i32> @test_neg_v2i32(<2 x i32> %in) {
+; CHECK-LABEL: test_neg_v2i32:
+; CHECK: neg v0.2s, v0.2s
+  %res = sub <2 x i32> zeroinitializer, %in
+  ret <2 x i32> %res
+}
+
+define <16 x i8> @test_neg_v16i8(<16 x i8> %in) {
+; CHECK-LABEL: test_neg_v16i8:
+; CHECK: neg v0.16b, v0.16b
+  %res = sub <16 x i8> zeroinitializer, %in
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_neg_v8i16(<8 x i16> %in) {
+; CHECK-LABEL: test_neg_v8i16:
+; CHECK: neg v0.8h, v0.8h
+  %res = sub <8 x i16> zeroinitializer, %in
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_neg_v4i32(<4 x i32> %in) {
+; CHECK-LABEL: test_neg_v4i32:
+; CHECK: neg v0.4s, v0.4s
+  %res = sub <4 x i32> zeroinitializer, %in
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_neg_v2i64(<2 x i64> %in) {
+; CHECK-LABEL: test_neg_v2i64:
+; CHECK: neg v0.2d, v0.2d
+  %res = sub <2 x i64> zeroinitializer, %in
+  ret <2 x i64> %res
+}
+
+define <1 x i64> @test_neg_v1i64(<1 x i64> %in) {
+; CHECK-LABEL: test_neg_v1i64:
+; CHECK: neg d0, d0
+  %res = sub <1 x i64> zeroinitializer, %in
+  ret <1 x i64> %res
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-2velem-high.ll b/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
new file mode 100644
index 00000000000..58df094d192
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
@@ -0,0 +1,341 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
+
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
+
+define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) {
+; CHECK-LABEL: test_vmull_high_n_s16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
+  %vmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  ret <4 x i32> %vmull15.i.i
+}
+
+define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vmull_high_n_s32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
+  %vmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  ret <2 x i64> %vmull9.i.i
+}
+
+define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) {
+; CHECK-LABEL: test_vmull_high_n_u16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
+  %vmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  ret <4 x i32> %vmull15.i.i
+}
+
+define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vmull_high_n_u32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
+  %vmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  ret <2 x i64> %vmull9.i.i
+}
+
+define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) {
+; CHECK-LABEL: test_vqdmull_high_n_s16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
+  %vqdmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  ret <4 x i32> %vqdmull15.i.i
+}
+
+define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vqdmull_high_n_s32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
+  %vqdmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  ret <2 x i64> %vqdmull9.i.i
+}
+
+define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK-LABEL: test_vmlal_high_n_s16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK-LABEL: test_vmlal_high_n_s32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK-LABEL: test_vmlal_high_n_u16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK-LABEL: test_vmlal_high_n_u32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK-LABEL: test_vqdmlal_high_n_s16:
+; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vqdmlal15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlal17.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
+  ret <4 x i32> %vqdmlal17.i.i
+}
+
+define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK-LABEL: test_vqdmlal_high_n_s32:
+; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vqdmlal9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlal11.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
+  ret <2 x i64> %vqdmlal11.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK-LABEL: test_vmlsl_high_n_s16:
+; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK-LABEL: test_vmlsl_high_n_s32:
+; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK-LABEL: test_vmlsl_high_n_u16:
+; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK-LABEL: test_vmlsl_high_n_u32:
+; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK-LABEL: test_vqdmlsl_high_n_s16:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
+  ret <4 x i32> %vqdmlsl17.i.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK-LABEL: test_vqdmlsl_high_n_s32:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
+  ret <2 x i64> %vqdmlsl11.i.i
+}
+
+define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) {
+; CHECK-LABEL: test_vmul_n_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1
+  %mul.i = fmul <2 x float> %vecinit1.i, %a
+  ret <2 x float> %mul.i
+}
+
+define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
+; CHECK-LABEL: test_vmulq_n_f32:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3
+  %mul.i = fmul <4 x float> %vecinit3.i, %a
+  ret <4 x float> %mul.i
+}
+
+define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) {
+; CHECK-LABEL: test_vmulq_n_f64:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %vecinit.i = insertelement <2 x double> undef, double %b, i32 0
+  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1
+  %mul.i = fmul <2 x double> %vecinit1.i, %a
+  ret <2 x double> %mul.i
+}
+
+define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
+; CHECK-LABEL: test_vfma_n_f32:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
+; CHECK-LABEL: test_vfmaq_n_f32:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
+; CHECK-LABEL: test_vfms_n_f32:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
+  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+  %1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a)
+  ret <2 x float> %1
+}
+
+define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
+; CHECK-LABEL: test_vfmsq_n_f32:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
+  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a)
+  ret <4 x float> %1
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll
new file mode 100644
index 00000000000..869966caa3a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -0,0 +1,2853 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
+
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
+
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmla_lane_s16:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlaq_lane_s16:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmla_lane_s32:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlaq_lane_s32:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmla_laneq_s16:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlaq_laneq_s16:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmla_laneq_s32:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlaq_laneq_s32:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmls_lane_s16:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsq_lane_s16:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmls_lane_s32:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsq_lane_s32:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmls_laneq_s16:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsq_laneq_s16:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmls_laneq_s32:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsq_laneq_s32:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmul_lane_s16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmulq_lane_s16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmul_lane_s32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmulq_lane_s32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmul_lane_u16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmulq_lane_u16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmul_lane_u32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmulq_lane_u32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmul_laneq_s16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmulq_laneq_s16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmul_laneq_s32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmulq_laneq_s32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmul_laneq_u16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmulq_laneq_u16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmul_laneq_u32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmulq_laneq_u32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfma_lane_f32:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
+
+define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmaq_lane_f32:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfma_laneq_f32:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmaq_laneq_f32:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfms_lane_f32:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmsq_lane_f32:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfms_laneq_f32:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmsq_laneq_f32:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
+; CHECK-LABEL: test_vfmaq_lane_f64:
+; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
+define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmaq_laneq_f64:
+; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
+; CHECK-LABEL: test_vfmsq_lane_f64:
+; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <1 x double> <double -0.000000e+00>, %v
+  %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmsq_laneq_f64:
+; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmas_laneq_f32
+; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %extract = extractelement <4 x float> %v, i32 3
+  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
+  ret float %0
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
+; CHECK-LABEL: test_vfmsd_lane_f64
+; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: ret
+entry:
+  %extract.rhs = extractelement <1 x double> %v, i32 0
+  %extract = fsub double -0.000000e+00, %extract.rhs
+  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
+  ret double %0
+}
+
+declare double @llvm.fma.f64(double, double, double)
+
+define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmss_laneq_f32
+; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %extract.rhs = extractelement <4 x float> %v, i32 3
+  %extract = fsub float -0.000000e+00, %extract.rhs
+  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
+  ret float %0
+}
+
+define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmsd_laneq_f64
+; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %extract.rhs = extractelement <2 x double> %v, i32 1
+  %extract = fsub double -0.000000e+00, %extract.rhs
+  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
+  ret double %0
+}
+
+define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_lane_s16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_lane_s32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_laneq_s16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_laneq_s32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_s16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_s32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_s16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_s32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_lane_s16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_lane_s32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_s16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_s32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_s16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_s32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_s16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_s32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_lane_u16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_lane_u32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_laneq_u16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_laneq_u32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_u16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_u32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_u16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_u32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_lane_u16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_lane_u32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_u16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_u32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_u16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_u32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_u16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_u32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_lane_s16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_lane_s32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_lane_u16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_lane_u32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_lane_s16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_lane_s32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_lane_u16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_lane_u32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_laneq_s16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_laneq_s32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_laneq_u16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_laneq_u32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_s16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_s32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_u16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_u32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlal_lane_s16:
+; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlal_lane_s32:
+; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlal_high_lane_s16:
+; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlal_high_lane_s32:
+; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlsl_lane_s16:
+; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlsl_lane_s32:
+; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
+; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
+; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_lane_s16:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_lane_s32:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_laneq_s16:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_laneq_s32:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_high_lane_s16:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_high_lane_s32:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_high_laneq_s16:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_high_laneq_s32:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s16:
+; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s16:
+; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s32:
+; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s32:
+; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s16:
+; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s16:
+; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s32:
+; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s32:
+; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
+define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmul_lane_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
+; CHECK-LABEL: test_vmul_lane_f64:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <1 x double> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to double
+  %extract = extractelement <1 x double> %v, i32 0
+  %2 = fmul double %1, %extract
+  %3 = insertelement <1 x double> undef, double %2, i32 0
+  ret <1 x double> %3
+}
+
+define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulq_lane_f32:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
+; CHECK-LABEL: test_vmulq_lane_f64:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x double> %shuffle, %a
+  ret <2 x double> %mul
+}
+
+define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmul_laneq_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmul_laneq_f64:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <1 x double> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to double
+  %extract = extractelement <2 x double> %v, i32 1
+  %2 = fmul double %1, %extract
+  %3 = insertelement <1 x double> undef, double %2, i32 0
+  ret <1 x double> %3
+}
+
+define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmulq_laneq_f32:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmulq_laneq_f64:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %mul = fmul <2 x double> %shuffle, %a
+  ret <2 x double> %mul
+}
+
+define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulx_lane_f32:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulxq_lane_f32:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
+; CHECK-LABEL: test_vmulxq_lane_f64:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
+define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmulx_laneq_f32:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmulxq_laneq_f32:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmulxq_laneq_f64:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
+define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmla_lane_s16_0:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlaq_lane_s16_0:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmla_lane_s32_0:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlaq_lane_s32_0:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmla_laneq_s16_0:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlaq_laneq_s16_0:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmla_laneq_s32_0:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlaq_laneq_s32_0:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmls_lane_s16_0:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsq_lane_s16_0:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmls_lane_s32_0:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsq_lane_s32_0:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmls_laneq_s16_0:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsq_laneq_s16_0:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmls_laneq_s32_0:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsq_laneq_s32_0:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmul_lane_s16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmulq_lane_s16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmul_lane_s32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmulq_lane_s32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmul_lane_u16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmulq_lane_u16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmul_lane_u32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmulq_lane_u32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmul_laneq_s16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmulq_laneq_s16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmul_laneq_s32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmulq_laneq_s32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmul_laneq_u16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmulq_laneq_u16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmul_laneq_u32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmulq_laneq_u32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfma_lane_f32_0:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmaq_lane_f32_0:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfma_laneq_f32_0:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmaq_laneq_f32_0:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfms_lane_f32_0:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmsq_lane_f32_0:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfms_laneq_f32_0:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmsq_laneq_f32_0:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmaq_laneq_f64_0:
+; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmsq_laneq_f64_0:
+; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_lane_s16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_lane_s32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_laneq_s16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_laneq_s32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_s16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_s32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_lane_s16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_lane_s32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_s16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_s32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_lane_u16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_lane_u32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_laneq_u16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_laneq_u32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_u16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_u32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_lane_u16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_lane_u32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_u16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_u32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_lane_s16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_lane_s32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_lane_u16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_lane_u32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_lane_s16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_lane_s32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_lane_u16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_lane_u32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_laneq_s16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_laneq_s32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_laneq_u16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_laneq_u32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_s16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_s32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_u16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_u32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlal_lane_s16_0:
+; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlal_lane_s32_0:
+; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
+; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
+; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
+; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
+; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
+; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
+; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_lane_s16_0:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_lane_s32_0:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_laneq_s16_0:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_laneq_s32_0:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s16_0:
+; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
+; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s32_0:
+; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
+; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
+; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
+; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
+; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
+; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
+define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmul_lane_f32_0:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulq_lane_f32_0:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmul_laneq_f32_0:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmul_laneq_f64_0:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <1 x double> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to double
+  %extract = extractelement <2 x double> %v, i32 0
+  %2 = fmul double %1, %extract
+  %3 = insertelement <1 x double> undef, double %2, i32 0
+  ret <1 x double> %3
+}
+
+define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmulq_laneq_f32_0:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmulq_laneq_f64_0:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x double> %shuffle, %a
+  ret <2 x double> %mul
+}
+
+define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulx_lane_f32_0:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulxq_lane_f32_0:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
+; CHECK-LABEL: test_vmulxq_lane_f64_0:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
+define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmulx_laneq_f32_0:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmulxq_laneq_f32_0:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmulxq_laneq_f64_0:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
diff --git a/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
new file mode 100644
index 00000000000..cb9b36c4c18
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -0,0 +1,1829 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
+
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
+
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
+
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
+
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
+
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
+
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
+
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
+
+declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>)
+
+declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>)
+
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>)
+
+declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>)
+
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>)
+
+declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vaddl_s8:
+; CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vaddl_s16:
+; CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vaddl_s32:
+; CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vaddl_u8:
+; CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vaddl_u16:
+; CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vaddl_u32:
+; CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaddl_high_s8:
+; CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %1
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddl_high_s16:
+; CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %1
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddl_high_s32:
+; CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %1
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaddl_high_u8:
+; CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %1
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddl_high_u16:
+; CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %1
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddl_high_u32:
+; CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %1
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vaddw_s8:
+; CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vaddw_s16:
+; CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vaddw_s32:
+; CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vaddw_u8:
+; CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vaddw_u16:
+; CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vaddw_u32:
+; CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaddw_high_s8:
+; CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddw_high_s16:
+; CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddw_high_s32:
+; CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaddw_high_u8:
+; CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddw_high_u16:
+; CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddw_high_u32:
+; CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vsubl_s8:
+; CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vsubl_s16:
+; CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vsubl_s32:
+; CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vsubl_u8:
+; CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vsubl_u16:
+; CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vsubl_u32:
+; CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsubl_high_s8:
+; CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %sub.i = sub <8 x i16> %0, %1
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubl_high_s16:
+; CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %sub.i = sub <4 x i32> %0, %1
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubl_high_s32:
+; CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %sub.i = sub <2 x i64> %0, %1
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsubl_high_u8:
+; CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %sub.i = sub <8 x i16> %0, %1
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubl_high_u16:
+; CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %sub.i = sub <4 x i32> %0, %1
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubl_high_u32:
+; CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %sub.i = sub <2 x i64> %0, %1
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vsubw_s8:
+; CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %vmovl.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vsubw_s16:
+; CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %vmovl.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vsubw_s32:
+; CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %vmovl.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vsubw_u8:
+; CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %vmovl.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vsubw_u16:
+; CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %vmovl.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vsubw_u32:
+; CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %vmovl.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsubw_high_s8:
+; CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %0
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubw_high_s16:
+; CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %0
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubw_high_s32:
+; CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %0
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsubw_high_u8:
+; CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %0
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubw_high_u16:
+; CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %0
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubw_high_u32:
+; CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %0
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddhn_s16:
+; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i = add <8 x i16> %a, %b
+  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
+  ret <8 x i8> %vaddhn2.i
+}
+
+define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddhn_s32:
+; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i = add <4 x i32> %a, %b
+  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
+  ret <4 x i16> %vaddhn2.i
+}
+
+define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vaddhn_s64:
+; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i = add <2 x i64> %a, %b
+  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
+  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
+  ret <2 x i32> %vaddhn2.i
+}
+
+define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddhn_u16:
+; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i = add <8 x i16> %a, %b
+  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
+  ret <8 x i8> %vaddhn2.i
+}
+
+define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddhn_u32:
+; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i = add <4 x i32> %a, %b
+  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
+  ret <4 x i16> %vaddhn2.i
+}
+
+define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vaddhn_u64:
+; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i = add <2 x i64> %a, %b
+  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
+  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
+  ret <2 x i32> %vaddhn2.i
+}
+
+define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddhn_high_s16:
+; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i.i = add <8 x i16> %a, %b
+  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddhn_high_s32:
+; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i.i = add <4 x i32> %a, %b
+  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vaddhn_high_s64:
+; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i.i = add <2 x i64> %a, %b
+  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
+  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddhn_high_u16:
+; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i.i = add <8 x i16> %a, %b
+  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddhn_high_u32:
+; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i.i = add <4 x i32> %a, %b
+  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vaddhn_high_u64:
+; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i.i = add <2 x i64> %a, %b
+  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
+  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vraddhn_s16:
+; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vraddhn2.i
+}
+
+define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vraddhn_s32:
+; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vraddhn2.i
+}
+
+define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vraddhn_s64:
+; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vraddhn2.i
+}
+
+define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vraddhn_u16:
+; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vraddhn2.i
+}
+
+define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vraddhn_u32:
+; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vraddhn2.i
+}
+
+define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vraddhn_u64:
+; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vraddhn2.i
+}
+
+define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vraddhn_high_s16:
+; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vraddhn_high_s32:
+; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vraddhn_high_s64:
+; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vraddhn_high_u16:
+; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vraddhn_high_u32:
+; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vraddhn_high_u64:
+; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubhn_s16:
+; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i = sub <8 x i16> %a, %b
+  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
+  ret <8 x i8> %vsubhn2.i
+}
+
+define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubhn_s32:
+; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i = sub <4 x i32> %a, %b
+  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
+  ret <4 x i16> %vsubhn2.i
+}
+
+define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vsubhn_s64:
+; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i = sub <2 x i64> %a, %b
+  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
+  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
+  ret <2 x i32> %vsubhn2.i
+}
+
+define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubhn_u16:
+; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i = sub <8 x i16> %a, %b
+  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
+  ret <8 x i8> %vsubhn2.i
+}
+
+define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubhn_u32:
+; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i = sub <4 x i32> %a, %b
+  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
+  ret <4 x i16> %vsubhn2.i
+}
+
+define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vsubhn_u64:
+; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i = sub <2 x i64> %a, %b
+  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
+  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
+  ret <2 x i32> %vsubhn2.i
+}
+
+define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubhn_high_s16:
+; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i.i = sub <8 x i16> %a, %b
+  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubhn_high_s32:
+; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i.i = sub <4 x i32> %a, %b
+  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vsubhn_high_s64:
+; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i.i = sub <2 x i64> %a, %b
+  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
+  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubhn_high_u16:
+; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i.i = sub <8 x i16> %a, %b
+  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubhn_high_u32:
+; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i.i = sub <4 x i32> %a, %b
+  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vsubhn_high_u64:
+; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i.i = sub <2 x i64> %a, %b
+  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
+  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vrsubhn_s16:
+; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vrsubhn2.i
+}
+
+define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrsubhn_s32:
+; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vrsubhn2.i
+}
+
+define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vrsubhn_s64:
+; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vrsubhn2.i
+}
+
+define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vrsubhn_u16:
+; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vrsubhn2.i
+}
+
+define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrsubhn_u32:
+; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vrsubhn2.i
+}
+
+define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vrsubhn_u64:
+; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vrsubhn2.i
+}
+
+define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vrsubhn_high_s16:
+; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrsubhn_high_s32:
+; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vrsubhn_high_s64:
+; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vrsubhn_high_u16:
+; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrsubhn_high_u32:
+; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vrsubhn_high_u64:
+; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vabdl_s8:
+; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i
+}
+
+define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vabdl_s16:
+; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i
+}
+
+define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vabdl_s32:
+; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i
+}
+
+define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vabdl_u8:
+; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i
+}
+
+define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vabdl_u16:
+; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i
+}
+
+define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vabdl_u32:
+; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i
+}
+
+define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: test_vabal_s8:
+; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vabal_s16:
+; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vabal_s32:
+; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: test_vabal_u8:
+; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vabal_u16:
+; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vabal_u32:
+; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vabdl_high_s8:
+; CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i.i
+}
+
+define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vabdl_high_s16:
+; CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i.i
+}
+
+define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vabdl_high_s32:
+; CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i.i
+}
+
+define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vabdl_high_u8:
+; CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i.i
+}
+
+define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vabdl_high_u16:
+; CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i.i
+}
+
+define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vabdl_high_u32:
+; CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i.i
+}
+
+define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vabal_high_s8:
+; CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
+  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vabal_high_s16:
+; CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
+  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vabal_high_s32:
+; CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
+  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vabal_high_u8:
+; CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
+  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vabal_high_u16:
+; CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
+  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vabal_high_u32:
+; CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
+  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vmull_s8:
+; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i16> %vmull.i
+}
+
+define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vmull_s16:
+; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vmull_s32:
+; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vmull_u8:
+; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i16> %vmull.i
+}
+
+define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vmull_u16:
+; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vmull_u32:
+; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmull_high_s8:
+; CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmull_high_s16:
+; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmull_high_s32:
+; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmull_high_u8:
+; CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmull_high_u16:
+; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmull_high_u32:
+; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: test_vmlal_s8:
+; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %add.i = add <8 x i16> %vmull.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vmlal_s16:
+; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %add.i = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vmlal_s32:
+; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %add.i = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: test_vmlal_u8:
+; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %add.i = add <8 x i16> %vmull.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vmlal_u16:
+; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %add.i = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vmlal_u32:
+; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %add.i = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmlal_high_s8:
+; CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlal_high_s16:
+; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlal_high_s32:
+; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmlal_high_u8:
+; CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlal_high_u16:
+; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlal_high_u32:
+; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: test_vmlsl_s8:
+; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %sub.i = sub <8 x i16> %a, %vmull.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vmlsl_s16:
+; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %sub.i = sub <4 x i32> %a, %vmull2.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vmlsl_s32:
+; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %sub.i = sub <2 x i64> %a, %vmull2.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: test_vmlsl_u8:
+; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %sub.i = sub <8 x i16> %a, %vmull.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vmlsl_u16:
+; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %sub.i = sub <4 x i32> %a, %vmull2.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vmlsl_u32:
+; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %sub.i = sub <2 x i64> %a, %vmull2.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmlsl_high_s8:
+; CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
+  ret <8 x i16> %sub.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlsl_high_s16:
+; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlsl_high_s32:
+; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmlsl_high_u8:
+; CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
+  ret <8 x i16> %sub.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlsl_high_u16:
+; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlsl_high_u32:
+; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vqdmull_s16:
+; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vqdmull_s32:
+; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vqdmlal_s16:
+; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vqdmlal_s32:
+; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vqdmlsl_s16:
+; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vqdmlsl_s32:
+; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vqdmull_high_s16:
+; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vqdmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  ret <4 x i32> %vqdmull2.i.i
+}
+
+define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vqdmull_high_s32:
+; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vqdmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  ret <2 x i64> %vqdmull2.i.i
+}
+
+define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vqdmlal_high_s16:
+; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
+  ret <4 x i32> %vqdmlal4.i.i
+}
+
+define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vqdmlal_high_s32:
+; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vqdmlal2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
+  ret <2 x i64> %vqdmlal4.i.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vqdmlsl_high_s16:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
+  ret <4 x i32> %vqdmlsl4.i.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vqdmlsl_high_s32:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
+  ret <2 x i64> %vqdmlsl4.i.i
+}
+
+define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vmull_p8:
+; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i16> %vmull.i
+}
+
+define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmull_high_p8:
+; CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  ret <8 x i16> %vmull.i.i
+}
+
+define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
+; CHECK-LABEL: test_vmull_p64
+; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
+entry:
+  %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
+  %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
+  ret i128 %vmull3.i
+}
+
+define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
+; CHECK-LABEL: test_vmull_high_p64
+; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %0 = extractelement <2 x i64> %a, i32 1
+  %1 = extractelement <2 x i64> %b, i32 1
+  %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %0, i64 %1) #1
+  %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
+  ret i128 %vmull3.i.i
+}
+
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
+
+
diff --git a/test/CodeGen/AArch64/arm64-neon-aba-abd.ll b/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
new file mode 100644
index 00000000000..6404ab72801
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
@@ -0,0 +1,236 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uabd_v8i8:
+  %abd = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uabd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %abd
+}
+
+define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uaba_v8i8:
+  %abd = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %aba = add <8 x i8> %lhs, %abd
+; CHECK: uaba v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %aba
+}
+
+define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sabd_v8i8:
+  %abd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sabd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %abd
+}
+
+define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_saba_v8i8:
+  %abd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %aba = add <8 x i8> %lhs, %abd
+; CHECK: saba v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %aba
+}
+
+declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uabd_v16i8:
+  %abd = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uabd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %abd
+}
+
+define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uaba_v16i8:
+  %abd = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %aba = add <16 x i8> %lhs, %abd
+; CHECK: uaba v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %aba
+}
+
+define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sabd_v16i8:
+  %abd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sabd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %abd
+}
+
+define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_saba_v16i8:
+  %abd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %aba = add <16 x i8> %lhs, %abd
+; CHECK: saba v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %aba
+}
+
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uabd_v4i16:
+  %abd = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uabd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %abd
+}
+
+define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uaba_v4i16:
+  %abd = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %aba = add <4 x i16> %lhs, %abd
+; CHECK: uaba v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %aba
+}
+
+define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sabd_v4i16:
+  %abd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sabd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %abd
+}
+
+define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_saba_v4i16:
+  %abd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %aba = add <4 x i16> %lhs, %abd
+; CHECK: saba v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %aba
+}
+
+declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uabd_v8i16:
+  %abd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uabd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %abd
+}
+
+define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uaba_v8i16:
+  %abd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %aba = add <8 x i16> %lhs, %abd
+; CHECK: uaba v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %aba
+}
+
+define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sabd_v8i16:
+  %abd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sabd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %abd
+}
+
+define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_saba_v8i16:
+  %abd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %aba = add <8 x i16> %lhs, %abd
+; CHECK: saba v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %aba
+}
+
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uabd_v2i32:
+  %abd = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uabd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %abd
+}
+
+define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uaba_v2i32:
+  %abd = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %aba = add <2 x i32> %lhs, %abd
+; CHECK: uaba v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %aba
+}
+
+define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sabd_v2i32:
+  %abd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sabd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %abd
+}
+
+define <2 x i32> @test_sabd_v2i32_const() {
+; CHECK: test_sabd_v2i32_const:
+; CHECK: movi     d1, #0x00ffffffff0000
+; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s
+  %1 = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(
+    <2 x i32> <i32 -2147483648, i32 2147450880>,
+    <2 x i32> <i32 -65536, i32 65535>)
+  ret <2 x i32> %1
+}
+
+define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_saba_v2i32:
+  %abd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %aba = add <2 x i32> %lhs, %abd
+; CHECK: saba v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %aba
+}
+
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uabd_v4i32:
+  %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uabd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %abd
+}
+
+define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uaba_v4i32:
+  %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %aba = add <4 x i32> %lhs, %abd
+; CHECK: uaba v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %aba
+}
+
+define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sabd_v4i32:
+  %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sabd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %abd
+}
+
+define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_saba_v4i32:
+  %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %aba = add <4 x i32> %lhs, %abd
+; CHECK: saba v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %aba
+}
+
+declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>)
+
+define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fabd_v2f32:
+  %abd = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fabd v0.2s, v0.2s, v1.2s
+  ret <2 x float> %abd
+}
+
+declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>)
+
+define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fabd_v4f32:
+  %abd = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fabd v0.4s, v0.4s, v1.4s
+  ret <4 x float> %abd
+}
+
+declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>)
+
+define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fabd_v2f64:
+  %abd = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fabd v0.2d, v0.2d, v1.2d
+  ret <2 x double> %abd
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-across.ll b/test/CodeGen/AArch64/arm64-neon-across.ll
new file mode 100644
index 00000000000..3a63673f120
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-across.ll
@@ -0,0 +1,460 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
+
+declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>)
+
+declare i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
+
+define i16 @test_vaddlv_s8(<8 x i8> %a) {
+; CHECK: test_vaddlv_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %saddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlv_s16(<4 x i16> %a) {
+; CHECK: test_vaddlv_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %a)
+  ret i32 %saddlvv.i
+}
+
+define i16 @test_vaddlv_u8(<8 x i8> %a) {
+; CHECK: test_vaddlv_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %uaddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlv_u16(<4 x i16> %a) {
+; CHECK: test_vaddlv_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a)
+  ret i32 %uaddlvv.i
+}
+
+define i16 @test_vaddlvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %saddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %a)
+  ret i32 %saddlvv.i
+}
+
+define i64 @test_vaddlvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_s32:
+; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %saddlvv.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %a)
+  ret i64 %saddlvv.i
+}
+
+define i16 @test_vaddlvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %uaddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
+  ret i32 %uaddlvv.i
+}
+
+define i64 @test_vaddlvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_u32:
+; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %uaddlvv.i = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %a)
+  ret i64 %uaddlvv.i
+}
+
+define i8 @test_vmaxv_s8(<8 x i8> %a) {
+; CHECK: test_vmaxv_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %smaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxv_s16(<4 x i16> %a) {
+; CHECK: test_vmaxv_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %smaxv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vmaxv_u8(<8 x i8> %a) {
+; CHECK: test_vmaxv_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %umaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxv_u16(<4 x i16> %a) {
+; CHECK: test_vmaxv_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %umaxv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vmaxvq_s8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %smaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxvq_s16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %smaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_s32:
+; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a)
+  ret i32 %smaxv.i
+}
+
+define i8 @test_vmaxvq_u8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %umaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxvq_u16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %umaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_u32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_u32:
+; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> %a)
+  ret i32 %umaxv.i
+}
+
+define i8 @test_vminv_s8(<8 x i8> %a) {
+; CHECK: test_vminv_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %sminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminv_s16(<4 x i16> %a) {
+; CHECK: test_vminv_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %sminv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vminv_u8(<8 x i8> %a) {
+; CHECK: test_vminv_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %uminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminv_u16(<4 x i16> %a) {
+; CHECK: test_vminv_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %uminv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vminvq_s8(<16 x i8> %a) {
+; CHECK: test_vminvq_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %sminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminvq_s16(<8 x i16> %a) {
+; CHECK: test_vminvq_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %sminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a) {
+; CHECK: test_vminvq_s32:
+; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a)
+  ret i32 %sminv.i
+}
+
+define i8 @test_vminvq_u8(<16 x i8> %a) {
+; CHECK: test_vminvq_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %uminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminvq_u16(<8 x i16> %a) {
+; CHECK: test_vminvq_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %uminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_u32(<4 x i32> %a) {
+; CHECK: test_vminvq_u32:
+; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> %a)
+  ret i32 %uminv.i
+}
+
+define i8 @test_vaddv_s8(<8 x i8> %a) {
+; CHECK: test_vaddv_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddv_s16(<4 x i16> %a) {
+; CHECK: test_vaddv_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vaddv_u8(<8 x i8> %a) {
+; CHECK: test_vaddv_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddv_u16(<4 x i16> %a) {
+; CHECK: test_vaddv_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vaddvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddvq_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddvq_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddvq_s32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a)
+  ret i32 %vaddv.i
+}
+
+define i8 @test_vaddvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddvq_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddvq_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddvq_u32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a)
+  ret i32 %vaddv.i
+}
+
+define float @test_vmaxvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxvq_f32:
+; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define float @test_vminvq_f32(<4 x float> %a) {
+; CHECK: test_vminvq_f32:
+; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define float @test_vmaxnmvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxnmvq_f32:
+; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define float @test_vminnmvq_f32(<4 x float> %a) {
+; CHECK: test_vminnmvq_f32:
+; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
diff --git a/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll b/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
new file mode 100644
index 00000000000..d3dc1b8d010
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_addp_v8i8:
+  %tmp1 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: addp v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_addp_v16i8:
+  %tmp1 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: addp v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_addp_v4i16:
+  %tmp1 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: addp v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_addp_v8i16:
+  %tmp1 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: addp v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_addp_v2i32:
+  %tmp1 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: addp v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_addp_v4i32:
+  %tmp1 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: addp v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_addp_v2i64:
+        %val = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: addp v0.2d, v0.2d, v1.2d
+        ret <2 x i64> %val
+}
+
+declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_faddp_v2f32:
+        %val = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: faddp v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_faddp_v4f32:
+        %val = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: faddp v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_faddp_v2f64:
+        %val = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: faddp v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
+
+define i32 @test_vaddv.v2i32(<2 x i32> %a) {
+; CHECK-LABEL: test_vaddv.v2i32
+; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %1 = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a)
+  ret i32 %1
+}
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
diff --git a/test/CodeGen/AArch64/arm64-neon-add-sub.ll b/test/CodeGen/AArch64/arm64-neon-add-sub.ll
new file mode 100644
index 00000000000..fbde606538c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-add-sub.ll
@@ -0,0 +1,237 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -aarch64-simd-scalar| FileCheck %s
+
+define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = add <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @add16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = add <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @add4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = add <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @add8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = add <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @add2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = add <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @add4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = add <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @add2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = add <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @add2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = fadd <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @add4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = fadd <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @add2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = fadd <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+define <8 x i8> @sub8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = sub <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sub16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = sub <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sub4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = sub <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sub8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = sub <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sub2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = sub <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sub4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = sub <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sub2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = sub <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @sub2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = fsub <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @sub4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = fsub <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = fsub <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vadd_f64
+; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fadd <1 x double> %a, %b
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmul_f64
+; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fmul <1 x double> %a, %b
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vdiv_f64
+; CHECK: fdiv d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fdiv <1 x double> %a, %b
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vmla_f64
+; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fmul <1 x double> %b, %c
+  %2 = fadd <1 x double> %1, %a
+  ret <1 x double> %2
+}
+
+define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vmls_f64
+; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fmul <1 x double> %b, %c
+  %2 = fsub <1 x double> %a, %1
+  ret <1 x double> %2
+}
+
+define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vfms_f64
+; CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fsub <1 x double> <double -0.000000e+00>, %b
+  %2 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %1, <1 x double> %c, <1 x double> %a)
+  ret <1 x double> %2
+}
+
+define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vfma_f64
+; CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vsub_f64
+; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fsub <1 x double> %a, %b
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vabd_f64
+; CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmax_f64
+; CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmin_f64
+; CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmaxnm_f64
+; CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vminnm_f64
+; CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vabs_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vabs_f64
+; CHECK: fabs d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.fabs.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vneg_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vneg_f64
+; CHECK: fneg d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fsub <1 x double> <double -0.000000e+00>, %a
+  ret <1 x double> %1
+}
+
+declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
diff --git a/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
new file mode 100644
index 00000000000..cba81ef99b9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
@@ -0,0 +1,1191 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s
+
+define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp eq <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp eq <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp eq <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp eq <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp eq <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp eq <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp eq <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp sgt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp sgt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp sgt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp sgt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp sgt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp sgt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp sgt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp slt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp slt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp slt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp slt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp slt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp slt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp slt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp sge <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp sge <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp sge <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp sge <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp sge <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp sge <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp sge <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp sle <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp sle <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp sle <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp sle <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp sle <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp sle <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp sle <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ugt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ugt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp ugt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp ugt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp ugt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp ugt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp ugt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ult <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ult <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ult <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ult <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ult <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ult <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ult <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp uge <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp uge <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp uge <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp uge <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp uge <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp uge <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp uge <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ule <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ule <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ule <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ule <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ule <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ule <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ule <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
+;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
+	%tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
+	%tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
+	%tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
+	%tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
+	%tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
+	%tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
+	%tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
+	%tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
+	%tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
+	%tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
+	%tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
+	%tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
+	%tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
+	%tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v[[ZERO]].8b, v0.8b
+	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
+	%tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
+	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
+	%tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
+	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
+	%tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
+	%tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v[[ZERO]].8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
+	%tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
+	%tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
+	%tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
+	%tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
+	%tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
+	%tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <1 x i64> @cmeqz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmeqz_v1i64:
+; CHECK: cmeq d0, d0, #0
+  %tst = icmp eq <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmgez_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmgez_v1i64:
+; CHECK: cmge d0, d0, #0
+  %tst = icmp sge <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmgtz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmgtz_v1i64:
+; CHECK: cmgt d0, d0, #0
+  %tst = icmp sgt <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmlez_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmlez_v1i64:
+; CHECK: cmle d0, d0, #0
+  %tst = icmp sle <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmltz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmltz_v1i64:
+; CHECK: cmlt d0, d0, #0
+  %tst = icmp slt <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmeqz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmeqz_v1f64:
+; CHECK: fcmeq d0, d0, #0
+  %tst = fcmp oeq <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgez_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmgez_v1f64:
+; CHECK: fcmge d0, d0, #0
+  %tst = fcmp oge <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgtz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmgtz_v1f64:
+; CHECK: fcmgt d0, d0, #0
+  %tst = fcmp ogt <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlez_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmlez_v1f64:
+; CHECK: fcmle d0, d0, #0
+  %tst = fcmp ole <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmltz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmltz_v1f64:
+; CHECK: fcmlt d0, d0, #0
+  %tst = fcmp olt <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
new file mode 100644
index 00000000000..cfc2ebf0a2e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -0,0 +1,1445 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+
+define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
+; CHECK-LABEL: ins16bw:
+; CHECK: ins {{v[0-9]+}}.b[15], {{w[0-9]+}}
+  %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) {
+; CHECK-LABEL: ins8hw:
+; CHECK: ins {{v[0-9]+}}.h[6], {{w[0-9]+}}
+  %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6
+  ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) {
+; CHECK-LABEL: ins4sw:
+; CHECK: ins {{v[0-9]+}}.s[2], {{w[0-9]+}}
+  %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) {
+; CHECK-LABEL: ins2dw:
+; CHECK: ins {{v[0-9]+}}.d[1], {{x[0-9]+}}
+  %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1
+  ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) {
+; CHECK-LABEL: ins8bw:
+; CHECK: ins {{v[0-9]+}}.b[5], {{w[0-9]+}}
+  %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5
+  ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) {
+; CHECK-LABEL: ins4hw:
+; CHECK: ins {{v[0-9]+}}.h[3], {{w[0-9]+}}
+  %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3
+  ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
+; CHECK-LABEL: ins2sw:
+; CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}}
+  %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
+  ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
+; CHECK-LABEL: ins16b16:
+; CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
+  ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
+; CHECK-LABEL: ins8h8:
+; CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
+; CHECK-LABEL: ins4s4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
+; CHECK-LABEL: ins2d2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
+  ret <2 x i64> %tmp4
+}
+
+define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
+; CHECK-LABEL: ins4f4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+  %tmp3 = extractelement <4 x float> %tmp1, i32 2
+  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
+; CHECK-LABEL: ins2df2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  %tmp3 = extractelement <2 x double> %tmp1, i32 0
+  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
+  ret <2 x double> %tmp4
+}
+
+define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
+; CHECK-LABEL: ins8b16:
+; CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
+  ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
+; CHECK-LABEL: ins4h8:
+; CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
+; CHECK-LABEL: ins2s4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
+; CHECK-LABEL: ins1d2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
+  ret <2 x i64> %tmp4
+}
+
+define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
+; CHECK-LABEL: ins2f4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
+  %tmp3 = extractelement <2 x float> %tmp1, i32 1
+  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
+; CHECK-LABEL: ins1f2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  %tmp3 = extractelement <1 x double> %tmp1, i32 0
+  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
+  ret <2 x double> %tmp4
+}
+
+define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
+; CHECK-LABEL: ins16b8:
+; CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
+  ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
+; CHECK-LABEL: ins8h4:
+; CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
+  ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
+; CHECK-LABEL: ins4s2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
+; CHECK-LABEL: ins2d1:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
+  ret <1 x i64> %tmp4
+}
+
+define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
+; CHECK-LABEL: ins4f2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+  %tmp3 = extractelement <4 x float> %tmp1, i32 2
+  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
+  ret <2 x float> %tmp4
+}
+
+define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
+; CHECK-LABEL: ins2f1:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+  %tmp3 = extractelement <2 x double> %tmp1, i32 1
+  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
+  ret <1 x double> %tmp4
+}
+
+define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
+; CHECK-LABEL: ins8b8:
+; CHECK: ins {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
+  ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
+; CHECK-LABEL: ins4h4:
+; CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
+  ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
+; CHECK-LABEL: ins2s2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 0
+  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
+; CHECK-LABEL: ins1d1:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
+  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
+  ret <1 x i64> %tmp4
+}
+
+define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
+; CHECK-LABEL: ins2f2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  %tmp3 = extractelement <2 x float> %tmp1, i32 0
+  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
+  ret <2 x float> %tmp4
+}
+
+define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
+; CHECK-LABEL: ins1df1:
+; CHECK-NOT: ins {{v[0-9]+}}
+  %tmp3 = extractelement <1 x double> %tmp1, i32 0
+  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
+  ret <1 x double> %tmp4
+}
+
+define i32 @umovw16b(<16 x i8> %tmp1) {
+; CHECK-LABEL: umovw16b:
+; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
+  %tmp4 = zext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw8h(<8 x i16> %tmp1) {
+; CHECK-LABEL: umovw8h:
+; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = zext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw4s(<4 x i32> %tmp1) {
+; CHECK-LABEL: umovw4s:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  ret i32 %tmp3
+}
+
+define i64 @umovx2d(<2 x i64> %tmp1) {
+; CHECK-LABEL: umovx2d:
+; CHECK: mov {{x[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 1
+  ret i64 %tmp3
+}
+
+define i32 @umovw8b(<8 x i8> %tmp1) {
+; CHECK-LABEL: umovw8b:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.b[7]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 7
+  %tmp4 = zext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw4h(<4 x i16> %tmp1) {
+; CHECK-LABEL: umovw4h:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = zext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw2s(<2 x i32> %tmp1) {
+; CHECK-LABEL: umovw2s:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
+  ret i32 %tmp3
+}
+
+define i64 @umovx1d(<1 x i64> %tmp1) {
+; CHECK-LABEL: umovx1d:
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
+  ret i64 %tmp3
+}
+
+define i32 @smovw16b(<16 x i8> %tmp1) {
+; CHECK-LABEL: smovw16b:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
+  %tmp4 = sext i8 %tmp3 to i32
+  %tmp5 = add i32 %tmp4, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovw8h(<8 x i16> %tmp1) {
+; CHECK-LABEL: smovw8h:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  %tmp5 = add i32 %tmp4, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovx16b(<16 x i8> %tmp1) {
+; CHECK-LABEL: smovx16b:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[8]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
+  %tmp4 = sext i8 %tmp3 to i32
+  %tmp5 = add i32 %tmp4, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovx8h(<8 x i16> %tmp1) {
+; CHECK-LABEL: smovx8h:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i64 @smovx4s(<4 x i32> %tmp1) {
+; CHECK-LABEL: smovx4s:
+; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  %tmp4 = sext i32 %tmp3 to i64
+  ret i64 %tmp4
+}
+
+define i32 @smovw8b(<8 x i8> %tmp1) {
+; CHECK-LABEL: smovw8b:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 4
+  %tmp4 = sext i8 %tmp3 to i32
+  %tmp5 = add i32 %tmp4, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovw4h(<4 x i16> %tmp1) {
+; CHECK-LABEL: smovw4h:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  %tmp5 = add i32 %tmp4, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovx8b(<8 x i8> %tmp1) {
+; CHECK-LABEL: smovx8b:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[6]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 6
+  %tmp4 = sext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @smovx4h(<4 x i16> %tmp1) {
+; CHECK-LABEL: smovx4h:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i64 @smovx2s(<2 x i32> %tmp1) {
+; CHECK-LABEL: smovx2s:
+; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
+  %tmp4 = sext i32 %tmp3 to i64
+  ret i64 %tmp4
+}
+
+define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_vcopy_lane_s8:
+; CHECK: ins  {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
+  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
+  ret <8 x i8> %vset_lane
+}
+
+define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_vcopyq_laneq_s8:
+; CHECK: ins  {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
+  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
+  ret <16 x i8> %vset_lane
+}
+
+define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_vcopy_lane_swap_s8:
+; CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
+  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
+  ret <8 x i8> %vset_lane
+}
+
+define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_vcopyq_laneq_swap_s8:
+; CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
+  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vset_lane
+}
+
+define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
+; CHECK-LABEL: test_vdup_n_u8:
+; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+  %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
+  %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
+  %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
+  %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3
+  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4
+  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5
+  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6
+  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7
+  ret <8 x i8> %vecinit7.i
+}
+
+define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
+; CHECK-LABEL: test_vdup_n_u16:
+; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+  %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
+; CHECK-LABEL: test_vdup_n_u32:
+; CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
+  %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
+; CHECK-LABEL: test_vdup_n_u64:
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+  %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
+  ret <1 x i64> %vecinit.i
+}
+
+define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
+; CHECK-LABEL: test_vdupq_n_u8:
+; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+  %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
+  %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
+  %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
+  %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3
+  %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4
+  %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5
+  %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6
+  %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7
+  %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8
+  %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9
+  %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10
+  %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11
+  %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12
+  %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13
+  %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14
+  %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15
+  ret <16 x i8> %vecinit15.i
+}
+
+define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
+; CHECK-LABEL: test_vdupq_n_u16:
+; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+  %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7
+  ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
+; CHECK-LABEL: test_vdupq_n_u32:
+; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+  %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3
+  ret <4 x i32> %vecinit3.i
+}
+
+define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
+; CHECK-LABEL: test_vdupq_n_u64:
+; CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
+  %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
+  %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
+  ret <2 x i64> %vecinit1.i
+}
+
+define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
+; CHECK-LABEL: test_vdup_lane_s8:
+; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i8> %shuffle
+}
+
+define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
+; CHECK-LABEL: test_vdup_lane_s16:
+; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i16> %shuffle
+}
+
+define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
+; CHECK-LABEL: test_vdup_lane_s32:
+; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
+; CHECK-LABEL: test_vdupq_lane_s8:
+; CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
+; CHECK-LABEL: test_vdupq_lane_s16:
+; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
+; CHECK-LABEL: test_vdupq_lane_s32:
+; CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
+; CHECK-LABEL: test_vdupq_lane_s64:
+; CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+  %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %shuffle
+}
+
+define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
+; CHECK-LABEL: test_vdup_laneq_s8:
+; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i8> %shuffle
+}
+
+define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
+; CHECK-LABEL: test_vdup_laneq_s16:
+; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i16> %shuffle
+}
+
+define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
+; CHECK-LABEL: test_vdup_laneq_s32:
+; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
+; CHECK-LABEL: test_vdupq_laneq_s8:
+; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
+; CHECK-LABEL: test_vdupq_laneq_s16:
+; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
+; CHECK-LABEL: test_vdupq_laneq_s32:
+; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
+; CHECK-LABEL: test_vdupq_laneq_s64:
+; CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+  %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %shuffle
+}
+
+define i64 @test_bitcastv8i8toi64(<8 x i8> %in) {
+; CHECK-LABEL: test_bitcastv8i8toi64:
+   %res = bitcast <8 x i8> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv4i16toi64(<4 x i16> %in) {
+; CHECK-LABEL: test_bitcastv4i16toi64:
+   %res = bitcast <4 x i16> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv2i32toi64(<2 x i32> %in) {
+; CHECK-LABEL: test_bitcastv2i32toi64:
+   %res = bitcast <2 x i32> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv2f32toi64(<2 x float> %in) {
+; CHECK-LABEL: test_bitcastv2f32toi64:
+   %res = bitcast <2 x float> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv1i64toi64(<1 x i64> %in) {
+; CHECK-LABEL: test_bitcastv1i64toi64:
+   %res = bitcast <1 x i64> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv1f64toi64(<1 x double> %in) {
+; CHECK-LABEL: test_bitcastv1f64toi64:
+   %res = bitcast <1 x double> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define <8 x i8> @test_bitcasti64tov8i8(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov8i8:
+   %res = bitcast i64 %in to <8 x i8>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <8 x i8> %res
+}
+
+define <4 x i16> @test_bitcasti64tov4i16(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov4i16:
+   %res = bitcast i64 %in to <4 x i16>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <4 x i16> %res
+}
+
+define <2 x i32> @test_bitcasti64tov2i32(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov2i32:
+   %res = bitcast i64 %in to <2 x i32>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <2 x i32> %res
+}
+
+define <2 x float> @test_bitcasti64tov2f32(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov2f32:
+   %res = bitcast i64 %in to <2 x float>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <2 x float> %res
+}
+
+define <1 x i64> @test_bitcasti64tov1i64(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov1i64:
+   %res = bitcast i64 %in to <1 x i64>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <1 x i64> %res
+}
+
+define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov1f64:
+   %res = bitcast i64 %in to <1 x double>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <1 x double> %res
+}
+
+define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
+; CHECK-LABEL: test_bitcastv8i8tov1f64:
+; CHECK: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
+  %sub.i = sub <8 x i8> zeroinitializer, %a
+  %1 = bitcast <8 x i8> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
+; CHECK-LABEL: test_bitcastv4i16tov1f64:
+; CHECK: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
+  %sub.i = sub <4 x i16> zeroinitializer, %a
+  %1 = bitcast <4 x i16> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_bitcastv2i32tov1f64:
+; CHECK: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
+  %sub.i = sub <2 x i32> zeroinitializer, %a
+  %1 = bitcast <2 x i32> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1i64tov1f64:
+; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
+  %sub.i = sub <1 x i64> zeroinitializer, %a
+  %1 = bitcast <1 x i64> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
+; CHECK-LABEL: test_bitcastv2f32tov1f64:
+; CHECK: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
+  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+  %1 = bitcast <2 x float> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov8i8:
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
+; CHECK-NEXT: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <8 x i8>
+  %sub.i = sub <8 x i8> zeroinitializer, %1
+  ret <8 x i8> %sub.i
+}
+
+define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov4i16:
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
+; CHECK-NEXT: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <4 x i16>
+  %sub.i = sub <4 x i16> zeroinitializer, %1
+  ret <4 x i16> %sub.i
+}
+
+define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov2i32:
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
+; CHECK-NEXT: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <2 x i32>
+  %sub.i = sub <2 x i32> zeroinitializer, %1
+  ret <2 x i32> %sub.i
+}
+
+define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov1i64:
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
+; CHECK-NEXT: neg {{d[0-9]+}}, {{d[0-9]+}}
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <1 x i64>
+  %sub.i = sub <1 x i64> zeroinitializer, %1
+  ret <1 x i64> %sub.i
+}
+
+define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov2f32:
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
+; CHECK-NEXT: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <2 x float>
+  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %1
+  ret <2 x float> %sub.i
+}
+
+; Test insert element into an undef vector
+define <8 x i8> @scalar_to_vector.v8i8(i8 %a) {
+; CHECK-LABEL: scalar_to_vector.v8i8:
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+  %b = insertelement <8 x i8> undef, i8 %a, i32 0
+  ret <8 x i8> %b
+}
+
+define <16 x i8> @scalar_to_vector.v16i8(i8 %a) {
+; CHECK-LABEL: scalar_to_vector.v16i8:
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+  %b = insertelement <16 x i8> undef, i8 %a, i32 0
+  ret <16 x i8> %b
+}
+
+define <4 x i16> @scalar_to_vector.v4i16(i16 %a) {
+; CHECK-LABEL: scalar_to_vector.v4i16:
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+  %b = insertelement <4 x i16> undef, i16 %a, i32 0
+  ret <4 x i16> %b
+}
+
+define <8 x i16> @scalar_to_vector.v8i16(i16 %a) {
+; CHECK-LABEL: scalar_to_vector.v8i16:
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+  %b = insertelement <8 x i16> undef, i16 %a, i32 0
+  ret <8 x i16> %b
+}
+
+define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
+; CHECK-LABEL: scalar_to_vector.v2i32:
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+  %b = insertelement <2 x i32> undef, i32 %a, i32 0
+  ret <2 x i32> %b
+}
+
+define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
+; CHECK-LABEL: scalar_to_vector.v4i32:
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+  %b = insertelement <4 x i32> undef, i32 %a, i32 0
+  ret <4 x i32> %b
+}
+
+define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
+; CHECK-LABEL: scalar_to_vector.v2i64:
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+  %b = insertelement <2 x i64> undef, i64 %a, i32 0
+  ret <2 x i64> %b
+}
+
+define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
+; CHECK-LABEL: testDUP.v1i8:
+; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+  %b = extractelement <1 x i8> %a, i32 0
+  %c = insertelement <8 x i8> undef, i8 %b, i32 0
+  %d = insertelement <8 x i8> %c, i8 %b, i32 1
+  %e = insertelement <8 x i8> %d, i8 %b, i32 2
+  %f = insertelement <8 x i8> %e, i8 %b, i32 3
+  %g = insertelement <8 x i8> %f, i8 %b, i32 4
+  %h = insertelement <8 x i8> %g, i8 %b, i32 5
+  %i = insertelement <8 x i8> %h, i8 %b, i32 6
+  %j = insertelement <8 x i8> %i, i8 %b, i32 7
+  ret <8 x i8> %j
+}
+
+define <8 x i16> @testDUP.v1i16(<1 x i16> %a) {
+; CHECK-LABEL: testDUP.v1i16:
+; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+  %b = extractelement <1 x i16> %a, i32 0
+  %c = insertelement <8 x i16> undef, i16 %b, i32 0
+  %d = insertelement <8 x i16> %c, i16 %b, i32 1
+  %e = insertelement <8 x i16> %d, i16 %b, i32 2
+  %f = insertelement <8 x i16> %e, i16 %b, i32 3
+  %g = insertelement <8 x i16> %f, i16 %b, i32 4
+  %h = insertelement <8 x i16> %g, i16 %b, i32 5
+  %i = insertelement <8 x i16> %h, i16 %b, i32 6
+  %j = insertelement <8 x i16> %i, i16 %b, i32 7
+  ret <8 x i16> %j
+}
+
+define <4 x i32> @testDUP.v1i32(<1 x i32> %a) {
+; CHECK-LABEL: testDUP.v1i32:
+; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+  %b = extractelement <1 x i32> %a, i32 0
+  %c = insertelement <4 x i32> undef, i32 %b, i32 0
+  %d = insertelement <4 x i32> %c, i32 %b, i32 1
+  %e = insertelement <4 x i32> %d, i32 %b, i32 2
+  %f = insertelement <4 x i32> %e, i32 %b, i32 3
+  ret <4 x i32> %f
+}
+
+define <8 x i8> @getl(<16 x i8> %x) #0 {
+; CHECK-LABEL: getl:
+; CHECK: ret
+  %vecext = extractelement <16 x i8> %x, i32 0
+  %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0
+  %vecext1 = extractelement <16 x i8> %x, i32 1
+  %vecinit2 = insertelement <8 x i8> %vecinit, i8 %vecext1, i32 1
+  %vecext3 = extractelement <16 x i8> %x, i32 2
+  %vecinit4 = insertelement <8 x i8> %vecinit2, i8 %vecext3, i32 2
+  %vecext5 = extractelement <16 x i8> %x, i32 3
+  %vecinit6 = insertelement <8 x i8> %vecinit4, i8 %vecext5, i32 3
+  %vecext7 = extractelement <16 x i8> %x, i32 4
+  %vecinit8 = insertelement <8 x i8> %vecinit6, i8 %vecext7, i32 4
+  %vecext9 = extractelement <16 x i8> %x, i32 5
+  %vecinit10 = insertelement <8 x i8> %vecinit8, i8 %vecext9, i32 5
+  %vecext11 = extractelement <16 x i8> %x, i32 6
+  %vecinit12 = insertelement <8 x i8> %vecinit10, i8 %vecext11, i32 6
+  %vecext13 = extractelement <16 x i8> %x, i32 7
+  %vecinit14 = insertelement <8 x i8> %vecinit12, i8 %vecext13, i32 7
+  ret <8 x i8> %vecinit14
+}
+
+define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
+; CHECK-LABEL: test_dup_v2i32_v4i16:
+; CHECK: dup v0.4h, v0.h[2]
+entry:
+  %x = extractelement <2 x i32> %a, i32 1
+  %vget_lane = trunc i32 %x to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) {
+; CHECK-LABEL: test_dup_v4i32_v8i16:
+; CHECK: dup v0.8h, v0.h[6]
+entry:
+  %x = extractelement <4 x i32> %a, i32 3
+  %vget_lane = trunc i32 %x to i16
+  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+  ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) {
+; CHECK-LABEL: test_dup_v1i64_v4i16:
+; CHECK: dup v0.4h, v0.h[0]
+entry:
+  %x = extractelement <1 x i64> %a, i32 0
+  %vget_lane = trunc i64 %x to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) {
+; CHECK-LABEL: test_dup_v1i64_v2i32:
+; CHECK: dup v0.2s, v0.s[0]
+entry:
+  %x = extractelement <1 x i64> %a, i32 0
+  %vget_lane = trunc i64 %x to i32
+  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v8i16:
+; CHECK: dup v0.8h, v0.h[4]
+entry:
+  %x = extractelement <2 x i64> %a, i32 1
+  %vget_lane = trunc i64 %x to i16
+  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+  ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i32> @test_dup_v2i64_v4i32(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v4i32:
+; CHECK: dup v0.4s, v0.s[2]
+entry:
+  %x = extractelement <2 x i64> %a, i32 1
+  %vget_lane = trunc i64 %x to i32
+  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
+  ret <4 x i32> %vecinit3.i
+}
+
+define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) {
+; CHECK-LABEL: test_dup_v4i32_v4i16:
+; CHECK: dup v0.4h, v0.h[2]
+entry:
+  %x = extractelement <4 x i32> %a, i32 1
+  %vget_lane = trunc i32 %x to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v4i16:
+; CHECK: dup v0.4h, v0.h[0]
+entry:
+  %x = extractelement <2 x i64> %a, i32 0
+  %vget_lane = trunc i64 %x to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v2i32:
+; CHECK: dup v0.2s, v0.s[0]
+entry:
+  %x = extractelement <2 x i64> %a, i32 0
+  %vget_lane = trunc i64 %x to i32
+  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+
+define <2 x float> @test_scalar_to_vector_f32_to_v2f32(<2 x float> %a) {
+; CHECK-LABEL: test_scalar_to_vector_f32_to_v2f32:
+; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
+; CHECK-NEXT: ret
+entry:
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
+  %1 = insertelement <1 x float> undef, float %0, i32 0
+  %2 = extractelement <1 x float> %1, i32 0
+  %vecinit1.i = insertelement <2 x float> undef, float %2, i32 0
+  ret <2 x float> %vecinit1.i
+}
+
+define <4 x float> @test_scalar_to_vector_f32_to_v4f32(<2 x float> %a) {
+; CHECK-LABEL: test_scalar_to_vector_f32_to_v4f32:
+; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
+; CHECK-NEXT: ret
+entry:
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
+  %1 = insertelement <1 x float> undef, float %0, i32 0
+  %2 = extractelement <1 x float> %1, i32 0
+  %vecinit1.i = insertelement <4 x float> undef, float %2, i32 0
+  ret <4 x float> %vecinit1.i
+}
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
+
+define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) {
+; CHECK-LABEL: test_concat_undef_v1i32:
+; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %0 = extractelement <2 x i32> %a, i32 0
+  %vecinit1.i = insertelement <2 x i32> undef, i32 %0, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+declare i32 @llvm.aarch64.neon.sqabs.i32(i32) #4
+
+define <2 x i32> @test_concat_v1i32_undef(i32 %a) {
+; CHECK-LABEL: test_concat_v1i32_undef:
+; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT: ret
+entry:
+  %b = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
+  %vecinit.i432 = insertelement <2 x i32> undef, i32 %b, i32 0
+  ret <2 x i32> %vecinit.i432
+}
+
+define <2 x i32> @test_concat_same_v1i32_v1i32(<2 x i32> %a) {
+; CHECK-LABEL: test_concat_same_v1i32_v1i32:
+; CHECK: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
+entry:
+  %0 = extractelement <2 x i32> %a, i32 0
+  %vecinit.i = insertelement <2 x i32> undef, i32 %0, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %0, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) {
+; CHECK-LABEL: test_concat_diff_v1i32_v1i32:
+; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
+  %d = insertelement <2 x i32> undef, i32 %c, i32 0
+  %e = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %b)
+  %f = insertelement <2 x i32> undef, i32 %e, i32 0
+  %h = shufflevector <2 x i32> %d, <2 x i32> %f, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %h
+}
+
+define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 {
+; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %vecinit30
+}
+
+define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
+; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <8 x i8> %x, i32 0
+  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
+  %vecext1 = extractelement <8 x i8> %x, i32 1
+  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
+  %vecext3 = extractelement <8 x i8> %x, i32 2
+  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
+  %vecext5 = extractelement <8 x i8> %x, i32 3
+  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
+  %vecext7 = extractelement <8 x i8> %x, i32 4
+  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
+  %vecext9 = extractelement <8 x i8> %x, i32 5
+  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
+  %vecext11 = extractelement <8 x i8> %x, i32 6
+  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
+  %vecext13 = extractelement <8 x i8> %x, i32 7
+  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
+  %vecinit30 = shufflevector <16 x i8> %vecinit14, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %vecinit30
+}
+
+define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
+; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <16 x i8> %x, i32 0
+  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
+  %vecext1 = extractelement <16 x i8> %x, i32 1
+  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
+  %vecext3 = extractelement <16 x i8> %x, i32 2
+  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
+  %vecext5 = extractelement <16 x i8> %x, i32 3
+  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
+  %vecext7 = extractelement <16 x i8> %x, i32 4
+  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
+  %vecext9 = extractelement <16 x i8> %x, i32 5
+  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
+  %vecext11 = extractelement <16 x i8> %x, i32 6
+  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
+  %vecext13 = extractelement <16 x i8> %x, i32 7
+  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
+  %vecext15 = extractelement <8 x i8> %y, i32 0
+  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
+  %vecext17 = extractelement <8 x i8> %y, i32 1
+  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
+  %vecext19 = extractelement <8 x i8> %y, i32 2
+  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
+  %vecext21 = extractelement <8 x i8> %y, i32 3
+  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
+  %vecext23 = extractelement <8 x i8> %y, i32 4
+  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
+  %vecext25 = extractelement <8 x i8> %y, i32 5
+  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
+  %vecext27 = extractelement <8 x i8> %y, i32 6
+  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
+  %vecext29 = extractelement <8 x i8> %y, i32 7
+  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
+  ret <16 x i8> %vecinit30
+}
+
+define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
+; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <8 x i8> %x, i32 0
+  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
+  %vecext1 = extractelement <8 x i8> %x, i32 1
+  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
+  %vecext3 = extractelement <8 x i8> %x, i32 2
+  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
+  %vecext5 = extractelement <8 x i8> %x, i32 3
+  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
+  %vecext7 = extractelement <8 x i8> %x, i32 4
+  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
+  %vecext9 = extractelement <8 x i8> %x, i32 5
+  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
+  %vecext11 = extractelement <8 x i8> %x, i32 6
+  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
+  %vecext13 = extractelement <8 x i8> %x, i32 7
+  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
+  %vecext15 = extractelement <8 x i8> %y, i32 0
+  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
+  %vecext17 = extractelement <8 x i8> %y, i32 1
+  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
+  %vecext19 = extractelement <8 x i8> %y, i32 2
+  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
+  %vecext21 = extractelement <8 x i8> %y, i32 3
+  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
+  %vecext23 = extractelement <8 x i8> %y, i32 4
+  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
+  %vecext25 = extractelement <8 x i8> %y, i32 5
+  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
+  %vecext27 = extractelement <8 x i8> %y, i32 6
+  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
+  %vecext29 = extractelement <8 x i8> %y, i32 7
+  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
+  ret <16 x i8> %vecinit30
+}
+
+define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 {
+; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i16> %vecinit14
+}
+
+define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
+; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <4 x i16> %x, i32 0
+  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
+  %vecext1 = extractelement <4 x i16> %x, i32 1
+  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
+  %vecext3 = extractelement <4 x i16> %x, i32 2
+  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
+  %vecext5 = extractelement <4 x i16> %x, i32 3
+  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
+  %vecinit14 = shufflevector <8 x i16> %vecinit6, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i16> %vecinit14
+}
+
+define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
+; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <8 x i16> %x, i32 0
+  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
+  %vecext1 = extractelement <8 x i16> %x, i32 1
+  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
+  %vecext3 = extractelement <8 x i16> %x, i32 2
+  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
+  %vecext5 = extractelement <8 x i16> %x, i32 3
+  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
+  %vecext7 = extractelement <4 x i16> %y, i32 0
+  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
+  %vecext9 = extractelement <4 x i16> %y, i32 1
+  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
+  %vecext11 = extractelement <4 x i16> %y, i32 2
+  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
+  %vecext13 = extractelement <4 x i16> %y, i32 3
+  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
+  ret <8 x i16> %vecinit14
+}
+
+define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
+; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <4 x i16> %x, i32 0
+  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
+  %vecext1 = extractelement <4 x i16> %x, i32 1
+  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
+  %vecext3 = extractelement <4 x i16> %x, i32 2
+  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
+  %vecext5 = extractelement <4 x i16> %x, i32 3
+  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
+  %vecext7 = extractelement <4 x i16> %y, i32 0
+  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
+  %vecext9 = extractelement <4 x i16> %y, i32 1
+  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
+  %vecext11 = extractelement <4 x i16> %y, i32 2
+  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
+  %vecext13 = extractelement <4 x i16> %y, i32 3
+  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
+  ret <8 x i16> %vecinit14
+}
+
+define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 {
+; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %vecinit6
+}
+
+define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
+; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <2 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <2 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecinit6 = shufflevector <4 x i32> %vecinit2, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %vecinit6
+}
+
+define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
+; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecext3 = extractelement <2 x i32> %y, i32 0
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
+  %vecext5 = extractelement <2 x i32> %y, i32 1
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3
+  ret <4 x i32> %vecinit6
+}
+
+define <4 x i32> @test_concat_v4i32_v2i32_v2i32(<2 x i32> %x, <2 x i32> %y) #0 {
+; CHECK-LABEL: test_concat_v4i32_v2i32_v2i32:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecinit6 = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %vecinit6
+}
+
+define <2 x i64> @test_concat_v2i64_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) #0 {
+; CHECK-LABEL: test_concat_v2i64_v2i64_v2i64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vecinit2 = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %vecinit2
+}
+
+define <2 x i64> @test_concat_v2i64_v1i64_v2i64(<1 x i64> %x, <2 x i64> %y) #0 {
+; CHECK-LABEL: test_concat_v2i64_v1i64_v2i64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vecext = extractelement <1 x i64> %x, i32 0
+  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
+  %vecinit2 = shufflevector <2 x i64> %vecinit, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %vecinit2
+}
+
+define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
+; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <2 x i64> %x, i32 0
+  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
+  %vecext1 = extractelement <1 x i64> %y, i32 0
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
+  ret <2 x i64> %vecinit2
+}
+
+define <2 x i64> @test_concat_v2i64_v1i64_v1i64(<1 x i64> %x, <1 x i64> %y) #0 {
+; CHECK-LABEL: test_concat_v2i64_v1i64_v1i64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <1 x i64> %x, i32 0
+  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
+  %vecext1 = extractelement <1 x i64> %y, i32 0
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
+  ret <2 x i64> %vecinit2
+}
+
+
+define <4 x i16> @concat_vector_v4i16_const() {
+; CHECK-LABEL: concat_vector_v4i16_const:
+; CHECK: movi {{d[0-9]+}}, #0
+ %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %r
+}
+
+define <4 x i16> @concat_vector_v4i16_const_one() {
+; CHECK-LABEL: concat_vector_v4i16_const_one:
+; CHECK: movi {{v[0-9]+}}.4h, #0x1
+ %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %r
+}
+
+define <4 x i32> @concat_vector_v4i32_const() {
+; CHECK-LABEL: concat_vector_v4i32_const:
+; CHECK: movi {{v[0-9]+}}.2d, #0
+ %r = shufflevector <1 x i32> zeroinitializer, <1 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %r
+}
+
+define <8 x i8> @concat_vector_v8i8_const() {
+; CHECK-LABEL: concat_vector_v8i8_const:
+; CHECK: movi {{d[0-9]+}}, #0
+ %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
+ ret <8 x i8> %r
+}
+
+define <8 x i16> @concat_vector_v8i16_const() {
+; CHECK-LABEL: concat_vector_v8i16_const:
+; CHECK: movi {{v[0-9]+}}.2d, #0
+ %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %r
+}
+
+define <8 x i16> @concat_vector_v8i16_const_one() {
+; CHECK-LABEL: concat_vector_v8i16_const_one:
+; CHECK: movi {{v[0-9]+}}.8h, #0x1
+ %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %r
+}
+
+define <16 x i8> @concat_vector_v16i8_const() {
+; CHECK-LABEL: concat_vector_v16i8_const:
+; CHECK: movi {{v[0-9]+}}.2d, #0
+ %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %r
+}
+
+define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) {
+; CHECK-LABEL: concat_vector_v4i16:
+; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+ %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %r
+}
+
+define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) {
+; CHECK-LABEL: concat_vector_v4i32:
+; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+ %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %r
+}
+
+define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
+; CHECK-LABEL: concat_vector_v8i8:
+; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+ %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer
+ ret <8 x i8> %r
+}
+
+define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) {
+; CHECK-LABEL: concat_vector_v8i16:
+; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+ %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %r
+}
+
+define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
+; CHECK-LABEL: concat_vector_v16i8:
+; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+ %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %r
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll b/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
new file mode 100644
index 00000000000..276ac13da40
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; arm64 has a separate copy due to intrinsics
+
+define <4 x i32> @copyTuple.QPair(i32* %a, i32* %b) {
+; CHECK-LABEL: copyTuple.QPair:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i64 1, i32* %a)
+  %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i64 1, i32* %b)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+define <4 x i32> @copyTuple.QTriple(i32* %a, i32* %b, <4 x i32> %c) {
+; CHECK-LABEL: copyTuple.QTriple:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
+  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i64 1, i32* %b)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+define <4 x i32> @copyTuple.QQuad(i32* %a, i32* %b, <4 x i32> %c) {
+; CHECK-LABEL: copyTuple.QQuad:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
+  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %b)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
diff --git a/test/CodeGen/AArch64/arm64-neon-mul-div.ll b/test/CodeGen/AArch64/arm64-neon-mul-div.ll
new file mode 100644
index 00000000000..720f3eb6a4b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-mul-div.ll
@@ -0,0 +1,797 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; arm64 has its own copy of this because of the intrinsics
+
+define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: mul8xi8:
+; CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = mul <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: mul16xi8:
+; CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = mul <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: mul4xi16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = mul <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: mul8xi16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = mul <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: mul2xi32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = mul <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: mul4x32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = mul <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: mul1xi64:
+; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+  %tmp3 = mul <1 x i64> %A, %B;
+  ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: mul2xi64:
+; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+  %tmp3 = mul <2 x i64> %A, %B;
+  ret <2 x i64> %tmp3
+}
+
+ define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: mul2xfloat:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = fmul <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: mul4xfloat:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = fmul <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: mul2xdouble:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = fmul <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+
+ define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: div2xfloat:
+; CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = fdiv <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: div4xfloat:
+; CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = fdiv <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: div2xdouble:
+; CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = fdiv <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+define <1 x i8> @sdiv1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: sdiv1x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: sdiv8x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: sdiv16x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @sdiv1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: sdiv1x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @sdiv4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: sdiv4x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: sdiv8x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @sdiv1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: sdiv1x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @sdiv2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: sdiv2x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: sdiv4x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @sdiv1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: sdiv1x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = sdiv <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: sdiv2x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = sdiv <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @udiv1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: udiv1x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: udiv8x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: udiv16x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @udiv1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: udiv1x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @udiv4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: udiv4x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: udiv8x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @udiv1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: udiv1x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @udiv2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: udiv2x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: udiv4x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @udiv1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: udiv1x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = udiv <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: udiv2x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = udiv <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @srem1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: srem1x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: srem8x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: srem16x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @srem1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: srem1x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: srem4x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: srem8x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @srem1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: srem1x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: srem2x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: srem4x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @srem1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: srem1x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = srem <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: srem2x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = srem <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @urem1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: urem1x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: urem8x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: urem16x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @urem1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: urem1x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: urem4x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: urem8x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @urem1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: urem1x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: urem2x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: urem4x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @urem1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: urem1x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = urem <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: urem2x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = urem <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: frem2f32:
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+	%tmp3 = frem <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frem4f32(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: frem4f32:
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+	%tmp3 = frem <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+
+define <1 x double> @frem1d64(<1 x double> %A, <1 x double> %B) {
+; CHECK-LABEL: frem1d64:
+; CHECK: bl fmod
+	%tmp3 = frem <1 x double> %A, %B;
+	ret <1 x double> %tmp3
+}
+
+define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: frem2d64:
+; CHECK: bl fmod
+; CHECK: bl fmod
+	%tmp3 = frem <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8>, <8 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8>, <16 x i8>)
+
+define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: poly_mulv8i8:
+   %prod = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: pmul v0.8b, v0.8b, v1.8b
+   ret <8 x i8> %prod
+}
+
+define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK-LABEL: poly_mulv16i8:
+   %prod = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: pmul v0.16b, v0.16b, v1.16b
+   ret <16 x i8> %prod
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
+   ret <4 x i16> %prod
+}
+
+define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
+   ret <8 x i16> %prod
+}
+
+define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
+   ret <2 x i32> %prod
+}
+
+define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
+   ret <4 x i32> %prod
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
+   ret <4 x i16> %prod
+}
+
+define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
+   ret <8 x i16> %prod
+}
+
+define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
+   ret <2 x i32> %prod
+}
+
+define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
+   ret <4 x i32> %prod
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmulx_v2f32:
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.2s, v0.2s, v1.2s
+        %val = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+        ret <2 x float> %val
+}
+
+define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmulx_v4f32:
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.4s, v0.4s, v1.4s
+        %val = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+        ret <4 x float> %val
+}
+
+define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK-LABEL: fmulx_v2f64:
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.2d, v0.2d, v1.2d
+        %val = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+        ret <2 x double> %val
+}
+
diff --git a/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
new file mode 100644
index 00000000000..92ed2399509
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
@@ -0,0 +1,124 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmul_lane_ss2S
+  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = fmul float %a, %tmp1;
+  ret float %tmp2;
+}
+
+define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmul_lane_ss2S_swap
+  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = fmul float %tmp1, %a;
+  ret float %tmp2;
+}
+
+
+define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmul_lane_ss4S
+  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fmul float %a, %tmp1;
+  ret float %tmp2;
+}
+
+define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmul_lane_ss4S_swap
+  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fmul float %tmp1, %a;
+  ret float %tmp2;
+}
+
+
+define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
+  ; CHECK-LABEL: test_fmul_lane_ddD
+  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0]|d[0-9]+}}
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = fmul double %a, %tmp1;
+  ret double %tmp2;
+}
+
+
+
+define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmul_lane_dd2D
+  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fmul double %a, %tmp1;
+  ret double %tmp2;
+}
+
+
+define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmul_lane_dd2D_swap
+  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fmul double %tmp1, %a;
+  ret double %tmp2;
+}
+
+declare float @llvm.aarch64.neon.fmulx.f32(float, float)
+
+define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmulx_lane_f32
+  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
+  ret float %tmp2;
+}
+
+define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmulx_laneq_f32
+  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
+  ret float %tmp2;
+}
+
+define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmulx_laneq_f32_swap
+  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %tmp1, float %a)
+  ret float %tmp2;
+}
+
+declare double @llvm.aarch64.neon.fmulx.f64(double, double)
+
+define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
+  ; CHECK-LABEL: test_fmulx_lane_f64
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0]|d[0-9]+}}
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
+  ret double %tmp2;
+}
+
+define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmulx_laneq_f64_0
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  %tmp1 = extractelement <2 x double> %v, i32 0
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
+  ret double %tmp2;
+}
+
+
+define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmulx_laneq_f64_1
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
+  ret double %tmp2;
+}
+
+define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmulx_laneq_f64_1_swap
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %tmp1, double %a)
+  ret double %tmp2;
+}
+
diff --git a/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
new file mode 100644
index 00000000000..255b90dfa64
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
@@ -0,0 +1,206 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_i8:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].8b, v[[LHS]].8b, v[[RHS]].8b
+; CHECK: dup [[DUPMASK:v[0-9]+]].8b, [[MASK]].b[0]
+; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i8 %a, %b
+  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+  ret <8x i8> %e
+}
+
+define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_f32:
+; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s
+; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
+; CHECK-NEXT: bsl [[DUPMASK]].8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+  ret <8x i8> %e
+}
+
+define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_f64:
+; CHECK: fcmeq d[[MASK:[0-9]+]], d0, d1
+; CHECK-NEXT: bsl v[[MASK]].8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+  ret <8x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_i8:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].16b, v[[LHS]].16b, v[[RHS]].16b
+; CHECK: dup [[DUPMASK:v[0-9]+]].16b, [[MASK]].b[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i8 %a, %b
+  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+  ret <16x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_f32:
+; CHECK: fcmeq [[MASK:v[0-9]+]].4s, v0.4s, v1.4s
+; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK-NEXT: bsl [[DUPMASK]].16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+  ret <16x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_f64:
+; CHECK: fcmeq [[MASK:v[0-9]+]].2d, v0.2d, v1.2d
+; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
+; CHECK-NEXT: bsl [[DUPMASK]].16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+  ret <16x i8> %e
+}
+
+define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) {
+; CHECK-LABEL: test_select_cc_v4i16:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].4h, v[[LHS]].4h, v[[RHS]].4h
+; CHECK: dup [[DUPMASK:v[0-9]+]].4h, [[MASK]].h[0]
+; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i16 %a, %b
+  %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
+  ret <4x i16> %e
+}
+
+define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) {
+; CHECK-LABEL: test_select_cc_v8i16:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].8h, v[[LHS]].8h, v[[RHS]].8h
+; CHECK: dup [[DUPMASK:v[0-9]+]].8h, [[MASK]].h[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i16 %a, %b
+  %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
+  ret <8x i16> %e
+}
+
+define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) {
+; CHECK-LABEL: test_select_cc_v2i32:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].2s, v[[LHS]].2s, v[[RHS]].2s
+; CHECK: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i32 %a, %b
+  %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
+  ret <2x i32> %e
+}
+
+define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) {
+; CHECK-LABEL: test_select_cc_v4i32:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].4s, v[[LHS]].4s, v[[RHS]].4s
+; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i32 %a, %b
+  %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
+  ret <4x i32> %e
+}
+
+define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) {
+; CHECK-LABEL: test_select_cc_v1i64:
+; CHECK-DAG: fmov d[[LHS:[0-9]+]], x0
+; CHECK-DAG: fmov d[[RHS:[0-9]+]], x1
+; CHECK: cmeq d[[MASK:[0-9]+]], d[[LHS]], d[[RHS]]
+; CHECK: bsl v[[MASK]].8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i64 %a, %b
+  %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
+  ret <1x i64> %e
+}
+
+define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) {
+; CHECK-LABEL: test_select_cc_v2i64:
+; CHECK-DAG: fmov d[[LHS:[0-9]+]], x0
+; CHECK-DAG: fmov d[[RHS:[0-9]+]], x1
+; CHECK: cmeq [[MASK:v[0-9]+]].2d, v[[LHS]].2d, v[[RHS]].2d
+; CHECK: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i64 %a, %b
+  %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
+  ret <2x i64> %e
+}
+
+define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) {
+; CHECK-LABEL: test_select_cc_v1f32:
+; CHECK: fcmp s0, s1
+; CHECK-NEXT: fcsel s0, s2, s3, eq
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d
+  ret <1 x float> %e
+}
+
+define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d ) {
+; CHECK-LABEL: test_select_cc_v2f32:
+; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s
+; CHECK: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <2 x float> %c, <2 x float> %d
+  ret <2 x float> %e
+}
+
+define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x float> %d ) {
+; CHECK-LABEL: test_select_cc_v4f32:
+; CHECK: fcmeq [[MASK:v[0-9]+]].4s, v0.4s, v1.4s
+; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
+  ret <4x float> %e
+}
+
+define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x float> %d ) {
+; CHECK-LABEL: test_select_cc_v4f32_icmp:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].4s, v[[LHS]].4s, v[[RHS]].4s
+; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i32 %a, %b
+  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
+  ret <4x float> %e
+}
+
+define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c, <1 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v1f64:
+; CHECK: fcmeq d[[MASK:[0-9]+]], d0, d1
+; CHECK: bsl v[[MASK]].8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
+  ret <1 x double> %e
+}
+
+define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c, <1 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v1f64_icmp:
+; CHECK-DAG: fmov [[LHS:d[0-9]+]], x0
+; CHECK-DAG: fmov [[RHS:d[0-9]+]], x1
+; CHECK: cmeq d[[MASK:[0-9]+]], [[LHS]], [[RHS]]
+; CHECK: bsl v[[MASK]].8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i64 %a, %b
+  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
+  ret <1 x double> %e
+}
+
+define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v2f64:
+; CHECK: fcmeq [[MASK:v[0-9]+]].2d, v0.2d, v1.2d
+; CHECK: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
+; CHECK: bsl [[DUPMASK]].16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <2 x double> %c, <2 x double> %d
+  ret <2 x double> %e
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
new file mode 100644
index 00000000000..cca6bfef730
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
@@ -0,0 +1,482 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+
+%struct.uint8x16x2_t = type { [2 x <16 x i8>] }
+%struct.poly8x16x2_t = type { [2 x <16 x i8>] }
+%struct.uint8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int8x16x2_t = type { [2 x <16 x i8>] }
+%struct.int16x8x2_t = type { [2 x <8 x i16>] }
+%struct.int32x4x2_t = type { [2 x <4 x i32>] }
+%struct.int64x2x2_t = type { [2 x <2 x i64>] }
+%struct.float32x4x2_t = type { [2 x <4 x float>] }
+%struct.float64x2x2_t = type { [2 x <2 x double>] }
+%struct.int8x8x2_t = type { [2 x <8 x i8>] }
+%struct.int16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int32x2x2_t = type { [2 x <2 x i32>] }
+%struct.int64x1x2_t = type { [2 x <1 x i64>] }
+%struct.float32x2x2_t = type { [2 x <2 x float>] }
+%struct.float64x1x2_t = type { [2 x <1 x double>] }
+%struct.int8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int16x8x3_t = type { [3 x <8 x i16>] }
+%struct.int32x4x3_t = type { [3 x <4 x i32>] }
+%struct.int64x2x3_t = type { [3 x <2 x i64>] }
+%struct.float32x4x3_t = type { [3 x <4 x float>] }
+%struct.float64x2x3_t = type { [3 x <2 x double>] }
+%struct.int8x8x3_t = type { [3 x <8 x i8>] }
+%struct.int16x4x3_t = type { [3 x <4 x i16>] }
+%struct.int32x2x3_t = type { [3 x <2 x i32>] }
+%struct.int64x1x3_t = type { [3 x <1 x i64>] }
+%struct.float32x2x3_t = type { [3 x <2 x float>] }
+%struct.float64x1x3_t = type { [3 x <1 x double>] }
+%struct.int8x16x4_t = type { [4 x <16 x i8>] }
+%struct.int16x8x4_t = type { [4 x <8 x i16>] }
+%struct.int32x4x4_t = type { [4 x <4 x i32>] }
+%struct.int64x2x4_t = type { [4 x <2 x i64>] }
+%struct.float32x4x4_t = type { [4 x <4 x float>] }
+%struct.float64x2x4_t = type { [4 x <2 x double>] }
+%struct.int8x8x4_t = type { [4 x <8 x i8>] }
+%struct.int16x4x4_t = type { [4 x <4 x i16>] }
+%struct.int32x2x4_t = type { [4 x <2 x i32>] }
+%struct.int64x1x4_t = type { [4 x <1 x i64>] }
+%struct.float32x2x4_t = type { [4 x <2 x float>] }
+%struct.float64x1x4_t = type { [4 x <1 x double>] }
+
+define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
+; CHECK-LABEL: test_ld_from_poll_v16i8:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
+  ret <16 x i8> %b
+}
+
+define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: test_ld_from_poll_v8i16:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+  ret <8 x i16> %b
+}
+
+define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4i32:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %b
+}
+
+define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2i64:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <2 x i64> %a, <i64 1, i64 2>
+  ret <2 x i64> %b
+}
+
+define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4f32:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
+  ret <4 x float> %b
+}
+
+define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2f64:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = fadd <2 x double> %a, <double 1.0, double 2.0>
+  ret <2 x double> %b
+}
+
+define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
+; CHECK-LABEL: test_ld_from_poll_v8i8:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
+  ret <8 x i8> %b
+}
+
+define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4i16:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
+  ret <4 x i16> %b
+}
+
+define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2i32:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <2 x i32> %a, <i32 1, i32 2>
+  ret <2 x i32> %b
+}
+
+define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1q_dup_s8:
+; CHECK: ld1r {{{ ?v[0-9]+.16b ?}}}, [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
+  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %lane
+}
+
+define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1q_dup_s16:
+; CHECK: ld1r {{{ ?v[0-9]+.8h ?}}}, [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %lane
+}
+
+define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1q_dup_s32:
+; CHECK: ld1r {{{ ?v[0-9]+.4s ?}}}, [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %1 = insertelement <4 x i32> undef, i32 %0, i32 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %lane
+}
+
+define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1q_dup_s64:
+; CHECK: ld1r {{{ ?v[0-9]+.2d ?}}}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %1 = insertelement <2 x i64> undef, i64 %0, i32 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %lane
+}
+
+define <4 x float> @test_vld1q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1q_dup_f32:
+; CHECK: ld1r {{{ ?v[0-9]+.4s ?}}}, [x0]
+entry:
+  %0 = load float* %a, align 4
+  %1 = insertelement <4 x float> undef, float %0, i32 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x double> @test_vld1q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1q_dup_f64:
+; CHECK: ld1r {{{ ?v[0-9]+.2d ?}}}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %1 = insertelement <2 x double> undef, double %0, i32 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <8 x i8> @test_vld1_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1_dup_s8:
+; CHECK: ld1r {{{ ?v[0-9]+.8b ?}}}, [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  ret <8 x i8> %lane
+}
+
+define <4 x i16> @test_vld1_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1_dup_s16:
+; CHECK: ld1r {{{ ?v[0-9]+.4h ?}}}, [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  ret <4 x i16> %lane
+}
+
+define <2 x i32> @test_vld1_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1_dup_s32:
+; CHECK: ld1r {{{ ?v[0-9]+.2s ?}}}, [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  ret <2 x i32> %lane
+}
+
+define <1 x i64> @test_vld1_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1_dup_s64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %1 = insertelement <1 x i64> undef, i64 %0, i32 0
+  ret <1 x i64> %1
+}
+
+define <2 x float> @test_vld1_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1_dup_f32:
+; CHECK: ld1r {{{ ?v[0-9]+.2s ?}}}, [x0]
+entry:
+  %0 = load float* %a, align 4
+  %1 = insertelement <2 x float> undef, float %0, i32 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <1 x double> @test_vld1_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1_dup_f64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %1 = insertelement <1 x double> undef, double %0, i32 0
+  ret <1 x double> %1
+}
+
+define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
+; As there is a store operation depending on %1, LD1R pattern can't be selected.
+; So LDR and FMOV should be emitted.
+; CHECK-LABEL: testDUP.v1i64:
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-DAG: fmov {{d[0-9]+}}, {{x[0-9]+}}
+; CHECK-DAG: str {{x[0-9]+}}, [{{x[0-9]+}}]
+  %1 = load i64* %a, align 8
+  store i64 %1, i64* %b, align 8
+  %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
+  ret <1 x i64> %vecinit.i
+}
+
+define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
+; As there is a store operation depending on %1, LD1R pattern can't be selected.
+; So LDR and FMOV should be emitted.
+; CHECK-LABEL: testDUP.v1f64:
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
+  %1 = load double* %a, align 8
+  store double %1, double* %b, align 8
+  %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
+  ret <1 x double> %vecinit.i
+}
+
+define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vld1q_lane_s8:
+; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
+  ret <16 x i8> %vld1_lane
+}
+
+define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vld1q_lane_s16:
+; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
+  ret <8 x i16> %vld1_lane
+}
+
+define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vld1q_lane_s32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
+  ret <4 x i32> %vld1_lane
+}
+
+define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vld1q_lane_s64:
+; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
+  ret <2 x i64> %vld1_lane
+}
+
+define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vld1q_lane_f32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load float* %a, align 4
+  %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
+  ret <4 x float> %vld1_lane
+}
+
+define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vld1q_lane_f64:
+; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load double* %a, align 8
+  %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
+  ret <2 x double> %vld1_lane
+}
+
+define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vld1_lane_s8:
+; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
+  ret <8 x i8> %vld1_lane
+}
+
+define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vld1_lane_s16:
+; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
+  ret <4 x i16> %vld1_lane
+}
+
+define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vld1_lane_s32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
+  ret <2 x i32> %vld1_lane
+}
+
+define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vld1_lane_s64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
+  ret <1 x i64> %vld1_lane
+}
+
+define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vld1_lane_f32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load float* %a, align 4
+  %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
+  ret <2 x float> %vld1_lane
+}
+
+define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vld1_lane_f64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
+  ret <1 x double> %vld1_lane
+}
+
+define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vst1q_lane_s8:
+; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <16 x i8> %b, i32 15
+  store i8 %0, i8* %a, align 1
+  ret void
+}
+
+define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vst1q_lane_s16:
+; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <8 x i16> %b, i32 7
+  store i16 %0, i16* %a, align 2
+  ret void
+}
+
+define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vst1q_lane_s32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x i32> %b, i32 3
+  store i32 %0, i32* %a, align 4
+  ret void
+}
+
+define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vst1q_lane_s64:
+; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x i64> %b, i32 1
+  store i64 %0, i64* %a, align 8
+  ret void
+}
+
+define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vst1q_lane_f32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x float> %b, i32 3
+  store float %0, float* %a, align 4
+  ret void
+}
+
+define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vst1q_lane_f64:
+; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x double> %b, i32 1
+  store double %0, double* %a, align 8
+  ret void
+}
+
+define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vst1_lane_s8:
+; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <8 x i8> %b, i32 7
+  store i8 %0, i8* %a, align 1
+  ret void
+}
+
+define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vst1_lane_s16:
+; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x i16> %b, i32 3
+  store i16 %0, i16* %a, align 2
+  ret void
+}
+
+define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vst1_lane_s32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x i32> %b, i32 1
+  store i32 %0, i32* %a, align 4
+  ret void
+}
+
+define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vst1_lane_s64:
+; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <1 x i64> %b, i32 0
+  store i64 %0, i64* %a, align 8
+  ret void
+}
+
+define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vst1_lane_f32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x float> %b, i32 1
+  store float %0, float* %a, align 4
+  ret void
+}
+
+define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vst1_lane_f64:
+; CHECK: str {{d[0-9]+}}, [x0]
+entry:
+  %0 = extractelement <1 x double> %b, i32 0
+  store double %0, double* %a, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-simd-shift.ll b/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
new file mode 100644
index 00000000000..447fb6307f2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
@@ -0,0 +1,663 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) {
+; CHECK: test_vshr_n_s8
+; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vshr_n = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %vshr_n
+}
+
+define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) {
+; CHECK: test_vshr_n_s16
+; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vshr_n = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %vshr_n
+}
+
+define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) {
+; CHECK: test_vshr_n_s32
+; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vshr_n = ashr <2 x i32> %a, <i32 3, i32 3>
+  ret <2 x i32> %vshr_n
+}
+
+define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) {
+; CHECK: test_vshrq_n_s8
+; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vshr_n = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %vshr_n
+}
+
+define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) {
+; CHECK: test_vshrq_n_s16
+; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vshr_n = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %vshr_n
+}
+
+define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) {
+; CHECK: test_vshrq_n_s32
+; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vshr_n = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %vshr_n
+}
+
+define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) {
+; CHECK: test_vshrq_n_s64
+; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vshr_n = ashr <2 x i64> %a, <i64 3, i64 3>
+  ret <2 x i64> %vshr_n
+}
+
+define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) {
+; CHECK: test_vshr_n_u8
+; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vshr_n = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %vshr_n
+}
+
+define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) {
+; CHECK: test_vshr_n_u16
+; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vshr_n = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %vshr_n
+}
+
+define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) {
+; CHECK: test_vshr_n_u32
+; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vshr_n = lshr <2 x i32> %a, <i32 3, i32 3>
+  ret <2 x i32> %vshr_n
+}
+
+define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) {
+; CHECK: test_vshrq_n_u8
+; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vshr_n = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %vshr_n
+}
+
+define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) {
+; CHECK: test_vshrq_n_u16
+; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vshr_n = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %vshr_n
+}
+
+define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) {
+; CHECK: test_vshrq_n_u32
+; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vshr_n = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %vshr_n
+}
+
+define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) {
+; CHECK: test_vshrq_n_u64
+; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vshr_n = lshr <2 x i64> %a, <i64 3, i64 3>
+  ret <2 x i64> %vshr_n
+}
+
+define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsra_n_s8
+; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsra_n = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <8 x i8> %vsra_n, %a
+  ret <8 x i8> %1
+}
+
+define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsra_n_s16
+; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsra_n = ashr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
+  %1 = add <4 x i16> %vsra_n, %a
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsra_n_s32
+; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsra_n = ashr <2 x i32> %b, <i32 3, i32 3>
+  %1 = add <2 x i32> %vsra_n, %a
+  ret <2 x i32> %1
+}
+
+define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsraq_n_s8
+; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsra_n = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <16 x i8> %vsra_n, %a
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsraq_n_s16
+; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsra_n = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %1 = add <8 x i16> %vsra_n, %a
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsraq_n_s32
+; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsra_n = ashr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %vsra_n, %a
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsraq_n_s64
+; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsra_n = ashr <2 x i64> %b, <i64 3, i64 3>
+  %1 = add <2 x i64> %vsra_n, %a
+  ret <2 x i64> %1
+}
+
+define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsra_n_u8
+; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsra_n = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <8 x i8> %vsra_n, %a
+  ret <8 x i8> %1
+}
+
+define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsra_n_u16
+; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsra_n = lshr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
+  %1 = add <4 x i16> %vsra_n, %a
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsra_n_u32
+; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsra_n = lshr <2 x i32> %b, <i32 3, i32 3>
+  %1 = add <2 x i32> %vsra_n, %a
+  ret <2 x i32> %1
+}
+
+define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsraq_n_u8
+; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsra_n = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <16 x i8> %vsra_n, %a
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsraq_n_u16
+; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsra_n = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %1 = add <8 x i16> %vsra_n, %a
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsraq_n_u32
+; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsra_n = lshr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %vsra_n, %a
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsraq_n_u64
+; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsra_n = lshr <2 x i64> %b, <i64 3, i64 3>
+  %1 = add <2 x i64> %vsra_n, %a
+  ret <2 x i64> %1
+}
+
+define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) {
+; CHECK: test_vshrn_n_s16
+; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %1 = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  ret <8 x i8> %vshrn_n
+}
+
+define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) {
+; CHECK: test_vshrn_n_s32
+; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %1 = ashr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  ret <4 x i16> %vshrn_n
+}
+
+define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) {
+; CHECK: test_vshrn_n_s64
+; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %1 = ashr <2 x i64> %a, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %vshrn_n
+}
+
+define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) {
+; CHECK: test_vshrn_n_u16
+; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %1 = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  ret <8 x i8> %vshrn_n
+}
+
+define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) {
+; CHECK: test_vshrn_n_u32
+; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %1 = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  ret <4 x i16> %vshrn_n
+}
+
+define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) {
+; CHECK: test_vshrn_n_u64
+; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %1 = lshr <2 x i64> %a, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %vshrn_n
+}
+
+define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vshrn_high_n_s16
+; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %1 = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  %2 = bitcast <8 x i8> %a to <1 x i64>
+  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vshrn_high_n_s32
+; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %1 = ashr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  %2 = bitcast <4 x i16> %a to <1 x i64>
+  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %4
+}
+
+define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_high_n_s64
+; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %2 = ashr <2 x i64> %b, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vshrn_high_n_u16
+; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %1 = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  %2 = bitcast <8 x i8> %a to <1 x i64>
+  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vshrn_high_n_u32
+; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %1 = lshr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  %2 = bitcast <4 x i16> %a to <1 x i64>
+  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %4
+}
+
+define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_high_n_u64
+; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %2 = lshr <2 x i64> %b, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrun_high_n_s16
+; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrun_high_n_s32
+; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrun_high_n_s64
+; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vrshrn_high_n_s16
+; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vrshrn_high_n_s32
+; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vrshrn_high_n_s64
+; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrun_high_n_s16
+; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrun_high_n_s32
+; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrun_high_n_s64
+; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrn_high_n_s16
+; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrn_high_n_s32
+; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrn_high_n_s64
+; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrn_high_n_u16
+; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrn_high_n_u32
+; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrn_high_n_u64
+; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrn_high_n_s16
+; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrn_high_n_s32
+; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrn_high_n_s64
+; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrn_high_n_u16
+; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrn_high_n_u32
+; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrn_high_n_u64
+; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+
+
+declare <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32)
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
+
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
+
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
+
+define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_n_s64_f64
+; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_n_u64_f64
+; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
+  ret <1 x i64> %1
+}
+
+define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_n_f64_s64
+; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_n_f64_u64
+; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  ret <1 x double> %1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
+declare <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
+declare <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
diff --git a/test/CodeGen/AArch64/arm64-neon-simd-vget.ll b/test/CodeGen/AArch64/arm64-neon-simd-vget.ll
new file mode 100644
index 00000000000..87f3956eb20
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-simd-vget.ll
@@ -0,0 +1,225 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @test_vget_high_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_high_s8:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_s16:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_high_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_high_s32:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_high_s64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_high_s64:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vget_high_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_high_u8:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_u16:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_high_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_high_u32:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_high_u64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_high_u64:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle.i
+}
+
+define <1 x i64> @test_vget_high_p64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_high_p64:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_f16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_f16:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x float> @test_vget_high_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vget_high_f32:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x float> %shuffle.i
+}
+
+define <8 x i8> @test_vget_high_p8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_high_p8:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_p16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_p16:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <1 x double> @test_vget_high_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vget_high_f64:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> <i32 1>
+  ret <1 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vget_low_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_low_s8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_s16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_low_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_low_s32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_low_s64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_low_s64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vget_low_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_low_u8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_u16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_low_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_low_u32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_low_u64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_low_u64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %shuffle.i
+}
+
+define <1 x i64> @test_vget_low_p64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_low_p64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_f16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_f16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x float> @test_vget_low_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vget_low_f32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuffle.i
+}
+
+define <8 x i8> @test_vget_low_p8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_low_p8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_p16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_p16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <1 x double> @test_vget_low_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vget_low_f64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> zeroinitializer
+  ret <1 x double> %shuffle.i
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll b/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
new file mode 100644
index 00000000000..74e3af8206f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
@@ -0,0 +1,74 @@
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=arm64-none-linux-gnu | FileCheck %s
+
+; This is the analogue of AArch64's file of the same name. It's mostly testing
+; some form of correct lowering occurs, the tests are a little artificial but I
+; strongly suspect there's room for improved CodeGen (FIXME).
+
+define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_0:
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: cset
+  %1 = icmp sge <1 x i64> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  %vget_lane = sext i1 %2 to i64
+  ret i64 %vget_lane
+}
+
+define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_1:
+; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
+  %1 = fcmp oeq <1 x double> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  %vget_lane = sext i1 %2 to i64
+  ret i64 %vget_lane
+}
+
+define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_0:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_1:
+; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = fcmp oeq <1 x double> %v1, %v2
+  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
+; CHECK-LABEL: test_select_v1i1_2:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
+  ret <1 x double> %res
+}
+
+define <1 x i64> @test_select_v1i1_3(i64 %lhs, i64 %rhs, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_3:
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
+  %tst = icmp eq i64 %lhs, %rhs
+  %evil = insertelement <1 x i1> undef, i1 %tst, i32 0
+  %res = select <1 x i1> %evil, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_br_extr_cmp:
+; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}}
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  br i1 %2, label %if.end, label %if.then
+
+if.then:
+  ret i32 0;
+
+if.end:
+  ret i32 1;
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll b/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
new file mode 100644
index 00000000000..8262fe43a66
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
@@ -0,0 +1,175 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+; FIXME: We should not generate ld/st for such register spill/fill, because the
+; test case seems very simple and the register pressure is not high. If the
+; spill/fill algorithm is optimized, this test case may not be triggered. And
+; then we can delete it.
+define i32 @spill.DPairReg(i32* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.DPairReg:
+; CHECK: ld2 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %arg1)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <2 x i32>, <2 x i32> } %vld, 0
+  %res = extractelement <2 x i32> %vld.extract, i32 1
+  ret i32 %res
+}
+
+define i16 @spill.DTripleReg(i16* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.DTripleReg:
+; CHECK: ld3 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %arg1)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
+  %res = extractelement <4 x i16> %vld.extract, i32 1
+  ret i16 %res
+}
+
+define i16 @spill.DQuadReg(i16* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.DQuadReg:
+; CHECK: ld4 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %arg1)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
+  %res = extractelement <4 x i16> %vld.extract, i32 0
+  ret i16 %res
+}
+
+define i32 @spill.QPairReg(i32* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.QPairReg:
+; CHECK: ld2 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %arg1)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
+  %res = extractelement <4 x i32> %vld.extract, i32 1
+  ret i32 %res
+}
+
+define float @spill.QTripleReg(float* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.QTripleReg:
+; CHECK: ld3 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+entry:
+  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %arg1)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
+  %res = extractelement <4 x float> %vld3.extract, i32 1
+  ret float %res
+}
+
+define i8 @spill.QQuadReg(i8* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.QQuadReg:
+; CHECK: ld4 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %arg1)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld, 0
+  %res = extractelement <16 x i8> %vld.extract, i32 1
+  ret i8 %res
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*)
+
+declare void @foo()
+
+; FIXME: We should not generate ld/st for such register spill/fill, because the
+; test case seems very simple and the register pressure is not high. If the
+; spill/fill algorithm is optimized, this test case may not be triggered. And
+; then we can delete it.
+; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
+define <8 x i16> @test_2xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
+  tail call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
+  tail call void @foo()
+  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %sv to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %3 = mul <8 x i16> %2, %2
+  ret <8 x i16> %3
+}
+
+; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
+define <8 x i16> @test_3xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
+  tail call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
+  tail call void @foo()
+  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %sv to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %3 = mul <8 x i16> %2, %2
+  ret <8 x i16> %3
+}
+
+; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
+define <8 x i16> @test_4xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
+  tail call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
+  tail call void @foo()
+  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %sv to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %3 = mul <8 x i16> %2, %2
+  ret <8 x i16> %3
+}
+
+declare void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
diff --git a/test/CodeGen/AArch64/arm64-patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll
new file mode 100644
index 00000000000..9ef1d778a31
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -0,0 +1,163 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone | FileCheck %s
+
+; Trivial patchpoint codegen
+;
+define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: trivial_patchpoint_codegen:
+; CHECK:       movz x16, #0xdead, lsl #32
+; CHECK-NEXT:  movk x16, #0xbeef, lsl #16
+; CHECK-NEXT:  movk x16, #0xcafe
+; CHECK-NEXT:  blr  x16
+; CHECK:       movz x16, #0xdead, lsl #32
+; CHECK-NEXT:  movk x16, #0xbeef, lsl #16
+; CHECK-NEXT:  movk x16, #0xcaff
+; CHECK-NEXT:  blr  x16
+; CHECK:       ret
+  %resolveCall2 = inttoptr i64 244837814094590 to i8*
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %resolveCall3 = inttoptr i64 244837814094591 to i8*
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  ret i64 %result
+}
+
+; Caller frame metadata with stackmaps. This should not be optimized
+; as a leaf function.
+;
+; CHECK-LABEL: caller_meta_leaf
+; CHECK:       mov x29, sp
+; CHECK-NEXT:  sub sp, sp, #32
+; CHECK:       Ltmp
+; CHECK:       mov sp, x29
+; CHECK:       ret
+
+define void @caller_meta_leaf() {
+entry:
+  %metadata = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata
+  store i64 12, i64* %metadata
+  store i64 13, i64* %metadata
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+  ret void
+}
+
+; Test the webkit_jscc calling convention.
+; One argument will be passed in register, the other will be pushed on the stack.
+; Return value in x0.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      str x{{.+}}, [sp]
+; CHECK-NEXT: mov  x0, x{{.+}}
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #0xffff, lsl #32
+; CHECK-NEXT: movk  x16, #0xdead, lsl #16
+; CHECK-NEXT: movk  x16, #0xbeef
+; CHECK-NEXT: blr x16
+  %resolveCall2 = inttoptr i64 281474417671919 to i8*
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %resolveCall3 = inttoptr i64 244837814038255 to i8*
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  ret void
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK:      Ltmp
+; CHECK:      orr w{{.+}}, wzr, #0x6
+; CHECK-NEXT: str x{{.+}}, [sp, #24]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
+; CHECK-NEXT: str w{{.+}}, [sp, #16]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
+; CHECK-NEXT: str x{{.+}}, [sp]
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #0xffff, lsl #32
+; CHECK-NEXT: movk  x16, #0xdead, lsl #16
+; CHECK-NEXT: movk  x16, #0xbeef
+; CHECK-NEXT: blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK:      Ltmp
+; CHECK:      movz  w{{.+}}, #0xa
+; CHECK-NEXT: str x{{.+}}, [sp, #48]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x8
+; CHECK-NEXT: str w{{.+}}, [sp, #36]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x6
+; CHECK-NEXT: str x{{.+}}, [sp, #24]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
+; CHECK-NEXT: str w{{.+}}, [sp, #16]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
+; CHECK-NEXT: str x{{.+}}, [sp]
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #0xffff, lsl #32
+; CHECK-NEXT: movk  x16, #0xdead, lsl #16
+; CHECK-NEXT: movk  x16, #0xbeef
+; CHECK-NEXT: blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  ret i64 %result
+}
+
+; Test patchpoints reusing the same TargetConstant.
+; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
+; There is no way to verify this, since it depends on memory allocation.
+; But I think it's useful to include as a working example.
+define i64 @testLowerConstant(i64 %arg, i64 %tmp2, i64 %tmp10, i64* %tmp33, i64 %tmp79) {
+entry:
+  %tmp80 = add i64 %tmp79, -16
+  %tmp81 = inttoptr i64 %tmp80 to i64*
+  %tmp82 = load i64* %tmp81, align 8
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  %tmp83 = load i64* %tmp33, align 8
+  %tmp84 = add i64 %tmp83, -24
+  %tmp85 = inttoptr i64 %tmp84 to i64*
+  %tmp86 = load i64* %tmp85, align 8
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  ret i64 10
+}
+
+; Test small patchpoints that don't emit calls.
+define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: small_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: ldp
+; CHECK-NEXT: ret
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
+  ret void
+}
+
+; Test that scratch registers are spilled around patchpoints
+; CHECK: InlineAsm End
+; CHECK-NEXT: mov x{{[0-9]+}}, x16
+; CHECK-NEXT: mov x{{[0-9]+}}, x17
+; CHECK-NEXT: Ltmp
+; CHECK-NEXT: nop
+define void @clobberScratch(i32* %p) {
+  %v = load i32* %p
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
+  store i32 %v, i32* %p
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-pic-local-symbol.ll b/test/CodeGen/AArch64/arm64-pic-local-symbol.ll
new file mode 100644
index 00000000000..627e741fc32
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-pic-local-symbol.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64-unknown-linux-gnu -relocation-model=pic < %s | FileCheck %s
+
+@a = internal unnamed_addr global i32 0, align 4
+@.str = private unnamed_addr constant [6 x i8] c"test\0A\00", align 1
+
+define i32 @get() {
+; CHECK: get:
+; CHECK: adrp x{{[0-9]+}}, a
+; CHECK-NEXT: ldr w{{[0-9]+}}, [x{{[0-9]}}, :lo12:a]
+  %res = load i32* @a, align 4
+  ret i32 %res
+}
+
+define void @foo() nounwind {
+; CHECK: foo:
+; CHECK: adrp x{{[0-9]}}, .L.str
+; CHECK-NEXT: add x{{[0-9]}}, x{{[0-9]}}, :lo12:.L.str
+  tail call void @bar(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0))
+  ret void
+}
+
+declare void @bar(i8*)
diff --git a/test/CodeGen/AArch64/arm64-platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll
new file mode 100644
index 00000000000..651c793f73a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-platform-reg.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+; x18 is reserved as a platform register on Darwin but not on other
+; systems. Create loads of register pressure and make sure this is respected.
+
+; Also, fp must always refer to a valid frame record, even if it's not the one
+; of the current function, so it shouldn't be used either.
+
+@var = global [30 x i64] zeroinitializer
+
+define void @keep_live() {
+  %val = load volatile [30 x i64]* @var
+  store volatile [30 x i64] %val, [30 x i64]* @var
+
+; CHECK: ldr x18
+; CHECK: str x18
+
+; CHECK-DARWIN-NOT: ldr fp
+; CHECK-DARWIN-NOT: ldr x18
+; CHECK-DARWIN: Spill
+; CHECK-DARWIN-NOT: ldr fp
+; CHECK-DARWIN-NOT: ldr x18
+; CHECK-DARWIN: ret
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-popcnt.ll b/test/CodeGen/AArch64/arm64-popcnt.ll
new file mode 100644
index 00000000000..2afade2ee75
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+; CHECK: fmov	s0, w0
+; CHECK: cnt.8b	v0, v0
+; CHECK: uaddlv.8b	h0, v0
+; CHECK: fmov w0, s0
+; CHECK: ret
+}
+
+define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %cnt
+; CHECK: fmov	d0, x0
+; CHECK: cnt.8b	v0, v0
+; CHECK: uaddlv.8b	h0, v0
+; CHECK: fmov	w0, s0
+; CHECK: ret
+}
+
+; Do not use AdvSIMD when -mno-implicit-float is specified.
+; rdar://9473858
+
+define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+; CHECK-LABEL: cnt32:
+; CHECK-NOT 16b
+; CHECK: ret
+}
+
+define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %cnt
+; CHECK-LABEL: cnt64:
+; CHECK-NOT 16b
+; CHECK: ret
+}
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-prefetch.ll b/test/CodeGen/AArch64/arm64-prefetch.ll
new file mode 100644
index 00000000000..b2e06edf931
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-prefetch.ll
@@ -0,0 +1,88 @@
+; RUN: llc %s -march arm64 -o - | FileCheck %s
+
+@a = common global i32* null, align 8
+
+define void @test(i32 %i, i32 %j) nounwind ssp {
+entry:
+  ; CHECK: @test
+  %j.addr = alloca i32, align 4
+  store i32 %j, i32* %j.addr, align 4, !tbaa !0
+  %tmp = bitcast i32* %j.addr to i8*
+  ; CHECK: prfum pldl1strm
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 0, i32 1)
+  ; CHECK: prfum pldl3keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 1, i32 1)
+  ; CHECK: prfum pldl2keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 2, i32 1)
+  ; CHECK: prfum pldl1keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 1)
+
+  ; CHECK: prfum pstl1strm
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 0, i32 1)
+  ; CHECK: prfum pstl3keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 1, i32 1)
+  ; CHECK: prfum pstl2keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 2, i32 1)
+  ; CHECK: prfum pstl1keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 3, i32 1)
+
+  %tmp1 = load i32* %j.addr, align 4, !tbaa !0
+  %add = add nsw i32 %tmp1, %i
+  %idxprom = sext i32 %add to i64
+  %tmp2 = load i32** @a, align 8, !tbaa !3
+  %arrayidx = getelementptr inbounds i32* %tmp2, i64 %idxprom
+  %tmp3 = bitcast i32* %arrayidx to i8*
+
+  ; CHECK: prfm pldl1strm
+  call void @llvm.prefetch(i8* %tmp3, i32 0, i32 0, i32 1)
+  %tmp4 = load i32** @a, align 8, !tbaa !3
+  %arrayidx3 = getelementptr inbounds i32* %tmp4, i64 %idxprom
+  %tmp5 = bitcast i32* %arrayidx3 to i8*
+
+  ; CHECK: prfm pldl3keep
+  call void @llvm.prefetch(i8* %tmp5, i32 0, i32 1, i32 1)
+  %tmp6 = load i32** @a, align 8, !tbaa !3
+  %arrayidx6 = getelementptr inbounds i32* %tmp6, i64 %idxprom
+  %tmp7 = bitcast i32* %arrayidx6 to i8*
+
+  ; CHECK: prfm pldl2keep
+  call void @llvm.prefetch(i8* %tmp7, i32 0, i32 2, i32 1)
+  %tmp8 = load i32** @a, align 8, !tbaa !3
+  %arrayidx9 = getelementptr inbounds i32* %tmp8, i64 %idxprom
+  %tmp9 = bitcast i32* %arrayidx9 to i8*
+
+  ; CHECK: prfm pldl1keep
+  call void @llvm.prefetch(i8* %tmp9, i32 0, i32 3, i32 1)
+  %tmp10 = load i32** @a, align 8, !tbaa !3
+  %arrayidx12 = getelementptr inbounds i32* %tmp10, i64 %idxprom
+  %tmp11 = bitcast i32* %arrayidx12 to i8*
+
+  ; CHECK: prfm pstl1strm
+  call void @llvm.prefetch(i8* %tmp11, i32 1, i32 0, i32 1)
+  %tmp12 = load i32** @a, align 8, !tbaa !3
+  %arrayidx15 = getelementptr inbounds i32* %tmp12, i64 %idxprom
+  %tmp13 = bitcast i32* %arrayidx15 to i8*
+
+  ; CHECK: prfm pstl3keep
+  call void @llvm.prefetch(i8* %tmp13, i32 1, i32 1, i32 1)
+  %tmp14 = load i32** @a, align 8, !tbaa !3
+  %arrayidx18 = getelementptr inbounds i32* %tmp14, i64 %idxprom
+  %tmp15 = bitcast i32* %arrayidx18 to i8*
+
+  ; CHECK: prfm pstl2keep
+  call void @llvm.prefetch(i8* %tmp15, i32 1, i32 2, i32 1)
+  %tmp16 = load i32** @a, align 8, !tbaa !3
+  %arrayidx21 = getelementptr inbounds i32* %tmp16, i64 %idxprom
+  %tmp17 = bitcast i32* %arrayidx21 to i8*
+
+  ; CHECK: prfm pstl1keep
+  call void @llvm.prefetch(i8* %tmp17, i32 1, i32 3, i32 1)
+  ret void
+}
+
+declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/AArch64/arm64-promote-const.ll b/test/CodeGen/AArch64/arm64-promote-const.ll
new file mode 100644
index 00000000000..380ff55d683
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-promote-const.ll
@@ -0,0 +1,255 @@
+; Disable machine cse to stress the different path of the algorithm.
+; Otherwise, we always fall in the simple case, i.e., only one definition.
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -aarch64-stress-promote-const -mcpu=cyclone | FileCheck -check-prefix=PROMOTED %s
+; The REGULAR run just checks that the inputs passed to promote const expose
+; the appropriate patterns.
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -aarch64-promote-const=false -mcpu=cyclone | FileCheck -check-prefix=REGULAR %s
+
+%struct.uint8x16x4_t = type { [4 x <16 x i8>] }
+
+; Constant is a structure
+define %struct.uint8x16x4_t @test1() {
+; PROMOTED-LABEL: test1:
+; Promote constant has created a big constant for the whole structure
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], __PromotedConst@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], __PromotedConst@PAGEOFF
+; Destination registers are defined by the ABI
+; PROMOTED-NEXT: ldp q0, q1, {{\[}}[[BASEADDR]]]
+; PROMOTED-NEXT: ldp q2, q3, {{\[}}[[BASEADDR]], #32]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test1:
+; Regular access is quite bad, it performs 4 loads, one for each chunk of
+; the structure
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; Destination registers are defined by the ABI
+; REGULAR: ldr q0, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR: ldr q1, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR2:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
+; REGULAR: ldr q2, {{\[}}[[PAGEADDR2]], [[CSTLABEL2]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR3:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
+; REGULAR: ldr q3, {{\[}}[[PAGEADDR3]], [[CSTLABEL3]]@PAGEOFF]
+; REGULAR-NEXT: ret
+entry:
+  ret %struct.uint8x16x4_t { [4 x <16 x i8>] [<16 x i8> <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, <16 x i8> <i8 32, i8 124, i8 121, i8 120, i8 8, i8 117, i8 -56, i8 113, i8 -76, i8 110, i8 -53, i8 107, i8 7, i8 105, i8 103, i8 102>, <16 x i8> <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>, <16 x i8> <i8 -104, i8 83, i8 -20, i8 81, i8 81, i8 80, i8 -59, i8 78, i8 73, i8 77, i8 -37, i8 75, i8 122, i8 74, i8 37, i8 73>] }
+}
+
+; Two different uses of the same constant in the same basic block
+define <16 x i8> @test2(<16 x i8> %arg) {
+entry:
+; PROMOTED-LABEL: test2:
+; In stress mode, constant vector are promoted
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1:__PromotedConst[0-9]+]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: mla.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test2:
+; Regular access is strickly the same as promoted access.
+; The difference is that the address (and thus the space in memory) is not
+; shared between constants
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: mla.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: ret
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %add.i9 = add <16 x i8> %add.i, %mul.i
+  ret <16 x i8> %add.i9
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; one dominates the other
+define <16 x i8> @test3(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test3:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: cbnz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV2:__PromotedConst[0-9]+]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV2]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM]], {{\[}}[[BASEADDR]]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; PROMOTED-NEXT: add.16b v0, v0, [[DESTV]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test3:
+; Regular mode does not elimitate common sub expression by its own.
+; In other words, the same loads appears several times.
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: cbz w0, [[LABELelse:LBB.*]]
+; Next BB
+; Redundant load
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
+; REGULAR-NEXT: b [[LABELend:LBB.*]]
+; Next BB
+; REGULAR-NEXT: [[LABELelse]]
+; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL2]]@PAGEOFF]
+; Next BB
+; REGULAR-NEXT: [[LABELend]]:
+; REGULAR-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; REGULAR-NEXT: add.16b v0, v0, [[DESTV]]
+; REGULAR-NEXT: ret
+entry:
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul.i13 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul.i = mul <16 x i8> %add.i, <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %ret2.0 = phi <16 x i8> [ %mul.i13, %if.then ], [ %mul.i, %if.else ]
+  %add.i12 = add <16 x i8> %add.i, %ret2.0
+  ret <16 x i8> %add.i12
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; none dominates the other
+define <16 x i8> @test4(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test4:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: mul.16b v0, v0, v[[REGNUM]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: ret
+
+
+; REGULAR-LABEL: test4:
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; Redundant expression
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: mul.16b v0, v0, v[[REGNUM]]
+; Next BB
+; REGULAR-NEXT: [[LABEL]]:
+; REGULAR-NEXT: ret
+entry:
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %ret.0 = phi <16 x i8> [ %mul.i, %if.then ], [ %add.i, %entry ]
+  ret <16 x i8> %ret.0
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; one is in a phi.
+define <16 x i8> @test5(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test5:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; PROMOTED-NEXT: mul.16b v[[REGNUM]], [[DESTV]], v[[REGNUM]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[REGNUM]], v[[REGNUM]]
+; PROMOTED-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
+; PROMOTED-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
+; PROMOTED-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test5:
+; REGULAR: cbz w0, [[LABELelse:LBB.*]]
+; Next BB
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR-NEXT: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; REGULAR-NEXT: mul.16b v[[DESTREGNUM:[0-9]+]], [[DESTV]], v[[REGNUM]]
+; REGULAR-NEXT: b [[LABELend:LBB.*]]
+; Next BB
+; REGULAR-NEXT: [[LABELelse]]
+; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[DESTREGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; Next BB
+; REGULAR-NEXT: [[LABELend]]:
+; REGULAR-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[DESTREGNUM]], v[[DESTREGNUM]]
+; REGULAR-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
+; REGULAR-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
+; REGULAR-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
+; REGULAR-NEXT: ret
+entry:
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %mul.i26 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %ret.0 = phi <16 x i8> [ %mul.i26, %if.then ], [ <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, %entry ]
+  %mul.i25 = mul <16 x i8> %ret.0, %ret.0
+  %mul.i24 = mul <16 x i8> %mul.i25, %mul.i25
+  %mul.i23 = mul <16 x i8> %mul.i24, %mul.i24
+  %mul.i = mul <16 x i8> %mul.i23, %mul.i23
+  ret <16 x i8> %mul.i
+}
+
+define void @accessBig(i64* %storage) {
+; PROMOTED-LABEL: accessBig:
+; PROMOTED: adrp
+; PROMOTED: ret
+  %addr = bitcast i64* %storage to <1 x i80>*
+  store <1 x i80> <i80 483673642326615442599424>, <1 x i80>* %addr
+  ret void
+}
+
+define void @asmStatement() {
+; PROMOTED-LABEL: asmStatement:
+; PROMOTED-NOT: adrp
+; PROMOTED: ret
+  call void asm sideeffect "bfxil w0, w0, $0, $1", "i,i"(i32 28, i32 4)
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-redzone.ll b/test/CodeGen/AArch64/arm64-redzone.ll
new file mode 100644
index 00000000000..9b0c384c4d9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-redzone.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=arm64 -aarch64-redzone | FileCheck %s
+
+define i32 @foo(i32 %a, i32 %b) nounwind ssp {
+; CHECK-LABEL: foo:
+; CHECK-NOT: sub sp, sp
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %x = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 %b, i32* %b.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %tmp1 = load i32* %b.addr, align 4
+  %add = add nsw i32 %tmp, %tmp1
+  store i32 %add, i32* %x, align 4
+  %tmp2 = load i32* %x, align 4
+  ret i32 %tmp2
+}
diff --git a/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll b/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
new file mode 100644
index 00000000000..29255ef187c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s | FileCheck %s
+
+define float @copy_FPR32(float %a, float %b) {
+;CHECK-LABEL: copy_FPR32:
+;CHECK: fmov s0, s1
+  ret float %b;
+}
+  
+define double @copy_FPR64(double %a, double %b) {
+;CHECK-LABEL: copy_FPR64:
+;CHECK: fmov d0, d1
+  ret double %b;
+}
+  
+define fp128 @copy_FPR128(fp128 %a, fp128 %b) {
+;CHECK-LABEL: copy_FPR128:
+;CHECK: str	q1, [sp, #-16]!
+;CHECK-NEXT: ldr	q0, [sp, #16]!
+  ret fp128 %b;
+}
diff --git a/test/CodeGen/AArch64/arm64-register-offset-addressing.ll b/test/CodeGen/AArch64/arm64-register-offset-addressing.ll
new file mode 100644
index 00000000000..045712bea6a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-register-offset-addressing.ll
@@ -0,0 +1,145 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i8 @test_64bit_add(i16* %a, i64 %b) {
+; CHECK-LABEL: test_64bit_add:
+; CHECK: lsl [[REG:x[0-9]+]], x1, #1
+; CHECK: ldrb w0, [x0, [[REG]]]
+; CHECK: ret
+  %tmp1 = getelementptr inbounds i16* %a, i64 %b
+  %tmp2 = load i16* %tmp1
+  %tmp3 = trunc i16 %tmp2 to i8
+  ret i8 %tmp3
+}
+
+; These tests are trying to form SEXT and ZEXT operations that never leave i64
+; space, to make sure LLVM can adapt the offset register correctly.
+define void @ldst_8bit(i8* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_8bit:
+
+   %off32.sext.tmp = shl i64 %offset, 32
+   %off32.sext = ashr i64 %off32.sext.tmp, 32
+   %addr8_sxtw = getelementptr i8* %base, i64 %off32.sext
+   %val8_sxtw = load volatile i8* %addr8_sxtw
+   %val32_signed = sext i8 %val8_sxtw to i32
+   store volatile i32 %val32_signed, i32* @var_32bit
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+  %addrint_uxtw = ptrtoint i8* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i8*
+  %val8_uxtw = load volatile i8* %addr_uxtw
+  %newval8 = add i8 %val8_uxtw, 1
+  store volatile i8 %newval8, i8* @var_8bit
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+   ret void
+}
+
+
+define void @ldst_16bit(i16* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_16bit:
+
+  %addrint_uxtw = ptrtoint i16* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i16*
+  %val8_uxtw = load volatile i16* %addr_uxtw
+  %newval8 = add i16 %val8_uxtw, 1
+  store volatile i16 %newval8, i16* @var_16bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i16* %base to i64
+  %offset_sxtw.tmp = shl i64 %offset, 32
+  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i16*
+  %val16_sxtw = load volatile i16* %addr_sxtw
+  %val64_signed = sext i16 %val16_sxtw to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_uxtwN = ptrtoint i16* %base to i64
+  %offset_uxtwN = and i64 %offset, 4294967295
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 1
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i16*
+  %val32 = load volatile i32* @var_32bit
+  %val16_trunc32 = trunc i32 %val32 to i16
+  store volatile i16 %val16_trunc32, i16* %addr_uxtwN
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #1]
+   ret void
+}
+
+define void @ldst_32bit(i32* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_32bit:
+
+  %addrint_uxtw = ptrtoint i32* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i32*
+  %val32_uxtw = load volatile i32* %addr_uxtw
+  %newval32 = add i32 %val32_uxtw, 1
+  store volatile i32 %newval32, i32* @var_32bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i32* %base to i64
+  %offset_sxtw.tmp = shl i64 %offset, 32
+  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i32*
+  %val32_sxtw = load volatile i32* %addr_sxtw
+  %val64_signed = sext i32 %val32_sxtw to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_uxtwN = ptrtoint i32* %base to i64
+  %offset_uxtwN = and i64 %offset, 4294967295
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 2
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i32*
+  %val32 = load volatile i32* @var_32bit
+  store volatile i32 %val32, i32* %addr_uxtwN
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+   ret void
+}
+
+define void @ldst_64bit(i64* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_64bit:
+
+  %addrint_uxtw = ptrtoint i64* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i64*
+  %val64_uxtw = load volatile i64* %addr_uxtw
+  %newval8 = add i64 %val64_uxtw, 1
+  store volatile i64 %newval8, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i64* %base to i64
+  %offset_sxtw.tmp = shl i64 %offset, 32
+  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i64*
+  %val64_sxtw = load volatile i64* %addr_sxtw
+  store volatile i64 %val64_sxtw, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_uxtwN = ptrtoint i64* %base to i64
+  %offset_uxtwN = and i64 %offset, 4294967295
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 3
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i64*
+  %val64 = load volatile i64* @var_64bit
+  store volatile i64 %val64, i64* %addr_uxtwN
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+   ret void
+}
+
+@var_8bit = global i8 0
+@var_16bit = global i16 0
+@var_32bit = global i32 0
+@var_64bit = global i64 0
diff --git a/test/CodeGen/AArch64/arm64-register-pairing.ll b/test/CodeGen/AArch64/arm64-register-pairing.ll
new file mode 100644
index 00000000000..99defb1aad7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-register-pairing.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+;
+; rdar://14075006
+
+define void @odd() nounwind {
+; CHECK-LABEL: odd:
+; CHECK: stp d15, d14, [sp, #-144]!
+; CHECK: stp d13, d12, [sp, #16]
+; CHECK: stp d11, d10, [sp, #32]
+; CHECK: stp d9, d8, [sp, #48]
+; CHECK: stp x28, x27, [sp, #64]
+; CHECK: stp x26, x25, [sp, #80]
+; CHECK: stp x24, x23, [sp, #96]
+; CHECK: stp x22, x21, [sp, #112]
+; CHECK: stp x20, x19, [sp, #128]
+; CHECK: movz x0, #0x2a
+; CHECK: ldp x20, x19, [sp, #128]
+; CHECK: ldp x22, x21, [sp, #112]
+; CHECK: ldp x24, x23, [sp, #96]
+; CHECK: ldp x26, x25, [sp, #80]
+; CHECK: ldp x28, x27, [sp, #64]
+; CHECK: ldp d9, d8, [sp, #48]
+; CHECK: ldp d11, d10, [sp, #32]
+; CHECK: ldp d13, d12, [sp, #16]
+; CHECK: ldp d15, d14, [sp], #144
+  call void asm sideeffect "mov x0, #42", "~{x0},~{x19},~{x21},~{x23},~{x25},~{x27},~{d8},~{d10},~{d12},~{d14}"() nounwind
+  ret void
+}
+
+define void @even() nounwind {
+; CHECK-LABEL: even:
+; CHECK: stp d15, d14, [sp, #-144]!
+; CHECK: stp d13, d12, [sp, #16]
+; CHECK: stp d11, d10, [sp, #32]
+; CHECK: stp d9, d8, [sp, #48]
+; CHECK: stp x28, x27, [sp, #64]
+; CHECK: stp x26, x25, [sp, #80]
+; CHECK: stp x24, x23, [sp, #96]
+; CHECK: stp x22, x21, [sp, #112]
+; CHECK: stp x20, x19, [sp, #128]
+; CHECK: movz x0, #0x2a
+; CHECK: ldp x20, x19, [sp, #128]
+; CHECK: ldp x22, x21, [sp, #112]
+; CHECK: ldp x24, x23, [sp, #96]
+; CHECK: ldp x26, x25, [sp, #80]
+; CHECK: ldp x28, x27, [sp, #64]
+; CHECK: ldp d9, d8, [sp, #48]
+; CHECK: ldp d11, d10, [sp, #32]
+; CHECK: ldp d13, d12, [sp, #16]
+; CHECK: ldp d15, d14, [sp], #144
+  call void asm sideeffect "mov x0, #42", "~{x0},~{x20},~{x22},~{x24},~{x26},~{x28},~{d9},~{d11},~{d13},~{d15}"() nounwind
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll b/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
new file mode 100644
index 00000000000..a1daf03f4fa
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
+
+; We used to not mark NZCV as being used in the continuation basic-block
+; when lowering a 128-bit "select" to branches. This meant a subsequent use
+; of the same flags gave an internal fault here.
+
+declare void @foo(fp128)
+
+define double @test_f128csel_flags(i32 %lhs, fp128 %a, fp128 %b, double %l, double %r) nounwind {
+; CHECK: test_f128csel_flags
+
+    %tst = icmp ne i32 %lhs, 42
+    %val = select i1 %tst, fp128 %a, fp128 %b
+; CHECK: cmp w0, #42
+; CHECK: b.eq {{.?LBB0}}
+
+    call void @foo(fp128 %val)
+    %retval = select i1 %tst, double %l, double %r
+
+    ; It's also reasonably important that the actual fcsel comes before the
+    ; function call since bl may corrupt NZCV. We were doing the right thing anyway,
+    ; but just as well test it while we're here.
+; CHECK: fcsel {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, ne
+; CHECK: bl {{_?foo}}
+
+    ret double %retval
+}
diff --git a/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll b/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
new file mode 100644
index 00000000000..fec89334801
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+; This is mostly a "don't assert" test. The type of the RHS of a shift depended
+; on the phase of legalization, which led to the creation of an unexpected and
+; unselectable "rotr" node: (i32 (rotr i32, i64)).
+
+; FIXME: This test is xfailed because it relies on an optimization that has
+; been reverted (see PR17975).
+; XFAIL: *
+
+define void @foo(i64* nocapture %d) {
+; CHECK-LABEL: foo:
+; CHECK: rorv
+  %tmp = load i64* undef, align 8
+  %sub397 = sub i64 0, %tmp
+  %and398 = and i64 %sub397, 4294967295
+  %shr404 = lshr i64 %and398, 0
+  %or405 = or i64 0, %shr404
+  %xor406 = xor i64 %or405, 0
+  %xor417 = xor i64 0, %xor406
+  %xor428 = xor i64 0, %xor417
+  %sub430 = sub i64 %xor417, 0
+  %and431 = and i64 %sub430, 4294967295
+  %and432 = and i64 %xor428, 31
+  %sub433 = sub i64 32, %and432
+  %shl434 = shl i64 %and431, %sub433
+  %shr437 = lshr i64 %and431, %and432
+  %or438 = or i64 %shl434, %shr437
+  %xor439 = xor i64 %or438, %xor428
+  %sub441 = sub i64 %xor439, 0
+  store i64 %sub441, i64* %d, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-return-vector.ll b/test/CodeGen/AArch64/arm64-return-vector.ll
new file mode 100644
index 00000000000..9457d8bc6d0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-return-vector.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; 2x64 vector should be returned in Q0.
+
+define <2 x double> @test(<2 x double>* %p) nounwind {
+; CHECK: test
+; CHECK: ldr q0, [x0]
+; CHECK: ret
+  %tmp1 = load <2 x double>* %p, align 16
+  ret <2 x double> %tmp1
+}
diff --git a/test/CodeGen/AArch64/arm64-returnaddr.ll b/test/CodeGen/AArch64/arm64-returnaddr.ll
new file mode 100644
index 00000000000..285b29563c0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-returnaddr.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i8* @rt0(i32 %x) nounwind readnone {
+entry:
+; CHECK-LABEL: rt0:
+; CHECK: mov x0, x30
+; CHECK: ret
+  %0 = tail call i8* @llvm.returnaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @rt2() nounwind readnone {
+entry:
+; CHECK-LABEL: rt2:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: ldr x[[REG:[0-9]+]], [x29]
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]]]
+; CHECK: ldr x0, [x[[REG2]], #8]
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+  %0 = tail call i8* @llvm.returnaddress(i32 2)
+  ret i8* %0
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-rev.ll b/test/CodeGen/AArch64/arm64-rev.ll
new file mode 100644
index 00000000000..30d9f4f3e67
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-rev.ll
@@ -0,0 +1,235 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @test_rev_w(i32 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev_w:
+; CHECK: rev w0, w0
+  %0 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %0
+}
+
+define i64 @test_rev_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev_x:
+; CHECK: rev x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %0
+}
+
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
+declare i64 @llvm.bswap.i64(i64) nounwind readnone
+
+define i32 @test_rev16_w(i32 %X) nounwind {
+entry:
+; CHECK-LABEL: test_rev16_w:
+; CHECK: rev16 w0, w0
+  %tmp1 = lshr i32 %X, 8
+  %X15 = bitcast i32 %X to i32
+  %tmp4 = shl i32 %X15, 8
+  %tmp2 = and i32 %tmp1, 16711680
+  %tmp5 = and i32 %tmp4, -16777216
+  %tmp9 = and i32 %tmp1, 255
+  %tmp13 = and i32 %tmp4, 65280
+  %tmp6 = or i32 %tmp5, %tmp2
+  %tmp10 = or i32 %tmp6, %tmp13
+  %tmp14 = or i32 %tmp10, %tmp9
+  ret i32 %tmp14
+}
+
+; 64-bit REV16 is *not* a swap then a 16-bit rotation:
+;   01234567 ->(bswap) 76543210 ->(rotr) 10765432
+;   01234567 ->(rev16) 10325476
+define i64 @test_rev16_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev16_x:
+; CHECK-NOT: rev16 x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %1 = lshr i64 %0, 16
+  %2 = shl i64 %0, 48
+  %3 = or i64 %1, %2
+  ret i64 %3
+}
+
+define i64 @test_rev32_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev32_x:
+; CHECK: rev32 x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %1 = lshr i64 %0, 32
+  %2 = shl i64 %0, 32
+  %3 = or i64 %1, %2
+  ret i64 %3
+}
+
+define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D8:
+;CHECK: rev64.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D16:
+;CHECK: rev64.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D32:
+;CHECK: rev64.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Df:
+;CHECK: rev64.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q8:
+;CHECK: rev64.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q16:
+;CHECK: rev64.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q32:
+;CHECK: rev64.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Qf:
+;CHECK: rev64.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x float> %tmp2
+}
+
+define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev32D8:
+;CHECK: rev32.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32D16:
+;CHECK: rev32.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i16> %tmp2
+}
+
+define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q8:
+;CHECK: rev32.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q16:
+;CHECK: rev32.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i16> %tmp2
+}
+
+define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev16D8:
+;CHECK: rev16.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev16Q8:
+;CHECK: rev16.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+	ret <16 x i8> %tmp2
+}
+
+; Undef shuffle indices should not prevent matching to VREV:
+
+define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D8_undef:
+;CHECK: rev64.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
+	ret <8 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q16_undef:
+;CHECK: rev32.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
+	ret <8 x i16> %tmp2
+}
+
+; vrev <4 x i16> should use REV32 and not REV64
+define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
+; CHECK-LABEL: test_vrev64:
+; CHECK: ldr [[DEST:q[0-9]+]],
+; CHECK: st1.h
+; CHECK: st1.h
+entry:
+  %0 = bitcast <4 x i16>* %source to <8 x i16>*
+  %tmp2 = load <8 x i16>* %0, align 4
+  %tmp3 = extractelement <8 x i16> %tmp2, i32 6
+  %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
+  %tmp9 = extractelement <8 x i16> %tmp2, i32 5
+  %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
+  store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
+  ret void
+}
+
+; Test vrev of float4
+define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
+; CHECK: float_vrev64
+; CHECK: ldr [[DEST:q[0-9]+]],
+; CHECK: rev64.4s
+entry:
+  %0 = bitcast float* %source to <4 x float>*
+  %tmp2 = load <4 x float>* %0, align 4
+  %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
+  %arrayidx8 = getelementptr inbounds <4 x float>* %dest, i32 11
+  store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
+  ret void
+}
+
+
+define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
+; CHECK-LABEL: test_vrev32_bswap:
+; CHECK: rev32.16b
+; CHECK-NOT: rev
+; CHECK: ret
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
+  ret <4 x i32> %bswap
+}
+
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-rounding.ll b/test/CodeGen/AArch64/arm64-rounding.ll
new file mode 100644
index 00000000000..931114447ad
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-rounding.ll
@@ -0,0 +1,208 @@
+; RUN: llc -O3 < %s -mcpu=cyclone | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-ios6.0.0"
+
+; CHECK: test1
+; CHECK: frintx
+; CHECK: frintm
+define float @test1(float %a) #0 {
+entry:
+  %call = tail call float @floorf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @floorf(float) nounwind readnone
+
+; CHECK: test2
+; CHECK: frintx
+; CHECK: frintm
+define double @test2(double %a) #0 {
+entry:
+  %call = tail call double @floor(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @floor(double) nounwind readnone
+
+; CHECK: test3
+; CHECK: frinti
+define float @test3(float %a) #0 {
+entry:
+  %call = tail call float @nearbyintf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @nearbyintf(float) nounwind readnone
+
+; CHECK: test4
+; CHECK: frinti
+define double @test4(double %a) #0 {
+entry:
+  %call = tail call double @nearbyint(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @nearbyint(double) nounwind readnone
+
+; CHECK: test5
+; CHECK: frintx
+; CHECK: frintp
+define float @test5(float %a) #0 {
+entry:
+  %call = tail call float @ceilf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @ceilf(float) nounwind readnone
+
+; CHECK: test6
+; CHECK: frintx
+; CHECK: frintp
+define double @test6(double %a) #0 {
+entry:
+  %call = tail call double @ceil(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @ceil(double) nounwind readnone
+
+; CHECK: test7
+; CHECK: frintx
+define float @test7(float %a) #0 {
+entry:
+  %call = tail call float @rintf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @rintf(float) nounwind readnone
+
+; CHECK: test8
+; CHECK: frintx
+define double @test8(double %a) #0 {
+entry:
+  %call = tail call double @rint(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @rint(double) nounwind readnone
+
+; CHECK: test9
+; CHECK: frintx
+; CHECK: frintz
+define float @test9(float %a) #0 {
+entry:
+  %call = tail call float @truncf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @truncf(float) nounwind readnone
+
+; CHECK: test10
+; CHECK: frintx
+; CHECK: frintz
+define double @test10(double %a) #0 {
+entry:
+  %call = tail call double @trunc(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @trunc(double) nounwind readnone
+
+; CHECK: test11
+; CHECK: frintx
+; CHECK: frinta
+define float @test11(float %a) #0 {
+entry:
+  %call = tail call float @roundf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @roundf(float %a) nounwind readnone
+
+; CHECK: test12
+; CHECK: frintx
+; CHECK: frinta
+define double @test12(double %a) #0 {
+entry:
+  %call = tail call double @round(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @round(double %a) nounwind readnone
+
+; CHECK: test13
+; CHECK-NOT: frintx
+; CHECK: frintm
+define float @test13(float %a) #1 {
+entry:
+  %call = tail call float @floorf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test14
+; CHECK-NOT: frintx
+; CHECK: frintm
+define double @test14(double %a) #1 {
+entry:
+  %call = tail call double @floor(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test15
+; CHECK-NOT: frintx
+; CHECK: frintp
+define float @test15(float %a) #1 {
+entry:
+  %call = tail call float @ceilf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test16
+; CHECK-NOT: frintx
+; CHECK: frintp
+define double @test16(double %a) #1 {
+entry:
+  %call = tail call double @ceil(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test17
+; CHECK-NOT: frintx
+; CHECK: frintz
+define float @test17(float %a) #1 {
+entry:
+  %call = tail call float @truncf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test18
+; CHECK-NOT: frintx
+; CHECK: frintz
+define double @test18(double %a) #1 {
+entry:
+  %call = tail call double @trunc(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test19
+; CHECK-NOT: frintx
+; CHECK: frinta
+define float @test19(float %a) #1 {
+entry:
+  %call = tail call float @roundf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test20
+; CHECK-NOT: frintx
+; CHECK: frinta
+define double @test20(double %a) #1 {
+entry:
+  %call = tail call double @round(double %a) nounwind readnone
+  ret double %call
+}
+
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AArch64/arm64-scaled_iv.ll b/test/CodeGen/AArch64/arm64-scaled_iv.ll
new file mode 100644
index 00000000000..987373e542a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-scaled_iv.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+; Scaling factor in addressing mode are costly.
+; Make loop-reduce prefer unscaled accesses.
+; <rdar://problem/13806271>
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp
+define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
+; CHECK: @mulDouble
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
+; Only one induction variable should have been generated.
+; CHECK-NOT: phi
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = add nsw i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds double* %b, i64 %tmp
+  %tmp1 = load double* %arrayidx, align 8
+; The induction variable should carry the scaling factor: 1 * 8 = 8.
+; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
+  %tmp2 = load double* %arrayidx2, align 8
+  %mul = fmul double %tmp1, %tmp2
+  %arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
+  store double %mul, double* %arrayidx4, align 8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; Comparison should be 19 * 8 = 152.
+; CHECK: icmp eq i32 {{%[^,]+}}, 152
+  %exitcond = icmp eq i32 %lftr.wideiv, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-scvt.ll b/test/CodeGen/AArch64/arm64-scvt.ll
new file mode 100644
index 00000000000..2e006cff159
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-scvt.ll
@@ -0,0 +1,830 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; rdar://13082402
+
+define float @t1(i32* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldr s0, [x0]
+; CHECK: scvtf s0, s0
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = sitofp i32 %tmp1 to float
+  ret float %tmp2
+}
+
+define float @t2(i32* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr s0, [x0]
+; CHECK: ucvtf s0, s0
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = uitofp i32 %tmp1 to float
+  ret float %tmp2
+}
+
+define double @t3(i64* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldr d0, [x0]
+; CHECK: scvtf d0, d0
+  %tmp1 = load i64* %src, align 4
+  %tmp2 = sitofp i64 %tmp1 to double
+  ret double %tmp2
+}
+
+define double @t4(i64* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: ldr d0, [x0]
+; CHECK: ucvtf d0, d0
+  %tmp1 = load i64* %src, align 4
+  %tmp2 = uitofp i64 %tmp1 to double
+  ret double %tmp2
+}
+
+; rdar://13136456
+define double @t5(i32* nocapture %src) nounwind ssp optsize {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: ldr [[REG:w[0-9]+]], [x0]
+; CHECK: scvtf d0, [[REG]]
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = sitofp i32 %tmp1 to double
+  ret double %tmp2
+}
+
+; Check that we load in FP register when we want to convert into
+; floating point value.
+; This is much faster than loading on GPR and making the conversion
+; GPR -> FPR.
+; <rdar://problem/14599607>
+;
+; Check the flollowing patterns for signed/unsigned:
+; 1. load with scaled imm to float.
+; 2. load with scaled register to float.
+; 3. load with scaled imm to double.
+; 4. load with scaled register to double.
+; 5. load with unscaled imm to float.
+; 6. load with unscaled imm to double.
+; With loading size: 8, 16, 32, and 64-bits.
+
+; ********* 1. load with scaled imm to float. *********
+define float @fct1(i8* nocapture %sp0) {
+; CHECK-LABEL: fct1:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct2(i16* nocapture %sp0) {
+; CHECK-LABEL: fct2:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct3(i32* nocapture %sp0) {
+; CHECK-LABEL: fct3:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct4(i64* nocapture %sp0) {
+; CHECK-LABEL: fct4:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 2. load with scaled register to float. *********
+define float @fct5(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct5:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct6(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct6:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct7(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct7:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct8(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct8:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+
+; ********* 3. load with scaled imm to double. *********
+define double @fct9(i8* nocapture %sp0) {
+; CHECK-LABEL: fct9:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct10(i16* nocapture %sp0) {
+; CHECK-LABEL: fct10:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct11(i32* nocapture %sp0) {
+; CHECK-LABEL: fct11:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct12(i64* nocapture %sp0) {
+; CHECK-LABEL: fct12:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 4. load with scaled register to double. *********
+define double @fct13(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct13:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct14(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct14:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct15(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct15:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct16(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct16:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 5. load with unscaled imm to float. *********
+define float @fct17(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: fct17:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct18(i16* nocapture %sp0) {
+; CHECK-LABEL: fct18:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct19(i32* nocapture %sp0) {
+; CHECK-LABEL: fct19:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct20(i64* nocapture %sp0) {
+; CHECK-LABEL: fct20:
+; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+
+}
+
+; ********* 6. load with unscaled imm to double. *********
+define double @fct21(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: fct21:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct22(i16* nocapture %sp0) {
+; CHECK-LABEL: fct22:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct23(i32* nocapture %sp0) {
+; CHECK-LABEL: fct23:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct24(i64* nocapture %sp0) {
+; CHECK-LABEL: fct24:
+; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+
+}
+
+; ********* 1s. load with scaled imm to float. *********
+define float @sfct1(i8* nocapture %sp0) {
+; CHECK-LABEL: sfct1:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct2(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct2:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct3(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct3:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct4(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct4:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 2s. load with scaled register to float. *********
+define float @sfct5(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct5:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct6(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct6:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct7(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct7:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct8(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct8:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 3s. load with scaled imm to double. *********
+define double @sfct9(i8* nocapture %sp0) {
+; CHECK-LABEL: sfct9:
+; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct10(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct10:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct11(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct11:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct12(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct12:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 4s. load with scaled register to double. *********
+define double @sfct13(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct13:
+; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct14(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct14:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct15(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct15:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct16(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct16:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 5s. load with unscaled imm to float. *********
+define float @sfct17(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: sfct17:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct18(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct18:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct19(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct19:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct20(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct20:
+; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+
+}
+
+; ********* 6s. load with unscaled imm to double. *********
+define double @sfct21(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: sfct21:
+; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct22(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct22:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct23(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct23:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct24(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct24:
+; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+
+}
+
+; Check that we do not use SSHLL code sequence when code size is a concern.
+define float @codesize_sfct17(i8* nocapture %sp0) optsize {
+entry:
+; CHECK-LABEL: codesize_sfct17:
+; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define double @codesize_sfct11(i32* nocapture %sp0) minsize {
+; CHECK-LABEL: sfct11:
+; CHECK: ldr w[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; Adding fp128 custom lowering makes these a little fragile since we have to
+; return the correct mix of Legal/Expand from the custom method.
+;
+; rdar://problem/14991489
+
+define float @float_from_i128(i128 %in) {
+; CHECK-LABEL: float_from_i128:
+; CHECK: bl {{_?__floatuntisf}}
+  %conv = uitofp i128 %in to float
+  ret float %conv
+}
+
+define double @double_from_i128(i128 %in) {
+; CHECK-LABEL: double_from_i128:
+; CHECK: bl {{_?__floattidf}}
+  %conv = sitofp i128 %in to double
+  ret double %conv
+}
+
+define fp128 @fp128_from_i128(i128 %in) {
+; CHECK-LABEL: fp128_from_i128:
+; CHECK: bl {{_?__floatuntitf}}
+  %conv = uitofp i128 %in to fp128
+  ret fp128 %conv
+}
+
+define i128 @i128_from_float(float %in) {
+; CHECK-LABEL: i128_from_float
+; CHECK: bl {{_?__fixsfti}}
+  %conv = fptosi float %in to i128
+  ret i128 %conv
+}
+
+define i128 @i128_from_double(double %in) {
+; CHECK-LABEL: i128_from_double
+; CHECK: bl {{_?__fixunsdfti}}
+  %conv = fptoui double %in to i128
+  ret i128 %conv
+}
+
+define i128 @i128_from_fp128(fp128 %in) {
+; CHECK-LABEL: i128_from_fp128
+; CHECK: bl {{_?__fixtfti}}
+  %conv = fptosi fp128 %in to i128
+  ret i128 %conv
+}
+
diff --git a/test/CodeGen/AArch64/arm64-shifted-sext.ll b/test/CodeGen/AArch64/arm64-shifted-sext.ll
new file mode 100644
index 00000000000..b7b4e5de1d5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-shifted-sext.ll
@@ -0,0 +1,277 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+;
+; <rdar://problem/13820218>
+
+define signext i16 @extendedLeftShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToshortBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #4, #8
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv1, 4
+  %conv2 = trunc i32 %shl to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedRightShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToshortBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfx w0, [[REG]], #4, #4
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shr4 = lshr i32 %conv1, 4
+  %conv2 = trunc i32 %shr4 to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedLeftShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToshortBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #8, #8
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv1, 8
+  %conv2 = trunc i32 %shl to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedRightShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToshortBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxtb [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #8
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shr4 = lshr i32 %conv1, 8
+  %conv2 = trunc i32 %shr4 to i16
+  ret i16 %conv2
+}
+
+define i32 @extendedLeftShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #4, #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv, 4
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfx w0, [[REG]], #4, #4
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shr = ashr i32 %conv, 4
+  ret i32 %shr
+}
+
+define i32 @extendedLeftShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharTointBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #8, #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv, 8
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharTointBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxtb [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shr = ashr i32 %conv, 8
+  ret i32 %shr
+}
+
+define i64 @extendedLeftShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #4, #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfx x0, x[[REG]], #4, #4
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToint64By8:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #8, #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shl = shl nsw i64 %conv, 8
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToint64By8:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxtb x[[REG]], w[[REG]]
+; CHECK: asr x0, x[[REG]], #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shr = ashr i64 %conv, 8
+  ret i64 %shr
+}
+
+define i32 @extendedLeftShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #4, #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shl = shl nsw i32 %conv, 4
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfx w0, [[REG]], #4, #12
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shr = ashr i32 %conv, 4
+  ret i32 %shr
+}
+
+define i32 @extendedLeftShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortTointBy16:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: lsl w0, [[REG]], #16
+  %inc = add i16 %a, 1
+  %conv2 = zext i16 %inc to i32
+  %shl = shl nuw i32 %conv2, 16
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortTointBy16:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxth [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shr = ashr i32 %conv, 16
+  ret i32 %shr
+}
+
+define i64 @extendedLeftShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #4, #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfx x0, x[[REG]], #4, #12
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortToint64By16:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #16, #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shl = shl nsw i64 %conv, 16
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortToint64By16:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxth x[[REG]], w[[REG]]
+; CHECK: asr x0, x[[REG]], #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shr = ashr i64 %conv, 16
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftintToint64By4(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftintToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #4, #32
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftintToint64By4(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftintToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfx x0, x[[REG]], #4, #28
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftintToint64By32(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftintToint64By32:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: lsl x0, x[[REG]], #32
+  %inc = add nsw i32 %a, 1
+  %conv2 = zext i32 %inc to i64
+  %shl = shl nuw i64 %conv2, 32
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftintToint64By32(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftintToint64By32:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxtw x[[REG]], w[[REG]]
+; CHECK: asr x0, x[[REG]], #32
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shr = ashr i64 %conv, 32
+  ret i64 %shr
+}
diff --git a/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll b/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
new file mode 100644
index 00000000000..aed39e7ed8c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
+
+define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
+; CHECK: uaddlv.16b h0, v0
+; CHECK: rshrn.8b v0, v0, #4
+; CHECK: dup.16b v0, v0[0]
+; CHECK: ret
+
+; CHECK-FAST: uaddlv.16b
+; CHECK-FAST: rshrn.8b
+; CHECK-FAST: dup.16b
+  %tmp = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp1 = trunc i32 %tmp to i16
+  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
+  %tmp3 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
+  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp4
+}
+
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-simplest-elf.ll b/test/CodeGen/AArch64/arm64-simplest-elf.ll
new file mode 100644
index 00000000000..1254365b820
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-simplest-elf.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump - -r -d --triple=arm64-linux-gnu | FileCheck --check-prefix=CHECK-ELF %s
+
+define void @foo() nounwind {
+  ret void
+}
+
+  ; Check source looks ELF-like: no leading underscore, comments with //
+; CHECK: foo: // @foo
+; CHECK:     ret
+
+  ; Similarly make sure ELF output works and is vaguely sane: aarch64 target
+  ; machine with correct section & symbol names.
+; CHECK-ELF: file format ELF64-aarch64
+
+; CHECK-ELF: Disassembly of section .text
+; CHECK-ELF-LABEL: foo:
+; CHECK-ELF:    ret
diff --git a/test/CodeGen/AArch64/arm64-sincos.ll b/test/CodeGen/AArch64/arm64-sincos.ll
new file mode 100644
index 00000000000..06157b2580c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-sincos.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7 | FileCheck %s --check-prefix CHECK-IOS
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix CHECK-LINUX
+
+; Combine sin / cos into a single call.
+; rdar://12856873
+
+define float @test1(float %x) nounwind {
+entry:
+; CHECK-IOS-LABEL: test1:
+; CHECK-IOS: bl ___sincosf_stret
+; CHECK-IOS: fadd s0, s0, s1
+
+; CHECK-LINUX-LABEL: test1:
+; CHECK-LINUX: bl sinf
+; CHECK-LINUX: bl cosf
+
+  %call = tail call float @sinf(float %x) nounwind readnone
+  %call1 = tail call float @cosf(float %x) nounwind readnone
+  %add = fadd float %call, %call1
+  ret float %add
+}
+
+define double @test2(double %x) nounwind {
+entry:
+; CHECK-IOS-LABEL: test2:
+; CHECK-IOS: bl ___sincos_stret
+; CHECK-IOS: fadd d0, d0, d1
+
+; CHECK-LINUX-LABEL: test2:
+; CHECK-LINUX: bl sin
+; CHECK-LINUX: bl cos
+
+  %call = tail call double @sin(double %x) nounwind readnone
+  %call1 = tail call double @cos(double %x) nounwind readnone
+  %add = fadd double %call, %call1
+  ret double %add
+}
+
+declare float  @sinf(float) readonly
+declare double @sin(double) readonly
+declare float @cosf(float) readonly
+declare double @cos(double) readonly
diff --git a/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll b/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
new file mode 100644
index 00000000000..10b433b9775
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=arm64 -o -  %s | FileCheck %s
+
+; ARM64ISelLowering.cpp was creating a new (floating-point) load for efficiency
+; but not updating chain-successors of the old one. As a result, the two memory
+; operations in this function both ended up direct successors to the EntryToken
+; and could be reordered.
+
+@var = global i32 0, align 4
+
+define float @foo() {
+; CHECK-LABEL: foo:
+  ; Load must come before we clobber @var
+; CHECK: adrp x[[VARBASE:[0-9]+]], {{_?var}}
+; CHECK: ldr [[SREG:s[0-9]+]], [x[[VARBASE]],
+; CHECK: str wzr, [x[[VARBASE]],
+
+  %val = load i32* @var, align 4
+  store i32 0, i32* @var, align 4
+
+  %fltval = sitofp i32 %val to float
+  ret float %fltval
+}
diff --git a/test/CodeGen/AArch64/arm64-sli-sri-opt.ll b/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
new file mode 100644
index 00000000000..7fec53993bc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
@@ -0,0 +1,41 @@
+; RUN: llc -aarch64-shift-insert-generation=true -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood:
+; CHECK: sli.16b v0, v1, #3
+  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+  %vshl_n = shl <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testLeftBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad:
+; CHECK-NOT: sli
+  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+  %vshl_n = shl <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testRightGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightGood:
+; CHECK: sri.16b v0, v1, #3
+  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+  %vshl_n = lshr <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testRightBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightBad:
+; CHECK-NOT: sri
+  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+  %vshl_n = lshr <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-smaxv.ll b/test/CodeGen/AArch64/arm64-smaxv.ll
new file mode 100644
index 00000000000..183e667643c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-smaxv.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
+; CHECK: test_vmaxv_s8
+; CHECK: smaxv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vmaxv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vmaxv_s16(<4 x i16> %a1) {
+; CHECK: test_vmaxv_s16
+; CHECK: smaxv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vmaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxv_s32(<2 x i32> %a1) {
+; CHECK: test_vmaxv_s32
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: smaxp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vmaxv.i
+}
+
+define signext i8 @test_vmaxvq_s8(<16 x i8> %a1) {
+; CHECK: test_vmaxvq_s8
+; CHECK: smaxv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vmaxv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vmaxvq_s16(<8 x i16> %a1) {
+; CHECK: test_vmaxvq_s16
+; CHECK: smaxv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vmaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a1) {
+; CHECK: test_vmaxvq_s32
+; CHECK: smaxv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vmaxv.i
+}
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
+
diff --git a/test/CodeGen/AArch64/arm64-sminv.ll b/test/CodeGen/AArch64/arm64-sminv.ll
new file mode 100644
index 00000000000..195c4e59dc4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-sminv.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vminv_s8(<8 x i8> %a1) {
+; CHECK: test_vminv_s8
+; CHECK: sminv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vminv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vminv_s16(<4 x i16> %a1) {
+; CHECK: test_vminv_s16
+; CHECK: sminv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminv_s32(<2 x i32> %a1) {
+; CHECK: test_vminv_s32
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: sminp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vminv.i
+}
+
+define signext i8 @test_vminvq_s8(<16 x i8> %a1) {
+; CHECK: test_vminvq_s8
+; CHECK: sminv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vminv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vminvq_s16(<8 x i16> %a1) {
+; CHECK: test_vminvq_s16
+; CHECK: sminv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a1) {
+; CHECK: test_vminvq_s32
+; CHECK: sminv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vminv.i
+}
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
+
diff --git a/test/CodeGen/AArch64/arm64-spill-lr.ll b/test/CodeGen/AArch64/arm64-spill-lr.ll
new file mode 100644
index 00000000000..fb6588e6ae4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-spill-lr.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s
+@bar = common global i32 0, align 4
+
+; Leaf function which uses all callee-saved registers and allocates >= 256 bytes on the stack
+; this will cause processFunctionBeforeCalleeSavedScan() to spill LR as an additional scratch
+; register.
+;
+; This is a crash-only regression test for rdar://15124582.
+define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) nounwind {
+entry:
+  %stack = alloca [128 x i32], align 4
+  %0 = bitcast [128 x i32]* %stack to i8*
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom
+  store i32 %b, i32* %arrayidx, align 4
+  %1 = load volatile i32* @bar, align 4
+  %2 = load volatile i32* @bar, align 4
+  %3 = load volatile i32* @bar, align 4
+  %4 = load volatile i32* @bar, align 4
+  %5 = load volatile i32* @bar, align 4
+  %6 = load volatile i32* @bar, align 4
+  %7 = load volatile i32* @bar, align 4
+  %8 = load volatile i32* @bar, align 4
+  %9 = load volatile i32* @bar, align 4
+  %10 = load volatile i32* @bar, align 4
+  %11 = load volatile i32* @bar, align 4
+  %12 = load volatile i32* @bar, align 4
+  %13 = load volatile i32* @bar, align 4
+  %14 = load volatile i32* @bar, align 4
+  %15 = load volatile i32* @bar, align 4
+  %16 = load volatile i32* @bar, align 4
+  %17 = load volatile i32* @bar, align 4
+  %18 = load volatile i32* @bar, align 4
+  %19 = load volatile i32* @bar, align 4
+  %20 = load volatile i32* @bar, align 4
+  %idxprom1 = sext i32 %c to i64
+  %arrayidx2 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom1
+  %21 = load i32* %arrayidx2, align 4
+  %factor = mul i32 %h, -2
+  %factor67 = mul i32 %g, -2
+  %factor68 = mul i32 %f, -2
+  %factor69 = mul i32 %e, -2
+  %factor70 = mul i32 %d, -2
+  %factor71 = mul i32 %c, -2
+  %factor72 = mul i32 %b, -2
+  %sum = add i32 %2, %1
+  %sum73 = add i32 %sum, %3
+  %sum74 = add i32 %sum73, %4
+  %sum75 = add i32 %sum74, %5
+  %sum76 = add i32 %sum75, %6
+  %sum77 = add i32 %sum76, %7
+  %sum78 = add i32 %sum77, %8
+  %sum79 = add i32 %sum78, %9
+  %sum80 = add i32 %sum79, %10
+  %sum81 = add i32 %sum80, %11
+  %sum82 = add i32 %sum81, %12
+  %sum83 = add i32 %sum82, %13
+  %sum84 = add i32 %sum83, %14
+  %sum85 = add i32 %sum84, %15
+  %sum86 = add i32 %sum85, %16
+  %sum87 = add i32 %sum86, %17
+  %sum88 = add i32 %sum87, %18
+  %sum89 = add i32 %sum88, %19
+  %sum90 = add i32 %sum89, %20
+  %sub15 = sub i32 %21, %sum90
+  %sub16 = add i32 %sub15, %factor
+  %sub17 = add i32 %sub16, %factor67
+  %sub18 = add i32 %sub17, %factor68
+  %sub19 = add i32 %sub18, %factor69
+  %sub20 = add i32 %sub19, %factor70
+  %sub21 = add i32 %sub20, %factor71
+  %add = add i32 %sub21, %factor72
+  ret i32 %add
+}
diff --git a/test/CodeGen/AArch64/arm64-spill.ll b/test/CodeGen/AArch64/arm64-spill.ll
new file mode 100644
index 00000000000..47cdc2bd95e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-spill.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -aarch64-neon-syntax=apple -verify-machineinstrs
+
+; CHECK: fpr128
+; CHECK: ld1.2d
+; CHECK: str q
+; CHECK: inlineasm
+; CHECK: ldr q
+; CHECK: st1.2d
+define void @fpr128(<4 x float>* %p) nounwind ssp {
+entry:
+  %x = load <4 x float>* %p, align 16
+  call void asm sideeffect "; inlineasm", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23},~{q24},~{q25},~{q26},~{q27},~{q28},~{q29},~{q30},~{q31},~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
+  store <4 x float> %x, <4 x float>* %p, align 16
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-st1.ll b/test/CodeGen/AArch64/arm64-st1.ll
new file mode 100644
index 00000000000..4370484478c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-st1.ll
@@ -0,0 +1,676 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+define void @st1lane_16b(<16 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane_16b
+; CHECK: st1.b
+  %tmp = extractelement <16 x i8> %A, i32 1
+  store i8 %tmp, i8* %D
+  ret void
+}
+
+define void @st1lane_8h(<8 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane_8h
+; CHECK: st1.h
+  %tmp = extractelement <8 x i16> %A, i32 1
+  store i16 %tmp, i16* %D
+  ret void
+}
+
+define void @st1lane_4s(<4 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane_4s
+; CHECK: st1.s
+  %tmp = extractelement <4 x i32> %A, i32 1
+  store i32 %tmp, i32* %D
+  ret void
+}
+
+define void @st1lane_4s_float(<4 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_4s_float
+; CHECK: st1.s
+  %tmp = extractelement <4 x float> %A, i32 1
+  store float %tmp, float* %D
+  ret void
+}
+
+define void @st1lane_2d(<2 x i64> %A, i64* %D) {
+; CHECK-LABEL: st1lane_2d
+; CHECK: st1.d
+  %tmp = extractelement <2 x i64> %A, i32 1
+  store i64 %tmp, i64* %D
+  ret void
+}
+
+define void @st1lane_2d_double(<2 x double> %A, double* %D) {
+; CHECK-LABEL: st1lane_2d_double
+; CHECK: st1.d
+  %tmp = extractelement <2 x double> %A, i32 1
+  store double %tmp, double* %D
+  ret void
+}
+
+define void @st1lane_8b(<8 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane_8b
+; CHECK: st1.b
+  %tmp = extractelement <8 x i8> %A, i32 1
+  store i8 %tmp, i8* %D
+  ret void
+}
+
+define void @st1lane_4h(<4 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane_4h
+; CHECK: st1.h
+  %tmp = extractelement <4 x i16> %A, i32 1
+  store i16 %tmp, i16* %D
+  ret void
+}
+
+define void @st1lane_2s(<2 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane_2s
+; CHECK: st1.s
+  %tmp = extractelement <2 x i32> %A, i32 1
+  store i32 %tmp, i32* %D
+  ret void
+}
+
+define void @st1lane_2s_float(<2 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_2s_float
+; CHECK: st1.s
+  %tmp = extractelement <2 x float> %A, i32 1
+  store float %tmp, float* %D
+  ret void
+}
+
+define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
+; CHECK-LABEL: st2lane_16b
+; CHECK: st2.b
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
+  ret void
+}
+
+define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
+; CHECK-LABEL: st2lane_8h
+; CHECK: st2.h
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
+  ret void
+}
+
+define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
+; CHECK-LABEL: st2lane_4s
+; CHECK: st2.s
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
+  ret void
+}
+
+define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
+; CHECK-LABEL: st2lane_2d
+; CHECK: st2.d
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
+; CHECK-LABEL: st3lane_16b
+; CHECK: st3.b
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
+  ret void
+}
+
+define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
+; CHECK-LABEL: st3lane_8h
+; CHECK: st3.h
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
+  ret void
+}
+
+define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
+; CHECK-LABEL: st3lane_4s
+; CHECK: st3.s
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
+  ret void
+}
+
+define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
+; CHECK-LABEL: st3lane_2d
+; CHECK: st3.d
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
+; CHECK-LABEL: st4lane_16b
+; CHECK: st4.b
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
+  ret void
+}
+
+define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
+; CHECK-LABEL: st4lane_8h
+; CHECK: st4.h
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
+  ret void
+}
+
+define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
+; CHECK-LABEL: st4lane_4s
+; CHECK: st4.s
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
+  ret void
+}
+
+define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
+; CHECK-LABEL: st4lane_2d
+; CHECK: st4.d
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+
+define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
+; CHECK-LABEL: st2_8b
+; CHECK st2.8b
+	call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
+	ret void
+}
+
+define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
+; CHECK-LABEL: st3_8b
+; CHECK st3.8b
+	call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
+	ret void
+}
+
+define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
+; CHECK-LABEL: st4_8b
+; CHECK st4.8b
+	call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+
+define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
+; CHECK-LABEL: st2_16b
+; CHECK st2.16b
+	call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
+	ret void
+}
+
+define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
+; CHECK-LABEL: st3_16b
+; CHECK st3.16b
+	call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
+	ret void
+}
+
+define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
+; CHECK-LABEL: st4_16b
+; CHECK st4.16b
+	call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+
+define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
+; CHECK-LABEL: st2_4h
+; CHECK st2.4h
+	call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
+	ret void
+}
+
+define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
+; CHECK-LABEL: st3_4h
+; CHECK st3.4h
+	call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
+	ret void
+}
+
+define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
+; CHECK-LABEL: st4_4h
+; CHECK st4.4h
+	call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+
+define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
+; CHECK-LABEL: st2_8h
+; CHECK st2.8h
+	call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
+	ret void
+}
+
+define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
+; CHECK-LABEL: st3_8h
+; CHECK st3.8h
+	call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
+	ret void
+}
+
+define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
+; CHECK-LABEL: st4_8h
+; CHECK st4.8h
+	call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+
+define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
+; CHECK-LABEL: st2_2s
+; CHECK st2.2s
+	call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
+	ret void
+}
+
+define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
+; CHECK-LABEL: st3_2s
+; CHECK st3.2s
+	call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
+	ret void
+}
+
+define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
+; CHECK-LABEL: st4_2s
+; CHECK st4.2s
+	call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+
+define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
+; CHECK-LABEL: st2_4s
+; CHECK st2.4s
+	call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
+	ret void
+}
+
+define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
+; CHECK-LABEL: st3_4s
+; CHECK st3.4s
+	call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
+	ret void
+}
+
+define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
+; CHECK-LABEL: st4_4s
+; CHECK st4.4s
+	call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+
+define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
+; CHECK-LABEL: st2_1d
+; CHECK st1.2d
+	call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
+	ret void
+}
+
+define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
+; CHECK-LABEL: st3_1d
+; CHECK st1.3d
+	call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
+	ret void
+}
+
+define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
+; CHECK-LABEL: st4_1d
+; CHECK st1.4d
+	call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+
+define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
+; CHECK-LABEL: st2_2d
+; CHECK st2.2d
+	call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
+	ret void
+}
+
+define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
+; CHECK-LABEL: st3_2d
+; CHECK st2.3d
+	call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
+	ret void
+}
+
+define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
+; CHECK-LABEL: st4_2d
+; CHECK st2.4d
+	call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+
+declare void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x2_v8i8(<8 x i8> %A, <8 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
+  ret void
+}
+
+define void @st1_x2_v4i16(<4 x i16> %A, <4 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
+  ret void
+}
+
+define void @st1_x2_v2i32(<2 x i32> %A, <2 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
+  ret void
+}
+
+define void @st1_x2_v2f32(<2 x float> %A, <2 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
+  ret void
+}
+
+define void @st1_x2_v1i64(<1 x i64> %A, <1 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
+  ret void
+}
+
+define void @st1_x2_v1f64(<1 x double> %A, <1 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x2_v16i8(<16 x i8> %A, <16 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
+  ret void
+}
+
+define void @st1_x2_v8i16(<8 x i16> %A, <8 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
+  ret void
+}
+
+define void @st1_x2_v4i32(<4 x i32> %A, <4 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
+  ret void
+}
+
+define void @st1_x2_v4f32(<4 x float> %A, <4 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
+  ret void
+}
+
+define void @st1_x2_v2i64(<2 x i64> %A, <2 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
+  ret void
+}
+
+define void @st1_x2_v2f64(<2 x double> %A, <2 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
+  ret void
+}
+
+define void @st1_x3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
+  ret void
+}
+
+define void @st1_x3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
+  ret void
+}
+
+define void @st1_x3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
+  ret void
+}
+
+define void @st1_x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
+  ret void
+}
+
+define void @st1_x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
+  ret void
+}
+
+define void @st1_x3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
+  ret void
+}
+
+define void @st1_x3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
+  ret void
+}
+
+define void @st1_x3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
+  ret void
+}
+
+define void @st1_x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
+  ret void
+}
+
+define void @st1_x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
+  ret void
+}
+
+
+declare void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
+  ret void
+}
+
+define void @st1_x4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
+  ret void
+}
+
+define void @st1_x4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
+  ret void
+}
+
+define void @st1_x4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
+  ret void
+}
+
+define void @st1_x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
+  ret void
+}
+
+define void @st1_x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
+  ret void
+}
+
+define void @st1_x4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
+  ret void
+}
+
+define void @st1_x4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
+  ret void
+}
+
+define void @st1_x4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
+  ret void
+}
+
+define void @st1_x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
+  ret void
+}
+
+define void @st1_x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-stack-no-frame.ll b/test/CodeGen/AArch64/arm64-stack-no-frame.ll
new file mode 100644
index 00000000000..b5970c00ff9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stack-no-frame.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+
+@global = global [20 x i64] zeroinitializer, align 8
+
+; The following function has enough locals to need some restoring, but not a
+; frame record. In an intermediate frame refactoring, prologue and epilogue were
+; inconsistent about how much to move SP.
+define void @test_stack_no_frame() {
+; CHECK: test_stack_no_frame
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+  %local = alloca [20 x i64]
+  %val = load volatile [20 x i64]* @global, align 8
+  store volatile [20 x i64] %val, [20 x i64]* %local, align 8
+
+  %val2 = load volatile [20 x i64]* %local, align 8
+  store volatile [20 x i64] %val2, [20 x i64]* @global, align 8
+
+; CHECK: add sp, sp, #[[STACKSIZE]]
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
new file mode 100644
index 00000000000..2c7c6ae5d6d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stackmap.ll
@@ -0,0 +1,288 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+;
+; Note: Print verbose stackmaps using -debug-only=stackmaps.
+
+; We are not getting the correct stack alignment when cross compiling for arm64.
+; So specify a datalayout here.
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 11
+; Num LargeConstants
+; CHECK-NEXT:   .long 2
+; Num Callsites
+; CHECK-NEXT:   .long 11
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _constantargs
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _osrinline
+; CHECK-NEXT:   .quad 32
+; CHECK-NEXT:   .quad _osrcold
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _propertyRead
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _propertyWrite
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _jsVoidCall
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _jsIntCall
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _spilledValue
+; CHECK-NEXT:   .quad 160
+; CHECK-NEXT:   .quad _spilledStackMapValue
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad _liveConstant
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _clobberLR
+; CHECK-NEXT:   .quad 112
+
+; Num LargeConstants
+; CHECK-NEXT:   .quad   4294967295
+; CHECK-NEXT:   .quad   4294967296
+
+; Constant arguments
+;
+; CHECK-NEXT:   .quad   1
+; CHECK-NEXT:   .long   L{{.*}}-_constantargs
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  4
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65535
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65536
+; SmallConstant
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
+; LargeConstant at index 0
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   1
+
+define void @constantargs() {
+entry:
+  %0 = inttoptr i64 244837814094590 to i8*
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  ret void
+}
+
+; Inline OSR Exit
+;
+; CHECK-LABEL:  .long   L{{.*}}-_osrinline
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrinline(i64 %a, i64 %b) {
+entry:
+  ; Runtime void->void call.
+  call void inttoptr (i64 244837814094590 to void ()*)()
+  ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+  ret void
+}
+
+; Cold OSR Exit
+;
+; 2 live variables in register.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_osrcold
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrcold(i64 %a, i64 %b) {
+entry:
+  %test = icmp slt i64 %a, %b
+  br i1 %test, label %ret, label %cold
+cold:
+  ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
+  %thunk = inttoptr i64 244837814094590 to i8*
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
+  unreachable
+ret:
+  ret void
+}
+
+; Property Read
+; CHECK-LABEL:  .long   L{{.*}}-_propertyRead
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+;
+; FIXME: There are currently no stackmap entries. After moving to
+; AnyRegCC, we will have entries for the object and return value.
+define i64 @propertyRead(i64* %obj) {
+entry:
+  %resolveRead = inttoptr i64 244837814094590 to i8*
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Property Write
+; CHECK-LABEL:  .long   L{{.*}}-_propertyWrite
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
+entry:
+  %resolveWrite = inttoptr i64 244837814094590 to i8*
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  ret void
+}
+
+; Void JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_jsVoidCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 244837814094590 to i8*
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  ret void
+}
+
+; i64 JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_jsIntCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 244837814094590 to i8*
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Spilled stack map values.
+;
+; Verify 28 stack map entries.
+;
+; CHECK-LABEL:  .long L{{.*}}-_spilledValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 28
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect FP + ...
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
+entry:
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+  ret void
+}
+
+; Spilled stack map values.
+;
+; Verify 23 stack map entries.
+;
+; CHECK-LABEL:  .long L{{.*}}-_spilledStackMapValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 30
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect FP + ...
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
+entry:
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
+  ret void
+}
+
+
+; Map a constant value.
+;
+; CHECK-LABEL:  .long L{{.*}}-_liveConstant
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   33
+
+define void @liveConstant() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
+  ret void
+}
+
+; Map a value when LR is the only free register.
+;
+; CHECK-LABEL:  .long L{{.*}}-_clobberLR
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: Indirect FP (r29) - offset
+; CHECK-NEXT:   .byte   3
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .short  29
+; CHECK-NEXT:   .long   -{{[0-9]+}}
+define void @clobberLR(i32 %a) {
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x31}"() nounwind
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-stackpointer.ll b/test/CodeGen/AArch64/arm64-stackpointer.ll
new file mode 100644
index 00000000000..581faf130f1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stackpointer.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
+
+define i64 @get_stack() nounwind {
+entry:
+; CHECK-LABEL: get_stack:
+; CHECK: mov   x0, sp
+	%sp = call i64 @llvm.read_register.i64(metadata !0)
+  ret i64 %sp
+}
+
+define void @set_stack(i64 %val) nounwind {
+entry:
+; CHECK-LABEL: set_stack:
+; CHECK: mov   sp, x0
+  call void @llvm.write_register.i64(metadata !0, i64 %val)
+  ret void
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+declare void @llvm.write_register.i64(metadata, i64) nounwind
+
+; register unsigned long current_stack_pointer asm("sp");
+; CHECK-NOT: .asciz  "sp"
+!0 = metadata !{metadata !"sp\00"}
diff --git a/test/CodeGen/AArch64/arm64-stacksave.ll b/test/CodeGen/AArch64/arm64-stacksave.ll
new file mode 100644
index 00000000000..a79e99ba323
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stacksave.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -verify-coalescing
+; <rdar://problem/11522048>
+target triple = "arm64-apple-macosx10.8.0"
+
+; Verify that we can handle spilling the stack pointer without attempting
+; spilling it directly.
+; CHECK: f
+; CHECK: mov [[X0:x[0-9]+]], sp
+; CHECK: str [[X0]]
+; CHECK: inlineasm
+define void @f() nounwind ssp {
+entry:
+  %savedstack = call i8* @llvm.stacksave() nounwind
+  call void asm sideeffect "; inlineasm", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
+  call void @llvm.stackrestore(i8* %savedstack) nounwind
+  ret void
+}
+
+declare i8* @llvm.stacksave() nounwind
+declare void @llvm.stackrestore(i8*) nounwind
diff --git a/test/CodeGen/AArch64/arm64-stp.ll b/test/CodeGen/AArch64/arm64-stp.ll
new file mode 100644
index 00000000000..40bdf22c995
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stp.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=arm64 -aarch64-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
+; RUN:   -verify-machineinstrs -mcpu=cyclone | FileCheck -check-prefix=STUR_CHK %s
+
+; CHECK: stp_int
+; CHECK: stp w0, w1, [x2]
+define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
+  store i32 %a, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32* %p, i64 1
+  store i32 %b, i32* %add.ptr, align 4
+  ret void
+}
+
+; CHECK: stp_long
+; CHECK: stp x0, x1, [x2]
+define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
+  store i64 %a, i64* %p, align 8
+  %add.ptr = getelementptr inbounds i64* %p, i64 1
+  store i64 %b, i64* %add.ptr, align 8
+  ret void
+}
+
+; CHECK: stp_float
+; CHECK: stp s0, s1, [x0]
+define void @stp_float(float %a, float %b, float* nocapture %p) nounwind {
+  store float %a, float* %p, align 4
+  %add.ptr = getelementptr inbounds float* %p, i64 1
+  store float %b, float* %add.ptr, align 4
+  ret void
+}
+
+; CHECK: stp_double
+; CHECK: stp d0, d1, [x0]
+define void @stp_double(double %a, double %b, double* nocapture %p) nounwind {
+  store double %a, double* %p, align 8
+  %add.ptr = getelementptr inbounds double* %p, i64 1
+  store double %b, double* %add.ptr, align 8
+  ret void
+}
+
+; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
+define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
+; STUR_CHK: stur_int
+; STUR_CHK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %p, i32 -1
+  store i32 %a, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %p, i32 -2
+  store i32 %b, i32* %p2, align 2
+  ret void
+}
+
+define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
+; STUR_CHK: stur_long
+; STUR_CHK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %p, i32 -1
+  store i64 %a, i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %p, i32 -2
+  store i64 %b, i64* %p2, align 2
+  ret void
+}
+
+define void @stur_float(float %a, float %b, float* nocapture %p) nounwind {
+; STUR_CHK: stur_float
+; STUR_CHK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds float* %p, i32 -1
+  store float %a, float* %p1, align 2
+  %p2 = getelementptr inbounds float* %p, i32 -2
+  store float %b, float* %p2, align 2
+  ret void
+}
+
+define void @stur_double(double %a, double %b, double* nocapture %p) nounwind {
+; STUR_CHK: stur_double
+; STUR_CHK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds double* %p, i32 -1
+  store double %a, double* %p1, align 2
+  %p2 = getelementptr inbounds double* %p, i32 -2
+  store double %b, double* %p2, align 2
+  ret void
+}
+
+define void @splat_v4i32(i32 %v, i32 *%p) {
+entry:
+
+; CHECK-LABEL: splat_v4i32
+; CHECK-DAG: stp w0, w0, [x1]
+; CHECK-DAG: stp w0, w0, [x1, #8]
+; CHECK: ret
+
+  %p17 = insertelement <4 x i32> undef, i32 %v, i32 0
+  %p18 = insertelement <4 x i32> %p17, i32 %v, i32 1
+  %p19 = insertelement <4 x i32> %p18, i32 %v, i32 2
+  %p20 = insertelement <4 x i32> %p19, i32 %v, i32 3
+  %p21 = bitcast i32* %p to <4 x i32>*
+  store <4 x i32> %p20, <4 x i32>* %p21, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-strict-align.ll b/test/CodeGen/AArch64/arm64-strict-align.ll
new file mode 100644
index 00000000000..5d137043a69
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-strict-align.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-no-strict-align | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
+
+define i32 @f0(i32* nocapture %p) nounwind {
+; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
+; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
+; CHECK-STRICT: bfi [[LOW]], [[HIGH]], #16, #16
+; CHECK-STRICT: ret
+
+; CHECK: ldr w0, [x0]
+; CHECK: ret
+  %tmp = load i32* %p, align 2
+  ret i32 %tmp
+}
+
+define i64 @f1(i64* nocapture %p) nounwind {
+; CHECK-STRICT:	ldp	w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
+; CHECK-STRICT: bfi x[[LOW]], x[[HIGH]], #32, #32
+; CHECK-STRICT:	ret
+
+; CHECK: ldr x0, [x0]
+; CHECK: ret
+  %tmp = load i64* %p, align 4
+  ret i64 %tmp
+}
diff --git a/test/CodeGen/AArch64/arm64-stur.ll b/test/CodeGen/AArch64/arm64-stur.ll
new file mode 100644
index 00000000000..a2e684dc952
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stur.ll
@@ -0,0 +1,98 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+%struct.X = type <{ i32, i64, i64 }>
+
+define void @foo1(i32* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: 	stur	w1, [x0, #-4]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i32
+  %ptr = getelementptr inbounds i32* %p, i64 -1
+  store i32 %tmp1, i32* %ptr, align 4
+  ret void
+}
+define void @foo2(i16* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK: 	sturh	w1, [x0, #-2]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i16
+  %ptr = getelementptr inbounds i16* %p, i64 -1
+  store i16 %tmp1, i16* %ptr, align 2
+  ret void
+}
+define void @foo3(i8* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK: 	sturb	w1, [x0, #-1]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i8
+  %ptr = getelementptr inbounds i8* %p, i64 -1
+  store i8 %tmp1, i8* %ptr, align 1
+  ret void
+}
+define void @foo4(i16* %p, i32 %val) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK: 	sturh	w1, [x0, #-2]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i32 %val to i16
+  %ptr = getelementptr inbounds i16* %p, i32 -1
+  store i16 %tmp1, i16* %ptr, align 2
+  ret void
+}
+define void @foo5(i8* %p, i32 %val) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK: 	sturb	w1, [x0, #-1]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i32 %val to i8
+  %ptr = getelementptr inbounds i8* %p, i32 -1
+  store i8 %tmp1, i8* %ptr, align 1
+  ret void
+}
+
+define void @foo(%struct.X* nocapture %p) nounwind optsize ssp {
+; CHECK-LABEL: foo:
+; CHECK-NOT: str
+; CHECK: stur    xzr, [x0, #12]
+; CHECK-NEXT: stur    xzr, [x0, #4]
+; CHECK-NEXT: ret
+  %B = getelementptr inbounds %struct.X* %p, i64 0, i32 1
+  %val = bitcast i64* %B to i8*
+  call void @llvm.memset.p0i8.i64(i8* %val, i8 0, i64 16, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+; Unaligned 16b stores are split into 8b stores for performance.
+; radar://15424193
+
+; CHECK-LABEL: unaligned:
+; CHECK-NOT: str q0
+; CHECK: str     d[[REG:[0-9]+]], [x0]
+; CHECK: ext.16b v[[REG2:[0-9]+]], v[[REG]], v[[REG]], #8
+; CHECK: str     d[[REG2]], [x0, #8]
+define void @unaligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 4
+  ret void
+}
+
+; CHECK-LABEL: aligned:
+; CHECK: str q0
+define void @aligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p
+  ret void
+}
+
+; Don't split one and two byte aligned stores.
+; radar://16349308
+
+; CHECK-LABEL: twobytealign:
+; CHECK: str q0
+define void @twobytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 2
+  ret void
+}
+; CHECK-LABEL: onebytealign:
+; CHECK: str q0
+define void @onebytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 1
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-subsections.ll b/test/CodeGen/AArch64/arm64-subsections.ll
new file mode 100644
index 00000000000..316e7c3a8eb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-subsections.ll
@@ -0,0 +1,5 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s --check-prefix=CHECK-MACHO
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s --check-prefix=CHECK-ELF
+
+; CHECK-MACHO: .subsections_via_symbols
+; CHECK-ELF-NOT: .subsections_via_symbols
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-subvector-extend.ll b/test/CodeGen/AArch64/arm64-subvector-extend.ll
new file mode 100644
index 00000000000..d5a178a9e65
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+; Test efficient codegen of vector extends up from legal type to 128 bit
+; and 256 bit vector types.
+
+;-----
+; Vectors of i16.
+;-----
+define <8 x i16> @func1(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: func1:
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i8> %v0 to <8 x i16>
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @func2(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: func2:
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i8> %v0 to <8 x i16>
+  ret <8 x i16> %r
+}
+
+define <16 x i16> @func3(<16 x i8> %v0) nounwind {
+; CHECK-LABEL: func3:
+; CHECK-NEXT: ushll2.8h  v1, v0, #0
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <16 x i8> %v0 to <16 x i16>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @func4(<16 x i8> %v0) nounwind {
+; CHECK-LABEL: func4:
+; CHECK-NEXT: sshll2.8h  v1, v0, #0
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <16 x i8> %v0 to <16 x i16>
+  ret <16 x i16> %r
+}
+
+;-----
+; Vectors of i32.
+;-----
+
+define <4 x i32> @afunc1(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc1:
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i16> %v0 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @afunc2(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc2:
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i16> %v0 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <8 x i32> @afunc3(<8 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc3:
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i16> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @afunc4(<8 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc4:
+; CHECK-NEXT: sshll2.4s v1, v0, #0
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i16> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @bfunc1(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: bfunc1:
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i8> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @bfunc2(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: bfunc2:
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: sshll2.4s v1, v0, #0
+; CHECK-NEXT: sshll.4s  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i8> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+;-----
+; Vectors of i64.
+;-----
+
+define <4 x i64> @zfunc1(<4 x i32> %v0) nounwind {
+; CHECK-LABEL: zfunc1:
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i32> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @zfunc2(<4 x i32> %v0) nounwind {
+; CHECK-LABEL: zfunc2:
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i32> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @bfunc3(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: func3:
+; CHECK-NEXT: ushll.4s  v0, v0, #0
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i16> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @cfunc4(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: func4:
+; CHECK-NEXT: sshll.4s  v0, v0, #0
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i16> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
diff --git a/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll b/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
new file mode 100644
index 00000000000..4ab2bee0ed1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; rdar://13214163 - Make sure we generate a correct lookup table for the TBL
+; instruction when the element size of the vector is not 8 bits. We were
+; getting both the endianness wrong and the element indexing wrong.
+define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {
+; CHECK:	.section	__TEXT,__literal16,16byte_literals
+; CHECK:	.align	4
+; CHECK:lCPI0_0:
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.section __TEXT,__text,regular,pure_instructions
+; CHECK:	.globl	_foo
+; CHECK:	.align	2
+; CHECK:_foo:                                   ; @foo
+; CHECK:	adrp	[[BASE:x[0-9]+]], lCPI0_0@PAGE
+; CHECK:	ldr	q[[REG:[0-9]+]], {{\[}}[[BASE]], lCPI0_0@PAGEOFF]
+; CHECK:	tbl.16b	v0, { v0 }, v[[REG]]
+; CHECK:	ret
+
+  %val = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i16> %val
+}
diff --git a/test/CodeGen/AArch64/arm64-tbl.ll b/test/CodeGen/AArch64/arm64-tbl.ll
new file mode 100644
index 00000000000..b1ce15a1e19
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tbl.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+; CHECK: tbl1_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+; CHECK: tbl1_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+; CHECK: tbl2_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK: tbl2_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbl3_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbl3_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbl4_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbl4_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+; CHECK: tbx1_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+; CHECK: tbx1_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbx2_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbx2_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbx3_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbx3_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+; CHECK: tbx4_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+; CHECK: tbx4_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-this-return.ll b/test/CodeGen/AArch64/arm64-this-return.ll
new file mode 100644
index 00000000000..30f5b9b064a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-this-return.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+%struct.A = type { i8 }
+%struct.B = type { i32 }
+%struct.C = type { %struct.B }
+%struct.D = type { %struct.B }
+%struct.E = type { %struct.B, %struct.B }
+
+declare %struct.A* @A_ctor_base(%struct.A* returned)
+declare %struct.B* @B_ctor_base(%struct.B* returned, i32)
+declare %struct.B* @B_ctor_complete(%struct.B* returned, i32)
+
+declare %struct.A* @A_ctor_base_nothisret(%struct.A*)
+declare %struct.B* @B_ctor_base_nothisret(%struct.B*, i32)
+declare %struct.B* @B_ctor_complete_nothisret(%struct.B*, i32)
+
+define %struct.C* @C_ctor_base(%struct.C* returned %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_base:
+; CHECK-NOT: mov {{x[0-9]+}}, x0
+; CHECK: bl {{_?A_ctor_base}}
+; CHECK-NOT: mov x0, {{x[0-9]+}}
+; CHECK: b {{_?B_ctor_base}}
+  %0 = bitcast %struct.C* %this to %struct.A*
+  %call = tail call %struct.A* @A_ctor_base(%struct.A* %0)
+  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
+  %call2 = tail call %struct.B* @B_ctor_base(%struct.B* %1, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_base_nothisret:
+; CHECK: mov [[SAVETHIS:x[0-9]+]], x0
+; CHECK: bl {{_?A_ctor_base_nothisret}}
+; CHECK: mov x0, [[SAVETHIS]]
+; CHECK-NOT: b {{_?B_ctor_base_nothisret}}
+  %0 = bitcast %struct.C* %this to %struct.A*
+  %call = tail call %struct.A* @A_ctor_base_nothisret(%struct.A* %0)
+  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
+  %call2 = tail call %struct.B* @B_ctor_base_nothisret(%struct.B* %1, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_complete(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_complete:
+; CHECK: b {{_?C_ctor_base}}
+  %call = tail call %struct.C* @C_ctor_base(%struct.C* %this, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_complete_nothisret(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_complete_nothisret:
+; CHECK-NOT: b {{_?C_ctor_base_nothisret}}
+  %call = tail call %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.D* @D_ctor_base(%struct.D* %this, i32 %x) {
+entry:
+; CHECK-LABEL: D_ctor_base:
+; CHECK-NOT: mov {{x[0-9]+}}, x0
+; CHECK: bl {{_?B_ctor_complete}}
+; CHECK-NOT: mov x0, {{x[0-9]+}}
+; CHECK: b {{_?B_ctor_complete}}
+  %b = getelementptr inbounds %struct.D* %this, i32 0, i32 0
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  ret %struct.D* %this
+}
+
+define %struct.E* @E_ctor_base(%struct.E* %this, i32 %x) {
+entry:
+; CHECK-LABEL: E_ctor_base:
+; CHECK-NOT: b {{_?B_ctor_complete}}
+  %b = getelementptr inbounds %struct.E* %this, i32 0, i32 0
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %b2 = getelementptr inbounds %struct.E* %this, i32 0, i32 1
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b2, i32 %x)
+  ret %struct.E* %this
+}
diff --git a/test/CodeGen/AArch64/arm64-tls-darwin.ll b/test/CodeGen/AArch64/arm64-tls-darwin.ll
new file mode 100644
index 00000000000..5e8ec33ba41
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tls-darwin.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+
+@var = thread_local global i8 0
+
+; N.b. x0 must be the result of the first load (i.e. the address of the
+; descriptor) when tlv_get_addr is called. Likewise the result is returned in
+; x0.
+define i8 @get_var() {
+; CHECK-LABEL: get_var:
+; CHECK: adrp x[[TLVPDESC_SLOT_HI:[0-9]+]], _var@TLVPPAGE
+; CHECK: ldr x0, [x[[TLVPDESC_SLOT_HI]], _var@TLVPPAGEOFF]
+; CHECK: ldr [[TLV_GET_ADDR:x[0-9]+]], [x0]
+; CHECK: blr [[TLV_GET_ADDR]]
+; CHECK: ldrb w0, [x0]
+
+  %val = load i8* @var, align 1
+  ret i8 %val
+}
diff --git a/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
new file mode 100644
index 00000000000..3daae625c84
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+
+; If the .tlsdesccall and blr parts are emitted completely separately (even with
+; glue) then LLVM will separate them quite happily (with a spill at O0, hence
+; the option). This is definitely wrong, so we make sure they are emitted
+; together.
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32* @general_dynamic_var
+  ret i32 %val
+
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr {{x[0-9]+}}
+}
diff --git a/test/CodeGen/AArch64/arm64-tls-dynamics.ll b/test/CodeGen/AArch64/arm64-tls-dynamics.ll
new file mode 100644
index 00000000000..e8a83fd7db3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tls-dynamics.ll
@@ -0,0 +1,135 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32* @general_dynamic_var
+  ret i32 %val
+
+  ; FIXME: the adrp instructions are redundant (if harmless).
+; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], x0]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+define i32* @test_generaldynamic_addr() {
+; CHECK-LABEL: test_generaldynamic_addr:
+
+  ret i32* @general_dynamic_var
+
+  ; FIXME: the adrp instructions are redundant (if harmless).
+; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], x0
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+}
+
+@local_dynamic_var = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic() {
+; CHECK-LABEL: test_localdynamic:
+
+  %val = load i32* @local_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
+; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
+
+; CHECK: add x[[TPREL:[0-9]+]], x0, [[DTP_OFFSET]]
+
+; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+
+; CHECK: ldr w0, [x[[TPIDR]], x[[TPREL]]]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+define i32* @test_localdynamic_addr() {
+; CHECK-LABEL: test_localdynamic_addr:
+
+  ret i32* @local_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
+; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
+
+; CHECK: add [[TPREL:x[0-9]+]], x0, [[DTP_OFFSET]]
+
+; CHECK: mrs [[TPIDR:x[0-9]+]], TPIDR_EL0
+
+; CHECK: add x0, [[TPIDR]], [[TPREL]]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+; The entire point of the local-dynamic access model is to have a single call to
+; the expensive resolver. Make sure we achieve that goal.
+
+@local_dynamic_var2 = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic_deduplicate() {
+; CHECK-LABEL: test_localdynamic_deduplicate:
+
+  %val = load i32* @local_dynamic_var
+  %val2 = load i32* @local_dynamic_var2
+
+  %sum = add i32 %val, %val2
+  ret i32 %sum
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK-NOT: _TLS_MODULE_BASE_
+
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-tls-execs.ll b/test/CodeGen/AArch64/arm64-tls-execs.ll
new file mode 100644
index 00000000000..f0130d85889
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tls-execs.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+@initial_exec_var = external thread_local(initialexec) global i32
+
+define i32 @test_initial_exec() {
+; CHECK-LABEL: test_initial_exec:
+  %val = load i32* @initial_exec_var
+
+; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
+; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], x[[TP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
+; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+
+  ret i32 %val
+}
+
+define i32* @test_initial_exec_addr() {
+; CHECK-LABEL: test_initial_exec_addr:
+  ret i32* @initial_exec_var
+
+; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
+; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], [[TP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
+; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+
+}
+
+@local_exec_var = thread_local(localexec) global i32 0
+
+define i32 @test_local_exec() {
+; CHECK-LABEL: test_local_exec:
+  %val = load i32* @local_exec_var
+
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [0bAAA{{[01]+}},A,0b101AAAAA,0x92]
+; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+
+  ret i32 %val
+}
+
+define i32* @test_local_exec_addr() {
+; CHECK-LABEL: test_local_exec_addr:
+  ret i32* @local_exec_var
+
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
+; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], [[TP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+}
diff --git a/test/CodeGen/AArch64/arm64-trap.ll b/test/CodeGen/AArch64/arm64-trap.ll
new file mode 100644
index 00000000000..5e99c32c57b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-trap.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define void @foo() nounwind {
+; CHECK: foo
+; CHECK: brk #0x1
+  tail call void @llvm.trap()
+  ret void
+}
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/AArch64/arm64-trn.ll b/test/CodeGen/AArch64/arm64-trn.ll
new file mode 100644
index 00000000000..2db7a14e754
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-trn.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrni8:
+;CHECK: trn1.8b
+;CHECK: trn2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrni16:
+;CHECK: trn1.4h
+;CHECK: trn2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+; 2xi32 TRN is redundant with ZIP
+define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vtrni32:
+;CHECK: zip1.2s
+;CHECK: zip2.2s
+;CHECK-NEXT: add.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
+	%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+	ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: vtrnf:
+;CHECK: zip1.2s
+;CHECK: zip2.2s
+;CHECK-NEXT: fadd.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2>
+	%tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3>
+        %tmp5 = fadd <2 x float> %tmp3, %tmp4
+	ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrnQi8:
+;CHECK: trn1.16b
+;CHECK: trn2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrnQi16:
+;CHECK: trn1.8h
+;CHECK: trn2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vtrnQi32:
+;CHECK: trn1.4s
+;CHECK: trn2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vtrnQf:
+;CHECK: trn1.4s
+;CHECK: trn2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VTRN:
+
+define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrni8_undef:
+;CHECK: trn1.8b
+;CHECK: trn2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrnQi16_undef:
+;CHECK: trn1.8h
+;CHECK: trn2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
diff --git a/test/CodeGen/AArch64/arm64-trunc-store.ll b/test/CodeGen/AArch64/arm64-trunc-store.ll
new file mode 100644
index 00000000000..cf15247e152
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-trunc-store.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+define void @bar(<8 x i16> %arg, <8 x i8>* %p) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: xtn.8b v[[REG:[0-9]+]], v0
+; CHECK-NEXT: str d[[REG]], [x0]
+; CHECK-NEXT: ret
+  %tmp = trunc <8 x i16> %arg to <8 x i8>
+  store <8 x i8> %tmp, <8 x i8>* %p, align 8
+  ret void
+}
+
+@zptr8 = common global i8* null, align 8
+@zptr16 = common global i16* null, align 8
+@zptr32 = common global i32* null, align 8
+
+define void @fct32(i32 %arg, i64 %var) {
+; CHECK: fct32
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr32@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr32@GOTPAGEOFF]
+; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
+; w1 is %var truncated
+; CHECK-NEXT: str w1, {{\[}}[[GLOBALADDR]], w[[OFFSETREGNUM]], sxtw #2]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i32** @zptr32, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i32* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i32
+  store i32 %tmp, i32* %arrayidx9, align 4
+  ret void
+}
+
+define void @fct16(i32 %arg, i64 %var) {
+; CHECK: fct16
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr16@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr16@GOTPAGEOFF]
+; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
+; w1 is %var truncated
+; CHECK-NEXT: strh w1, {{\[}}[[GLOBALADDR]], w[[OFFSETREGNUM]], sxtw #1]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i16** @zptr16, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i16* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i16
+  store i16 %tmp, i16* %arrayidx9, align 4
+  ret void
+}
+
+define void @fct8(i32 %arg, i64 %var) {
+; CHECK: fct8
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr8@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr8@GOTPAGEOFF]
+; CHECK: ldr [[BASEADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: add [[ADDR:x[0-9]+]], [[BASEADDR]], w0, sxtw
+; w1 is %var truncated
+; CHECK-NEXT: sturb w1, {{\[}}[[ADDR]], #-1]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i8** @zptr8, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i8* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i8
+  store i8 %tmp, i8* %arrayidx9, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-umaxv.ll b/test/CodeGen/AArch64/arm64-umaxv.ll
new file mode 100644
index 00000000000..d523f317d08
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-umaxv.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u8x8:
+; CHECK: umaxv.8b        b[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...)
+
+define i32 @vmax_u4x16(<4 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u4x16:
+; CHECK: umaxv.4h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmax_u8x16(<8 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u8x16:
+; CHECK: umaxv.8h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmax_u16x8(<16 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u16x8:
+; CHECK: umaxv.16b        b[[REG:[0-9]+]], v0
+; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-uminv.ll b/test/CodeGen/AArch64/arm64-uminv.ll
new file mode 100644
index 00000000000..3bade4b28b8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-uminv.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u8x8:
+; CHECK: uminv.8b        b[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...)
+
+define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u4x16:
+; CHECK: uminv.4h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u8x16:
+; CHECK: uminv.8h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u16x8:
+; CHECK: uminv.16b        b[[REG:[0-9]+]], v0
+; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-umov.ll b/test/CodeGen/AArch64/arm64-umov.ll
new file mode 100644
index 00000000000..a1ef9908646
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-umov.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define zeroext i8 @f1(<16 x i8> %a) {
+; CHECK-LABEL: f1:
+; CHECK: mov.b w0, v0[3]
+; CHECK-NEXT: ret
+  %vecext = extractelement <16 x i8> %a, i32 3
+  ret i8 %vecext
+}
+
+define zeroext i16 @f2(<4 x i16> %a) {
+; CHECK-LABEL: f2:
+; CHECK: mov.h w0, v0[2]
+; CHECK-NEXT: ret
+  %vecext = extractelement <4 x i16> %a, i32 2
+  ret i16 %vecext
+}
+
+define i32 @f3(<2 x i32> %a) {
+; CHECK-LABEL: f3:
+; CHECK: mov.s w0, v0[1]
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i32> %a, i32 1
+  ret i32 %vecext
+}
+
+define i64 @f4(<2 x i64> %a) {
+; CHECK-LABEL: f4:
+; CHECK: mov.d x0, v0[1]
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 1
+  ret i64 %vecext
+}
diff --git a/test/CodeGen/AArch64/arm64-unaligned_ldst.ll b/test/CodeGen/AArch64/arm64-unaligned_ldst.ll
new file mode 100644
index 00000000000..20b80c09f72
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-unaligned_ldst.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+; rdar://r11231896
+
+define void @t1(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: orr
+; CHECK: ldr [[X0:x[0-9]+]], [x1]
+; CHECK: str [[X0]], [x0]
+  %tmp1 = bitcast i8* %b to i64*
+  %tmp2 = bitcast i8* %a to i64*
+  %tmp3 = load i64* %tmp1, align 1
+  store i64 %tmp3, i64* %tmp2, align 1
+  ret void
+}
+
+define void @t2(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK-NOT: orr
+; CHECK: ldr [[W0:w[0-9]+]], [x1]
+; CHECK: str [[W0]], [x0]
+  %tmp1 = bitcast i8* %b to i32*
+  %tmp2 = bitcast i8* %a to i32*
+  %tmp3 = load i32* %tmp1, align 1
+  store i32 %tmp3, i32* %tmp2, align 1
+  ret void
+}
+
+define void @t3(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: orr
+; CHECK: ldrh [[W0:w[0-9]+]], [x1]
+; CHECK: strh [[W0]], [x0]
+  %tmp1 = bitcast i8* %b to i16*
+  %tmp2 = bitcast i8* %a to i16*
+  %tmp3 = load i16* %tmp1, align 1
+  store i16 %tmp3, i16* %tmp2, align 1
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-uzp.ll b/test/CodeGen/AArch64/arm64-uzp.ll
new file mode 100644
index 00000000000..cdd8d31c998
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-uzp.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpi8:
+;CHECK: uzp1.8b
+;CHECK: uzp2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpi16:
+;CHECK: uzp1.4h
+;CHECK: uzp2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpQi8:
+;CHECK: uzp1.16b
+;CHECK: uzp2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpQi16:
+;CHECK: uzp1.8h
+;CHECK: uzp2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vuzpQi32:
+;CHECK: uzp1.4s
+;CHECK: uzp2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vuzpQf:
+;CHECK: uzp1.4s
+;CHECK: uzp2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VUZP:
+
+define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpi8_undef:
+;CHECK: uzp1.8b
+;CHECK: uzp2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpQi16_undef:
+;CHECK: uzp1.8h
+;CHECK: uzp2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
diff --git a/test/CodeGen/AArch64/arm64-vaargs.ll b/test/CodeGen/AArch64/arm64-vaargs.ll
new file mode 100644
index 00000000000..ce07635a5c8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vaargs.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-darwin11.0.0"
+
+define float @t1(i8* nocapture %fmt, ...) nounwind ssp {
+entry:
+; CHECK: t1
+; CHECK: fcvt
+  %argp = alloca i8*, align 8
+  %argp1 = bitcast i8** %argp to i8*
+  call void @llvm.va_start(i8* %argp1)
+  %0 = va_arg i8** %argp, i32
+  %1 = va_arg i8** %argp, float
+  call void @llvm.va_end(i8* %argp1)
+  ret float %1
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+declare void @llvm.va_end(i8*) nounwind
diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
new file mode 100644
index 00000000000..5afc8d9f3f4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -0,0 +1,804 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+
+define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sabdl8h:
+;CHECK: sabdl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sabdl4s:
+;CHECK: sabdl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sabdl2d:
+;CHECK: sabdl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sabdl2_8h:
+;CHECK: sabdl2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sabdl2_4s:
+;CHECK: sabdl2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sabdl2_2d:
+;CHECK: sabdl2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uabdl8h:
+;CHECK: uabdl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uabdl4s:
+;CHECK: uabdl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uabdl2d:
+;CHECK: uabdl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+  ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uabdl2_8h:
+;CHECK: uabdl2.8h
+  %load1 = load <16 x i8>* %A
+  %load2 = load <16 x i8>* %B
+  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uabdl2_4s:
+;CHECK: uabdl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uabdl2_2d:
+;CHECK: uabdl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+  ret <2 x i64> %tmp4
+}
+
+define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fabd_2s:
+;CHECK: fabd.2s
+        %tmp1 = load <2 x float>* %A
+        %tmp2 = load <2 x float>* %B
+        %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        ret <2 x float> %tmp3
+}
+
+define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fabd_4s:
+;CHECK: fabd.4s
+        %tmp1 = load <4 x float>* %A
+        %tmp2 = load <4 x float>* %B
+        %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        ret <4 x float> %tmp3
+}
+
+define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fabd_2d:
+;CHECK: fabd.2d
+        %tmp1 = load <2 x double>* %A
+        %tmp2 = load <2 x double>* %B
+        %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sabd_8b:
+;CHECK: sabd.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sabd_16b:
+;CHECK: sabd.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sabd_4h:
+;CHECK: sabd.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sabd_8h:
+;CHECK: sabd.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sabd_2s:
+;CHECK: sabd.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sabd_4s:
+;CHECK: sabd.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uabd_8b:
+;CHECK: uabd.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uabd_16b:
+;CHECK: uabd.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uabd_4h:
+;CHECK: uabd.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uabd_8h:
+;CHECK: uabd.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uabd_2s:
+;CHECK: uabd.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uabd_4s:
+;CHECK: uabd.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqabs_8b:
+;CHECK: sqabs.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqabs_16b:
+;CHECK: sqabs.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqabs_4h:
+;CHECK: sqabs.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqabs_8h:
+;CHECK: sqabs.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqabs_2s:
+;CHECK: sqabs.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqabs_4s:
+;CHECK: sqabs.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqneg_8b:
+;CHECK: sqneg.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqneg_16b:
+;CHECK: sqneg.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqneg_4h:
+;CHECK: sqneg.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqneg_8h:
+;CHECK: sqneg.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqneg_2s:
+;CHECK: sqneg.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqneg_4s:
+;CHECK: sqneg.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: abs_8b:
+;CHECK: abs.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: abs_16b:
+;CHECK: abs.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: abs_4h:
+;CHECK: abs.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: abs_8h:
+;CHECK: abs.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: abs_2s:
+;CHECK: abs.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: abs_4s:
+;CHECK: abs.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
+; CHECK-LABEL: abs_1d:
+; CHECK: abs d0, d0
+  %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
+  ret <1 x i64> %abs
+}
+
+define i64 @abs_1d_honestly(i64 %A) nounwind {
+; CHECK-LABEL: abs_1d_honestly:
+; CHECK: abs d0, d0
+  %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
+  ret i64 %abs
+}
+
+declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
+
+define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
+;CHECK-LABEL: sabal8h:
+;CHECK: sabal.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sabal4s:
+;CHECK: sabal.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sabal2d:
+;CHECK: sabal.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: sabal2_8h:
+;CHECK: sabal2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sabal2_4s:
+;CHECK: sabal2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sabal2_2d:
+;CHECK: sabal2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uabal8h:
+;CHECK: uabal.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uabal4s:
+;CHECK: uabal.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: uabal2d:
+;CHECK: uabal.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uabal2_8h:
+;CHECK: uabal2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uabal2_4s:
+;CHECK: uabal2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: uabal2_2d:
+;CHECK: uabal2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK-LABEL: saba_8b:
+;CHECK: saba.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = load <8 x i8>* %C
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK-LABEL: saba_16b:
+;CHECK: saba.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp4 = load <16 x i8>* %C
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+        ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: saba_4h:
+;CHECK: saba.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = load <4 x i16>* %C
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: saba_8h:
+;CHECK: saba.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp4 = load <8 x i16>* %C
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: saba_2s:
+;CHECK: saba.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = load <2 x i32>* %C
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: saba_4s:
+;CHECK: saba.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp4 = load <4 x i32>* %C
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK-LABEL: uaba_8b:
+;CHECK: uaba.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = load <8 x i8>* %C
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK-LABEL: uaba_16b:
+;CHECK: uaba.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp4 = load <16 x i8>* %C
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+        ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: uaba_4h:
+;CHECK: uaba.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = load <4 x i16>* %C
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uaba_8h:
+;CHECK: uaba.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp4 = load <8 x i16>* %C
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: uaba_2s:
+;CHECK: uaba.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = load <2 x i32>* %C
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uaba_4s:
+;CHECK: uaba.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp4 = load <4 x i32>* %C
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+; Scalar FABD
+define float @fabds(float %a, float %b) nounwind {
+; CHECK-LABEL: fabds:
+; CHECK: fabd s0, s0, s1
+  %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
+  ret float %vabd.i
+}
+
+define double @fabdd(double %a, double %b) nounwind {
+; CHECK-LABEL: fabdd:
+; CHECK: fabd d0, d0, d1
+  %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
+  ret double %vabd.i
+}
+
+declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
+
+define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uabdl_from_extract_dup:
+; CHECK-NOT: ext.16b
+; CHECK: uabdl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
+
+define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: sabdl_from_extract_dup:
+; CHECK-NOT: ext.16b
+; CHECK: sabdl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
diff --git a/test/CodeGen/AArch64/arm64-vadd.ll b/test/CodeGen/AArch64/arm64-vadd.ll
new file mode 100644
index 00000000000..9ed8aa6d7c5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vadd.ll
@@ -0,0 +1,941 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn8b:
+;CHECK: addhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn4h:
+;CHECK: addhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2s:
+;CHECK: addhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: addhn2_16b:
+;CHECK: addhn.8b
+;CHECK-NEXT: addhn2.16b
+  %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: addhn2_8h:
+;CHECK: addhn.4h
+;CHECK-NEXT: addhn2.8h
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: addhn2_4s:
+;CHECK: addhn.2s
+;CHECK-NEXT: addhn2.4s
+  %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: raddhn8b:
+;CHECK: raddhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: raddhn4h:
+;CHECK: raddhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: raddhn2s:
+;CHECK: raddhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: raddhn2_16b:
+;CHECK: raddhn.8b
+;CHECK-NEXT: raddhn2.16b
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: raddhn2_8h:
+;CHECK: raddhn.4h
+;CHECK-NEXT: raddhn2.8h
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: raddhn2_4s:
+;CHECK: raddhn.2s
+;CHECK-NEXT: raddhn2.4s
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: saddl8h:
+;CHECK: saddl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: saddl4s:
+;CHECK: saddl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: saddl2d:
+;CHECK: saddl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
+; CHECK-LABEL: saddl2_8h:
+; CHECK-NEXT: saddl2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
+  %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
+; CHECK-LABEL: saddl2_4s:
+; CHECK-NEXT: saddl2.4s v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
+  %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
+; CHECK-LABEL: saddl2_2d:
+; CHECK-NEXT: saddl2.2d v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
+  %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddl8h:
+;CHECK: uaddl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = add <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddl4s:
+;CHECK: uaddl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddl2d:
+;CHECK: uaddl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+
+define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
+; CHECK-LABEL: uaddl2_8h:
+; CHECK-NEXT: uaddl2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
+  %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
+; CHECK-LABEL: uaddl2_4s:
+; CHECK-NEXT: uaddl2.4s v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
+  %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
+; CHECK-LABEL: uaddl2_2d:
+; CHECK-NEXT: uaddl2.2d v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
+  %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddw8h:
+;CHECK: uaddw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = add <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddw4s:
+;CHECK: uaddw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = add <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddw2d:
+;CHECK: uaddw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = add <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddw2_8h:
+;CHECK: uaddw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+        %res = add <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddw2_4s:
+;CHECK: uaddw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+        %res = add <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddw2_2d:
+;CHECK: uaddw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+        %res = add <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: saddw8h:
+;CHECK: saddw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: saddw4s:
+;CHECK: saddw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: saddw2d:
+;CHECK: saddw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: saddw2_8h:
+;CHECK: saddw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = add <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: saddw2_4s:
+;CHECK: saddw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = add <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: saddw2_2d:
+;CHECK: saddw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = add <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: saddlp4h:
+;CHECK: saddlp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: saddlp2s:
+;CHECK: saddlp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: saddlp1d:
+;CHECK: saddlp.1d
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: saddlp8h:
+;CHECK: saddlp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: saddlp4s:
+;CHECK: saddlp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: saddlp2d:
+;CHECK: saddlp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        ret <2 x i64> %tmp3
+}
+
+declare <4 x i16>  @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16>  @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uaddlp4h:
+;CHECK: uaddlp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uaddlp2s:
+;CHECK: uaddlp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uaddlp1d:
+;CHECK: uaddlp.1d
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: uaddlp8h:
+;CHECK: uaddlp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uaddlp4s:
+;CHECK: uaddlp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uaddlp2d:
+;CHECK: uaddlp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        ret <2 x i64> %tmp3
+}
+
+declare <4 x i16>  @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16>  @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sadalp4h:
+;CHECK: sadalp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sadalp2s:
+;CHECK: sadalp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sadalp8h:
+;CHECK: sadalp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sadalp4s:
+;CHECK: sadalp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sadalp2d:
+;CHECK: sadalp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uadalp4h:
+;CHECK: uadalp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uadalp2s:
+;CHECK: uadalp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uadalp8h:
+;CHECK: uadalp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uadalp4s:
+;CHECK: uadalp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uadalp2d:
+;CHECK: uadalp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: addp_8b:
+;CHECK: addp.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: addp_16b:
+;CHECK: addp.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: addp_4h:
+;CHECK: addp.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addp_8h:
+;CHECK: addp.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: addp_2s:
+;CHECK: addp.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addp_4s:
+;CHECK: addp.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addp_2d:
+;CHECK: addp.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: faddp_2s:
+;CHECK: faddp.2s
+        %tmp1 = load <2 x float>* %A
+        %tmp2 = load <2 x float>* %B
+        %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        ret <2 x float> %tmp3
+}
+
+define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: faddp_4s:
+;CHECK: faddp.4s
+        %tmp1 = load <4 x float>* %A
+        %tmp2 = load <4 x float>* %B
+        %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        ret <4 x float> %tmp3
+}
+
+define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: faddp_2d:
+;CHECK: faddp.2d
+        %tmp1 = load <2 x double>* %A
+        %tmp2 = load <2 x double>* %B
+        %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uaddl2_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: uaddl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: saddl2_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: saddl2.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: usubl2_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: usubl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: ssubl2_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: ssubl2.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn8b_natural:
+;CHECK: addhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %sum = add <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        ret <8 x i8> %narrowed
+}
+
+define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn4h_natural:
+;CHECK: addhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %sum = add <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        ret <4 x i16> %narrowed
+}
+
+define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2s_natural:
+;CHECK: addhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %sum = add <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        ret <2 x i32> %narrowed
+}
+
+define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn2_16b_natural:
+;CHECK: addhn2.16b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %sum = add <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn2_8h_natural:
+;CHECK: addhn2.8h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %sum = add <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2_4s_natural:
+;CHECK: addhn2.4s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %sum = add <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn8b_natural:
+;CHECK: subhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %diff = sub <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        ret <8 x i8> %narrowed
+}
+
+define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn4h_natural:
+;CHECK: subhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %diff = sub <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        ret <4 x i16> %narrowed
+}
+
+define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2s_natural:
+;CHECK: subhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %diff = sub <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        ret <2 x i32> %narrowed
+}
+
+define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn2_16b_natural:
+;CHECK: subhn2.16b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %diff = sub <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn2_8h_natural:
+;CHECK: subhn2.8h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %diff = sub <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2_4s_natural:
+;CHECK: subhn2.4s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %diff = sub <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
diff --git a/test/CodeGen/AArch64/arm64-vaddlv.ll b/test/CodeGen/AArch64/arm64-vaddlv.ll
new file mode 100644
index 00000000000..2d6413812ec
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vaddlv.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_s32
+; CHECK: saddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddlv.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  ret i64 %vaddlv.i
+}
+
+define i64 @test_vaddlv_u32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_u32
+; CHECK: uaddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddlv.i = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  ret i64 %vaddlv.i
+}
+
+declare i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
+declare i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-vaddv.ll b/test/CodeGen/AArch64/arm64-vaddv.ll
new file mode 100644
index 00000000000..2d92ce6ea57
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vaddv.ll
@@ -0,0 +1,245 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+
+define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_s8:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_s16:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddv_s32(<2 x i32> %a1) {
+; CHECK-LABEL: test_vaddv_s32:
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define i64 @test_vaddv_s64(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_s64:
+; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
+; CHECK-NEXT: fmov x0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a1)
+  ret i64 %vaddv.i
+}
+
+define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_u8:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_u8_masked:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %0 = and i32 %vaddv.i, 511 ; 0x1ff
+  ret i32 %0
+}
+
+define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_u16:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_u16_masked:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
+  ret i32 %0
+}
+
+define i32 @test_vaddv_u32(<2 x i32> %a1) {
+; CHECK-LABEL: test_vaddv_u32:
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define float @test_vaddv_f32(<2 x float> %a1) {
+; CHECK-LABEL: test_vaddv_f32:
+; CHECK: faddp.2s s0, v0
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
+  ret float %vaddv.i
+}
+
+define float @test_vaddv_v4f32(<4 x float> %a1) {
+; CHECK-LABEL: test_vaddv_v4f32:
+; CHECK: faddp.4s [[REGNUM:v[0-9]+]], v0, v0
+; CHECK: faddp.2s s0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
+  ret float %vaddv.i
+}
+
+define double @test_vaddv_f64(<2 x double> %a1) {
+; CHECK-LABEL: test_vaddv_f64:
+; CHECK: faddp.2d d0, v0
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
+  ret double %vaddv.i
+}
+
+define i64 @test_vaddv_u64(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_u64:
+; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
+; CHECK-NEXT: fmov x0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  ret i64 %vaddv.i
+}
+
+define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_u64_to_vec:
+; CHECK: addp.2d d0, v0
+; CHECK-NOT: fmov
+; CHECK-NOT: ins
+; CHECK: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  %vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
+  ret <1 x i64> %vec
+}
+
+define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
+; CHECK-LABEL: test_vaddvq_s8:
+; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
+; CHECK-LABEL: test_vaddvq_s16:
+; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a1) {
+; CHECK-LABEL: test_vaddvq_s32:
+; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
+; CHECK-LABEL: test_vaddvq_u8:
+; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
+; CHECK-LABEL: test_vaddvq_u16:
+; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a1) {
+; CHECK-LABEL: test_vaddvq_u32:
+; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
+
+declare i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
+
+declare i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
+
+declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
+declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
+declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
new file mode 100644
index 00000000000..36a7bfd9252
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -0,0 +1,143 @@
+; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s
+
+%va_list = type {i8*, i8*, i8*, i32, i32}
+
+@var = global %va_list zeroinitializer, align 8
+
+declare void @llvm.va_start(i8*)
+
+define void @test_simple(i32 %n, ...) {
+; CHECK-LABEL: test_simple:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
+
+; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+
+; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]]
+; ... omit middle ones ...
+; CHECK: str x7, [sp, #
+
+; CHECK: stp q0, q1, [sp]
+; ... omit middle ones ...
+; CHECK: stp q6, q7, [sp, #
+
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+
+; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
+; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #56
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+
+; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
+; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #128
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #0x37
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+
+; CHECK: orr [[VR_OFFS:w[0-9]+]], wzr, #0xffffff80
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+
+  ret void
+}
+
+define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
+; CHECK-LABEL: test_fewargs:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
+
+; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+
+; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]]
+; ... omit middle ones ...
+; CHECK: str x7, [sp, #
+
+; CHECK: stp q1, q2, [sp]
+; ... omit middle ones ...
+; CHECK: str q7, [sp, #
+
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+
+; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
+; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #40
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+
+; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
+; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #112
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #0x27
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+
+; CHECK: movn [[VR_OFFS:w[0-9]+]], #0x6f
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+
+  ret void
+}
+
+define void @test_nospare([8 x i64], [8 x float], ...) {
+; CHECK-LABEL: test_nospare:
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+; CHECK-NOT: sub sp, sp
+; CHECK: mov [[STACK:x[0-9]+]], sp
+; CHECK: str [[STACK]], [{{x[0-9]+}}, :lo12:var]
+
+  ret void
+}
+
+; If there are non-variadic arguments on the stack (here two i64s) then the
+; __stack field should point just past them.
+define void @test_offsetstack([10 x i64], [3 x float], ...) {
+; CHECK-LABEL: test_offsetstack:
+; CHECK: sub sp, sp, #80
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
+; CHECK: str [[STACK_TOP]], [{{x[0-9]+}}, :lo12:var]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+  ret void
+}
+
+declare void @llvm.va_end(i8*)
+
+define void @test_va_end() nounwind {
+; CHECK-LABEL: test_va_end:
+; CHECK-NEXT: BB#0
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_end(i8* %addr)
+
+  ret void
+; CHECK-NEXT: ret
+}
+
+declare void @llvm.va_copy(i8* %dest, i8* %src)
+
+@second_list = global %va_list zeroinitializer
+
+define void @test_va_copy() {
+; CHECK-LABEL: test_va_copy:
+  %srcaddr = bitcast %va_list* @var to i8*
+  %dstaddr = bitcast %va_list* @second_list to i8*
+  call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
+
+; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var
+
+; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]]
+; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
+; CHECK: str [[BLOCK]], [x[[DST]]]
+
+; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16]
+; CHECK: str [[BLOCK]], [x[[DST]], #16]
+  ret void
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-vbitwise.ll b/test/CodeGen/AArch64/arm64-vbitwise.ll
new file mode 100644
index 00000000000..93de95e52e5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vbitwise.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @rbit_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: rbit_8b:
+;CHECK: rbit.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @rbit_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: rbit_16b:
+;CHECK: rbit.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
+
+define <8 x i16> @sxtl8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sxtl8h:
+;CHECK: sshll.8h
+	%tmp1 = load <8 x i8>* %A
+  %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+  ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @uxtl8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uxtl8h:
+;CHECK: ushll.8h
+	%tmp1 = load <8 x i8>* %A
+  %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @sxtl4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sxtl4s:
+;CHECK: sshll.4s
+	%tmp1 = load <4 x i16>* %A
+  %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+  ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @uxtl4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uxtl4s:
+;CHECK: ushll.4s
+	%tmp1 = load <4 x i16>* %A
+  %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+  ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @sxtl2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sxtl2d:
+;CHECK: sshll.2d
+	%tmp1 = load <2 x i32>* %A
+  %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+  ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @uxtl2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uxtl2d:
+;CHECK: ushll.2d
+	%tmp1 = load <2 x i32>* %A
+  %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+  ret <2 x i64> %tmp2
+}
+
+; Check for incorrect use of vector bic.
+; rdar://11553859
+define void @test_vsliq(i8* nocapture %src, i8* nocapture %dest) nounwind noinline ssp {
+entry:
+; CHECK-LABEL: test_vsliq:
+; CHECK-NOT: bic
+; CHECK: movi.2d [[REG1:v[0-9]+]], #0x0000ff000000ff
+; CHECK: and.16b v{{[0-9]+}}, v{{[0-9]+}}, [[REG1]]
+  %0 = bitcast i8* %src to <16 x i8>*
+  %1 = load <16 x i8>* %0, align 16
+  %and.i = and <16 x i8> %1, <i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0>
+  %2 = bitcast <16 x i8> %and.i to <8 x i16>
+  %vshl_n = shl <8 x i16> %2, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %3 = or <8 x i16> %2, %vshl_n
+  %4 = bitcast <8 x i16> %3 to <4 x i32>
+  %vshl_n8 = shl <4 x i32> %4, <i32 16, i32 16, i32 16, i32 16>
+  %5 = or <4 x i32> %4, %vshl_n8
+  %6 = bitcast <4 x i32> %5 to <16 x i8>
+  %7 = bitcast i8* %dest to <16 x i8>*
+  store <16 x i8> %6, <16 x i8>* %7, align 16
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-vclz.ll b/test/CodeGen/AArch64/arm64-vclz.ll
new file mode 100644
index 00000000000..cf5670a0354
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vclz.ll
@@ -0,0 +1,109 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define <8 x i8> @test_vclz_u8(<8 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u8:
+  ; CHECK: clz.8b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
+  ret <8 x i8> %vclz.i
+}
+
+define <8 x i8> @test_vclz_s8(<8 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s8:
+  ; CHECK: clz.8b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
+  ret <8 x i8> %vclz.i
+}
+
+define <4 x i16> @test_vclz_u16(<4 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u16:
+  ; CHECK: clz.4h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
+  ret <4 x i16> %vclz1.i
+}
+
+define <4 x i16> @test_vclz_s16(<4 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s16:
+  ; CHECK: clz.4h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
+  ret <4 x i16> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_u32(<2 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u32:
+  ; CHECK: clz.2s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
+  ret <2 x i32> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_s32(<2 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s32:
+  ; CHECK: clz.2s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
+  ret <2 x i32> %vclz1.i
+}
+
+define <16 x i8> @test_vclzq_u8(<16 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u8:
+  ; CHECK: clz.16b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
+  ret <16 x i8> %vclz.i
+}
+
+define <16 x i8> @test_vclzq_s8(<16 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s8:
+  ; CHECK: clz.16b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
+  ret <16 x i8> %vclz.i
+}
+
+define <8 x i16> @test_vclzq_u16(<8 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u16:
+  ; CHECK: clz.8h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
+  ret <8 x i16> %vclz1.i
+}
+
+define <8 x i16> @test_vclzq_s16(<8 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s16:
+  ; CHECK: clz.8h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
+  ret <8 x i16> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_u32(<4 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u32:
+  ; CHECK: clz.4s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
+  ret <4 x i32> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_s32(<4 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s32:
+  ; CHECK: clz.4s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
+  ret <4 x i32> %vclz1.i
+}
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
+
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
+
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
+
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcmp.ll b/test/CodeGen/AArch64/arm64-vcmp.ll
new file mode 100644
index 00000000000..982ab09ee69
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcmp.ll
@@ -0,0 +1,236 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+
+define void @fcmltz_4s(<4 x float> %a, <4 x i16>* %p) nounwind {
+;CHECK-LABEL: fcmltz_4s:
+;CHECK: fcmlt.4s [[REG:v[0-9]+]], v0, #0
+;CHECK-NEXT: xtn.4h v[[REG_1:[0-9]+]], [[REG]]
+;CHECK-NEXT: str d[[REG_1]], [x0]
+;CHECK-NEXT: ret
+  %tmp = fcmp olt <4 x float> %a, zeroinitializer
+  %tmp2 = sext <4 x i1> %tmp to <4 x i16>
+  store <4 x i16> %tmp2, <4 x i16>* %p, align 8
+  ret void
+}
+
+define <2 x i32> @facge_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: facge_2s:
+;CHECK: facge.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @facge_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: facge_4s:
+;CHECK: facge.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @facge_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: facge_2d:
+;CHECK: facge.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i32> @facgt_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: facgt_2s:
+;CHECK: facgt.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @facgt_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: facgt_4s:
+;CHECK: facgt.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @facgt_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: facgt_2d:
+;CHECK: facgt.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @facge_s(float %A, float %B) nounwind {
+; CHECK-LABEL: facge_s:
+; CHECK: facge {{s[0-9]+}}, s0, s1
+  %mask = call i32 @llvm.aarch64.neon.facge.i32.f32(float %A, float %B)
+  ret i32 %mask
+}
+
+define i64 @facge_d(double %A, double %B) nounwind {
+; CHECK-LABEL: facge_d:
+; CHECK: facge {{d[0-9]+}}, d0, d1
+  %mask = call i64 @llvm.aarch64.neon.facge.i64.f64(double %A, double %B)
+  ret i64 %mask
+}
+
+declare i64 @llvm.aarch64.neon.facge.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facge.i32.f32(float, float)
+
+define i32 @facgt_s(float %A, float %B) nounwind {
+; CHECK-LABEL: facgt_s:
+; CHECK: facgt {{s[0-9]+}}, s0, s1
+  %mask = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %A, float %B)
+  ret i32 %mask
+}
+
+define i64 @facgt_d(double %A, double %B) nounwind {
+; CHECK-LABEL: facgt_d:
+; CHECK: facgt {{d[0-9]+}}, d0, d1
+  %mask = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %A, double %B)
+  ret i64 %mask
+}
+
+declare i64 @llvm.aarch64.neon.facgt.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facgt.i32.f32(float, float)
+
+define <8 x i8> @cmtst_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: cmtst_8b:
+;CHECK: cmtst.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %commonbits = and <8 x i8> %tmp1, %tmp2
+  %mask = icmp ne <8 x i8> %commonbits, zeroinitializer
+  %res = sext <8 x i1> %mask to <8 x i8>
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @cmtst_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: cmtst_16b:
+;CHECK: cmtst.16b
+  %tmp1 = load <16 x i8>* %A
+  %tmp2 = load <16 x i8>* %B
+  %commonbits = and <16 x i8> %tmp1, %tmp2
+  %mask = icmp ne <16 x i8> %commonbits, zeroinitializer
+  %res = sext <16 x i1> %mask to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <4 x i16> @cmtst_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: cmtst_4h:
+;CHECK: cmtst.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %commonbits = and <4 x i16> %tmp1, %tmp2
+  %mask = icmp ne <4 x i16> %commonbits, zeroinitializer
+  %res = sext <4 x i1> %mask to <4 x i16>
+  ret <4 x i16> %res
+}
+
+define <8 x i16> @cmtst_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: cmtst_8h:
+;CHECK: cmtst.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %commonbits = and <8 x i16> %tmp1, %tmp2
+  %mask = icmp ne <8 x i16> %commonbits, zeroinitializer
+  %res = sext <8 x i1> %mask to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <2 x i32> @cmtst_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: cmtst_2s:
+;CHECK: cmtst.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %commonbits = and <2 x i32> %tmp1, %tmp2
+  %mask = icmp ne <2 x i32> %commonbits, zeroinitializer
+  %res = sext <2 x i1> %mask to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @cmtst_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: cmtst_4s:
+;CHECK: cmtst.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %commonbits = and <4 x i32> %tmp1, %tmp2
+  %mask = icmp ne <4 x i32> %commonbits, zeroinitializer
+  %res = sext <4 x i1> %mask to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @cmtst_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: cmtst_2d:
+;CHECK: cmtst.2d
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i64>* %B
+  %commonbits = and <2 x i64> %tmp1, %tmp2
+  %mask = icmp ne <2 x i64> %commonbits, zeroinitializer
+  %res = sext <2 x i1> %mask to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define <1 x i64> @fcmeq_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmeq_d:
+; CHECK: fcmeq {{d[0-9]+}}, d0, d1
+  %tst = fcmp oeq <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmge_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmge_d:
+; CHECK: fcmge {{d[0-9]+}}, d0, d1
+  %tst = fcmp oge <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmle_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmle_d:
+; CHECK: fcmge {{d[0-9]+}}, d1, d0
+  %tst = fcmp ole <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgt_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmgt_d:
+; CHECK: fcmgt {{d[0-9]+}}, d0, d1
+  %tst = fcmp ogt <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlt_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmlt_d:
+; CHECK: fcmgt {{d[0-9]+}}, d1, d0
+  %tst = fcmp olt <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmnez_d(<1 x i64> %A) nounwind {
+; CHECK-LABEL: cmnez_d:
+; CHECK: cmeq d[[EQ:[0-9]+]], d0, #0
+; CHECK: mvn.8b v0, v[[EQ]]
+  %tst = icmp ne <1 x i64> %A, zeroinitializer
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
diff --git a/test/CodeGen/AArch64/arm64-vcnt.ll b/test/CodeGen/AArch64/arm64-vcnt.ll
new file mode 100644
index 00000000000..903501ec16a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcnt.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_8b:
+;CHECK: cls.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_16b:
+;CHECK: cls.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_4h:
+;CHECK: cls.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_8h:
+;CHECK: cls.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_2s:
+;CHECK: cls.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_4s:
+;CHECK: cls.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcombine.ll b/test/CodeGen/AArch64/arm64-vcombine.ll
new file mode 100644
index 00000000000..fa1299603af
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcombine.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; LowerCONCAT_VECTORS() was reversing the order of two parts.
+; rdar://11558157
+; rdar://11559553
+define <16 x i8> @test(<16 x i8> %q0, <16 x i8> %q1, i8* nocapture %dest) nounwind {
+entry:
+; CHECK-LABEL: test:
+; CHECK: ins.d v0[1], v1[0]
+  %0 = bitcast <16 x i8> %q0 to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> zeroinitializer
+  %1 = bitcast <16 x i8> %q1 to <2 x i64>
+  %shuffle.i4 = shufflevector <2 x i64> %1, <2 x i64> undef, <1 x i32> zeroinitializer
+  %shuffle.i3 = shufflevector <1 x i64> %shuffle.i, <1 x i64> %shuffle.i4, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i3 to <16 x i8>
+  ret <16 x i8> %2
+}
diff --git a/test/CodeGen/AArch64/arm64-vcvt.ll b/test/CodeGen/AArch64/arm64-vcvt.ll
new file mode 100644
index 00000000000..8c9e4e92710
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt.ll
@@ -0,0 +1,686 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtas_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtas_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtas_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtau_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtau_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtau_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtau_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtms_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtms_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtms_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtms_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtmu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtmu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtmu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtmu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtps_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtps_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtps_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtps_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtpu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtpu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtpu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtpu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtns_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtns_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtns_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtns_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtnu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtnu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtnu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtnu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzs_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <2 x float> %A to <2 x i32>
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzs_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzs_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <4 x float> %A to <4 x i32>
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzs_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <2 x double> %A to <2 x i64>
+	ret <2 x i64> %tmp3
+}
+
+
+define <2 x i32> @fcvtzu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <2 x float> %A to <2 x i32>
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <4 x float> %A to <4 x i32>
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <2 x double> %A to <2 x i64>
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @frinta_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frinta_2s:
+;CHECK-NOT: ld1
+;CHECK: frinta.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.round.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frinta_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frinta_4s:
+;CHECK-NOT: ld1
+;CHECK: frinta.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.round.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frinta_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frinta_2d:
+;CHECK-NOT: ld1
+;CHECK: frinta.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.round.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.round.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.round.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.round.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frinti_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frinti_2s:
+;CHECK-NOT: ld1
+;CHECK: frinti.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frinti_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frinti_4s:
+;CHECK-NOT: ld1
+;CHECK: frinti.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frinti_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frinti_2d:
+;CHECK-NOT: ld1
+;CHECK: frinti.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintm_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintm_2s:
+;CHECK-NOT: ld1
+;CHECK: frintm.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.floor.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintm_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintm_4s:
+;CHECK-NOT: ld1
+;CHECK: frintm.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.floor.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintm_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintm_2d:
+;CHECK-NOT: ld1
+;CHECK: frintm.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.floor.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintn_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintn_2s:
+;CHECK-NOT: ld1
+;CHECK: frintn.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintn_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintn_4s:
+;CHECK-NOT: ld1
+;CHECK: frintn.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintn_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintn_2d:
+;CHECK-NOT: ld1
+;CHECK: frintn.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintp_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintp_2s:
+;CHECK-NOT: ld1
+;CHECK: frintp.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.ceil.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintp_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintp_4s:
+;CHECK-NOT: ld1
+;CHECK: frintp.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintp_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintp_2d:
+;CHECK-NOT: ld1
+;CHECK: frintp.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintx_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintx_2s:
+;CHECK-NOT: ld1
+;CHECK: frintx.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.rint.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintx_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintx_4s:
+;CHECK-NOT: ld1
+;CHECK: frintx.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintx_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintx_2d:
+;CHECK-NOT: ld1
+;CHECK: frintx.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.rint.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintz_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintz_2s:
+;CHECK-NOT: ld1
+;CHECK: frintz.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.trunc.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintz_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintz_4s:
+;CHECK-NOT: ld1
+;CHECK: frintz.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintz_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintz_2d:
+;CHECK-NOT: ld1
+;CHECK: frintz.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtxn_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtxn v0.2s, v0.2d
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtxn_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtxn2 v0.4s, v1.2d
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+        %res = shufflevector <2 x float> %ret, <2 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+	ret <4 x float> %res
+}
+
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzsc_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzsc_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+
+define <2 x i32> @fcvtzuc_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzuc_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzuc_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+
+define <2 x float> @scvtf_2sc(<2 x i32> %A) nounwind {
+;CHECK-LABEL: scvtf_2sc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @scvtf_4sc(<4 x i32> %A) nounwind {
+;CHECK-LABEL: scvtf_4sc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @scvtf_2dc(<2 x i64> %A) nounwind {
+;CHECK-LABEL: scvtf_2dc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+
+define <2 x float> @ucvtf_2sc(<2 x i32> %A) nounwind {
+;CHECK-LABEL: ucvtf_2sc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @ucvtf_4sc(<4 x i32> %A) nounwind {
+;CHECK-LABEL: ucvtf_4sc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
+;CHECK-LABEL: ucvtf_2dc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	ret <2 x double> %tmp3
+}
+
+
+;CHECK-LABEL: autogen_SD28458:
+;CHECK: fcvt
+;CHECK: ret
+define void @autogen_SD28458() {
+  %Tr53 = fptrunc <8 x double> undef to <8 x float>
+  store <8 x float> %Tr53, <8 x float>* undef
+  ret void
+}
+
+;CHECK-LABEL: autogen_SD19225:
+;CHECK: fcvt
+;CHECK: ret
+define void @autogen_SD19225() {
+  %A = load <8 x float>* undef
+  %Tr53 = fpext <8 x float> %A to <8 x double>
+  store <8 x double> %Tr53, <8 x double>* undef
+  ret void
+}
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcvt_f.ll b/test/CodeGen/AArch64/arm64-vcvt_f.ll
new file mode 100644
index 00000000000..d24495844b4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -0,0 +1,82 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_f64_f32:
+  %vcvt1.i = fpext <2 x float> %x to <2 x double>
+; CHECK: fcvtl	v0.2d, v0.2s
+  ret <2 x double> %vcvt1.i
+; CHECK: ret
+}
+
+define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_high_f64_f32:
+  %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
+; CHECK: fcvtl2	v0.2d, v0.4s
+  ret <2 x double> %vcvt1.i
+; CHECK: ret
+}
+
+define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_f32_f64:
+  %vcvt1.i = fptrunc <2 x double> %v to <2 x float>
+; CHECK: fcvtn
+  ret <2 x float> %vcvt1.i
+; CHECK: ret
+}
+
+define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_high_f32_f64:
+
+  %cvt = fptrunc <2 x double> %v to <2 x float>
+  %vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: fcvtn2
+  ret <4 x float> %vcvt2.i
+; CHECK: ret
+}
+
+define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvtx_f32_f64:
+  %vcvtx1.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+; CHECK: fcvtxn
+  ret <2 x float> %vcvtx1.i
+; CHECK: ret
+}
+
+define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvtx_high_f32_f64:
+  %vcvtx2.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+  %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: fcvtxn2
+  ret <4 x float> %res
+; CHECK: ret
+}
+
+
+declare <2 x double> @llvm.aarch64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfp2df(<2 x float>) nounwind readnone
+
+declare <2 x float> @llvm.aarch64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
+
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+
+define i16 @to_half(float %in) {
+; CHECK-LABEL: to_half:
+; CHECK: fcvt h[[HALFVAL:[0-9]+]], s0
+; CHECK: fmov {{w[0-9]+}}, {{s[0-9]+}}
+  %res = call i16 @llvm.convert.to.fp16(float %in)
+  ret i16 %res
+}
+
+define float @from_half(i16 %in) {
+; CHECK-LABEL: from_half:
+; CHECK: fmov s[[HALFVAL:[0-9]+]], {{w[0-9]+}}
+; CHECK: fcvt s0, h[[HALFVAL]]
+  %res = call float @llvm.convert.from.fp16(i16 %in)
+  ret float %res
+}
+
+declare float @llvm.convert.from.fp16(i16) #1
+declare i16 @llvm.convert.to.fp16(float) #1
diff --git a/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll b/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
new file mode 100644
index 00000000000..1eb7b43d575
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @ucvt(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: ucvt:
+; CHECK: ucvtf.2s  v0, v0
+; CHECK: ret
+
+  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <2 x float> @scvt(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: scvt:
+; CHECK: scvtf.2s  v0, v0
+; CHECK: ret
+  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <4 x float> @ucvtq(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: ucvtq:
+; CHECK: ucvtf.4s  v0, v0
+; CHECK: ret
+  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @scvtq(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: scvtq:
+; CHECK: scvtf.4s  v0, v0
+; CHECK: ret
+  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @cvtf16(<4 x i16> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16:
+; CHECK: fcvtl  v0.4s, v0.4h
+; CHECK-NEXT: ret
+  %vcvt1.i = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> %a) nounwind
+  ret <4 x float> %vcvt1.i
+}
+
+define <4 x float> @cvtf16_high(<8 x i16> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16_high:
+; CHECK: fcvtl2  v0.4s, v0.8h
+; CHECK-NEXT: ret
+  %in = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vcvt1.i = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> %in) nounwind
+  ret <4 x float> %vcvt1.i
+}
+
+
+
+define <4 x i16> @cvtf16f32(<4 x float> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16f32:
+; CHECK: fcvtn  v0.4h, v0.4s
+; CHECK-NEXT: ret
+  %vcvt1.i = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) nounwind
+  ret <4 x i16> %vcvt1.i
+}
+
+define <8 x i16> @cvtf16f32_high(<4 x i16> %low, <4 x float> %high_big) {
+; CHECK-LABEL: cvtf16f32_high:
+; CHECK: fcvtn2 v0.8h, v1.4s
+; CHECK-NEXT: ret
+  %high = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %high_big)
+  %res = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+declare <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcvt_n.ll b/test/CodeGen/AArch64/arm64-vcvt_n.ll
new file mode 100644
index 00000000000..7ed5be6e8af
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_n.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxpu:
+; CHECK: ucvtf.2s	v0, v0, #9
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
+  ret <2 x float> %vcvt_n1
+}
+
+define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxps:
+; CHECK: scvtf.2s	v0, v0, #12
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
+  ret <2 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxpu:
+; CHECK: ucvtf.4s	v0, v0, #18
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
+  ret <4 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxps:
+; CHECK: scvtf.4s	v0, v0, #30
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
+  ret <4 x float> %vcvt_n1
+}
+define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
+  ret <2 x double> %vcvt_n1
+}
+
+define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
+  ret <2 x double> %vcvt_n1
+}
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll b/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
new file mode 100644
index 00000000000..985a5f76243
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x i32> @c1(<2 x float> %a) nounwind readnone ssp {
+; CHECK: c1
+; CHECK: fcvtzs.2s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <2 x i32> @c2(<2 x float> %a) nounwind readnone ssp {
+; CHECK: c2
+; CHECK: fcvtzu.2s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <4 x i32> @c3(<4 x float> %a) nounwind readnone ssp {
+; CHECK: c3
+; CHECK: fcvtzs.4s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
+define <4 x i32> @c4(<4 x float> %a) nounwind readnone ssp {
+; CHECK: c4
+; CHECK: fcvtzu.4s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
diff --git a/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll b/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
new file mode 100644
index 00000000000..b29c22cbfda
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @fcvtxn(double %a) {
+; CHECK-LABEL: fcvtxn:
+; CHECK: fcvtxn s0, d0
+; CHECK-NEXT: ret
+  %vcvtxd.i = tail call float @llvm.aarch64.sisd.fcvtxn(double %a) nounwind
+  ret float %vcvtxd.i
+}
+
+declare float @llvm.aarch64.sisd.fcvtxn(double) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vecCmpBr.ll b/test/CodeGen/AArch64/arm64-vecCmpBr.ll
new file mode 100644
index 00000000000..c7321e4b7d0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vecCmpBr.ll
@@ -0,0 +1,207 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+; ModuleID = 'arm64_vecCmpBr.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+
+define i32 @anyZero64(<4 x i16> %a) #0 {
+; CHECK: _anyZero64:
+; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...) #1
+
+define i32 @anyZero128(<8 x i16> %a) #0 {
+; CHECK: _anyZero128:
+; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @anyNonZero64(<4 x i16> %a) #0 {
+; CHECK: _anyNonZero64:
+; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @anyNonZero128(<8 x i16> %a) #0 {
+; CHECK: _anyNonZero128:
+; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allZero64(<4 x i16> %a) #0 {
+; CHECK: _allZero64:
+; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allZero128(<8 x i16> %a) #0 {
+; CHECK: _allZero128:
+; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allNonZero64(<4 x i16> %a) #0 {
+; CHECK: _allNonZero64:
+; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allNonZero128(<8 x i16> %a) #0 {
+; CHECK: _allNonZero128:
+; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>) #2
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>) #2
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) #2
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) #2
+
+attributes #0 = { nounwind ssp "target-cpu"="cyclone" }
+attributes #1 = { "target-cpu"="cyclone" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+attributes #4 = { nobuiltin nounwind }
diff --git a/test/CodeGen/AArch64/arm64-vecFold.ll b/test/CodeGen/AArch64/arm64-vecFold.ll
new file mode 100644
index 00000000000..aeacfccab3c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vecFold.ll
@@ -0,0 +1,145 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple -o - %s| FileCheck %s
+
+define <16 x i8> @foov16i8(<8 x i16> %a0, <8 x i16> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov16i8:
+  %vshrn_low_shift = lshr <8 x i16> %a0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %vshrn_low = trunc <8 x i16> %vshrn_low_shift to <8 x i8>
+  %vshrn_high_shift = lshr <8 x i16> %b0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %vshrn_high = trunc <8 x i16> %vshrn_high_shift to <8 x i8>
+; CHECK: shrn.8b v0, v0, #5
+; CHECK-NEXT: shrn2.16b v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <8 x i8> %vshrn_low to <1 x i64>
+  %2 = bitcast <8 x i8> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @foov8i16(<4 x i32> %a0, <4 x i32> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov8i16:
+  %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_low = trunc <4 x i32> %vshrn_low_shift to <4 x i16>
+  %vshrn_high_shift = lshr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
+; CHECK: shrn.4h v0, v0, #5
+; CHECK-NEXT: shrn2.8h v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vshrn_low to <1 x i64>
+  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @foov4i32(<2 x i64> %a0, <2 x i64> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov4i32:
+  %vshrn_low_shift = lshr <2 x i64> %a0, <i64 5, i64 5>
+  %vshrn_low = trunc <2 x i64> %vshrn_low_shift to <2 x i32>
+  %vshrn_high_shift = lshr <2 x i64> %b0, <i64 5, i64 5>
+  %vshrn_high = trunc <2 x i64> %vshrn_high_shift to <2 x i32>
+; CHECK: shrn.2s v0, v0, #5
+; CHECK-NEXT: shrn2.4s v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <2 x i32> %vshrn_low to <1 x i64>
+  %2 = bitcast <2 x i32> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: bar:
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vaddhn2.i10 = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+; CHECK: addhn.4h	v0, v0, v1
+; CHECK-NEXT: addhn2.8h	v0, v2, v3
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vaddhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @baz(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: baz:
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vshrn_high_shift = ashr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
+; CHECK: addhn.4h	v0, v0, v1
+; CHECK-NEXT: shrn2.8h	v0, v2, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @raddhn(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: raddhn:
+entry:
+; CHECK: 	raddhn.4h	v0, v0, v1
+; CHECK-NEXT: 	raddhn2.8h	v0, v2, v3
+; CHECK-NEXT: 	ret
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vraddhn2.i10 = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %0 = bitcast <4 x i16> %vraddhn2.i to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @vrshrn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
+; CHECK-LABEL: vrshrn:
+; CHECK: rshrn.8b	v0, v0, #5
+; CHECK-NEXT: rshrn2.16b	v0, v2, #6
+; CHECK-NEXT: ret
+  %vrshrn_n1 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
+  %vrshrn_n4 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
+  %1 = bitcast <8 x i8> %vrshrn_n1 to <1 x i64>
+  %2 = bitcast <8 x i8> %vrshrn_n4 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @vrsubhn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
+; CHECK-LABEL: vrsubhn:
+; CHECK: rsubhn.8b	v0, v0, v1
+; CHECK: rsubhn2.16b	v0, v2, v3
+; CHECK-NEXT: 	ret
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
+  %vrsubhn2.i10 = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
+  %1 = bitcast <8 x i8> %vrsubhn2.i to <1 x i64>
+  %2 = bitcast <8 x i8> %vrsubhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @noOpt1(<2 x i32> %a0, <2 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: noOpt1:
+  %vqsub2.i = tail call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+; CHECK:	sqsub.2s	v0, v0, v1
+; CHECK-NEXT:	addhn2.8h	v0, v2, v3
+  %1 = bitcast <2 x i32> %vqsub2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
new file mode 100644
index 00000000000..650ff1e14f0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func30
+;CHECK: ushll.4s  v0, v0, #0
+;CHECK: movi.4s v1, #0x1
+;CHECK: and.16b v0, v0, v1
+;CHECK: str  q0, [x0]
+;CHECK: ret
+
+%T0_30 = type <4 x i1>
+%T1_30 = type <4 x i32>
+define void @func30(%T0_30 %v0, %T1_30* %p1) {
+  %r = zext %T0_30 %v0 to %T1_30
+  store %T1_30 %r, %T1_30* %p1
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-vector-imm.ll b/test/CodeGen/AArch64/arm64-vector-imm.ll
new file mode 100644
index 00000000000..9fb088b9a49
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-imm.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @v_orrimm(<8 x i8>* %A) nounwind {
+; CHECK-LABEL: v_orrimm:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: orr
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = or <8 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @v_orrimmQ(<16 x i8>* %A) nounwind {
+; CHECK: v_orrimmQ
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: orr
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = or <16 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @v_bicimm(<8 x i8>* %A) nounwind {
+; CHECK-LABEL: v_bicimm:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: bic
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @v_bicimmQ(<16 x i8>* %A) nounwind {
+; CHECK-LABEL: v_bicimmQ:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: bic
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+	ret <16 x i8> %tmp3
+}
+
+define <2 x double> @foo(<2 x double> %bar) nounwind {
+; CHECK: foo
+; CHECK: fmov.2d	v1, #1.0000000
+  %add = fadd <2 x double> %bar, <double 1.0, double 1.0>
+  ret <2 x double> %add
+}
+
+define <4 x i32> @movi_4s_imm_t1() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t1:
+; CHECK: movi.4s v0, #0x4b
+  ret <4 x i32> <i32 75, i32 75, i32 75, i32 75>
+}
+
+define <4 x i32> @movi_4s_imm_t2() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t2:
+; CHECK: movi.4s v0, #0x4b, lsl #8
+  ret <4 x i32> <i32 19200, i32 19200, i32 19200, i32 19200>
+}
+
+define <4 x i32> @movi_4s_imm_t3() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t3:
+; CHECK: movi.4s v0, #0x4b, lsl #16
+  ret <4 x i32> <i32 4915200, i32 4915200, i32 4915200, i32 4915200>
+}
+
+define <4 x i32> @movi_4s_imm_t4() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t4:
+; CHECK: movi.4s v0, #0x4b, lsl #24
+  ret <4 x i32> <i32 1258291200, i32 1258291200, i32 1258291200, i32 1258291200>
+}
+
+define <8 x i16> @movi_8h_imm_t5() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_8h_imm_t5:
+; CHECK: movi.8h v0, #0x4b
+  ret <8 x i16> <i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75>
+}
+
+; rdar://11989841
+define <8 x i16> @movi_8h_imm_t6() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_8h_imm_t6:
+; CHECK: movi.8h v0, #0x4b, lsl #8
+  ret <8 x i16> <i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200>
+}
+
+define <4 x i32> @movi_4s_imm_t7() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t7:
+; CHECK: movi.4s v0, #0x4b, msl #8
+ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
+}
+
+define <4 x i32> @movi_4s_imm_t8() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t8:
+; CHECK: movi.4s v0, #0x4b, msl #16
+ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
+}
+
+define <16 x i8> @movi_16b_imm_t9() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_16b_imm_t9:
+; CHECK: movi.16b v0, #0x4b
+ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75,
+               i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
+}
+
+define <2 x i64> @movi_2d_imm_t10() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_2d_imm_t10:
+; CHECK: movi.2d v0, #0xff00ff00ff00ff
+ret <2 x i64> <i64 71777214294589695, i64 71777214294589695>
+}
+
+define <4 x i32> @movi_4s_imm_t11() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t11:
+; CHECK: fmov.4s v0, #-0.32812500
+ret <4 x i32> <i32 3198681088, i32 3198681088, i32 3198681088, i32 3198681088>
+}
+
+define <2 x i64> @movi_2d_imm_t12() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_2d_imm_t12:
+; CHECK: fmov.2d v0, #-0.17187500
+ret <2 x i64> <i64 13818732506632945664, i64 13818732506632945664>
+}
diff --git a/test/CodeGen/AArch64/arm64-vector-insertion.ll b/test/CodeGen/AArch64/arm64-vector-insertion.ll
new file mode 100644
index 00000000000..8fbff71f9fc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=arm64 -mcpu=generic -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define void @test0f(float* nocapture %x, float %a) #0 {
+entry:
+  %0 = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %a, i32 0
+  %1 = bitcast float* %x to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+
+  ; CHECK-LABEL: test0f
+  ; CHECK: movi.2d v[[TEMP:[0-9]+]], #0000000000000000
+  ; CHECK: ins.s v[[TEMP]][0], v{{[0-9]+}}[0]
+  ; CHECK: str q[[TEMP]], [x0]
+  ; CHECK: ret
+
+
+}
+
+
+define void @test1f(float* nocapture %x, float %a) #0 {
+entry:
+  %0 = insertelement <4 x float> <float undef, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, float %a, i32 0
+  %1 = bitcast float* %x to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+
+  ; CHECK-LABEL: test1f
+  ; CHECK: fmov  s[[TEMP:[0-9]+]], #1.0000000
+  ; CHECK: dup.4s  v[[TEMP2:[0-9]+]], v[[TEMP]][0]
+  ; CHECK: ins.s v[[TEMP2]][0], v0[0]
+  ; CHECK: str q[[TEMP2]], [x0]
+  ; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-vector-ldst.ll b/test/CodeGen/AArch64/arm64-vector-ldst.ll
new file mode 100644
index 00000000000..c00191577d1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -0,0 +1,601 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+; rdar://9428579
+
+%type1 = type { <16 x i8> }
+%type2 = type { <8 x i8> }
+%type3 = type { <4 x i16> }
+
+
+define hidden fastcc void @t1(%type1** %argtable) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldr x[[REG:[0-9]+]], [x0]
+; CHECK: str q0, [x[[REG]]]
+  %tmp1 = load %type1** %argtable, align 8
+  %tmp2 = getelementptr inbounds %type1* %tmp1, i64 0, i32 0
+  store <16 x i8> zeroinitializer, <16 x i8>* %tmp2, align 16
+  ret void
+}
+
+define hidden fastcc void @t2(%type2** %argtable) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr x[[REG:[0-9]+]], [x0]
+; CHECK: str d0, [x[[REG]]]
+  %tmp1 = load %type2** %argtable, align 8
+  %tmp2 = getelementptr inbounds %type2* %tmp1, i64 0, i32 0
+  store <8 x i8> zeroinitializer, <8 x i8>* %tmp2, align 8
+  ret void
+}
+
+; add a bunch of tests for rdar://11246289
+
+@globalArray64x2 = common global <2 x i64>* null, align 8
+@globalArray32x4 = common global <4 x i32>* null, align 8
+@globalArray16x8 = common global <8 x i16>* null, align 8
+@globalArray8x16 = common global <16 x i8>* null, align 8
+@globalArray64x1 = common global <1 x i64>* null, align 8
+@globalArray32x2 = common global <2 x i32>* null, align 8
+@globalArray16x4 = common global <4 x i16>* null, align 8
+@globalArray8x8 = common global <8 x i8>* null, align 8
+@floatglobalArray64x2 = common global <2 x double>* null, align 8
+@floatglobalArray32x4 = common global <4 x float>* null, align 8
+@floatglobalArray64x1 = common global <1 x double>* null, align 8
+@floatglobalArray32x2 = common global <2 x float>* null, align 8
+
+define void @fct1_64x2(<2 x i64>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_64x2:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 %offset
+  %tmp = load <2 x i64>* %arrayidx, align 16
+  %tmp1 = load <2 x i64>** @globalArray64x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 %offset
+  store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_64x2(<2 x i64>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_64x2:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 3
+  %tmp = load <2 x i64>* %arrayidx, align 16
+  %tmp1 = load <2 x i64>** @globalArray64x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 5
+  store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_32x4(<4 x i32>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_32x4:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 %offset
+  %tmp = load <4 x i32>* %arrayidx, align 16
+  %tmp1 = load <4 x i32>** @globalArray32x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 %offset
+  store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_32x4(<4 x i32>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_32x4:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 3
+  %tmp = load <4 x i32>* %arrayidx, align 16
+  %tmp1 = load <4 x i32>** @globalArray32x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 5
+  store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_16x8(<8 x i16>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_16x8:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 %offset
+  %tmp = load <8 x i16>* %arrayidx, align 16
+  %tmp1 = load <8 x i16>** @globalArray16x8, align 8
+  %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 %offset
+  store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_16x8(<8 x i16>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_16x8:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 3
+  %tmp = load <8 x i16>* %arrayidx, align 16
+  %tmp1 = load <8 x i16>** @globalArray16x8, align 8
+  %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 5
+  store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_8x16(<16 x i8>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_8x16:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 %offset
+  %tmp = load <16 x i8>* %arrayidx, align 16
+  %tmp1 = load <16 x i8>** @globalArray8x16, align 8
+  %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 %offset
+  store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_8x16(<16 x i8>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_8x16:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 3
+  %tmp = load <16 x i8>* %arrayidx, align 16
+  %tmp1 = load <16 x i8>** @globalArray8x16, align 8
+  %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 5
+  store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_64x1(<1 x i64>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_64x1:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 %offset
+  %tmp = load <1 x i64>* %arrayidx, align 8
+  %tmp1 = load <1 x i64>** @globalArray64x1, align 8
+  %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 %offset
+  store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct2_64x1(<1 x i64>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_64x1:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+  %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 3
+  %tmp = load <1 x i64>* %arrayidx, align 8
+  %tmp1 = load <1 x i64>** @globalArray64x1, align 8
+  %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 5
+  store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct1_32x2(<2 x i32>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_32x2:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 %offset
+  %tmp = load <2 x i32>* %arrayidx, align 8
+  %tmp1 = load <2 x i32>** @globalArray32x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 %offset
+  store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct2_32x2(<2 x i32>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_32x2:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+  %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 3
+  %tmp = load <2 x i32>* %arrayidx, align 8
+  %tmp1 = load <2 x i32>** @globalArray32x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 5
+  store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct1_16x4(<4 x i16>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_16x4:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 %offset
+  %tmp = load <4 x i16>* %arrayidx, align 8
+  %tmp1 = load <4 x i16>** @globalArray16x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 %offset
+  store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct2_16x4(<4 x i16>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_16x4:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+  %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 3
+  %tmp = load <4 x i16>* %arrayidx, align 8
+  %tmp1 = load <4 x i16>** @globalArray16x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 5
+  store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct1_8x8(<8 x i8>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_8x8:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <8 x i8>* %array, i64 %offset
+  %tmp = load <8 x i8>* %arrayidx, align 8
+  %tmp1 = load <8 x i8>** @globalArray8x8, align 8
+  %arrayidx1 = getelementptr inbounds <8 x i8>* %tmp1, i64 %offset
+  store <8 x i8> %tmp, <8 x i8>* %arrayidx1, align 8
+  ret void
+}
+
+; Add a bunch of tests for rdar://13258794: Match LDUR/STUR for D and Q
+; registers for unscaled vector accesses
+@str = global [63 x i8] c"Test case for rdar://13258794: LDUR/STUR for D and Q registers\00", align 1
+
+define <1 x i64> @fct0() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct0:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
+  ret <1 x i64> %0
+}
+
+define <2 x i32> @fct1() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct1:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
+  ret <2 x i32> %0
+}
+
+define <4 x i16> @fct2() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct2:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
+  ret <4 x i16> %0
+}
+
+define <8 x i8> @fct3() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct3:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
+  ret <8 x i8> %0
+}
+
+define <2 x i64> @fct4() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct4:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
+  ret <2 x i64> %0
+}
+
+define <4 x i32> @fct5() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct5:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
+  ret <4 x i32> %0
+}
+
+define <8 x i16> @fct6() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct6:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
+  ret <8 x i16> %0
+}
+
+define <16 x i8> @fct7() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct7:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
+  ret <16 x i8> %0
+}
+
+define void @fct8() nounwind ssp {
+entry:
+; CHECK-LABEL: fct8:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
+  store <1 x i64> %0, <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <1 x i64>*), align 8
+  ret void
+}
+
+define void @fct9() nounwind ssp {
+entry:
+; CHECK-LABEL: fct9:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
+  store <2 x i32> %0, <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i32>*), align 8
+  ret void
+}
+
+define void @fct10() nounwind ssp {
+entry:
+; CHECK-LABEL: fct10:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
+  store <4 x i16> %0, <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i16>*), align 8
+  ret void
+}
+
+define void @fct11() nounwind ssp {
+entry:
+; CHECK-LABEL: fct11:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
+  store <8 x i8> %0, <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i8>*), align 8
+  ret void
+}
+
+define void @fct12() nounwind ssp {
+entry:
+; CHECK-LABEL: fct12:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
+  store <2 x i64> %0, <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i64>*), align 16
+  ret void
+}
+
+define void @fct13() nounwind ssp {
+entry:
+; CHECK-LABEL: fct13:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
+  store <4 x i32> %0, <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i32>*), align 16
+  ret void
+}
+
+define void @fct14() nounwind ssp {
+entry:
+; CHECK-LABEL: fct14:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
+  store <8 x i16> %0, <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i16>*), align 16
+  ret void
+}
+
+define void @fct15() nounwind ssp {
+entry:
+; CHECK-LABEL: fct15:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
+  store <16 x i8> %0, <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <16 x i8>*), align 16
+  ret void
+}
+
+; Check the building of vector from a single loaded value.
+; Part of <rdar://problem/14170854>
+;
+; Single loads with immediate offset.
+define <8 x i8> @fct16(i8* nocapture %sp0) {
+; CHECK-LABEL: fct16:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i8> %vec, %vec
+  ret <8 x i8> %vmull.i
+}
+
+define <16 x i8> @fct17(i8* nocapture %sp0) {
+; CHECK-LABEL: fct17:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <16 x i8> %vec, %vec
+  ret <16 x i8> %vmull.i
+}
+
+define <4 x i16> @fct18(i16* nocapture %sp0) {
+; CHECK-LABEL: fct18:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i16> %vec, %vec
+  ret <4 x i16> %vmull.i
+}
+
+define <8 x i16> @fct19(i16* nocapture %sp0) {
+; CHECK-LABEL: fct19:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i16> %vec, %vec
+  ret <8 x i16> %vmull.i
+}
+
+define <2 x i32> @fct20(i32* nocapture %sp0) {
+; CHECK-LABEL: fct20:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <2 x i32> %vec, %vec
+  ret <2 x i32> %vmull.i
+}
+
+define <4 x i32> @fct21(i32* nocapture %sp0) {
+; CHECK-LABEL: fct21:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i32> %vec, %vec
+  ret <4 x i32> %vmull.i
+}
+
+define <1 x i64> @fct22(i64* nocapture %sp0) {
+; CHECK-LABEL: fct22:
+; CHECK: ldr d0, [x0, #8]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+   ret <1 x i64> %vec
+}
+
+define <2 x i64> @fct23(i64* nocapture %sp0) {
+; CHECK-LABEL: fct23:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+  ret <2 x i64> %vec
+}
+
+;
+; Single loads with register offset.
+define <8 x i8> @fct24(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct24:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i8> %vec, %vec
+  ret <8 x i8> %vmull.i
+}
+
+define <16 x i8> @fct25(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct25:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <16 x i8> %vec, %vec
+  ret <16 x i8> %vmull.i
+}
+
+define <4 x i16> @fct26(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct26:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i16> %vec, %vec
+  ret <4 x i16> %vmull.i
+}
+
+define <8 x i16> @fct27(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct27:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i16> %vec, %vec
+  ret <8 x i16> %vmull.i
+}
+
+define <2 x i32> @fct28(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct28:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <2 x i32> %vec, %vec
+  ret <2 x i32> %vmull.i
+}
+
+define <4 x i32> @fct29(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct29:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i32> %vec, %vec
+  ret <4 x i32> %vmull.i
+}
+
+define <1 x i64> @fct30(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct30:
+; CHECK: ldr d0, [x0, x1, lsl #3]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+   ret <1 x i64> %vec
+}
+
+define <2 x i64> @fct31(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct31:
+; CHECK: ldr d0, [x0, x1, lsl #3]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+  ret <2 x i64> %vec
+}
diff --git a/test/CodeGen/AArch64/arm64-vext.ll b/test/CodeGen/AArch64/arm64-vext.ll
new file mode 100644
index 00000000000..2240dfd5a1a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vext.ll
@@ -0,0 +1,464 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define void @test_vext_s8() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s8:
+  ; CHECK: {{ext.8.*#1}}
+  %xS8x8 = alloca <8 x i8>, align 8
+  %__a = alloca <8 x i8>, align 8
+  %__b = alloca <8 x i8>, align 8
+  %tmp = load <8 x i8>* %xS8x8, align 8
+  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+  %tmp1 = load <8 x i8>* %xS8x8, align 8
+  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+  %tmp2 = load <8 x i8>* %__a, align 8
+  %tmp3 = load <8 x i8>* %__b, align 8
+  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  store <8 x i8> %vext, <8 x i8>* %xS8x8, align 8
+  ret void
+}
+
+define void @test_vext_u8() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u8:
+  ; CHECK: {{ext.8.*#2}}
+  %xU8x8 = alloca <8 x i8>, align 8
+  %__a = alloca <8 x i8>, align 8
+  %__b = alloca <8 x i8>, align 8
+  %tmp = load <8 x i8>* %xU8x8, align 8
+  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+  %tmp1 = load <8 x i8>* %xU8x8, align 8
+  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+  %tmp2 = load <8 x i8>* %__a, align 8
+  %tmp3 = load <8 x i8>* %__b, align 8
+  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  store <8 x i8> %vext, <8 x i8>* %xU8x8, align 8
+  ret void
+}
+
+define void @test_vext_p8() nounwind ssp {
+  ; CHECK-LABEL: test_vext_p8:
+  ; CHECK: {{ext.8.*#3}}
+  %xP8x8 = alloca <8 x i8>, align 8
+  %__a = alloca <8 x i8>, align 8
+  %__b = alloca <8 x i8>, align 8
+  %tmp = load <8 x i8>* %xP8x8, align 8
+  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+  %tmp1 = load <8 x i8>* %xP8x8, align 8
+  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+  %tmp2 = load <8 x i8>* %__a, align 8
+  %tmp3 = load <8 x i8>* %__b, align 8
+  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  store <8 x i8> %vext, <8 x i8>* %xP8x8, align 8
+  ret void
+}
+
+define void @test_vext_s16() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s16:
+  ; CHECK: {{ext.8.*#2}}
+  %xS16x4 = alloca <4 x i16>, align 8
+  %__a = alloca <4 x i16>, align 8
+  %__b = alloca <4 x i16>, align 8
+  %tmp = load <4 x i16>* %xS16x4, align 8
+  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+  %tmp1 = load <4 x i16>* %xS16x4, align 8
+  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+  %tmp2 = load <4 x i16>* %__a, align 8
+  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+  %tmp4 = load <4 x i16>* %__b, align 8
+  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  store <4 x i16> %vext, <4 x i16>* %xS16x4, align 8
+  ret void
+}
+
+define void @test_vext_u16() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u16:
+  ; CHECK: {{ext.8.*#4}}
+  %xU16x4 = alloca <4 x i16>, align 8
+  %__a = alloca <4 x i16>, align 8
+  %__b = alloca <4 x i16>, align 8
+  %tmp = load <4 x i16>* %xU16x4, align 8
+  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+  %tmp1 = load <4 x i16>* %xU16x4, align 8
+  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+  %tmp2 = load <4 x i16>* %__a, align 8
+  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+  %tmp4 = load <4 x i16>* %__b, align 8
+  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  store <4 x i16> %vext, <4 x i16>* %xU16x4, align 8
+  ret void
+}
+
+define void @test_vext_p16() nounwind ssp {
+  ; CHECK-LABEL: test_vext_p16:
+  ; CHECK: {{ext.8.*#6}}
+  %xP16x4 = alloca <4 x i16>, align 8
+  %__a = alloca <4 x i16>, align 8
+  %__b = alloca <4 x i16>, align 8
+  %tmp = load <4 x i16>* %xP16x4, align 8
+  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+  %tmp1 = load <4 x i16>* %xP16x4, align 8
+  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+  %tmp2 = load <4 x i16>* %__a, align 8
+  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+  %tmp4 = load <4 x i16>* %__b, align 8
+  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  store <4 x i16> %vext, <4 x i16>* %xP16x4, align 8
+  ret void
+}
+
+define void @test_vext_s32() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s32:
+  ; CHECK: {{ext.8.*#4}}
+  %xS32x2 = alloca <2 x i32>, align 8
+  %__a = alloca <2 x i32>, align 8
+  %__b = alloca <2 x i32>, align 8
+  %tmp = load <2 x i32>* %xS32x2, align 8
+  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
+  %tmp1 = load <2 x i32>* %xS32x2, align 8
+  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
+  %tmp2 = load <2 x i32>* %__a, align 8
+  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
+  %tmp4 = load <2 x i32>* %__b, align 8
+  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
+  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i32> %vext, <2 x i32>* %xS32x2, align 8
+  ret void
+}
+
+define void @test_vext_u32() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u32:
+  ; CHECK: {{ext.8.*#4}}
+  %xU32x2 = alloca <2 x i32>, align 8
+  %__a = alloca <2 x i32>, align 8
+  %__b = alloca <2 x i32>, align 8
+  %tmp = load <2 x i32>* %xU32x2, align 8
+  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
+  %tmp1 = load <2 x i32>* %xU32x2, align 8
+  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
+  %tmp2 = load <2 x i32>* %__a, align 8
+  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
+  %tmp4 = load <2 x i32>* %__b, align 8
+  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
+  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i32> %vext, <2 x i32>* %xU32x2, align 8
+  ret void
+}
+
+define void @test_vext_f32() nounwind ssp {
+  ; CHECK-LABEL: test_vext_f32:
+  ; CHECK: {{ext.8.*#4}}
+  %xF32x2 = alloca <2 x float>, align 8
+  %__a = alloca <2 x float>, align 8
+  %__b = alloca <2 x float>, align 8
+  %tmp = load <2 x float>* %xF32x2, align 8
+  store <2 x float> %tmp, <2 x float>* %__a, align 8
+  %tmp1 = load <2 x float>* %xF32x2, align 8
+  store <2 x float> %tmp1, <2 x float>* %__b, align 8
+  %tmp2 = load <2 x float>* %__a, align 8
+  %tmp3 = bitcast <2 x float> %tmp2 to <8 x i8>
+  %tmp4 = load <2 x float>* %__b, align 8
+  %tmp5 = bitcast <2 x float> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x float>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x float>
+  %vext = shufflevector <2 x float> %tmp6, <2 x float> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x float> %vext, <2 x float>* %xF32x2, align 8
+  ret void
+}
+
+define void @test_vext_s64() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s64:
+  ; CHECK_FIXME: {{ext.8.*#1}}
+  ; this just turns into a load of the second element
+  %xS64x1 = alloca <1 x i64>, align 8
+  %__a = alloca <1 x i64>, align 8
+  %__b = alloca <1 x i64>, align 8
+  %tmp = load <1 x i64>* %xS64x1, align 8
+  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
+  %tmp1 = load <1 x i64>* %xS64x1, align 8
+  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
+  %tmp2 = load <1 x i64>* %__a, align 8
+  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
+  %tmp4 = load <1 x i64>* %__b, align 8
+  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
+  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
+  store <1 x i64> %vext, <1 x i64>* %xS64x1, align 8
+  ret void
+}
+
+define void @test_vext_u64() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u64:
+  ; CHECK_FIXME: {{ext.8.*#1}}
+  ; this is turned into a simple load of the 2nd element
+  %xU64x1 = alloca <1 x i64>, align 8
+  %__a = alloca <1 x i64>, align 8
+  %__b = alloca <1 x i64>, align 8
+  %tmp = load <1 x i64>* %xU64x1, align 8
+  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
+  %tmp1 = load <1 x i64>* %xU64x1, align 8
+  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
+  %tmp2 = load <1 x i64>* %__a, align 8
+  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
+  %tmp4 = load <1 x i64>* %__b, align 8
+  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
+  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
+  store <1 x i64> %vext, <1 x i64>* %xU64x1, align 8
+  ret void
+}
+
+define void @test_vextq_s8() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s8:
+  ; CHECK: {{ext.16.*#4}}
+  %xS8x16 = alloca <16 x i8>, align 16
+  %__a = alloca <16 x i8>, align 16
+  %__b = alloca <16 x i8>, align 16
+  %tmp = load <16 x i8>* %xS8x16, align 16
+  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+  %tmp1 = load <16 x i8>* %xS8x16, align 16
+  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+  %tmp2 = load <16 x i8>* %__a, align 16
+  %tmp3 = load <16 x i8>* %__b, align 16
+  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+  store <16 x i8> %vext, <16 x i8>* %xS8x16, align 16
+  ret void
+}
+
+define void @test_vextq_u8() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u8:
+  ; CHECK: {{ext.16.*#5}}
+  %xU8x16 = alloca <16 x i8>, align 16
+  %__a = alloca <16 x i8>, align 16
+  %__b = alloca <16 x i8>, align 16
+  %tmp = load <16 x i8>* %xU8x16, align 16
+  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+  %tmp1 = load <16 x i8>* %xU8x16, align 16
+  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+  %tmp2 = load <16 x i8>* %__a, align 16
+  %tmp3 = load <16 x i8>* %__b, align 16
+  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+  store <16 x i8> %vext, <16 x i8>* %xU8x16, align 16
+  ret void
+}
+
+define void @test_vextq_p8() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_p8:
+  ; CHECK: {{ext.16.*#6}}
+  %xP8x16 = alloca <16 x i8>, align 16
+  %__a = alloca <16 x i8>, align 16
+  %__b = alloca <16 x i8>, align 16
+  %tmp = load <16 x i8>* %xP8x16, align 16
+  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+  %tmp1 = load <16 x i8>* %xP8x16, align 16
+  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+  %tmp2 = load <16 x i8>* %__a, align 16
+  %tmp3 = load <16 x i8>* %__b, align 16
+  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+  store <16 x i8> %vext, <16 x i8>* %xP8x16, align 16
+  ret void
+}
+
+define void @test_vextq_s16() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s16:
+  ; CHECK: {{ext.16.*#14}}
+  %xS16x8 = alloca <8 x i16>, align 16
+  %__a = alloca <8 x i16>, align 16
+  %__b = alloca <8 x i16>, align 16
+  %tmp = load <8 x i16>* %xS16x8, align 16
+  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+  %tmp1 = load <8 x i16>* %xS16x8, align 16
+  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+  %tmp2 = load <8 x i16>* %__a, align 16
+  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+  %tmp4 = load <8 x i16>* %__b, align 16
+  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  store <8 x i16> %vext, <8 x i16>* %xS16x8, align 16
+  ret void
+}
+
+define void @test_vextq_u16() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u16:
+  ; CHECK: {{ext.16.*#8}}
+  %xU16x8 = alloca <8 x i16>, align 16
+  %__a = alloca <8 x i16>, align 16
+  %__b = alloca <8 x i16>, align 16
+  %tmp = load <8 x i16>* %xU16x8, align 16
+  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+  %tmp1 = load <8 x i16>* %xU16x8, align 16
+  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+  %tmp2 = load <8 x i16>* %__a, align 16
+  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+  %tmp4 = load <8 x i16>* %__b, align 16
+  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  store <8 x i16> %vext, <8 x i16>* %xU16x8, align 16
+  ret void
+}
+
+define void @test_vextq_p16() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_p16:
+  ; CHECK: {{ext.16.*#10}}
+  %xP16x8 = alloca <8 x i16>, align 16
+  %__a = alloca <8 x i16>, align 16
+  %__b = alloca <8 x i16>, align 16
+  %tmp = load <8 x i16>* %xP16x8, align 16
+  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+  %tmp1 = load <8 x i16>* %xP16x8, align 16
+  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+  %tmp2 = load <8 x i16>* %__a, align 16
+  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+  %tmp4 = load <8 x i16>* %__b, align 16
+  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+  store <8 x i16> %vext, <8 x i16>* %xP16x8, align 16
+  ret void
+}
+
+define void @test_vextq_s32() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s32:
+  ; CHECK: {{ext.16.*#4}}
+  %xS32x4 = alloca <4 x i32>, align 16
+  %__a = alloca <4 x i32>, align 16
+  %__b = alloca <4 x i32>, align 16
+  %tmp = load <4 x i32>* %xS32x4, align 16
+  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
+  %tmp1 = load <4 x i32>* %xS32x4, align 16
+  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
+  %tmp2 = load <4 x i32>* %__a, align 16
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = load <4 x i32>* %__b, align 16
+  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
+  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  store <4 x i32> %vext, <4 x i32>* %xS32x4, align 16
+  ret void
+}
+
+define void @test_vextq_u32() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u32:
+  ; CHECK: {{ext.16.*#8}}
+  %xU32x4 = alloca <4 x i32>, align 16
+  %__a = alloca <4 x i32>, align 16
+  %__b = alloca <4 x i32>, align 16
+  %tmp = load <4 x i32>* %xU32x4, align 16
+  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
+  %tmp1 = load <4 x i32>* %xU32x4, align 16
+  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
+  %tmp2 = load <4 x i32>* %__a, align 16
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = load <4 x i32>* %__b, align 16
+  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
+  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  store <4 x i32> %vext, <4 x i32>* %xU32x4, align 16
+  ret void
+}
+
+define void @test_vextq_f32() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_f32:
+  ; CHECK: {{ext.16.*#12}}
+  %xF32x4 = alloca <4 x float>, align 16
+  %__a = alloca <4 x float>, align 16
+  %__b = alloca <4 x float>, align 16
+  %tmp = load <4 x float>* %xF32x4, align 16
+  store <4 x float> %tmp, <4 x float>* %__a, align 16
+  %tmp1 = load <4 x float>* %xF32x4, align 16
+  store <4 x float> %tmp1, <4 x float>* %__b, align 16
+  %tmp2 = load <4 x float>* %__a, align 16
+  %tmp3 = bitcast <4 x float> %tmp2 to <16 x i8>
+  %tmp4 = load <4 x float>* %__b, align 16
+  %tmp5 = bitcast <4 x float> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x float>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x float>
+  %vext = shufflevector <4 x float> %tmp6, <4 x float> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  store <4 x float> %vext, <4 x float>* %xF32x4, align 16
+  ret void
+}
+
+define void @test_vextq_s64() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s64:
+  ; CHECK: {{ext.16.*#8}}
+  %xS64x2 = alloca <2 x i64>, align 16
+  %__a = alloca <2 x i64>, align 16
+  %__b = alloca <2 x i64>, align 16
+  %tmp = load <2 x i64>* %xS64x2, align 16
+  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
+  %tmp1 = load <2 x i64>* %xS64x2, align 16
+  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
+  %tmp2 = load <2 x i64>* %__a, align 16
+  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
+  %tmp4 = load <2 x i64>* %__b, align 16
+  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
+  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i64> %vext, <2 x i64>* %xS64x2, align 16
+  ret void
+}
+
+define void @test_vextq_u64() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u64:
+  ; CHECK: {{ext.16.*#8}}
+  %xU64x2 = alloca <2 x i64>, align 16
+  %__a = alloca <2 x i64>, align 16
+  %__b = alloca <2 x i64>, align 16
+  %tmp = load <2 x i64>* %xU64x2, align 16
+  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
+  %tmp1 = load <2 x i64>* %xU64x2, align 16
+  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
+  %tmp2 = load <2 x i64>* %__a, align 16
+  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
+  %tmp4 = load <2 x i64>* %__b, align 16
+  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
+  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i64> %vext, <2 x i64>* %xU64x2, align 16
+  ret void
+}
+
+; shuffles with an undef second operand can use an EXT also so long as the
+; indices wrap and stay sequential.
+; rdar://12051674
+define <16 x i8> @vext1(<16 x i8> %_a) nounwind {
+; CHECK-LABEL: vext1:
+; CHECK: ext.16b  v0, v0, v0, #8
+  %vext = shufflevector <16 x i8> %_a, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i8> %vext
+}
+
+; <rdar://problem/12212062>
+define <2 x i64> @vext2(<2 x i64> %p0, <2 x i64> %p1) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vext2:
+; CHECK: ext.16b v1, v1, v1, #8
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: add.2d  v0, v0, v1
+  %t0 = shufflevector <2 x i64> %p1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  %t1 = shufflevector <2 x i64> %p0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  %t2 = add <2 x i64> %t1, %t0
+  ret <2 x i64> %t2
+}
diff --git a/test/CodeGen/AArch64/arm64-vext_reverse.ll b/test/CodeGen/AArch64/arm64-vext_reverse.ll
new file mode 100644
index 00000000000..c45e55edeca
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vext_reverse.ll
@@ -0,0 +1,172 @@
+; RUN: llc -mtriple=arm64-linux-gnuabi < %s | FileCheck %s
+
+; The following tests is to check the correctness of reversing input operand 
+; of vext by enumerating all cases of using two undefs in shuffle masks.
+
+define <4 x i16> @vext_6701_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_0:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_12:
+; CHECK: ext	v0.8b, v0.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_13:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 7, i32 undef, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_14:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 7, i32 0, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_23:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 undef, i32 undef, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_24:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 undef, i32 0, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_34:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 undef, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_0:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_12:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 7, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_13:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 6, i32 undef, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_14:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 6, i32 7, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_23:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 undef, i32 undef, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_24:
+; CHECK: rev32   v0.4h, v1.4h
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 undef, i32 7, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_34:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 6, i32 undef, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_0:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_12:
+; CHECK: ext	v0.8b, v0.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 1, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_13:
+; CHECK: rev32   v0.4h, v0.4h
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 0, i32 undef, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_14:
+; CHECK: ext	v0.8b, v0.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_23:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 undef, i32 undef, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_24:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 undef, i32 1, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_34:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 0, i32 undef, i32 undef>
+  ret <4 x i16> %x
+}
diff --git a/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll b/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
new file mode 100644
index 00000000000..255a18216de
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
@@ -0,0 +1,375 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+;;; Float vectors
+
+%v2f32 = type <2 x float>
+; CHECK: test_v2f32.sqrt:
+define %v2f32 @test_v2f32.sqrt(%v2f32 %a) {
+  ; CHECK: fsqrt.2s
+  %1 = call %v2f32 @llvm.sqrt.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.powi:
+define %v2f32 @test_v2f32.powi(%v2f32 %a, i32 %b) {
+  ; CHECK: pow
+  %1 = call %v2f32 @llvm.powi.v2f32(%v2f32 %a, i32 %b)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.sin:
+define %v2f32 @test_v2f32.sin(%v2f32 %a) {
+  ; CHECK: sin
+  %1 = call %v2f32 @llvm.sin.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.cos:
+define %v2f32 @test_v2f32.cos(%v2f32 %a) {
+  ; CHECK: cos
+  %1 = call %v2f32 @llvm.cos.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.pow:
+define %v2f32 @test_v2f32.pow(%v2f32 %a, %v2f32 %b) {
+  ; CHECK: pow
+  %1 = call %v2f32 @llvm.pow.v2f32(%v2f32 %a, %v2f32 %b)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.exp:
+define %v2f32 @test_v2f32.exp(%v2f32 %a) {
+  ; CHECK: exp
+  %1 = call %v2f32 @llvm.exp.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.exp2:
+define %v2f32 @test_v2f32.exp2(%v2f32 %a) {
+  ; CHECK: exp
+  %1 = call %v2f32 @llvm.exp2.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.log:
+define %v2f32 @test_v2f32.log(%v2f32 %a) {
+  ; CHECK: log
+  %1 = call %v2f32 @llvm.log.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.log10:
+define %v2f32 @test_v2f32.log10(%v2f32 %a) {
+  ; CHECK: log
+  %1 = call %v2f32 @llvm.log10.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.log2:
+define %v2f32 @test_v2f32.log2(%v2f32 %a) {
+  ; CHECK: log
+  %1 = call %v2f32 @llvm.log2.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.fma:
+define %v2f32 @test_v2f32.fma(%v2f32 %a, %v2f32 %b, %v2f32 %c) {
+  ; CHECK: fma
+  %1 = call %v2f32 @llvm.fma.v2f32(%v2f32 %a, %v2f32 %b, %v2f32 %c)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.fabs:
+define %v2f32 @test_v2f32.fabs(%v2f32 %a) {
+  ; CHECK: fabs
+  %1 = call %v2f32 @llvm.fabs.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.floor:
+define %v2f32 @test_v2f32.floor(%v2f32 %a) {
+  ; CHECK: frintm.2s
+  %1 = call %v2f32 @llvm.floor.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.ceil:
+define %v2f32 @test_v2f32.ceil(%v2f32 %a) {
+  ; CHECK: frintp.2s
+  %1 = call %v2f32 @llvm.ceil.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.trunc:
+define %v2f32 @test_v2f32.trunc(%v2f32 %a) {
+  ; CHECK: frintz.2s
+  %1 = call %v2f32 @llvm.trunc.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.rint:
+define %v2f32 @test_v2f32.rint(%v2f32 %a) {
+  ; CHECK: frintx.2s
+  %1 = call %v2f32 @llvm.rint.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.nearbyint:
+define %v2f32 @test_v2f32.nearbyint(%v2f32 %a) {
+  ; CHECK: frinti.2s
+  %1 = call %v2f32 @llvm.nearbyint.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+
+declare %v2f32 @llvm.sqrt.v2f32(%v2f32) #0
+declare %v2f32 @llvm.powi.v2f32(%v2f32, i32) #0
+declare %v2f32 @llvm.sin.v2f32(%v2f32) #0
+declare %v2f32 @llvm.cos.v2f32(%v2f32) #0
+declare %v2f32 @llvm.pow.v2f32(%v2f32, %v2f32) #0
+declare %v2f32 @llvm.exp.v2f32(%v2f32) #0
+declare %v2f32 @llvm.exp2.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log10.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log2.v2f32(%v2f32) #0
+declare %v2f32 @llvm.fma.v2f32(%v2f32, %v2f32, %v2f32) #0
+declare %v2f32 @llvm.fabs.v2f32(%v2f32) #0
+declare %v2f32 @llvm.floor.v2f32(%v2f32) #0
+declare %v2f32 @llvm.ceil.v2f32(%v2f32) #0
+declare %v2f32 @llvm.trunc.v2f32(%v2f32) #0
+declare %v2f32 @llvm.rint.v2f32(%v2f32) #0
+declare %v2f32 @llvm.nearbyint.v2f32(%v2f32) #0
+
+;;;
+
+%v4f32 = type <4 x float>
+; CHECK: test_v4f32.sqrt:
+define %v4f32 @test_v4f32.sqrt(%v4f32 %a) {
+  ; CHECK: fsqrt.4s
+  %1 = call %v4f32 @llvm.sqrt.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.powi:
+define %v4f32 @test_v4f32.powi(%v4f32 %a, i32 %b) {
+  ; CHECK: pow
+  %1 = call %v4f32 @llvm.powi.v4f32(%v4f32 %a, i32 %b)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.sin:
+define %v4f32 @test_v4f32.sin(%v4f32 %a) {
+  ; CHECK: sin
+  %1 = call %v4f32 @llvm.sin.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.cos:
+define %v4f32 @test_v4f32.cos(%v4f32 %a) {
+  ; CHECK: cos
+  %1 = call %v4f32 @llvm.cos.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.pow:
+define %v4f32 @test_v4f32.pow(%v4f32 %a, %v4f32 %b) {
+  ; CHECK: pow
+  %1 = call %v4f32 @llvm.pow.v4f32(%v4f32 %a, %v4f32 %b)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.exp:
+define %v4f32 @test_v4f32.exp(%v4f32 %a) {
+  ; CHECK: exp
+  %1 = call %v4f32 @llvm.exp.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.exp2:
+define %v4f32 @test_v4f32.exp2(%v4f32 %a) {
+  ; CHECK: exp
+  %1 = call %v4f32 @llvm.exp2.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.log:
+define %v4f32 @test_v4f32.log(%v4f32 %a) {
+  ; CHECK: log
+  %1 = call %v4f32 @llvm.log.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.log10:
+define %v4f32 @test_v4f32.log10(%v4f32 %a) {
+  ; CHECK: log
+  %1 = call %v4f32 @llvm.log10.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.log2:
+define %v4f32 @test_v4f32.log2(%v4f32 %a) {
+  ; CHECK: log
+  %1 = call %v4f32 @llvm.log2.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.fma:
+define %v4f32 @test_v4f32.fma(%v4f32 %a, %v4f32 %b, %v4f32 %c) {
+  ; CHECK: fma
+  %1 = call %v4f32 @llvm.fma.v4f32(%v4f32 %a, %v4f32 %b, %v4f32 %c)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.fabs:
+define %v4f32 @test_v4f32.fabs(%v4f32 %a) {
+  ; CHECK: fabs
+  %1 = call %v4f32 @llvm.fabs.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.floor:
+define %v4f32 @test_v4f32.floor(%v4f32 %a) {
+  ; CHECK: frintm.4s
+  %1 = call %v4f32 @llvm.floor.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.ceil:
+define %v4f32 @test_v4f32.ceil(%v4f32 %a) {
+  ; CHECK: frintp.4s
+  %1 = call %v4f32 @llvm.ceil.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.trunc:
+define %v4f32 @test_v4f32.trunc(%v4f32 %a) {
+  ; CHECK: frintz.4s
+  %1 = call %v4f32 @llvm.trunc.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.rint:
+define %v4f32 @test_v4f32.rint(%v4f32 %a) {
+  ; CHECK: frintx.4s
+  %1 = call %v4f32 @llvm.rint.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.nearbyint:
+define %v4f32 @test_v4f32.nearbyint(%v4f32 %a) {
+  ; CHECK: frinti.4s
+  %1 = call %v4f32 @llvm.nearbyint.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+
+declare %v4f32 @llvm.sqrt.v4f32(%v4f32) #0
+declare %v4f32 @llvm.powi.v4f32(%v4f32, i32) #0
+declare %v4f32 @llvm.sin.v4f32(%v4f32) #0
+declare %v4f32 @llvm.cos.v4f32(%v4f32) #0
+declare %v4f32 @llvm.pow.v4f32(%v4f32, %v4f32) #0
+declare %v4f32 @llvm.exp.v4f32(%v4f32) #0
+declare %v4f32 @llvm.exp2.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log10.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log2.v4f32(%v4f32) #0
+declare %v4f32 @llvm.fma.v4f32(%v4f32, %v4f32, %v4f32) #0
+declare %v4f32 @llvm.fabs.v4f32(%v4f32) #0
+declare %v4f32 @llvm.floor.v4f32(%v4f32) #0
+declare %v4f32 @llvm.ceil.v4f32(%v4f32) #0
+declare %v4f32 @llvm.trunc.v4f32(%v4f32) #0
+declare %v4f32 @llvm.rint.v4f32(%v4f32) #0
+declare %v4f32 @llvm.nearbyint.v4f32(%v4f32) #0
+
+;;; Double vector
+
+%v2f64 = type <2 x double>
+; CHECK: test_v2f64.sqrt:
+define %v2f64 @test_v2f64.sqrt(%v2f64 %a) {
+  ; CHECK: fsqrt.2d
+  %1 = call %v2f64 @llvm.sqrt.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.powi:
+define %v2f64 @test_v2f64.powi(%v2f64 %a, i32 %b) {
+  ; CHECK: pow
+  %1 = call %v2f64 @llvm.powi.v2f64(%v2f64 %a, i32 %b)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.sin:
+define %v2f64 @test_v2f64.sin(%v2f64 %a) {
+  ; CHECK: sin
+  %1 = call %v2f64 @llvm.sin.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.cos:
+define %v2f64 @test_v2f64.cos(%v2f64 %a) {
+  ; CHECK: cos
+  %1 = call %v2f64 @llvm.cos.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.pow:
+define %v2f64 @test_v2f64.pow(%v2f64 %a, %v2f64 %b) {
+  ; CHECK: pow
+  %1 = call %v2f64 @llvm.pow.v2f64(%v2f64 %a, %v2f64 %b)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.exp:
+define %v2f64 @test_v2f64.exp(%v2f64 %a) {
+  ; CHECK: exp
+  %1 = call %v2f64 @llvm.exp.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.exp2:
+define %v2f64 @test_v2f64.exp2(%v2f64 %a) {
+  ; CHECK: exp
+  %1 = call %v2f64 @llvm.exp2.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.log:
+define %v2f64 @test_v2f64.log(%v2f64 %a) {
+  ; CHECK: log
+  %1 = call %v2f64 @llvm.log.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.log10:
+define %v2f64 @test_v2f64.log10(%v2f64 %a) {
+  ; CHECK: log
+  %1 = call %v2f64 @llvm.log10.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.log2:
+define %v2f64 @test_v2f64.log2(%v2f64 %a) {
+  ; CHECK: log
+  %1 = call %v2f64 @llvm.log2.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.fma:
+define %v2f64 @test_v2f64.fma(%v2f64 %a, %v2f64 %b, %v2f64 %c) {
+  ; CHECK: fma
+  %1 = call %v2f64 @llvm.fma.v2f64(%v2f64 %a, %v2f64 %b, %v2f64 %c)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.fabs:
+define %v2f64 @test_v2f64.fabs(%v2f64 %a) {
+  ; CHECK: fabs
+  %1 = call %v2f64 @llvm.fabs.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.floor:
+define %v2f64 @test_v2f64.floor(%v2f64 %a) {
+  ; CHECK: frintm.2d
+  %1 = call %v2f64 @llvm.floor.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.ceil:
+define %v2f64 @test_v2f64.ceil(%v2f64 %a) {
+  ; CHECK: frintp.2d
+  %1 = call %v2f64 @llvm.ceil.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.trunc:
+define %v2f64 @test_v2f64.trunc(%v2f64 %a) {
+  ; CHECK: frintz.2d
+  %1 = call %v2f64 @llvm.trunc.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.rint:
+define %v2f64 @test_v2f64.rint(%v2f64 %a) {
+  ; CHECK: frintx.2d
+  %1 = call %v2f64 @llvm.rint.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.nearbyint:
+define %v2f64 @test_v2f64.nearbyint(%v2f64 %a) {
+  ; CHECK: frinti.2d
+  %1 = call %v2f64 @llvm.nearbyint.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+
+declare %v2f64 @llvm.sqrt.v2f64(%v2f64) #0
+declare %v2f64 @llvm.powi.v2f64(%v2f64, i32) #0
+declare %v2f64 @llvm.sin.v2f64(%v2f64) #0
+declare %v2f64 @llvm.cos.v2f64(%v2f64) #0
+declare %v2f64 @llvm.pow.v2f64(%v2f64, %v2f64) #0
+declare %v2f64 @llvm.exp.v2f64(%v2f64) #0
+declare %v2f64 @llvm.exp2.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log10.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log2.v2f64(%v2f64) #0
+declare %v2f64 @llvm.fma.v2f64(%v2f64, %v2f64, %v2f64) #0
+declare %v2f64 @llvm.fabs.v2f64(%v2f64) #0
+declare %v2f64 @llvm.floor.v2f64(%v2f64) #0
+declare %v2f64 @llvm.ceil.v2f64(%v2f64) #0
+declare %v2f64 @llvm.trunc.v2f64(%v2f64) #0
+declare %v2f64 @llvm.rint.v2f64(%v2f64) #0
+declare %v2f64 @llvm.nearbyint.v2f64(%v2f64) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AArch64/arm64-vhadd.ll b/test/CodeGen/AArch64/arm64-vhadd.ll
new file mode 100644
index 00000000000..6178bf9809d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -0,0 +1,249 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shadd8b:
+;CHECK: shadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shadd16b:
+;CHECK: shadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shadd4h:
+;CHECK: shadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shadd8h:
+;CHECK: shadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shadd2s:
+;CHECK: shadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shadd4s:
+;CHECK: shadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uhadd8b:
+;CHECK: uhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uhadd16b:
+;CHECK: uhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uhadd4h:
+;CHECK: uhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uhadd8h:
+;CHECK: uhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uhadd2s:
+;CHECK: uhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uhadd4s:
+;CHECK: uhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srhadd8b:
+;CHECK: srhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srhadd16b:
+;CHECK: srhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srhadd4h:
+;CHECK: srhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srhadd8h:
+;CHECK: srhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srhadd2s:
+;CHECK: srhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srhadd4s:
+;CHECK: srhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: urhadd8b:
+;CHECK: urhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: urhadd16b:
+;CHECK: urhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: urhadd4h:
+;CHECK: urhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: urhadd8h:
+;CHECK: urhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: urhadd2s:
+;CHECK: urhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: urhadd4s:
+;CHECK: urhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vhsub.ll b/test/CodeGen/AArch64/arm64-vhsub.ll
new file mode 100644
index 00000000000..13bfda3899e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vhsub.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @shsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shsub8b:
+;CHECK: shsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @shsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shsub16b:
+;CHECK: shsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @shsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shsub4h:
+;CHECK: shsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @shsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shsub8h:
+;CHECK: shsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @shsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shsub2s:
+;CHECK: shsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @shsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shsub4s:
+;CHECK: shsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @uhsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uhsub8b:
+;CHECK: uhsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uhsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uhsub16b:
+;CHECK: uhsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uhsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uhsub4h:
+;CHECK: uhsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uhsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uhsub8h:
+;CHECK: uhsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uhsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uhsub2s:
+;CHECK: uhsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uhsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uhsub4s:
+;CHECK: uhsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-virtual_base.ll b/test/CodeGen/AArch64/arm64-virtual_base.ll
new file mode 100644
index 00000000000..cb959545334
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-virtual_base.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -O3 -march arm64 | FileCheck %s
+; <rdar://13463602>
+
+%struct.Counter_Struct = type { i64, i64 }
+%struct.Bicubic_Patch_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64, i32, i32, i32, [4 x [4 x [3 x double]]], [3 x double], double, double, %struct.Bezier_Node_Struct* }
+%struct.Method_Struct = type { i32 (%struct.Object_Struct*, %struct.Ray_Struct*, %struct.istack_struct*)*, i32 (double*, %struct.Object_Struct*)*, void (double*, %struct.Object_Struct*, %struct.istk_entry*)*, i8* (%struct.Object_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*)*, void (%struct.Object_Struct*)* }
+%struct.Object_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64 }
+%struct.Texture_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.9, %struct.Texture_Struct*, %struct.Pigment_Struct*, %struct.Tnormal_Struct*, %struct.Finish_Struct*, %struct.Texture_Struct*, i32 }
+%struct.Warps_Struct = type { i16, %struct.Warps_Struct* }
+%struct.Pattern_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.6 }
+%struct.Blend_Map_Struct = type { i16, i16, i16, i64, %struct.Blend_Map_Entry* }
+%struct.Blend_Map_Entry = type { float, i8, %union.anon }
+%union.anon = type { [2 x double], [8 x i8] }
+%union.anon.6 = type { %struct.anon.7 }
+%struct.anon.7 = type { float, [3 x double] }
+%union.anon.9 = type { %struct.anon.10 }
+%struct.anon.10 = type { float, [3 x double] }
+%struct.Pigment_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.0, [5 x float] }
+%union.anon.0 = type { %struct.anon }
+%struct.anon = type { float, [3 x double] }
+%struct.Tnormal_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.3, float }
+%union.anon.3 = type { %struct.anon.4 }
+%struct.anon.4 = type { float, [3 x double] }
+%struct.Finish_Struct = type { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, [3 x float], [3 x float] }
+%struct.Interior_Struct = type { i64, i32, float, float, float, float, float, %struct.Media_Struct* }
+%struct.Media_Struct = type { i32, i32, i32, i32, i32, double, double, i32, i32, i32, i32, [5 x float], [5 x float], [5 x float], [5 x float], double, double, double, double*, %struct.Pigment_Struct*, %struct.Media_Struct* }
+%struct.Bounding_Box_Struct = type { [3 x float], [3 x float] }
+%struct.Ray_Struct = type { [3 x double], [3 x double], i32, [100 x %struct.Interior_Struct*] }
+%struct.istack_struct = type { %struct.istack_struct*, %struct.istk_entry*, i32 }
+%struct.istk_entry = type { double, [3 x double], [3 x double], %struct.Object_Struct*, i32, i32, double, double, i8* }
+%struct.Transform_Struct = type { [4 x [4 x double]], [4 x [4 x double]] }
+%struct.Bezier_Node_Struct = type { i32, [3 x double], double, i32, i8* }
+
+define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
+; CHECK: Precompute_Patch_Values
+; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288]
+; CHECK-NEXT: str [[VAL]], [sp, #232]
+; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272]
+; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
+entry:
+  %Control_Points = alloca [16 x [3 x double]], align 8
+  %arraydecay5.3.1 = getelementptr inbounds [16 x [3 x double]]* %Control_Points, i64 0, i64 9, i64 0
+  %tmp14 = bitcast double* %arraydecay5.3.1 to i8*
+  %arraydecay11.3.1 = getelementptr inbounds %struct.Bicubic_Patch_Struct* %Shape, i64 0, i32 12, i64 1, i64 3, i64 0
+  %tmp15 = bitcast double* %arraydecay11.3.1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp14, i8* %tmp15, i64 24, i32 1, i1 false)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
diff --git a/test/CodeGen/AArch64/arm64-vmax.ll b/test/CodeGen/AArch64/arm64-vmax.ll
new file mode 100644
index 00000000000..3f2c134dec6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vmax.ll
@@ -0,0 +1,679 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @smax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smax_8b:
+;CHECK: smax.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smax_16b:
+;CHECK: smax.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smax_4h:
+;CHECK: smax.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smax_8h:
+;CHECK: smax.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smax_2s:
+;CHECK: smax.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smax_4s:
+;CHECK: smax.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umax_8b:
+;CHECK: umax.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umax_16b:
+;CHECK: umax.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umax_4h:
+;CHECK: umax.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umax_8h:
+;CHECK: umax.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umax_2s:
+;CHECK: umax.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umax_4s:
+;CHECK: umax.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @smin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smin_8b:
+;CHECK: smin.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smin_16b:
+;CHECK: smin.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smin_4h:
+;CHECK: smin.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smin_8h:
+;CHECK: smin.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smin_2s:
+;CHECK: smin.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smin_4s:
+;CHECK: smin.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umin_8b:
+;CHECK: umin.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umin_16b:
+;CHECK: umin.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umin_4h:
+;CHECK: umin.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umin_8h:
+;CHECK: umin.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umin_2s:
+;CHECK: umin.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umin_4s:
+;CHECK: umin.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @smaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smaxp_8b:
+;CHECK: smaxp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smaxp_16b:
+;CHECK: smaxp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smaxp_4h:
+;CHECK: smaxp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smaxp_8h:
+;CHECK: smaxp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smaxp_2s:
+;CHECK: smaxp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smaxp_4s:
+;CHECK: smaxp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umaxp_8b:
+;CHECK: umaxp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umaxp_16b:
+;CHECK: umaxp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umaxp_4h:
+;CHECK: umaxp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umaxp_8h:
+;CHECK: umaxp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umaxp_2s:
+;CHECK: umaxp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umaxp_4s:
+;CHECK: umaxp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sminp_8b:
+;CHECK: sminp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sminp_16b:
+;CHECK: sminp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sminp_4h:
+;CHECK: sminp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sminp_8h:
+;CHECK: sminp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sminp_2s:
+;CHECK: sminp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sminp_4s:
+;CHECK: sminp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @uminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uminp_8b:
+;CHECK: uminp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uminp_16b:
+;CHECK: uminp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uminp_4h:
+;CHECK: uminp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uminp_8h:
+;CHECK: uminp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uminp_2s:
+;CHECK: uminp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uminp_4s:
+;CHECK: uminp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x float> @fmax_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmax_2s:
+;CHECK: fmax.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmax_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmax_4s:
+;CHECK: fmax.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmax_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmax_2d:
+;CHECK: fmax.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmaxp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxp_2s:
+;CHECK: fmaxp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmaxp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxp_4s:
+;CHECK: fmaxp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmaxp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmaxp_2d:
+;CHECK: fmaxp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmin_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmin_2s:
+;CHECK: fmin.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmin_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmin_4s:
+;CHECK: fmin.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmin_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmin_2d:
+;CHECK: fmin.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fminp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fminp_2s:
+;CHECK: fminp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fminp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fminp_4s:
+;CHECK: fminp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fminp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fminp_2d:
+;CHECK: fminp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fminnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fminnmp_2s:
+;CHECK: fminnmp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fminnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fminnmp_4s:
+;CHECK: fminnmp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fminnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fminnmp_2d:
+;CHECK: fminnmp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmaxnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_2s:
+;CHECK: fmaxnmp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmaxnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_4s:
+;CHECK: fmaxnmp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmaxnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_2d:
+;CHECK: fmaxnmp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vminmaxnm.ll b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
new file mode 100644
index 00000000000..b5aca45cd47
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vmaxnm2.i
+}
+
+define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.4s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vmaxnm2.i
+}
+
+define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2d	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vmaxnm2.i
+}
+
+define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.2s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vminnm2.i
+}
+
+define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.4s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vminnm2.i
+}
+
+define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fminnm.2d	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vminnm2.i
+}
+
+declare <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+
+define double @test_fmaxnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fmaxnmv:
+; CHECK: fmaxnmp.2d d0, v0
+  %max = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+define double @test_fminnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fminnmv:
+; CHECK: fminnmp.2d d0, v0
+  %min = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double>)
+declare double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/AArch64/arm64-vmovn.ll b/test/CodeGen/AArch64/arm64-vmovn.ll
new file mode 100644
index 00000000000..67e2816a7f5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vmovn.ll
@@ -0,0 +1,242 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @xtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: xtn8b:
+;CHECK-NOT: ld1
+;CHECK: xtn.8b v0, v0
+;CHECK-NEXT: ret
+  %tmp3 = trunc <8 x i16> %A to <8 x i8>
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @xtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: xtn4h:
+;CHECK-NOT: ld1
+;CHECK: xtn.4h v0, v0
+;CHECK-NEXT: ret
+  %tmp3 = trunc <4 x i32> %A to <4 x i16>
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @xtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: xtn2s:
+;CHECK-NOT: ld1
+;CHECK: xtn.2s v0, v0
+;CHECK-NEXT: ret
+  %tmp3 = trunc <2 x i64> %A to <2 x i32>
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @xtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: xtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: xtn2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = trunc <8 x i16> %A to <8 x i8>
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @xtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: xtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: xtn2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = trunc <4 x i32> %A to <4 x i16>
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @xtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: xtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: xtn2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = trunc <2 x i64> %A to <2 x i32>
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+define <8 x i8> @sqxtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtn8b:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.8b v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqxtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtn4h:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.4h v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqxtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtn2s:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.2s v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A)
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @sqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A)
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @sqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A)
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
+
+define <8 x i8> @uqxtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: uqxtn8b:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.8b v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqxtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: uqxtn4h:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.4h v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqxtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: uqxtn2s:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.2s v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: uqxtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A)
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @uqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: uqxtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A)
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @uqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: uqxtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A)
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
+
+define <8 x i8> @sqxtun8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtun8b:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.8b v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqxtun4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtun4h:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.4h v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqxtun2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtun2s:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.2s v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqxtun2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtun2_16b:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A)
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @sqxtun2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtun2_8h:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A)
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @sqxtun2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtun2_4s:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A)
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll
new file mode 100644
index 00000000000..6fa60fe346a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vmul.ll
@@ -0,0 +1,2036 @@
+; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+
+define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smull8h:
+;CHECK: smull.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smull4s:
+;CHECK: smull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smull2d:
+;CHECK: smull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+declare <8 x i16>  @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umull8h:
+;CHECK: umull.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umull4s:
+;CHECK: umull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umull2d:
+;CHECK: umull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+declare <8 x i16>  @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull4s:
+;CHECK: sqdmull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2d:
+;CHECK: sqdmull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_4s:
+;CHECK: sqdmull2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_2d:
+;CHECK: sqdmull2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: pmull8h:
+;CHECK: pmull.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_4h:
+;CHECK: sqdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_8h:
+;CHECK: sqdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_2s:
+;CHECK: sqdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_4s:
+;CHECK: sqdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
+;CHECK-LABEL: sqdmulh_1s:
+;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
+  %tmp1 = load i32* %A
+  %tmp2 = load i32* %B
+  %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
+  ret i32 %tmp3
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
+
+define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_4h:
+;CHECK: sqrdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_8h:
+;CHECK: sqrdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_2s:
+;CHECK: sqrdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_4s:
+;CHECK: sqrdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_1s:
+;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
+  %tmp1 = load i32* %A
+  %tmp2 = load i32* %B
+  %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
+  ret i32 %tmp3
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
+
+define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_2s:
+;CHECK: fmulx.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+  ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_4s:
+;CHECK: fmulx.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+  ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmulx_2d:
+;CHECK: fmulx.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+  ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlal4s:
+;CHECK: smlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlal2d:
+;CHECK: smlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlsl4s:
+;CHECK: smlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlsl2d:
+;CHECK: smlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+
+define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal4s:
+;CHECK: sqdmlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2d:
+;CHECK: sqdmlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_4s:
+;CHECK: sqdmlal2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_2d:
+;CHECK: sqdmlal2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl4s:
+;CHECK: sqdmlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2d:
+;CHECK: sqdmlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_4s:
+;CHECK: sqdmlsl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_2d:
+;CHECK: sqdmlsl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlal4s:
+;CHECK: umlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlal2d:
+;CHECK: umlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlsl4s:
+;CHECK: umlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlsl2d:
+;CHECK: umlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmla_2s:
+;CHECK: fmla.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmla_4s:
+;CHECK: fmla.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmla_2d:
+;CHECK: fmla.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_2s:
+;CHECK: fmls.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
+  ret <2 x float> %tmp5
+}
+
+define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_4s:
+;CHECK: fmls.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
+  ret <4 x float> %tmp5
+}
+
+define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmls_2d:
+;CHECK: fmls.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
+  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
+  ret <2 x double> %tmp5
+}
+
+define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_2s:
+;CHECK: fmls.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
+  ret <2 x float> %tmp5
+}
+
+define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_4s:
+;CHECK: fmls.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
+  ret <4 x float> %tmp5
+}
+
+define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_2d:
+;CHECK: fmls.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
+  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
+  ret <2 x double> %tmp5
+}
+
+define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_2s:
+;CHECK: fmls.2s
+entry:
+  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
+  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
+  %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
+  ret <2 x float> %fmls1
+}
+
+define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_4s:
+;CHECK: fmls.4s
+entry:
+  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+  %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
+  ret <4 x float> %fmls1
+}
+
+define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_2d:
+;CHECK: fmls.2d
+entry:
+  %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
+  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
+  %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
+  ret <2 x double> %fmls1
+}
+
+define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fmla_indexed_scalar_2s:
+; CHECK-NEXT: fmla.2s
+; CHECK-NEXT: ret
+  %v1 = insertelement <2 x float> undef, float %c, i32 0
+  %v2 = insertelement <2 x float> %v1, float %c, i32 1
+  %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
+  ret <2 x float> %fmla1
+}
+
+define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fmla_indexed_scalar_4s:
+; CHECK-NEXT: fmla.4s
+; CHECK-NEXT: ret
+  %v1 = insertelement <4 x float> undef, float %c, i32 0
+  %v2 = insertelement <4 x float> %v1, float %c, i32 1
+  %v3 = insertelement <4 x float> %v2, float %c, i32 2
+  %v4 = insertelement <4 x float> %v3, float %c, i32 3
+  %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %fmla1
+}
+
+define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fmla_indexed_scalar_2d:
+; CHECK-NEXT: fmla.2d
+; CHECK-NEXT: ret
+entry:
+  %v1 = insertelement <2 x double> undef, double %c, i32 0
+  %v2 = insertelement <2 x double> %v1, double %c, i32 1
+  %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %fmla1
+}
+
+define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: mul_4h:
+;CHECK-NOT: dup
+;CHECK: mul.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = mul <4 x i16> %tmp1, %tmp3
+  ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: mul_8h:
+;CHECK-NOT: dup
+;CHECK: mul.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = mul <8 x i16> %tmp1, %tmp3
+  ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: mul_2s:
+;CHECK-NOT: dup
+;CHECK: mul.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = mul <2 x i32> %tmp1, %tmp3
+  ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: mul_4s:
+;CHECK-NOT: dup
+;CHECK: mul.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = mul <4 x i32> %tmp1, %tmp3
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK-LABEL: mul_2d:
+; CHECK: mul
+; CHECK: mul
+  %tmp1 = mul <2 x i64> %A, %B
+  ret <2 x i64> %tmp1
+}
+
+define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_2s:
+;CHECK-NOT: dup
+;CHECK: fmul.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = fmul <2 x float> %tmp1, %tmp3
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_4s:
+;CHECK-NOT: dup
+;CHECK: fmul.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = fmul <4 x float> %tmp1, %tmp3
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_2d:
+;CHECK-NOT: dup
+;CHECK: fmul.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = fmul <2 x double> %tmp1, %tmp3
+  ret <2 x double> %tmp4
+}
+
+define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
+;CHECK-LABEL: fmul_lane_s:
+;CHECK-NOT: dup
+;CHECK: fmul.s s0, s0, v1[3]
+  %B = extractelement <4 x float> %vec, i32 3
+  %res = fmul float %A, %B
+  ret float %res
+}
+
+define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
+;CHECK-LABEL: fmul_lane_d:
+;CHECK-NOT: dup
+;CHECK: fmul.d d0, d0, v1[1]
+  %B = extractelement <2 x double> %vec, i32 1
+  %res = fmul double %A, %B
+  ret double %res
+}
+
+
+
+define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_2s:
+;CHECK-NOT: dup
+;CHECK: fmulx.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_4s:
+;CHECK-NOT: dup
+;CHECK: fmulx.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_2d:
+;CHECK-NOT: dup
+;CHECK: fmulx.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_4h:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_8h:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_2s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_1s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
+  %tmp1 = extractelement <4 x i32> %B, i32 1
+  %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
+  ret i32 %tmp2
+}
+
+define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_4h:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_8h:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_2s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_1s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
+  %tmp1 = extractelement <4 x i32> %B, i32 1
+  %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
+  ret i32 %tmp2
+}
+
+define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmull2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmull2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = add <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = add <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlal2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlal2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_1s:
+;CHECK: sqdmlal.4s
+  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
+  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod = extractelement <4 x i32> %prod.vec, i32 0
+  %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
+  ret i32 %res
+}
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
+
+define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_1s:
+;CHECK: sqdmlsl.4s
+  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
+  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod = extractelement <4 x i32> %prod.vec, i32 0
+  %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
+  ret i32 %res
+}
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
+
+define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_1d:
+;CHECK: sqdmlal.s
+  %rhs = extractelement <2 x i32> %C, i32 1
+  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
+  ret i64 %res
+}
+declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
+
+define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_1d:
+;CHECK: sqdmlsl.s
+  %rhs = extractelement <2 x i32> %C, i32 1
+  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
+  ret i64 %res
+}
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
+
+
+define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = add <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = add <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+
+define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = sub <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = sub <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = sub <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = sub <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+; Scalar FMULX
+define float @fmulxs(float %a, float %b) nounwind {
+; CHECK-LABEL: fmulxs:
+; CHECKNEXT: fmulx s0, s0, s1
+  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
+; CHECKNEXT: ret
+  ret float %fmulx.i
+}
+
+define double @fmulxd(double %a, double %b) nounwind {
+; CHECK-LABEL: fmulxd:
+; CHECKNEXT: fmulx d0, d0, d1
+  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
+; CHECKNEXT: ret
+  ret double %fmulx.i
+}
+
+define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
+; CHECK-LABEL: fmulxs_lane:
+; CHECKNEXT: fmulx.s s0, s0, v1[3]
+  %b = extractelement <4 x float> %vec, i32 3
+  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
+; CHECKNEXT: ret
+  ret float %fmulx.i
+}
+
+define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
+; CHECK-LABEL: fmulxd_lane:
+; CHECKNEXT: fmulx d0, d0, v1[1]
+  %b = extractelement <2 x double> %vec, i32 1
+  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
+; CHECKNEXT: ret
+  ret double %fmulx.i
+}
+
+declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
+
+
+define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: smull2_8h_simple:
+; CHECK-NEXT: smull2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: foo0:
+; CHECK: smull2.8h v0, v0, v1
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: smull2.4s v0, v0, v1
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK: smull2.2d v0, v0, v1
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK: umull2.8h v0, v0, v1
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK: umull2.4s v0, v0, v1
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK: umull2.2d v0, v0, v1
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo6:
+; CHECK-NEXT: smull2.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo7:
+; CHECK-NEXT: smull2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo8:
+; CHECK-NEXT: umull2.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo9:
+; CHECK-NEXT: umull2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
+; CHECK-LABEL: bar0:
+; CHECK: smlal2.8h v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %add.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
+; CHECK-LABEL: bar1:
+; CHECK: smlal2.4s v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
+; CHECK-LABEL: bar2:
+; CHECK: smlal2.2d v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
+; CHECK-LABEL: bar3:
+; CHECK: umlal2.8h v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %add.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
+; CHECK-LABEL: bar4:
+; CHECK: umlal2.4s v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
+; CHECK-LABEL: bar5:
+; CHECK: umlal2.2d v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
+; CHECK-LABEL: mlal2_1:
+; CHECK: smlal2.4s v0, v1, v2[3]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
+; CHECK-LABEL: mlal2_2:
+; CHECK: smlal2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
+; CHECK-LABEL: mlal2_4:
+; CHECK: umlal2.4s v0, v1, v2[2]
+; CHECK-NEXT: ret
+
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
+; CHECK-LABEL: mlal2_5:
+; CHECK: umlal2.2d v0, v1, v2[0]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add
+}
+
+; rdar://12328502
+define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmulq_n_f64:
+; CHECK-NOT: dup.2d
+; CHECK: fmul.2d v0, v0, v1[0]
+  %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
+  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
+  %mul.i = fmul <2 x double> %vecinit1.i, %x
+  ret <2 x double> %mul.i
+}
+
+define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmulq_n_f32:
+; CHECK-NOT: dup.4s
+; CHECK: fmul.4s v0, v0, v1[0]
+  %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
+  %mul.i = fmul <4 x float> %vecinit3.i, %x
+  ret <4 x float> %mul.i
+}
+
+define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmul_n_f32:
+; CHECK-NOT: dup.2s
+; CHECK: fmul.2s v0, v0, v1[0]
+  %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
+  %mul.i = fmul <2 x float> %vecinit1.i, %x
+  ret <2 x float> %mul.i
+}
+
+define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
+entry:
+; CHECK: vmla_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: mla.4h v0, v1, v2[6]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
+entry:
+; CHECK: vmla_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: mla.2s v0, v1, v2[3]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
+entry:
+; CHECK: not_really_vmlaq_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: mla.8h v0, v1, v2[5]
+; CHECK-NEXT: ret
+  %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <8 x i16> %shuffle2, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
+entry:
+; CHECK: not_really_vmlaq_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: mla.4s v0, v1, v2[3]
+; CHECK-NEXT: ret
+  %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle2, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: smull.4s v0, v0, v1[6]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: smull.2d v0, v0, v1[2]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  ret <2 x i64> %vmull2.i
+}
+define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_u16_test
+; CHECK-NOT: ext
+; CHECK: umull.4s v0, v0, v1[6]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_u32_test
+; CHECK-NOT: ext
+; CHECK: umull.2d v0, v0, v1[2]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_s16_test
+; CHECK-NOT: ext
+; CHECK: smull2.4s
+; CHECK-NEXT: ret
+  %conv = trunc i32 %d to i16
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_s32_test
+; CHECK-NOT: ext
+; CHECK: smull2.2d
+; CHECK-NEXT: ret
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_u16_test
+; CHECK-NOT: ext
+; CHECK: umull2.4s
+; CHECK-NEXT: ret
+  %conv = trunc i32 %d to i16
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_u32_test
+; CHECK-NOT: ext
+; CHECK: umull2.2d
+; CHECK-NEXT: ret
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmul_built_dup_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1]
+  %vget_lane = extractelement <4 x i32> %b, i32 1
+  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
+  %prod = mul <4 x i32> %a, %vecinit3.i
+  ret <4 x i32> %prod
+}
+
+define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmul_built_dup_fromsmall_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3]
+  %vget_lane = extractelement <4 x i16> %b, i32 3
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %prod = mul <4 x i16> %a, %vecinit3.i
+  ret <4 x i16> %prod
+}
+
+define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+  %vget_lane = extractelement <4 x i16> %b, i32 0
+  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+  %prod = mul <8 x i16> %a, %vecinit7.i
+  ret <8 x i16> %prod
+}
+
+define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: mull_from_two_extracts:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: mlal_from_two_extracts:
+; CHECK-NOT: ext
+; CHECK: sqdmlal2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  ret <2 x i64> %sum
+}
+
+define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  ret <2 x i64> %res
+}
+
+define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup:
+; CHECK-NOT: ext
+; CHECK: pmull2.8h
+  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
+  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: pmull2.8h
+
+  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+  ret <8 x i16> %res
+}
+
+define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: sqdmlal2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  ret <2 x i64> %sum
+}
+
+define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: umlal2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = add <2 x i64> %accum, %res
+  ret <2 x i64> %sum
+}
+
+define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
+; CHECK: fmla.s s0, s1, v2[3]
+  %rhs = extractelement <4 x float> %rvec, i32 3
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
+; CHECK: fmla.s s0, s1, v2[1]
+  %rhs = extractelement <2 x float> %rvec, i32 1
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
+; CHECK: fmls.s s0, s1, v2[3]
+  %rhs.scal = extractelement <4 x float> %rvec, i32 3
+  %rhs = fsub float -0.0, %rhs.scal
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
+; CHECK: fmls.s s0, s1, v2[1]
+  %rhs.scal = extractelement <2 x float> %rvec, i32 1
+  %rhs = fsub float -0.0, %rhs.scal
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
+; CHECK: fmla.d d0, d1, v2[1]
+  %rhs = extractelement <2 x double> %rvec, i32 1
+  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
+  ret double %res
+}
+
+define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
+; CHECK: fmls.d d0, d1, v2[1]
+  %rhs.scal = extractelement <2 x double> %rvec, i32 1
+  %rhs = fsub double -0.0, %rhs.scal
+  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
+  ret double %res
+}
+
+declare double @llvm.fma.f64(double, double, double)
+
+define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
+; CHECK: fmls.2s v0, v1, v2[3]
+  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
+  ret <2 x float> %res
+}
+
+define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
+; CHECK: fmls.2s v0, v1, v2[1]
+  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
+  ret <2 x float> %res
+}
+
+define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
+; CHECK: fmls.4s v0, v1, v2[3]
+  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
+  ret <4 x float> %res
+}
+
+define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
+; CHECK: fmls.4s v0, v1, v2[1]
+  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
+  ret <4 x float> %res
+}
+
+define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
+; CHECK: fmls.2d v0, v1, v2[1]
+  %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
+  %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
+  ret <2 x double> %res
+}
+
+define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
+; CHECK-LABEL: test_fmul_v1f64:
+; CHECK: fmul
+  %prod = fmul <1 x double> %L, %R
+  ret <1 x double> %prod
+}
+
+define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
+; CHECK-LABEL: test_fdiv_v1f64:
+; CHECK-LABEL: fdiv
+  %prod = fdiv <1 x double> %L, %R
+  ret <1 x double> %prod
+}
+
+define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
+;CHECK-LABEL: sqdmlal_d:
+;CHECK: sqdmlal
+  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+  %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
+  ret i64 %tmp5
+}
+
+define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
+;CHECK-LABEL: sqdmlsl_d:
+;CHECK: sqdmlsl
+  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+  %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
+  ret i64 %tmp5
+}
+
+define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: test_pmull_64:
+; CHECK: pmull.1q
+  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
+  ret <16 x i8> %val
+}
+
+define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
+; CHECK-LABEL: test_pmull_high_64:
+; CHECK: pmull2.1q
+  %l_hi = extractelement <2 x i64> %l, i32 1
+  %r_hi = extractelement <2 x i64> %r, i32 1
+  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
+  ret <16 x i8> %val
+}
+
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
+
+define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
+; CHECK-LABEL: test_mul_v1i64:
+; CHECK: mul
+  %prod = mul <1 x i64> %lhs, %rhs
+  ret <1 x i64> %prod
+}
diff --git a/test/CodeGen/AArch64/arm64-volatile.ll b/test/CodeGen/AArch64/arm64-volatile.ll
new file mode 100644
index 00000000000..e00ac5acb5f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-volatile.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define i64 @normal_load(i64* nocapture %bar) nounwind readonly {
+; CHECK: normal_load
+; CHECK: ldp
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+  %add.ptr = getelementptr inbounds i64* %bar, i64 1
+  %tmp = load i64* %add.ptr, align 8
+  %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
+  %tmp1 = load i64* %add.ptr1, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
+
+define i64 @volatile_load(i64* nocapture %bar) nounwind {
+; CHECK: volatile_load
+; CHECK: ldr
+; CHECK-NEXT: ldr
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+  %add.ptr = getelementptr inbounds i64* %bar, i64 1
+  %tmp = load volatile i64* %add.ptr, align 8
+  %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
+  %tmp1 = load volatile i64* %add.ptr1, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
diff --git a/test/CodeGen/AArch64/arm64-vpopcnt.ll b/test/CodeGen/AArch64/arm64-vpopcnt.ll
new file mode 100644
index 00000000000..25306eba491
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vpopcnt.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; The non-byte ones used to fail with "Cannot select"
+
+; CHECK-LABEL: ctpopv8i8
+; CHECK: cnt.8b
+define <8 x i8> @ctpopv8i8(<8 x i8> %x) nounwind readnone {
+  %cnt = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %x)
+  ret <8 x i8> %cnt
+}
+
+declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
+
+; CHECK-LABEL: ctpopv4i16
+; CHECK: cnt.8b
+define <4 x i16> @ctpopv4i16(<4 x i16> %x) nounwind readnone {
+  %cnt = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %x)
+  ret <4 x i16> %cnt
+}
+
+declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
+
+; CHECK-LABEL: ctpopv2i32
+; CHECK: cnt.8b
+define <2 x i32> @ctpopv2i32(<2 x i32> %x) nounwind readnone {
+  %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x)
+  ret <2 x i32> %cnt
+}
+
+declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
+
+
+; CHECK-LABEL: ctpopv16i8
+; CHECK: cnt.16b
+define <16 x i8> @ctpopv16i8(<16 x i8> %x) nounwind readnone {
+  %cnt = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %x)
+  ret <16 x i8> %cnt
+}
+
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
+
+; CHECK-LABEL: ctpopv8i16
+; CHECK: cnt.8b
+define <8 x i16> @ctpopv8i16(<8 x i16> %x) nounwind readnone {
+  %cnt = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %x)
+  ret <8 x i16> %cnt
+}
+
+declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
+
+; CHECK-LABEL: ctpopv4i32
+; CHECK: cnt.8b
+define <4 x i32> @ctpopv4i32(<4 x i32> %x) nounwind readnone {
+  %cnt = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x)
+  ret <4 x i32> %cnt
+}
+
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
+
+; CHECK-LABEL: ctpopv2i64
+; CHECK: cnt.8b
+define <2 x i64> @ctpopv2i64(<2 x i64> %x) nounwind readnone {
+  %cnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)
+  ret <2 x i64> %cnt
+}
+
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vqadd.ll b/test/CodeGen/AArch64/arm64-vqadd.ll
new file mode 100644
index 00000000000..20f7e2c7a89
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vqadd.ll
@@ -0,0 +1,332 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqadd8b:
+;CHECK: sqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqadd4h:
+;CHECK: sqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqadd2s:
+;CHECK: sqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqadd8b:
+;CHECK: uqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqadd4h:
+;CHECK: uqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqadd2s:
+;CHECK: uqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqadd16b:
+;CHECK: sqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqadd8h:
+;CHECK: sqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqadd4s:
+;CHECK: sqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqadd2d:
+;CHECK: sqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqadd16b:
+;CHECK: uqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqadd8h:
+;CHECK: uqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqadd4s:
+;CHECK: uqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqadd2d:
+;CHECK: uqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @usqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usqadd8b:
+;CHECK: usqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @usqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usqadd4h:
+;CHECK: usqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @usqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usqadd2s:
+;CHECK: usqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @usqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usqadd16b:
+;CHECK: usqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @usqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usqadd8h:
+;CHECK: usqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @usqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usqadd4s:
+;CHECK: usqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @usqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: usqadd2d:
+;CHECK: usqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define i64 @usqadd_d(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: usqadd_d:
+; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %l, i64 %r)
+  ret i64 %sum
+}
+
+define i32 @usqadd_s(i32 %l, i32 %r) nounwind {
+; CHECK-LABEL: usqadd_s:
+; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
+  %sum = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %l, i32 %r)
+  ret i32 %sum
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.usqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.usqadd.i32(i32, i32) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @suqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: suqadd8b:
+;CHECK: suqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @suqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: suqadd4h:
+;CHECK: suqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @suqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: suqadd2s:
+;CHECK: suqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @suqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: suqadd16b:
+;CHECK: suqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @suqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: suqadd8h:
+;CHECK: suqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @suqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: suqadd4s:
+;CHECK: suqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @suqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: suqadd2d:
+;CHECK: suqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i64> @suqadd_1d(<1 x i64> %l, <1 x i64> %r) nounwind {
+; CHECK-LABEL: suqadd_1d:
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
+  ret <1 x i64> %sum
+}
+
+define i64 @suqadd_d(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: suqadd_d:
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %l, i64 %r)
+  ret i64 %sum
+}
+
+define i32 @suqadd_s(i32 %l, i32 %r) nounwind {
+; CHECK-LABEL: suqadd_s:
+; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
+  %sum = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %l, i32 %r)
+  ret i32 %sum
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.suqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.suqadd.i32(i32, i32) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vqsub.ll b/test/CodeGen/AArch64/arm64-vqsub.ll
new file mode 100644
index 00000000000..dde3ac3478e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vqsub.ll
@@ -0,0 +1,147 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub8b:
+;CHECK: sqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub4h:
+;CHECK: sqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub2s:
+;CHECK: sqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub8b:
+;CHECK: uqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub4h:
+;CHECK: uqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub2s:
+;CHECK: uqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub16b:
+;CHECK: sqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub8h:
+;CHECK: sqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub4s:
+;CHECK: sqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqsub2d:
+;CHECK: sqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub16b:
+;CHECK: uqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub8h:
+;CHECK: uqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub4s:
+;CHECK: uqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqsub2d:
+;CHECK: uqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vselect.ll b/test/CodeGen/AArch64/arm64-vselect.ll
new file mode 100644
index 00000000000..9988512f530
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vselect.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func63
+;CHECK: cmeq.4h v0, v0, v1
+
+;FIXME: currently, it will generate 3 instructions:
+; ushll.4s	v0, v0, #0
+; shl.4s	v0, v0, #31
+; sshr.4s	v0, v0, #31
+;But these instrucitons can be optimized into 1 instruction:
+; sshll.4s  v0, v0, #0
+
+;CHECK: bsl.16b v0, v2, v3
+;CHECK: str  q0, [x0]
+;CHECK: ret
+
+%T0_63 = type <4 x i16>
+%T1_63 = type <4 x i32>
+%T2_63 = type <4 x i1>
+define void @func63(%T1_63* %out, %T0_63 %v0, %T0_63 %v1, %T1_63 %v2, %T1_63 %v3) {
+  %cond = icmp eq %T0_63 %v0, %v1
+  %r = select %T2_63 %cond, %T1_63 %v2, %T1_63 %v3
+  store %T1_63 %r, %T1_63* %out
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-vsetcc_fp.ll b/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
new file mode 100644
index 00000000000..f4f4714dde4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+define <2 x i32> @fcmp_one(<2 x float> %x, <2 x float> %y) nounwind optsize readnone {
+; CHECK-LABEL: fcmp_one:
+; CHECK-NEXT: fcmgt.2s [[REG:v[0-9]+]], v0, v1
+; CHECK-NEXT: fcmgt.2s [[REG2:v[0-9]+]], v1, v0
+; CHECK-NEXT: orr.8b v0, [[REG2]], [[REG]]
+; CHECK-NEXT: ret
+  %tmp = fcmp one <2 x float> %x, %y
+  %or = sext <2 x i1> %tmp to <2 x i32>
+  ret <2 x i32> %or
+}
diff --git a/test/CodeGen/AArch64/arm64-vshift.ll b/test/CodeGen/AArch64/arm64-vshift.ll
new file mode 100644
index 00000000000..82ae486f8c4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vshift.ll
@@ -0,0 +1,1917 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -enable-misched=false | FileCheck %s
+
+define <8 x i8> @sqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqshl8b:
+;CHECK: sqshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqshl4h:
+;CHECK: sqshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqshl2s:
+;CHECK: sqshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqshl8b:
+;CHECK: uqshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqshl4h:
+;CHECK: uqshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqshl2s:
+;CHECK: uqshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqshl16b:
+;CHECK: sqshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqshl8h:
+;CHECK: sqshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqshl4s:
+;CHECK: sqshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqshl2d:
+;CHECK: sqshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqshl16b:
+;CHECK: uqshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqshl8h:
+;CHECK: uqshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqshl4s:
+;CHECK: uqshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqshl2d:
+;CHECK: uqshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @srshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srshl8b:
+;CHECK: srshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @srshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srshl4h:
+;CHECK: srshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @srshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srshl2s:
+;CHECK: srshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @urshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: urshl8b:
+;CHECK: urshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @urshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: urshl4h:
+;CHECK: urshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @urshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: urshl2s:
+;CHECK: urshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @srshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srshl16b:
+;CHECK: srshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @srshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srshl8h:
+;CHECK: srshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @srshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srshl4s:
+;CHECK: srshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @srshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: srshl2d:
+;CHECK: srshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @urshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: urshl16b:
+;CHECK: urshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @urshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: urshl8h:
+;CHECK: urshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @urshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: urshl4s:
+;CHECK: urshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @urshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: urshl2d:
+;CHECK: urshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @sqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqrshl8b:
+;CHECK: sqrshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrshl4h:
+;CHECK: sqrshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrshl2s:
+;CHECK: sqrshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqrshl8b:
+;CHECK: uqrshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqrshl4h:
+;CHECK: uqrshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqrshl2s:
+;CHECK: uqrshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqrshl16b:
+;CHECK: sqrshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrshl8h:
+;CHECK: sqrshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrshl4s:
+;CHECK: sqrshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqrshl2d:
+;CHECK: sqrshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqrshl16b:
+;CHECK: uqrshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqrshl8h:
+;CHECK: uqrshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqrshl4s:
+;CHECK: uqrshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqrshl2d:
+;CHECK: uqrshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @urshr8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: urshr8b:
+;CHECK: urshr.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @urshr4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: urshr4h:
+;CHECK: urshr.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @urshr2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: urshr2s:
+;CHECK: urshr.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @urshr16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: urshr16b:
+;CHECK: urshr.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @urshr8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: urshr8h:
+;CHECK: urshr.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @urshr4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: urshr4s:
+;CHECK: urshr.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @urshr2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: urshr2d:
+;CHECK: urshr.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @srshr8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: srshr8b:
+;CHECK: srshr.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @srshr4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: srshr4h:
+;CHECK: srshr.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @srshr2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: srshr2s:
+;CHECK: srshr.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @srshr16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: srshr16b:
+;CHECK: srshr.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @srshr8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: srshr8h:
+;CHECK: srshr.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @srshr4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: srshr4s:
+;CHECK: srshr.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @srshr2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: srshr2d:
+;CHECK: srshr.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @sqshlu8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshlu8b:
+;CHECK: sqshlu.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshlu4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshlu4h:
+;CHECK: sqshlu.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshlu2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshlu2s:
+;CHECK: sqshlu.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshlu16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshlu16b:
+;CHECK: sqshlu.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshlu8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshlu8h:
+;CHECK: sqshlu.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshlu4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshlu4s:
+;CHECK: sqshlu.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshlu2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshlu2d:
+;CHECK: sqshlu.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @rshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: rshrn8b:
+;CHECK: rshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @rshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: rshrn4h:
+;CHECK: rshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @rshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: rshrn2s:
+;CHECK: rshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @rshrn16b(<8 x i8> *%ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: rshrn16b:
+;CHECK: rshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @rshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: rshrn8h:
+;CHECK: rshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @rshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: rshrn4s:
+;CHECK: rshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define <8 x i8> @shrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: shrn8b:
+;CHECK: shrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @shrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: shrn4h:
+;CHECK: shrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @shrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: shrn2s:
+;CHECK: shrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @shrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: shrn16b:
+;CHECK: shrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @shrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: shrn8h:
+;CHECK: shrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @shrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: shrn4s:
+;CHECK: shrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: sqshrn1s:
+; CHECK: sqshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrn8b:
+;CHECK: sqshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrn4h:
+;CHECK: sqshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrn2s:
+;CHECK: sqshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+
+define <16 x i8> @sqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrn16b:
+;CHECK: sqshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrn8h:
+;CHECK: sqshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrn4s:
+;CHECK: sqshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.aarch64.neon.sqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqshrun1s(i64 %A) nounwind {
+; CHECK-LABEL: sqshrun1s:
+; CHECK: sqshrun {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqshrun8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrun8b:
+;CHECK: sqshrun.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshrun4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrun4h:
+;CHECK: sqshrun.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshrun2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrun2s:
+;CHECK: sqshrun.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrun16b:
+;CHECK: sqshrun2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrun8h:
+;CHECK: sqshrun2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrun4s:
+;CHECK: sqshrun2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.aarch64.neon.sqshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqrshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: sqrshrn1s:
+; CHECK: sqrshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqrshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrn8b:
+;CHECK: sqrshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrn4h:
+;CHECK: sqrshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrn2s:
+;CHECK: sqrshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrn16b:
+;CHECK: sqrshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrn8h:
+;CHECK: sqrshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrn4s:
+;CHECK: sqrshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.aarch64.neon.sqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqrshrun1s(i64 %A) nounwind {
+; CHECK-LABEL: sqrshrun1s:
+; CHECK: sqrshrun {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqrshrun8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrun8b:
+;CHECK: sqrshrun.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshrun4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrun4h:
+;CHECK: sqrshrun.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshrun2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrun2s:
+;CHECK: sqrshrun.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrun16b:
+;CHECK: sqrshrun2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqrshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrun8h:
+;CHECK: sqrshrun2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqrshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrun4s:
+;CHECK: sqrshrun2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.aarch64.neon.sqrshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @uqrshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: uqrshrn1s:
+; CHECK: uqrshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @uqrshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqrshrn8b:
+;CHECK: uqrshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqrshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqrshrn4h:
+;CHECK: uqrshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqrshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqrshrn2s:
+;CHECK: uqrshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqrshrn16b:
+;CHECK: uqrshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @uqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqrshrn8h:
+;CHECK: uqrshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqrshrn4s:
+;CHECK: uqrshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.aarch64.neon.uqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @uqshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: uqshrn1s:
+; CHECK: uqshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @uqshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshrn8b:
+;CHECK: uqshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshrn4h:
+;CHECK: uqshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshrn2s:
+;CHECK: uqshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshrn16b:
+;CHECK: uqshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @uqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshrn8h:
+;CHECK: uqshrn2.8h v0, {{v[0-9]+}}, #1
+  %out = load <4 x i16>* %ret
+  %tmp1 = load <4 x i32>* %A
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+  %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshrn4s:
+;CHECK: uqshrn2.4s v0, {{v[0-9]+}}, #1
+  %out = load <2 x i32>* %ret
+  %tmp1 = load <2 x i64>* %A
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+  %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.aarch64.neon.uqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define <8 x i16> @ushll8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: ushll8h:
+;CHECK: ushll.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ushll4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: ushll4s:
+;CHECK: ushll.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ushll2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: ushll2d:
+;CHECK: ushll.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: ushll2_8h:
+;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1
+        %load1 = load <16 x i8>* %A
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: ushll2_4s:
+;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1
+        %load1 = load <8 x i16>* %A
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: ushll2_2d:
+;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1
+        %load1 = load <4 x i32>* %A
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @sshll8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sshll8h:
+;CHECK: sshll.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sshll4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sshll4s:
+;CHECK: sshll.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sshll2d:
+;CHECK: sshll.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sshll2_8h:
+;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1
+        %load1 = load <16 x i8>* %A
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sshll2_4s:
+;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1
+        %load1 = load <8 x i16>* %A
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sshll2_2d:
+;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1
+        %load1 = load <4 x i32>* %A
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @sqshli8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshli8b:
+;CHECK: sqshl.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshli4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshli4h:
+;CHECK: sqshl.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshli2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshli2s:
+;CHECK: sqshl.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshli16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshli16b:
+;CHECK: sqshl.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshli8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshli8h:
+;CHECK: sqshl.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshli4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshli4s:
+;CHECK: sqshl.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshli2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshli2d:
+;CHECK: sqshl.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli8b:
+;CHECK: uqshl.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshli4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshli4h:
+;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshli2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshli2s:
+;CHECK: uqshl.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqshli16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli16b:
+;CHECK: uqshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqshli8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshli8h:
+;CHECK: uqshl.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqshli4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshli4s:
+;CHECK: uqshl.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqshli2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshli2d:
+;CHECK: uqshl.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @ursra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ursra8b:
+;CHECK: ursra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @ursra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ursra4h:
+;CHECK: ursra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @ursra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ursra2s:
+;CHECK: ursra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @ursra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ursra16b:
+;CHECK: ursra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @ursra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ursra8h:
+;CHECK: ursra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ursra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ursra4s:
+;CHECK: ursra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ursra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: ursra2d:
+;CHECK: ursra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @srsra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srsra8b:
+;CHECK: srsra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @srsra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srsra4h:
+;CHECK: srsra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @srsra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srsra2s:
+;CHECK: srsra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @srsra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srsra16b:
+;CHECK: srsra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @srsra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srsra8h:
+;CHECK: srsra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @srsra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srsra4s:
+;CHECK: srsra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @srsra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: srsra2d:
+;CHECK: srsra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @usra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usra8b:
+;CHECK: usra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @usra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usra4h:
+;CHECK: usra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @usra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usra2s:
+;CHECK: usra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @usra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usra16b:
+;CHECK: usra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @usra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usra8h:
+;CHECK: usra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @usra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usra4s:
+;CHECK: usra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @usra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: usra2d:
+;CHECK: usra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @ssra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssra8b:
+;CHECK: ssra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = ashr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @ssra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssra4h:
+;CHECK: ssra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = ashr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @ssra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssra2s:
+;CHECK: ssra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = ashr <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @ssra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssra16b:
+;CHECK: ssra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = ashr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @ssra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssra8h:
+;CHECK: ssra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = ashr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ssra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssra4s:
+;CHECK: ssra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = ashr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ssra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: ssra2d:
+;CHECK: ssra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = ashr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @shr_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shr_orr8b:
+;CHECK: shr.8b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i8>* %A
+        %tmp4 = load <8 x i8>* %B
+        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @shr_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shr_orr4h:
+;CHECK: shr.4h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i16>* %A
+        %tmp4 = load <4 x i16>* %B
+        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @shr_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shr_orr2s:
+;CHECK: shr.2s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i32>* %A
+        %tmp4 = load <2 x i32>* %B
+        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp5 = or <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @shr_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shr_orr16b:
+;CHECK: shr.16b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <16 x i8>* %A
+        %tmp4 = load <16 x i8>* %B
+        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @shr_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shr_orr8h:
+;CHECK: shr.8h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp4 = load <8 x i16>* %B
+        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @shr_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shr_orr4s:
+;CHECK: shr.4s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp4 = load <4 x i32>* %B
+        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp5 = or <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @shr_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: shr_orr2d:
+;CHECK: shr.2d v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp4 = load <2 x i64>* %B
+        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp5 = or <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @shl_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shl_orr8b:
+;CHECK: shl.8b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i8>* %A
+        %tmp4 = load <8 x i8>* %B
+        %tmp3 = shl <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @shl_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shl_orr4h:
+;CHECK: shl.4h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i16>* %A
+        %tmp4 = load <4 x i16>* %B
+        %tmp3 = shl <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @shl_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shl_orr2s:
+;CHECK: shl.2s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i32>* %A
+        %tmp4 = load <2 x i32>* %B
+        %tmp3 = shl <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp5 = or <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @shl_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shl_orr16b:
+;CHECK: shl.16b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <16 x i8>* %A
+        %tmp4 = load <16 x i8>* %B
+        %tmp3 = shl <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @shl_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shl_orr8h:
+;CHECK: shl.8h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp4 = load <8 x i16>* %B
+        %tmp3 = shl <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @shl_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shl_orr4s:
+;CHECK: shl.4s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp4 = load <4 x i32>* %B
+        %tmp3 = shl <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp5 = or <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @shl_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: shl_orr2d:
+;CHECK: shl.2d v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp4 = load <2 x i64>* %B
+        %tmp3 = shl <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp5 = or <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @shll(<8 x i8> %in) {
+; CHECK-LABEL: shll:
+; CHECK: shll.8h v0, {{v[0-9]+}}, #8
+  %ext = zext <8 x i8> %in to <8 x i16>
+  %res = shl <8 x i16> %ext, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @shll_high(<8 x i16> %in) {
+; CHECK-LABEL: shll_high
+; CHECK: shll2.4s v0, {{v[0-9]+}}, #16
+  %extract = shufflevector <8 x i16> %in, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ext = zext <4 x i16> %extract to <4 x i32>
+  %res = shl <4 x i32> %ext, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %res
+}
+
+define <8 x i8> @sli8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sli8b:
+;CHECK: sli.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sli4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sli4h:
+;CHECK: sli.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sli2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sli2s:
+;CHECK: sli.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @sli1d(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: sli1d:
+;CHECK: sli d0, {{d[0-9]+}}, #1
+        %tmp1 = load <1 x i64>* %A
+        %tmp2 = load <1 x i64>* %B
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
+        ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @sli16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sli16b:
+;CHECK: sli.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sli8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sli8h:
+;CHECK: sli.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sli4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sli4s:
+;CHECK: sli.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sli2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sli2d:
+;CHECK: sli.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
+
+define <1 x i64> @ashr_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: ashr_v1i64:
+; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: sshl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %c = ashr <1 x i64> %a, %b
+  ret <1 x i64> %c
+}
diff --git a/test/CodeGen/AArch64/arm64-vshr.ll b/test/CodeGen/AArch64/arm64-vshr.ll
new file mode 100644
index 00000000000..21eb579f252
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vshr.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+
+define <8 x i16> @testShiftRightArith_v8i16(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK-LABEL: testShiftRightArith_v8i16:
+; CHECK: neg.8h	[[REG1:v[0-9]+]], [[REG1]]
+; CHECK-NEXT: sshl.8h [[REG2:v[0-9]+]], [[REG2]], [[REG1]]
+
+entry:
+  %a.addr = alloca <8 x i16>, align 16
+  %b.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %a, <8 x i16>* %a.addr, align 16
+  store <8 x i16> %b, <8 x i16>* %b.addr, align 16
+  %0 = load <8 x i16>* %a.addr, align 16
+  %1 = load <8 x i16>* %b.addr, align 16
+  %shr = ashr <8 x i16> %0, %1
+  ret <8 x i16> %shr
+}
+
+define <4 x i32> @testShiftRightArith_v4i32(<4 x i32> %a, <4 x i32> %b) #0 {
+; CHECK-LABEL: testShiftRightArith_v4i32:
+; CHECK: neg.4s	[[REG3:v[0-9]+]], [[REG3]]
+; CHECK-NEXT: sshl.4s [[REG4:v[0-9]+]], [[REG4]], [[REG3]]
+entry:
+  %a.addr = alloca <4 x i32>, align 32
+  %b.addr = alloca <4 x i32>, align 32
+  store <4 x i32> %a, <4 x i32>* %a.addr, align 32
+  store <4 x i32> %b, <4 x i32>* %b.addr, align 32
+  %0 = load <4 x i32>* %a.addr, align 32
+  %1 = load <4 x i32>* %b.addr, align 32
+  %shr = ashr <4 x i32> %0, %1
+  ret <4 x i32> %shr
+}
+
+define <8 x i16> @testShiftRightLogical(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK: testShiftRightLogical
+; CHECK: neg.8h	[[REG5:v[0-9]+]], [[REG5]]
+; CHECK-NEXT: ushl.8h [[REG6:v[0-9]+]], [[REG6]], [[REG5]]
+entry:
+  %a.addr = alloca <8 x i16>, align 16
+  %b.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %a, <8 x i16>* %a.addr, align 16
+  store <8 x i16> %b, <8 x i16>* %b.addr, align 16
+  %0 = load <8 x i16>* %a.addr, align 16
+  %1 = load <8 x i16>* %b.addr, align 16
+  %shr = lshr <8 x i16> %0, %1
+  ret <8 x i16> %shr
+}
+
+define <1 x i64> @sshr_v1i64(<1 x i64> %A) nounwind {
+; CHECK-LABEL: sshr_v1i64:
+; CHECK: sshr d0, d0, #63
+  %tmp3 = ashr <1 x i64> %A, < i64 63 >
+  ret <1 x i64> %tmp3
+}
+
+define <1 x i64> @ushr_v1i64(<1 x i64> %A) nounwind {
+; CHECK-LABEL: ushr_v1i64:
+; CHECK: ushr d0, d0, #63
+  %tmp3 = lshr <1 x i64> %A, < i64 63 >
+  ret <1 x i64> %tmp3
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll
new file mode 100644
index 00000000000..62fd96102d0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -0,0 +1,115 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s
+
+
+; The mask:
+; CHECK: lCPI0_0:
+; CHECK:  .byte   2                       ; 0x2
+; CHECK:  .byte   255                     ; 0xff
+; CHECK:  .byte   6                       ; 0x6
+; CHECK:  .byte   255                     ; 0xff
+; The second vector is legalized to undef and the elements of the first vector
+; are used instead.
+; CHECK:  .byte   2                       ; 0x2
+; CHECK:  .byte   4                       ; 0x4
+; CHECK:  .byte   6                       ; 0x6
+; CHECK:  .byte   0                       ; 0x0
+; CHECK: test1
+; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0
+; CHECK: movi.8h v[[REG1:[0-9]+]], #0x1, lsl #8
+; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <8 x i1> @test1() {
+entry:
+  %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
+                                   i1 7>,
+                         <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
+                                   i1 7>,
+                         <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10,
+                                    i32 12, i32 14, i32 0>
+  ret <8 x i1> %Shuff
+}
+
+; CHECK: lCPI1_0:
+; CHECK:          .byte   2                       ; 0x2
+; CHECK:          .byte   255                     ; 0xff
+; CHECK:          .byte   6                       ; 0x6
+; CHECK:          .byte   255                     ; 0xff
+; CHECK:          .byte   10                      ; 0xa
+; CHECK:          .byte   12                      ; 0xc
+; CHECK:          .byte   14                      ; 0xe
+; CHECK:          .byte   0                       ; 0x0
+; CHECK: test2
+; CHECK: ldr     d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF]
+; CHECK: adrp    x[[REG2:[0-9]+]], lCPI1_1@PAGE
+; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG2]], lCPI1_1@PAGEOFF]
+; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <8 x i1>@test2() {
+bb:
+  %Shuff = shufflevector <8 x i1> zeroinitializer,
+     <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
+     <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
+                i32 0>
+  ret <8 x i1> %Shuff
+}
+
+; CHECK: lCPI2_0:
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK: test3
+; CHECK: adrp    x[[REG3:[0-9]+]], lCPI2_0@PAGE
+; CHECK: ldr     q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF]
+; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG3]], lCPI2_1@PAGEOFF]
+; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <16 x i1> @test3(i1* %ptr, i32 %v) {
+bb:
+  %Shuff = shufflevector <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i1> undef,
+     <16 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
+                 i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12,
+                 i32 14, i32 0>
+  ret <16 x i1> %Shuff
+}
+; CHECK: lCPI3_1:
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   1                       ; 0x1
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   18                      ; 0x12
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   31                      ; 0x1f
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   30                      ; 0x1e
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK: _test4:
+; CHECK:         ldr     q[[REG1:[0-9]+]]
+; CHECK:         movi.2d v[[REG0:[0-9]+]], #0000000000000000
+; CHECK:         adrp    x[[REG3:[0-9]+]], lCPI3_1@PAGE
+; CHECK:         ldr     q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF]
+; CHECK:         tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]]
+define <16 x i1> @test4(i1* %ptr, i32 %v) {
+bb:
+  %Shuff = shufflevector <16 x i1> zeroinitializer,
+     <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1,
+                i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
+     <16 x i32> <i32 2, i32 1, i32 6, i32 18, i32 10, i32 12, i32 14, i32 0,
+                 i32 2, i32 31, i32 6, i32 30, i32 10, i32 12, i32 14, i32 0>
+  ret <16 x i1> %Shuff
+}
diff --git a/test/CodeGen/AArch64/arm64-vsqrt.ll b/test/CodeGen/AArch64/arm64-vsqrt.ll
new file mode 100644
index 00000000000..02b7c7ec5d8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vsqrt.ll
@@ -0,0 +1,232 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @frecps_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: frecps_2s:
+;CHECK: frecps.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frecps_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: frecps_4s:
+;CHECK: frecps.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frecps_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: frecps_2d:
+;CHECK: frecps.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: frsqrts_2s:
+;CHECK: frsqrts.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frsqrts_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: frsqrts_4s:
+;CHECK: frsqrts.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frsqrts_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: frsqrts_2d:
+;CHECK: frsqrts.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @frecpe_2s(<2 x float>* %A) nounwind {
+;CHECK-LABEL: frecpe_2s:
+;CHECK: frecpe.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frecpe_4s(<4 x float>* %A) nounwind {
+;CHECK-LABEL: frecpe_4s:
+;CHECK: frecpe.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frecpe_2d(<2 x double>* %A) nounwind {
+;CHECK-LABEL: frecpe_2d:
+;CHECK: frecpe.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> %tmp1)
+	ret <2 x double> %tmp3
+}
+
+define float @frecpe_s(float* %A) nounwind {
+;CHECK-LABEL: frecpe_s:
+;CHECK: frecpe s0, {{s[0-9]+}}
+  %tmp1 = load float* %A
+  %tmp3 = call float @llvm.aarch64.neon.frecpe.f32(float %tmp1)
+  ret float %tmp3
+}
+
+define double @frecpe_d(double* %A) nounwind {
+;CHECK-LABEL: frecpe_d:
+;CHECK: frecpe d0, {{d[0-9]+}}
+  %tmp1 = load double* %A
+  %tmp3 = call double @llvm.aarch64.neon.frecpe.f64(double %tmp1)
+  ret double %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.aarch64.neon.frecpe.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frecpe.f64(double) nounwind readnone
+
+define float @frecpx_s(float* %A) nounwind {
+;CHECK-LABEL: frecpx_s:
+;CHECK: frecpx s0, {{s[0-9]+}}
+  %tmp1 = load float* %A
+  %tmp3 = call float @llvm.aarch64.neon.frecpx.f32(float %tmp1)
+  ret float %tmp3
+}
+
+define double @frecpx_d(double* %A) nounwind {
+;CHECK-LABEL: frecpx_d:
+;CHECK: frecpx d0, {{d[0-9]+}}
+  %tmp1 = load double* %A
+  %tmp3 = call double @llvm.aarch64.neon.frecpx.f64(double %tmp1)
+  ret double %tmp3
+}
+
+declare float @llvm.aarch64.neon.frecpx.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frecpx.f64(double) nounwind readnone
+
+define <2 x float> @frsqrte_2s(<2 x float>* %A) nounwind {
+;CHECK-LABEL: frsqrte_2s:
+;CHECK: frsqrte.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frsqrte_4s(<4 x float>* %A) nounwind {
+;CHECK-LABEL: frsqrte_4s:
+;CHECK: frsqrte.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frsqrte_2d(<2 x double>* %A) nounwind {
+;CHECK-LABEL: frsqrte_2d:
+;CHECK: frsqrte.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> %tmp1)
+	ret <2 x double> %tmp3
+}
+
+define float @frsqrte_s(float* %A) nounwind {
+;CHECK-LABEL: frsqrte_s:
+;CHECK: frsqrte s0, {{s[0-9]+}}
+  %tmp1 = load float* %A
+  %tmp3 = call float @llvm.aarch64.neon.frsqrte.f32(float %tmp1)
+  ret float %tmp3
+}
+
+define double @frsqrte_d(double* %A) nounwind {
+;CHECK-LABEL: frsqrte_d:
+;CHECK: frsqrte d0, {{d[0-9]+}}
+  %tmp1 = load double* %A
+  %tmp3 = call double @llvm.aarch64.neon.frsqrte.f64(double %tmp1)
+  ret double %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.aarch64.neon.frsqrte.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frsqrte.f64(double) nounwind readnone
+
+define <2 x i32> @urecpe_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: urecpe_2s:
+;CHECK: urecpe.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urecpe_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: urecpe_4s:
+;CHECK: urecpe.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
+
+define <2 x i32> @ursqrte_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: ursqrte_2s:
+;CHECK: ursqrte.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @ursqrte_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: ursqrte_4s:
+;CHECK: ursqrte.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
+
+define float @f1(float %a, float %b) nounwind readnone optsize ssp {
+; CHECK-LABEL: f1:
+; CHECK: frsqrts s0, s0, s1
+; CHECK-NEXT: ret
+  %vrsqrtss.i = tail call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) nounwind
+  ret float %vrsqrtss.i
+}
+
+define double @f2(double %a, double %b) nounwind readnone optsize ssp {
+; CHECK-LABEL: f2:
+; CHECK: frsqrts d0, d0, d1
+; CHECK-NEXT: ret
+  %vrsqrtsd.i = tail call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) nounwind
+  ret double %vrsqrtsd.i
+}
+
+declare double @llvm.aarch64.neon.frsqrts.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.frsqrts.f32(float, float) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vsra.ll b/test/CodeGen/AArch64/arm64-vsra.ll
new file mode 100644
index 00000000000..5e9cef3e7e2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vsra.ll
@@ -0,0 +1,150 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vsras8:
+;CHECK: ssra.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = ashr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vsras16:
+;CHECK: ssra.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = ashr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vsras32:
+;CHECK: ssra.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = ashr <2 x i32> %tmp2, < i32 31, i32 31 >
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vsraQs8:
+;CHECK: ssra.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = ashr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vsraQs16:
+;CHECK: ssra.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = ashr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vsraQs32:
+;CHECK: ssra.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = ashr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vsraQs64:
+;CHECK: ssra.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = ashr <2 x i64> %tmp2, < i64 63, i64 63 >
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vsrau8:
+;CHECK: usra.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = lshr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vsrau16:
+;CHECK: usra.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = lshr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vsrau32:
+;CHECK: usra.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = lshr <2 x i32> %tmp2, < i32 31, i32 31 >
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+
+define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vsraQu8:
+;CHECK: usra.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = lshr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vsraQu16:
+;CHECK: usra.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = lshr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vsraQu32:
+;CHECK: usra.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = lshr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vsraQu64:
+;CHECK: usra.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = lshr <2 x i64> %tmp2, < i64 63, i64 63 >
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <1 x i64> @vsra_v1i64(<1 x i64> %A, <1 x i64> %B) nounwind {
+; CHECK-LABEL: vsra_v1i64:
+; CHECK: ssra d0, d1, #63
+  %tmp3 = ashr <1 x i64> %B, < i64 63 >
+  %tmp4 = add <1 x i64> %A, %tmp3
+  ret <1 x i64> %tmp4
+}
diff --git a/test/CodeGen/AArch64/arm64-vsub.ll b/test/CodeGen/AArch64/arm64-vsub.ll
new file mode 100644
index 00000000000..c2c8755c066
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vsub.ll
@@ -0,0 +1,417 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn8b:
+;CHECK: subhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn4h:
+;CHECK: subhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2s:
+;CHECK: subhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
+;CHECK-LABEL: subhn2_16b:
+;CHECK: subhn.8b
+;CHECK-NEXT: subhn2.16b
+  %vsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
+;CHECK-LABEL: subhn2_8h:
+;CHECK: subhn.4h
+;CHECK-NEXT: subhn2.8h
+  %vsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
+;CHECK-LABEL: subhn2_4s:
+;CHECK: subhn.2s
+;CHECK-NEXT: subhn2.4s
+  %vsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: rsubhn8b:
+;CHECK: rsubhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: rsubhn4h:
+;CHECK: rsubhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: rsubhn2s:
+;CHECK: rsubhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
+;CHECK-LABEL: rsubhn2_16b:
+;CHECK: rsubhn.8b
+;CHECK-NEXT: rsubhn2.16b
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vrsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
+;CHECK-LABEL: rsubhn2_8h:
+;CHECK: rsubhn.4h
+;CHECK-NEXT: rsubhn2.8h
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vrsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
+;CHECK-LABEL: rsubhn2_4s:
+;CHECK: rsubhn.2s
+;CHECK-NEXT: rsubhn2.4s
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vrsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubl8h:
+;CHECK: ssubl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sub <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ssubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubl4s:
+;CHECK: ssubl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubl2d:
+;CHECK: ssubl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubl2_8h:
+;CHECK: ssubl2.8h
+        %tmp1 = load <16 x i8>* %A
+        %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext1 = sext <8 x i8> %high1 to <8 x i16>
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = sub <8 x i16> %ext1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubl2_4s:
+;CHECK: ssubl2.4s
+        %tmp1 = load <8 x i16>* %A
+        %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext1 = sext <4 x i16> %high1 to <4 x i32>
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = sub <4 x i32> %ext1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubl2_2d:
+;CHECK: ssubl2.2d
+        %tmp1 = load <4 x i32>* %A
+        %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext1 = sext <2 x i32> %high1 to <2 x i64>
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = sub <2 x i64> %ext1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @usubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usubl8h:
+;CHECK: usubl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sub <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @usubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usubl4s:
+;CHECK: usubl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usubl2d:
+;CHECK: usubl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usubl2_8h:
+;CHECK: usubl2.8h
+  %tmp1 = load <16 x i8>* %A
+  %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ext1 = zext <8 x i8> %high1 to <8 x i16>
+
+  %tmp2 = load <16 x i8>* %B
+  %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+  %res = sub <8 x i16> %ext1, %ext2
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usubl2_4s:
+;CHECK: usubl2.4s
+  %tmp1 = load <8 x i16>* %A
+  %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ext1 = zext <4 x i16> %high1 to <4 x i32>
+
+  %tmp2 = load <8 x i16>* %B
+  %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+  %res = sub <4 x i32> %ext1, %ext2
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usubl2_2d:
+;CHECK: usubl2.2d
+  %tmp1 = load <4 x i32>* %A
+  %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %ext1 = zext <2 x i32> %high1 to <2 x i64>
+
+  %tmp2 = load <4 x i32>* %B
+  %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+  %res = sub <2 x i64> %ext1, %ext2
+  ret <2 x i64> %res
+}
+
+define <8 x i16> @ssubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubw8h:
+;CHECK: ssubw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = sub <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ssubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubw4s:
+;CHECK: ssubw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = sub <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubw2d:
+;CHECK: ssubw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = sub <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubw2_8h:
+;CHECK: ssubw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = sub <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubw2_4s:
+;CHECK: ssubw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = sub <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubw2_2d:
+;CHECK: ssubw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = sub <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @usubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usubw8h:
+;CHECK: usubw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = sub <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @usubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usubw4s:
+;CHECK: usubw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = sub <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usubw2d:
+;CHECK: usubw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = sub <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usubw2_8h:
+;CHECK: usubw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+        %res = sub <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usubw2_4s:
+;CHECK: usubw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+        %res = sub <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usubw2_2d:
+;CHECK: usubw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+        %res = sub <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
diff --git a/test/CodeGen/AArch64/arm64-weak-reference.ll b/test/CodeGen/AArch64/arm64-weak-reference.ll
new file mode 100644
index 00000000000..b2135e0960c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-weak-reference.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+@x = extern_weak global i32
+
+define i32 @fn() nounwind ssp {
+; CHECK-LABEL: fn:
+; CHECK: .weak_reference
+  %val = load i32* @x, align 4
+  ret i32 %val
+}
diff --git a/test/CodeGen/AArch64/arm64-xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll
new file mode 100644
index 00000000000..6cffbdeef8a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -0,0 +1,524 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+;
+; Get the actual value of the overflow bit.
+;
+define i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo.i32
+; CHECK:        adds w8, w0, w1
+; CHECK-NEXT:   cset w0, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo.i64
+; CHECK:        adds x8, x0, x1
+; CHECK-NEXT:   cset w0, vs
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  uaddo.i32
+; CHECK:        adds w8, w0, w1
+; CHECK-NEXT:   cset w0, hs
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  uaddo.i64
+; CHECK:        adds x8, x0, x1
+; CHECK-NEXT:   cset w0, hs
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  ssubo.i32
+; CHECK:        subs w8, w0, w1
+; CHECK-NEXT:   cset w0, vs
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  ssubo.i64
+; CHECK:        subs x8, x0, x1
+; CHECK-NEXT:   cset w0, vs
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  usubo.i32
+; CHECK:        subs w8, w0, w1
+; CHECK-NEXT:   cset w0, lo
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  usubo.i64
+; CHECK:        subs x8, x0, x1
+; CHECK-NEXT:   cset w0, lo
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  smulo.i32
+; CHECK:        smull x8, w0, w1
+; CHECK-NEXT:   lsr x9, x8, #32
+; CHECK-NEXT:   cmp w9, w8, asr #31
+; CHECK-NEXT:   cset w0, ne
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  smulo.i64
+; CHECK:        mul x8, x0, x1
+; CHECK-NEXT:   smulh x9, x0, x1
+; CHECK-NEXT:   cmp x9, x8, asr #63
+; CHECK-NEXT:   cset w0, ne
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  umulo.i32
+; CHECK:        umull x8, w0, w1
+; CHECK-NEXT:   cmp xzr, x8, lsr #32
+; CHECK-NEXT:   cset w0, ne
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  umulo.i64
+; CHECK:        umulh x8, x0, x1
+; CHECK-NEXT:   cmp xzr, x8
+; CHECK-NEXT:   cset w8, ne
+; CHECK-NEXT:   mul x9, x0, x1
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+
+;
+; Check the use of the overflow bit in combination with a select instruction.
+;
+define i32 @saddo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  saddo.select.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  saddo.select.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, vs
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.select.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, hs
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.select.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, hs
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @ssubo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.select.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, vs
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.select.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, vs
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  usubo.select.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, lo
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  usubo.select.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, lo
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  smulo.select.i32
+; CHECK:        smull    x8, w0, w1
+; CHECK-NEXT:   lsr     x9, x8, #32
+; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK-NEXT:   csel    w0, w0, w1, ne
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  smulo.select.i64
+; CHECK:        mul      x8, x0, x1
+; CHECK-NEXT:   smulh   x9, x0, x1
+; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK-NEXT:   csel    x0, x0, x1, ne
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  umulo.select.i32
+; CHECK:        umull    x8, w0, w1
+; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK-NEXT:   csel    w0, w0, w1, ne
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  umulo.select.i64
+; CHECK:        umulh   x8, x0, x1
+; CHECK-NEXT:   cmp     xzr, x8
+; CHECK-NEXT:   csel    x0, x0, x1, ne
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+
+;
+; Check the use of the overflow bit in combination with a branch instruction.
+;
+define i1 @saddo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  saddo.br.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   b.vc
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @saddo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  saddo.br.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   b.vc
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.br.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   b.lo
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.br.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   b.lo
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.br.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   b.vc
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.br.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   b.vc
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @usubo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  usubo.br.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   b.hs
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @usubo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  usubo.br.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   b.hs
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @smulo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  smulo.br.i32
+; CHECK:        smull    x8, w0, w1
+; CHECK-NEXT:   lsr     x9, x8, #32
+; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK-NEXT:   b.eq
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @smulo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  smulo.br.i64
+; CHECK:        mul      x8, x0, x1
+; CHECK-NEXT:   smulh   x9, x0, x1
+; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK-NEXT:   b.eq
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @umulo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  umulo.br.i32
+; CHECK:        umull    x8, w0, w1
+; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK-NEXT:   b.eq
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @umulo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  umulo.br.i64
+; CHECK:        umulh   x8, x0, x1
+; CHECK-NEXT:   cbz
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll b/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
new file mode 100644
index 00000000000..c56d607aa81
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; rdar://12254953
+
+define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: mov x0, [[REG1:x[0-9]+]]
+; CHECK: mov x1, [[REG2:x[0-9]+]]
+; CHECK: bl _foo
+; CHECK: mov x0, [[REG1]]
+; CHECK: mov x1, [[REG2]]
+  %call = call i32 @foo(i32 %c, i32 %d) nounwind
+  %call1 = call i32 @foo(i32 %c, i32 %d) nounwind
+  unreachable
+}
+
+declare i32 @foo(i32, i32)
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
new file mode 100644
index 00000000000..349bb6fd78a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; rdar://11481771
+; rdar://13713797
+
+define void @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: fmov
+; CHECK: movi.2d v0, #0000000000000000
+; CHECK: movi.2d v1, #0000000000000000
+; CHECK: movi.2d v2, #0000000000000000
+; CHECK: movi.2d v3, #0000000000000000
+  tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind
+  ret void
+}
+
+define void @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK-NOT: mov w0, wzr
+; CHECK: movz w0, #0
+; CHECK: movz w1, #0
+  tail call void @bari(i32 0, i32 0) nounwind
+  ret void
+}
+
+define void @t3() nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: mov x0, xzr
+; CHECK: movz x0, #0
+; CHECK: movz x1, #0
+  tail call void @barl(i64 0, i64 0) nounwind
+  ret void
+}
+
+define void @t4() nounwind ssp {
+; CHECK-LABEL: t4:
+; CHECK-NOT: fmov
+; CHECK: movi.2d v0, #0000000000000000
+; CHECK: movi.2d v1, #0000000000000000
+  tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
+  ret void
+}
+
+declare void @bar(double, double, double, double)
+declare void @bari(i32, i32)
+declare void @barl(i64, i64)
+declare void @barf(float, float)
diff --git a/test/CodeGen/AArch64/arm64-zext.ll b/test/CodeGen/AArch64/arm64-zext.ll
new file mode 100644
index 00000000000..8d9e5ea040e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zext.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i64 @foo(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: add w0, w1, w0
+; CHECK: ret
+  %add = add i32 %b, %a
+  %conv = zext i32 %add to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-zextload-unscaled.ll b/test/CodeGen/AArch64/arm64-zextload-unscaled.ll
new file mode 100644
index 00000000000..c475dbd21ee
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zextload-unscaled.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+@var32 = global i32 0
+
+define void @test_zextloadi1_unscaled(i1* %base) {
+; CHECK-LABEL: test_zextloadi1_unscaled:
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
+
+  %addr = getelementptr i1* %base, i32 -7
+  %val = load i1* %addr, align 1
+
+  %extended = zext i1 %val to i32
+  store i32 %extended, i32* @var32, align 4
+  ret void
+}
+
+define void @test_zextloadi8_unscaled(i8* %base) {
+; CHECK-LABEL: test_zextloadi8_unscaled:
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
+
+  %addr = getelementptr i8* %base, i32 -7
+  %val = load i8* %addr, align 1
+
+  %extended = zext i8 %val to i32
+  store i32 %extended, i32* @var32, align 4
+  ret void
+}
+
+define void @test_zextloadi16_unscaled(i16* %base) {
+; CHECK-LABEL: test_zextloadi16_unscaled:
+; CHECK: ldurh {{w[0-9]+}}, [{{x[0-9]+}}, #-14]
+
+  %addr = getelementptr i16* %base, i32 -7
+  %val = load i16* %addr, align 2
+
+  %extended = zext i16 %val to i32
+  store i32 %extended, i32* @var32, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-zip.ll b/test/CodeGen/AArch64/arm64-zip.ll
new file mode 100644
index 00000000000..304b2809943
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zip.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipi8:
+;CHECK: zip1.8b
+;CHECK: zip2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vzipi16:
+;CHECK: zip1.4h
+;CHECK: zip2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipQi8:
+;CHECK: zip1.16b
+;CHECK: zip2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vzipQi16:
+;CHECK: zip1.8h
+;CHECK: zip2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vzipQi32:
+;CHECK: zip1.4s
+;CHECK: zip2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vzipQf:
+;CHECK: zip1.4s
+;CHECK: zip2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VZIP:
+
+define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipi8_undef:
+;CHECK: zip1.8b
+;CHECK: zip2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 1, i32 9, i32 undef, i32 10, i32 3, i32 11>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 undef, i32 undef, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @vzipQi8_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipQi8_undef:
+;CHECK: zip1.16b
+;CHECK: zip2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
diff --git a/test/CodeGen/AArch64/atomic-ops-not-barriers.ll b/test/CodeGen/AArch64/atomic-ops-not-barriers.ll
index 162430b9b76..da095a0a42c 100644
--- a/test/CodeGen/AArch64/atomic-ops-not-barriers.ll
+++ b/test/CodeGen/AArch64/atomic-ops-not-barriers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 define i32 @foo(i32* %var, i1 %cond) {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index 58ea735c809..58b5d1d078c 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
 
 
 ; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
@@ -501,9 +501,9 @@ define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-ARM64-NEXT: cmp w[[OLD_EXT]], w0, sxtb
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
 
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
@@ -525,9 +525,9 @@ define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-ARM64-NEXT: cmp w[[OLD_EXT]], w0, sxth
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
 
 
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -550,8 +550,8 @@ define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
 
 
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -574,8 +574,8 @@ define i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 
-; CHECK-ARM64-NEXT: cmp x[[OLD]], x0
-; CHECK-ARM64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, le
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, le
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -598,9 +598,9 @@ define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-ARM64-NEXT: cmp w[[OLD_EXT]], w0, sxtb
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
 
 
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -623,9 +623,9 @@ define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-ARM64-NEXT: cmp w[[OLD_EXT]], w0, sxth
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
 
 
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -648,8 +648,8 @@ define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -672,8 +672,8 @@ define i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 
-; CHECK-ARM64-NEXT: cmp x[[OLD]], x0
-; CHECK-ARM64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
 
 
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -696,8 +696,8 @@ define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0, uxtb
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
 
 
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -720,8 +720,8 @@ define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0, uxth
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+; CHECK-NEXT: cmp w[[OLD]], w0, uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
 
 
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -744,8 +744,8 @@ define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -768,8 +768,8 @@ define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 
-; CHECK-ARM64-NEXT: cmp x[[OLD]], x0
-; CHECK-ARM64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, ls
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, ls
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -792,8 +792,8 @@ define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0, uxtb
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
 
 
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -816,8 +816,8 @@ define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0, uxth
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-NEXT: cmp w[[OLD]], w0, uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
 
 
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -840,8 +840,8 @@ define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -864,8 +864,8 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 
-; CHECK-ARM64-NEXT: cmp x[[OLD]], x0
-; CHECK-ARM64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
diff --git a/test/CodeGen/AArch64/basic-pic.ll b/test/CodeGen/AArch64/basic-pic.ll
index 2c69bee0d1b..62d41bcead6 100644
--- a/test/CodeGen/AArch64/basic-pic.ll
+++ b/test/CodeGen/AArch64/basic-pic.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
 
 @var = global i32 0
 
diff --git a/test/CodeGen/AArch64/bitfield-insert-0.ll b/test/CodeGen/AArch64/bitfield-insert-0.ll
index 8959e1b6959..da0ed8af312 100644
--- a/test/CodeGen/AArch64/bitfield-insert-0.ll
+++ b/test/CodeGen/AArch64/bitfield-insert-0.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -disassemble - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -filetype=obj -o - %s | llvm-objdump -disassemble - | FileCheck %s
 
 ; The encoding of lsb -> immr in the CGed bitfield instructions was wrong at one
 ; point, in the edge case where lsb = 0. Just make sure.
diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll
index 8b0b4dafe6c..2369a55aa92 100644
--- a/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK
 
 ; First, a simple example from Clang. The registers could plausibly be
 ; different, but probably won't be.
@@ -64,7 +64,7 @@ define void @test_whole32_from64(i64* %existing, i64* %new) {
 ; CHECK-LABEL: test_whole32_from64:
 
 
-; CHECK-ARM64: bfxil {{x[0-9]+}}, {{x[0-9]+}}, #0, #16
+; CHECK: bfxil {{x[0-9]+}}, {{x[0-9]+}}, #0, #16
 
 ; CHECK: ret
 
@@ -83,7 +83,7 @@ define void @test_whole32_from64(i64* %existing, i64* %new) {
 define void @test_32bit_masked(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_masked:
 
-; CHECK-ARM64: and
+; CHECK: and
 ; CHECK: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
 
   %oldval = load volatile i32* %existing
@@ -101,7 +101,7 @@ define void @test_32bit_masked(i32 *%existing, i32 *%new) {
 
 define void @test_64bit_masked(i64 *%existing, i64 *%new) {
 ; CHECK-LABEL: test_64bit_masked:
-; CHECK-ARM64: and
+; CHECK: and
 ; CHECK: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
 
   %oldval = load volatile i64* %existing
@@ -121,7 +121,7 @@ define void @test_64bit_masked(i64 *%existing, i64 *%new) {
 define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_complexmask:
 
-; CHECK-ARM64: and
+; CHECK: and
 ; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
 
   %oldval = load volatile i32* %existing
diff --git a/test/CodeGen/AArch64/bitfield.ll b/test/CodeGen/AArch64/bitfield.ll
index 71ffe30c928..0e1265372bd 100644
--- a/test/CodeGen/AArch64/bitfield.ll
+++ b/test/CodeGen/AArch64/bitfield.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -23,7 +23,7 @@ define void @test_extendb(i8 %var) {
 
   %uxt64 = zext i8 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK-ARM64: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff
   ret void
 }
 
@@ -47,7 +47,7 @@ define void @test_extendh(i16 %var) {
 
   %uxt64 = zext i16 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK-ARM64: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff
   ret void
 }
 
@@ -60,7 +60,7 @@ define void @test_extendw(i32 %var) {
 
   %uxt64 = zext i32 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK-ARM64: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #32
+; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #32
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/blockaddress.ll b/test/CodeGen/AArch64/blockaddress.ll
index 0cbdd3988b7..1eec4cc7f4e 100644
--- a/test/CodeGen/AArch64/blockaddress.ll
+++ b/test/CodeGen/AArch64/blockaddress.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -code-model=large -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -code-model=large -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 @addr = global i8* null
 
diff --git a/test/CodeGen/AArch64/bool-loads.ll b/test/CodeGen/AArch64/bool-loads.ll
index 5d92ef67d0e..881aeaa15dd 100644
--- a/test/CodeGen/AArch64/bool-loads.ll
+++ b/test/CodeGen/AArch64/bool-loads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
 
 @var = global i1 0
 
diff --git a/test/CodeGen/AArch64/breg.ll b/test/CodeGen/AArch64/breg.ll
index 137173bc4f3..591f48303e2 100644
--- a/test/CodeGen/AArch64/breg.ll
+++ b/test/CodeGen/AArch64/breg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @stored_label = global i8* null
 
diff --git a/test/CodeGen/AArch64/callee-save.ll b/test/CodeGen/AArch64/callee-save.ll
index 9b04a8f979b..046e6ceac07 100644
--- a/test/CodeGen/AArch64/callee-save.ll
+++ b/test/CodeGen/AArch64/callee-save.ll
@@ -1,19 +1,14 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @var = global float 0.0
 
 define void @foo() {
 ; CHECK-LABEL: foo:
 
-; CHECK: stp d14, d15, [sp
-; CHECK: stp d12, d13, [sp
-; CHECK: stp d10, d11, [sp
-; CHECK: stp d8, d9, [sp
-
-; CHECK-ARM64: stp d15, d14, [sp
-; CHECK-ARM64: stp d13, d12, [sp
-; CHECK-ARM64: stp d11, d10, [sp
-; CHECK-ARM64: stp d9, d8, [sp
+; CHECK: stp d15, d14, [sp
+; CHECK: stp d13, d12, [sp
+; CHECK: stp d11, d10, [sp
+; CHECK: stp d9, d8, [sp
 
   ; Create lots of live variables to exhaust the supply of
   ; caller-saved registers
@@ -83,14 +78,9 @@ define void @foo() {
   store volatile float %val31, float* @var
   store volatile float %val32, float* @var
 
-; CHECK: ldp     d8, d9, [sp
-; CHECK: ldp     d10, d11, [sp
-; CHECK: ldp     d12, d13, [sp
-; CHECK: ldp     d14, d15, [sp
-
-; CHECK-ARM64: ldp     d9, d8, [sp
-; CHECK-ARM64: ldp     d11, d10, [sp
-; CHECK-ARM64: ldp     d13, d12, [sp
-; CHECK-ARM64: ldp     d15, d14, [sp
+; CHECK: ldp     d9, d8, [sp
+; CHECK: ldp     d11, d10, [sp
+; CHECK: ldp     d13, d12, [sp
+; CHECK: ldp     d15, d14, [sp
   ret void
 }
diff --git a/test/CodeGen/AArch64/code-model-large-abs.ll b/test/CodeGen/AArch64/code-model-large-abs.ll
index 0408e6f4898..ca92500855b 100644
--- a/test/CodeGen/AArch64/code-model-large-abs.ll
+++ b/test/CodeGen/AArch64/code-model-large-abs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -code-model=large -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -o - %s | FileCheck %s
 
 @var8 = global i8 0
 @var16 = global i16 0
diff --git a/test/CodeGen/AArch64/compare-branch.ll b/test/CodeGen/AArch64/compare-branch.ll
index accbadd4d4e..a1a87cf51a1 100644
--- a/test/CodeGen/AArch64/compare-branch.ll
+++ b/test/CodeGen/AArch64/compare-branch.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/AArch64/complex-copy-noneon.ll b/test/CodeGen/AArch64/complex-copy-noneon.ll
index f65b1161282..4ae547856ec 100644
--- a/test/CodeGen/AArch64/complex-copy-noneon.ll
+++ b/test/CodeGen/AArch64/complex-copy-noneon.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s
 
 ; The DAG combiner decided to use a vector load/store for this struct copy
 ; previously. This probably shouldn't happen without NEON, but the most
diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll
index 96e11b12a17..5f81cba66cb 100644
--- a/test/CodeGen/AArch64/cond-sel.ll
+++ b/test/CodeGen/AArch64/cond-sel.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -45,7 +45,7 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
 ; CHECK-NOFP-NOT: fcmp
   %val2 = select i1 %tst2, i64 9, i64 15
   store i64 %val2, i64* @var64
-; CHECK-ARM64: orr w[[CONST15:[0-9]+]], wzr, #0xf
+; CHECK: orr w[[CONST15:[0-9]+]], wzr, #0xf
 ; CHECK: movz {{[wx]}}[[CONST9:[0-9]+]], #{{9|0x9}}
 ; CHECK: csel [[MAYBETRUE:x[0-9]+]], x[[CONST9]], x[[CONST15]], eq
 ; CHECK: csel {{x[0-9]+}}, x[[CONST9]], [[MAYBETRUE]], vs
diff --git a/test/CodeGen/AArch64/directcond.ll b/test/CodeGen/AArch64/directcond.ll
index 832a01046b0..1b519284846 100644
--- a/test/CodeGen/AArch64/directcond.ll
+++ b/test/CodeGen/AArch64/directcond.ll
@@ -1,10 +1,10 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 ; CHECK-LABEL: test_select_i32:
   %val = select i1 %bit, i32 %a, i32 %b
-; CHECK-ARM64: tst w0, #0x1
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: csel w0, w1, w2, ne
 
   ret i32 %val
@@ -13,7 +13,7 @@ define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) {
 ; CHECK-LABEL: test_select_i64:
   %val = select i1 %bit, i64 %a, i64 %b
-; CHECK-ARM64: tst w0, #0x1
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: csel x0, x1, x2, ne
 
   ret i64 %val
@@ -22,7 +22,7 @@ define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) {
 define float @test_select_float(i1 %bit, float %a, float %b) {
 ; CHECK-LABEL: test_select_float:
   %val = select i1 %bit, float %a, float %b
-; CHECK-ARM64: tst w0, #0x1
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: fcsel s0, s0, s1, ne
 ; CHECK-NOFP-NOT: fcsel
   ret float %val
@@ -31,7 +31,7 @@ define float @test_select_float(i1 %bit, float %a, float %b) {
 define double @test_select_double(i1 %bit, double %a, double %b) {
 ; CHECK-LABEL: test_select_double:
   %val = select i1 %bit, double %a, double %b
-; CHECK-ARM64: tst w0, #0x1
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: fcsel d0, d0, d1, ne
 ; CHECK-NOFP-NOT: fcsel
 
diff --git a/test/CodeGen/AArch64/dp1.ll b/test/CodeGen/AArch64/dp1.ll
index b09ce3668dc..662b4158854 100644
--- a/test/CodeGen/AArch64/dp1.ll
+++ b/test/CodeGen/AArch64/dp1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/AArch64/eliminate-trunc.ll b/test/CodeGen/AArch64/eliminate-trunc.ll
index 02a085acf03..ea86a084cb4 100644
--- a/test/CodeGen/AArch64/eliminate-trunc.ll
+++ b/test/CodeGen/AArch64/eliminate-trunc.ll
@@ -1,11 +1,11 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-apple-ios7.0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-apple-ios7.0 -mcpu=cyclone | FileCheck %s
 
 ; Check  trunc i64 operation is translated as a subregister access
 ; eliminating an i32 induction varible.
 
-; CHECK-ARM64-NOT: add {{x[0-9]+}}, {{x[0-9]+}}, #1
-; CHECK-ARM64: add {{w[0-9]+}}, {{w[0-9]+}}, #1
-; CHECK-ARM64-NEXT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NOT: add {{x[0-9]+}}, {{x[0-9]+}}, #1
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK-NEXT: cmp {{w[0-9]+}}, {{w[0-9]+}}
 define void @test1_signed([8 x i8]* nocapture %a, i8* nocapture readonly %box, i8 %limit) minsize {
 entry:
   %conv = zext i8 %limit to i32
diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll
index 8f418455ffa..ce5c0f68661 100644
--- a/test/CodeGen/AArch64/extern-weak.ll
+++ b/test/CodeGen/AArch64/extern-weak.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -o - %s | FileCheck %s --check-prefix=CHECK-ARM64
-; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
 
@@ -9,8 +9,8 @@ define i32()* @foo() {
   ret i32()* @var
 
 
-; CHECK-ARM64: adrp x[[ADDRHI:[0-9]+]], :got:var
-; CHECK-ARM64: ldr x0, [x[[ADDRHI]], :got_lo12:var]
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[ADDRHI]], :got_lo12:var]
 
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
@@ -27,9 +27,9 @@ define i32* @bar() {
   %addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5
 
 
-; CHECK-ARM64: adrp x[[ADDRHI:[0-9]+]], :got:arr_var
-; CHECK-ARM64: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
-; CHECK-ARM64: add x0, [[BASE]], #20
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:arr_var
+; CHECK: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
+; CHECK: add x0, [[BASE]], #20
 
   ret i32* %addr
 
@@ -46,8 +46,8 @@ define i32* @bar() {
 define i32* @wibble() {
   ret i32* @defined_weak_var
 
-; CHECK-ARM64: adrp [[BASE:x[0-9]+]], defined_weak_var
-; CHECK-ARM64: add x0, [[BASE]], :lo12:defined_weak_var
+; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
 
 ; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
diff --git a/test/CodeGen/AArch64/fastcc-reserved.ll b/test/CodeGen/AArch64/fastcc-reserved.ll
index 09a6ae3ccd2..a392619a768 100644
--- a/test/CodeGen/AArch64/fastcc-reserved.ll
+++ b/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
 
 ; This test is designed to be run in the situation where the
 ; call-frame is not reserved (hence disable-fp-elim), but where
@@ -12,30 +12,22 @@ define fastcc void @foo(i32 %in) {
   %addr = alloca i8, i32 %in
 
 ; Normal frame setup stuff:
-; CHECK: sub sp, sp,
-; CHECK: stp x29, x30
-; CHECK-ARM64: stp     x29, x30, [sp, #-16]!
-; CHECK-ARM64: mov     x29, sp
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame:
 ; CHECK: sub sp, sp, #16
-; CHECK-ARM64: sub sp, sp, #16
 
   call fastcc void @will_pop([8 x i32] undef, i32 42)
 ; CHECK: bl will_pop
-; CHECK-ARM64: bl will_pop
 
 ; Since @will_pop is fastcc with tailcallopt, it will put the stack
 ; back where it needs to be, we shouldn't duplicate that
 ; CHECK-NOT: sub sp, sp, #16
 ; CHECK-NOT: add sp, sp,
-; CHECK-ARM64-NOT: sub sp, sp, #16
-; CHECK-ARM64-NOT: add sp, sp,
 
-; CHECK: ldp x29, x30
-; CHECK: add sp, sp,
-; CHECK-ARM64: mov     sp, x29
-; CHECK-ARM64: ldp     x29, x30, [sp], #16
+; CHECK: mov     sp, x29
+; CHECK: ldp     x29, x30, [sp], #16
   ret void
 }
 
@@ -46,28 +38,21 @@ define void @foo1(i32 %in) {
 
   %addr = alloca i8, i32 %in
 ; Normal frame setup again
-; CHECK: sub sp, sp,
-; CHECK: stp x29, x30
-; CHECK-ARM64: stp     x29, x30, [sp, #-16]!
-; CHECK-ARM64: mov     x29, sp
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame
 ; CHECK: sub sp, sp, #16
-; CHECK-ARM64: sub sp, sp, #16
 
   call void @wont_pop([8 x i32] undef, i32 42)
 ; CHECK: bl wont_pop
-; CHECK-ARM64: bl wont_pop
 
 ; This time we *do* need to unreserve the call-frame
 ; CHECK: add sp, sp, #16
-; CHECK-ARM64: add sp, sp, #16
 
 ; Check for epilogue (primarily to make sure sp spotted above wasn't
 ; part of it).
-; CHECK: ldp x29, x30
-; CHECK: add sp, sp,
-; CHECK-ARM64: mov     sp, x29
-; CHECK-ARM64: ldp     x29, x30, [sp], #16
+; CHECK: mov     sp, x29
+; CHECK: ldp     x29, x30, [sp], #16
   ret void
 }
diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll
index b641de0ee29..9917fcd044f 100644
--- a/test/CodeGen/AArch64/fastcc.ll
+++ b/test/CodeGen/AArch64/fastcc.ll
@@ -1,226 +1,144 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-ARM64-TAIL
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK-ARM64 %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 ; Without tailcallopt fastcc still means the caller cleans up the
 ; stack, so try to make sure this is respected.
 
 define fastcc void @func_stack0() {
 ; CHECK-LABEL: func_stack0:
-; CHECK: sub sp, sp, #48
-
-; CHECK-ARM64-LABEL: func_stack0:
-; CHECK-ARM64: stp x29, x30, [sp, #-16]!
-; CHECK-ARM64-NEXT: mov x29, sp
-; CHECK-ARM64-NEXT: sub sp, sp, #32
+; CHECK: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #32
 
 ; CHECK-TAIL-LABEL: func_stack0:
-; CHECK-TAIL: sub sp, sp, #48
-
-; CHECK-ARM64-TAIL-LABEL: func_stack0:
-; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
-; CHECK-ARM64-TAIL-NEXT: mov x29, sp
-; CHECK-ARM64-TAIL-NEXT: sub sp, sp, #32
+; CHECK-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-TAIL-NEXT: mov x29, sp
+; CHECK-TAIL-NEXT: sub sp, sp, #32
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64:  bl func_stack8
-; CHECK-ARM64-NOT: sub sp, sp,
-
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
-; CHECK-ARM64-TAIL: bl func_stack8
-; CHECK-ARM64-TAIL: sub sp, sp, #16
-
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64: bl func_stack32
-; CHECK-ARM64-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
-; CHECK-ARM64-TAIL: bl func_stack32
-; CHECK-ARM64-TAIL: sub sp, sp, #32
-
 
   call fastcc void @func_stack0()
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
-; CHECK-ARM64: bl func_stack0
-; CHECK-ARM64-NOT: sub sp, sp
 
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
-; CHECK-ARM64-TAIL: bl func_stack0
-; CHECK-ARM64-TAIL-NOT: sub sp, sp
-
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-ARM64: mov sp, x29
-; CHECK-ARM64-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #48
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
-
-; CHECK-ARM64-TAIL: mov sp, x29
-; CHECK-ARM64-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-LABEL: func_stack8:
-; CHECK: sub sp, sp, #48
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #32
 
-; CHECK-ARM64-LABEL: func_stack8:
-; CHECK-ARM64: stp x29, x30, [sp, #-16]!
-; CHECK-ARM64: mov x29, sp
-; CHECK-ARM64: sub sp, sp, #32
 
 ; CHECK-TAIL-LABEL: func_stack8:
-; CHECK-TAIL: sub sp, sp, #48
-
-; CHECK-ARM64-TAIL-LABEL: func_stack8:
-; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
-; CHECK-ARM64-TAIL: mov x29, sp
-; CHECK-ARM64-TAIL: sub sp, sp, #32
+; CHECK-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-TAIL: mov x29, sp
+; CHECK-TAIL: sub sp, sp, #32
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64:  bl func_stack8
-; CHECK-ARM64-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
-; CHECK-ARM64-TAIL: bl func_stack8
-; CHECK-ARM64-TAIL: sub sp, sp, #16
-
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64: bl func_stack32
-; CHECK-ARM64-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
-; CHECK-ARM64-TAIL: bl func_stack32
-; CHECK-ARM64-TAIL: sub sp, sp, #32
-
 
   call fastcc void @func_stack0()
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
-; CHECK-ARM64: bl func_stack0
-; CHECK-ARM64-NOT: sub sp, sp
-
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
-; CHECK-ARM64-TAIL: bl func_stack0
-; CHECK-ARM64-TAIL-NOT: sub sp, sp
-
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-ARM64: mov sp, x29
-; CHECK-ARM64-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #64
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
-
-; CHECK-ARM64-TAIL: mov sp, x29
-; CHECK-ARM64-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32:
-; CHECK: sub sp, sp, #48
-
-; CHECK-ARM64-LABEL: func_stack32:
-; CHECK-ARM64: mov x29, sp
+; CHECK: mov x29, sp
 
 ; CHECK-TAIL-LABEL: func_stack32:
-; CHECK-TAIL: sub sp, sp, #48
-
-; CHECK-ARM64-TAIL-LABEL: func_stack32:
-; CHECK-ARM64-TAIL: mov x29, sp
+; CHECK-TAIL: mov x29, sp
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64:  bl func_stack8
-; CHECK-ARM64-NOT: sub sp, sp,
-
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
-; CHECK-ARM64-TAIL: bl func_stack8
-; CHECK-ARM64-TAIL: sub sp, sp, #16
-
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64: bl func_stack32
-; CHECK-ARM64-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
-; CHECK-ARM64-TAIL: bl func_stack32
-; CHECK-ARM64-TAIL: sub sp, sp, #32
-
 
   call fastcc void @func_stack0()
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
-; CHECK-ARM64: bl func_stack0
-; CHECK-ARM64-NOT: sub sp, sp
 
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
-; CHECK-ARM64-TAIL: bl func_stack0
-; CHECK-ARM64-TAIL-NOT: sub sp, sp
-
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-ARM64: mov sp, x29
-; CHECK-ARM64-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-NEXT: ret
-
-; CHECK-TAIL: add sp, sp, #80
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
-
-; CHECK-ARM64-TAIL: mov sp, x29
-; CHECK-ARM64-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-TAIL-NEXT: ret
 }
diff --git a/test/CodeGen/AArch64/fcmp.ll b/test/CodeGen/AArch64/fcmp.ll
index c54e3e62941..3c74508bb12 100644
--- a/test/CodeGen/AArch64/fcmp.ll
+++ b/test/CodeGen/AArch64/fcmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 declare void @bar(i32)
 
diff --git a/test/CodeGen/AArch64/fcvt-fixed.ll b/test/CodeGen/AArch64/fcvt-fixed.ll
index 40800d00e50..ccb3616b70b 100644
--- a/test/CodeGen/AArch64/fcvt-fixed.ll
+++ b/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 -O0
 
 ; (The O0 test is to make sure FastISel still constrains its operands properly
diff --git a/test/CodeGen/AArch64/flags-multiuse.ll b/test/CodeGen/AArch64/flags-multiuse.ll
index 667c05d1653..c9b0b9ff7d8 100644
--- a/test/CodeGen/AArch64/flags-multiuse.ll
+++ b/test/CodeGen/AArch64/flags-multiuse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 ; LLVM should be able to cope with multiple uses of the same flag-setting
 ; instruction at different points of a routine. Either by rematerializing the
diff --git a/test/CodeGen/AArch64/floatdp_2source.ll b/test/CodeGen/AArch64/floatdp_2source.ll
index 8e98b784bb9..262271784ec 100644
--- a/test/CodeGen/AArch64/floatdp_2source.ll
+++ b/test/CodeGen/AArch64/floatdp_2source.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu -mcpu=cyclone | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -mcpu=cyclone | FileCheck %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/test/CodeGen/AArch64/fp-cond-sel.ll b/test/CodeGen/AArch64/fp-cond-sel.ll
index 07cbb4919e6..b4f4d77cd0b 100644
--- a/test/CodeGen/AArch64/fp-cond-sel.ll
+++ b/test/CodeGen/AArch64/fp-cond-sel.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
@@ -12,7 +12,7 @@ define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
   %tst1 = icmp ugt i32 %lhs32, %rhs32
   %val1 = select i1 %tst1, float 0.0, float 1.0
   store float %val1, float* @varfloat
-; CHECK-ARM64: movi v[[FLT0:[0-9]+]].2d, #0
+; CHECK: movi v[[FLT0:[0-9]+]].2d, #0
 ; CHECK: fmov s[[FLT1:[0-9]+]], #1.0
 ; CHECK: fcsel {{s[0-9]+}}, s[[FLT0]], s[[FLT1]], hi
 
diff --git a/test/CodeGen/AArch64/fp-dp3.ll b/test/CodeGen/AArch64/fp-dp3.ll
index 53113b59127..10f88fdbbe9 100644
--- a/test/CodeGen/AArch64/fp-dp3.ll
+++ b/test/CodeGen/AArch64/fp-dp3.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu -fp-contract=fast | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s -check-prefix=CHECK-NOFAST
 
 declare float @llvm.fma.f32(float, float, float)
diff --git a/test/CodeGen/AArch64/fp128-folding.ll b/test/CodeGen/AArch64/fp128-folding.ll
index 4b19deb976c..892b19c5cf3 100644
--- a/test/CodeGen/AArch64/fp128-folding.ll
+++ b/test/CodeGen/AArch64/fp128-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 declare void @bar(i8*, i8*, i32*)
 
 ; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
diff --git a/test/CodeGen/AArch64/fpimm.ll b/test/CodeGen/AArch64/fpimm.ll
index e279d5b0096..e59520c4dc9 100644
--- a/test/CodeGen/AArch64/fpimm.ll
+++ b/test/CodeGen/AArch64/fpimm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @varf32 = global float 0.0
 @varf64 = global double 0.0
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
index 129ab25c877..abb732ccf43 100644
--- a/test/CodeGen/AArch64/func-argpassing.ll
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -1,8 +1,5 @@
-
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM64 %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE --check-prefix=CHECK-ARM64-BE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -63,7 +60,7 @@ define void @check_byval_align(i32* byval %ignore, %myStruct* byval align 16 %st
 
     %val0 = load volatile i32* %addr0
     ; Some weird move means x0 is used for one access
-; CHECK-ARM64: ldr [[REG32:w[0-9]+]], [sp, #28]
+; CHECK: ldr [[REG32:w[0-9]+]], [sp, #28]
     store i32 %val0, i32* @var32
 ; CHECK: str [[REG32]], [{{x[0-9]+}}, {{#?}}:lo12:var32]
 
@@ -149,7 +146,6 @@ define i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var
     %retval = load volatile i32* %stacked
     ret i32 %retval
 ; CHECK-LE: ldr w0, [sp, #16]
-; CHECK-BE-AARCH64: ldr w0, [sp, #20]
 }
 
 define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
@@ -159,8 +155,8 @@ define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
     store float %var8, float* @varfloat
     ; Beware as above: the offset would be different on big-endian
     ; machines if the first ldr were changed to use s-registers.
-; CHECK-ARM64: ldr {{[ds]}}[[VALFLOAT:[0-9]+]], [sp]
-; CHECK-ARM64: str s[[VALFLOAT]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
+; CHECK: ldr {{[ds]}}[[VALFLOAT:[0-9]+]], [sp]
+; CHECK: str s[[VALFLOAT]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
 
     ret void
 }
@@ -185,11 +181,10 @@ define void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
     ; Nothing local on stack in current codegen, so first stack is 16 away
 ; CHECK-LE: add     x[[REG:[0-9]+]], sp, #16
 ; CHECK-LE: ldr {{x[0-9]+}}, [x[[REG]], #8]
-; CHECK-BE-AARCH64: ldr {{x[0-9]+}}, [sp, #24]
 
     ; Important point is that we address sp+24 for second dword
 
-; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
     ret void
 }
 
@@ -209,6 +204,5 @@ define i16 @stacked_i16(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
                         i32 %val4, i32 %val5, i32 %val6, i32 %val7,
                         i16 %stack1) {
 ; CHECK-LABEL: stacked_i16
-; CHECK-ARM64-BE: ldrh
   ret i16 %stack1
 }
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index 8cb5f97e888..422c5765ec4 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -1,8 +1,7 @@
-
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM64-NONEON %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -90,13 +89,13 @@ define void @check_stack_args() {
   ; that varstruct is passed on the stack. Rather dependent on how a
   ; memcpy gets created, but the following works for now.
 
-; CHECK-ARM64-DAG: str {{q[0-9]+}}, [sp]
-; CHECK-ARM64-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
-; CHECK-ARM64: mov v0.16b, v[[FINAL_DOUBLE]].16b
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
+; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b
 
-; CHECK-ARM64-NONEON-DAG: str {{q[0-9]+}}, [sp]
-; CHECK-ARM64-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
-; CHECK-ARM64-NONEON: fmov d0, d[[FINAL_DOUBLE]]
+; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
+; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]]
 
 ; CHECK: bl struct_on_stack
 ; CHECK-NOFP-NOT: fmov
@@ -105,11 +104,11 @@ define void @check_stack_args() {
                          float -2.0, float -8.0, float 16.0, float 1.0,
                          float 64.0)
 
-; CHECK-ARM64:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
-; CHECK-ARM64: str [[SIXTY_FOUR]], [sp]
+; CHECK:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK: str [[SIXTY_FOUR]], [sp]
 
-; CHECK-ARM64-NONEON:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
-; CHECK-ARM64-NONEON: str [[SIXTY_FOUR]], [sp]
+; CHECK-NONEON:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK-NONEON: str [[SIXTY_FOUR]], [sp]
 
 ; CHECK: bl stacked_fpu
   ret void
@@ -131,8 +130,11 @@ define void @check_i128_align() {
                                    i32 42, i128 %val)
 ; CHECK: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:var128]
 ; CHECK: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
-; CHECK-ARM64: stp [[I128LO]], [[I128HI]], [sp, #16]
-; CHECK-ARM64-NONEON: stp [[I128LO]], [[I128HI]], [sp, #16]
+; CHECK: stp [[I128LO]], [[I128HI]], [sp, #16]
+
+; CHECK-NONEON: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, :lo12:var128]
+; CHECK-NONEON: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
+; CHECK-NONEON: stp [[I128LO]], [[I128HI]], [sp, #16]
 ; CHECK: bl check_i128_stackalign
 
   call void @check_i128_regalign(i32 0, i128 42)
diff --git a/test/CodeGen/AArch64/global-alignment.ll b/test/CodeGen/AArch64/global-alignment.ll
index 2bf4a2cbce4..451b9d6741e 100644
--- a/test/CodeGen/AArch64/global-alignment.ll
+++ b/test/CodeGen/AArch64/global-alignment.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 @var32 = global [3 x i32] zeroinitializer
 @var64 = global [3 x i64] zeroinitializer
diff --git a/test/CodeGen/AArch64/got-abuse.ll b/test/CodeGen/AArch64/got-abuse.ll
index c23edaf4360..7a02b104e77 100644
--- a/test/CodeGen/AArch64/got-abuse.ll
+++ b/test/CodeGen/AArch64/got-abuse.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj -o - %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj -o - %s
 
 ; LLVM gives well-defined semantics to this horrible construct (though C says
 ; it's undefined). Regardless, we shouldn't crash. The important feature here is
diff --git a/test/CodeGen/AArch64/illegal-float-ops.ll b/test/CodeGen/AArch64/illegal-float-ops.ll
index 8320f3ab044..9f7dd998bc2 100644
--- a/test/CodeGen/AArch64/illegal-float-ops.ll
+++ b/test/CodeGen/AArch64/illegal-float-ops.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/test/CodeGen/AArch64/init-array.ll b/test/CodeGen/AArch64/init-array.ll
index d3ed363821c..f47b490baeb 100644
--- a/test/CodeGen/AArch64/init-array.ll
+++ b/test/CodeGen/AArch64/init-array.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-none-eabi -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
 
 define internal void @_GLOBAL__I_a() section ".text.startup" {
   ret void
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
index 7ca9ade9cc6..9d833d936c0 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=arm64-none-linux-gnu -o - %s
+; RUN: not llc -mtriple=aarch64-none-linux-gnu -o - %s
 
 define void @foo() {
   ; Out of range immediate for I.
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
index 6bc633814c7..17260130199 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=arm64-none-linux-gnu -o - %s
+; RUN: not llc -mtriple=aarch64-none-linux-gnu -o - %s
 
 define void @foo() {
   ; 32-bit bitpattern ending in 1101 can't be produced.
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index a0fcafa4510..1dfb789ac8e 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck %s
-; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -o - %s | FileCheck --check-prefix=CHECK-PIC %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -o - %s | FileCheck --check-prefix=CHECK-PIC %s
 
 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
diff --git a/test/CodeGen/AArch64/large-consts.ll b/test/CodeGen/AArch64/large-consts.ll
index b5f6c32eef4..6bf85e829f6 100644
--- a/test/CodeGen/AArch64/large-consts.ll
+++ b/test/CodeGen/AArch64/large-consts.ll
@@ -1,14 +1,14 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s -code-model=large -show-mc-encoding | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s -code-model=large -show-mc-encoding | FileCheck %s
 
 ; Make sure the shift amount is encoded into the instructions by LLVM because
 ; it's not the linker's job to put it there.
 
 define double @foo() {
 
-; CHECK-ARM64: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0   // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
-; CHECK-ARM64: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b110AAAAA,0xf2]
-; CHECK-ARM64: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b101AAAAA,0xf2]
-; CHECK-ARM64: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b100AAAAA,0xf2]
+; CHECK: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0   // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
+; CHECK: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b110AAAAA,0xf2]
+; CHECK: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b101AAAAA,0xf2]
+; CHECK: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b100AAAAA,0xf2]
 
   ret double 3.14159
 }
diff --git a/test/CodeGen/AArch64/ldst-regoffset.ll b/test/CodeGen/AArch64/ldst-regoffset.ll
index b13634ca706..e2fa08bcce6 100644
--- a/test/CodeGen/AArch64/ldst-regoffset.ll
+++ b/test/CodeGen/AArch64/ldst-regoffset.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
diff --git a/test/CodeGen/AArch64/ldst-unscaledimm.ll b/test/CodeGen/AArch64/ldst-unscaledimm.ll
index d738cfdaa26..1de8443d9ed 100644
--- a/test/CodeGen/AArch64/ldst-unscaledimm.ll
+++ b/test/CodeGen/AArch64/ldst-unscaledimm.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
diff --git a/test/CodeGen/AArch64/ldst-unsignedimm.ll b/test/CodeGen/AArch64/ldst-unsignedimm.ll
index d6475f90429..e171d22b6c7 100644
--- a/test/CodeGen/AArch64/ldst-unsignedimm.ll
+++ b/test/CodeGen/AArch64/ldst-unsignedimm.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
diff --git a/test/CodeGen/AArch64/lit.local.cfg b/test/CodeGen/AArch64/lit.local.cfg
new file mode 100644
index 00000000000..77493d88b2a
--- /dev/null
+++ b/test/CodeGen/AArch64/lit.local.cfg
@@ -0,0 +1,11 @@
+import re
+
+config.suffixes = ['.ll']
+
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
+
+# For now we don't test arm64-win32.
+if re.search(r'cygwin|mingw32|win32', config.target_triple):
+    config.unsupported = True
diff --git a/test/CodeGen/AArch64/literal_pools_float.ll b/test/CodeGen/AArch64/literal_pools_float.ll
index 6f9f3fc3772..e53b8b62c6f 100644
--- a/test/CodeGen/AArch64/literal_pools_float.ll
+++ b/test/CodeGen/AArch64/literal_pools_float.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu -mcpu=cyclone | FileCheck %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu -code-model=large -mcpu=cyclone | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -code-model=large -mcpu=cyclone | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/test/CodeGen/AArch64/local_vars.ll b/test/CodeGen/AArch64/local_vars.ll
index 4518fa21023..2f5b9f2adb4 100644
--- a/test/CodeGen/AArch64/local_vars.ll
+++ b/test/CodeGen/AArch64/local_vars.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP-ARM64 %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP-ARM64 %s
 
 ; Make sure a reasonably sane prologue and epilogue are
 ; generated. This test is not robust in the face of an frame-handling
diff --git a/test/CodeGen/AArch64/logical_shifted_reg.ll b/test/CodeGen/AArch64/logical_shifted_reg.ll
index 608d44fc9d7..b249d72e0f9 100644
--- a/test/CodeGen/AArch64/logical_shifted_reg.ll
+++ b/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var1_32 = global i32 0
 @var2_32 = global i32 0
diff --git a/test/CodeGen/AArch64/mature-mc-support.ll b/test/CodeGen/AArch64/mature-mc-support.ll
index 2948da9f200..276c54d2cc4 100644
--- a/test/CodeGen/AArch64/mature-mc-support.ll
+++ b/test/CodeGen/AArch64/mature-mc-support.ll
@@ -1,14 +1,10 @@
 ; Test that inline assembly is parsed by the MC layer when MC support is mature
 ; (even when the output is assembly).
 
-; RUN: FileCheck %s < %t1
-
-; RUN: FileCheck %s < %t2
-
-; RUN: not llc -mtriple=arm64-pc-linux < %s > /dev/null 2> %t3
+; RUN: not llc -mtriple=aarch64-pc-linux < %s > /dev/null 2> %t3
 ; RUN: FileCheck %s < %t3
 
-; RUN: not llc -mtriple=arm64-pc-linux -filetype=obj < %s > /dev/null 2> %t4
+; RUN: not llc -mtriple=aarch64-pc-linux -filetype=obj < %s > /dev/null 2> %t4
 ; RUN: FileCheck %s < %t4
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
diff --git a/test/CodeGen/AArch64/movw-consts.ll b/test/CodeGen/AArch64/movw-consts.ll
index 6fe000974d5..93c18127175 100644
--- a/test/CodeGen/AArch64/movw-consts.ll
+++ b/test/CodeGen/AArch64/movw-consts.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
 
 define i64 @test0() {
 ; CHECK-LABEL: test0:
@@ -9,43 +9,43 @@ define i64 @test0() {
 
 define i64 @test1() {
 ; CHECK-LABEL: test1:
-; CHECK-ARM64: orr w0, wzr, #0x1
+; CHECK: orr w0, wzr, #0x1
   ret i64 1
 }
 
 define i64 @test2() {
 ; CHECK-LABEL: test2:
-; CHECK-ARM64: orr w0, wzr, #0xffff
+; CHECK: orr w0, wzr, #0xffff
   ret i64 65535
 }
 
 define i64 @test3() {
 ; CHECK-LABEL: test3:
-; CHECK-ARM64: orr w0, wzr, #0x10000
+; CHECK: orr w0, wzr, #0x10000
   ret i64 65536
 }
 
 define i64 @test4() {
 ; CHECK-LABEL: test4:
-; CHECK-ARM64: orr w0, wzr, #0xffff0000
+; CHECK: orr w0, wzr, #0xffff0000
   ret i64 4294901760
 }
 
 define i64 @test5() {
 ; CHECK-LABEL: test5:
-; CHECK-ARM64: orr x0, xzr, #0x100000000
+; CHECK: orr x0, xzr, #0x100000000
   ret i64 4294967296
 }
 
 define i64 @test6() {
 ; CHECK-LABEL: test6:
-; CHECK-ARM64: orr x0, xzr, #0xffff00000000
+; CHECK: orr x0, xzr, #0xffff00000000
   ret i64 281470681743360
 }
 
 define i64 @test7() {
 ; CHECK-LABEL: test7:
-; CHECK-ARM64: orr x0, xzr, #0x1000000000000
+; CHECK: orr x0, xzr, #0x1000000000000
   ret i64 281474976710656
 }
 
@@ -75,35 +75,35 @@ define i64 @test10() {
 
 define void @test11() {
 ; CHECK-LABEL: test11:
-; CHECK-ARM64: str wzr
+; CHECK: str wzr
   store i32 0, i32* @var32
   ret void
 }
 
 define void @test12() {
 ; CHECK-LABEL: test12:
-; CHECK-ARM64: orr {{w[0-9]+}}, wzr, #0x1
+; CHECK: orr {{w[0-9]+}}, wzr, #0x1
   store i32 1, i32* @var32
   ret void
 }
 
 define void @test13() {
 ; CHECK-LABEL: test13:
-; CHECK-ARM64: orr {{w[0-9]+}}, wzr, #0xffff
+; CHECK: orr {{w[0-9]+}}, wzr, #0xffff
   store i32 65535, i32* @var32
   ret void
 }
 
 define void @test14() {
 ; CHECK-LABEL: test14:
-; CHECK-ARM64: orr {{w[0-9]+}}, wzr, #0x10000
+; CHECK: orr {{w[0-9]+}}, wzr, #0x10000
   store i32 65536, i32* @var32
   ret void
 }
 
 define void @test15() {
 ; CHECK-LABEL: test15:
-; CHECK-ARM64: orr {{w[0-9]+}}, wzr, #0xffff0000
+; CHECK: orr {{w[0-9]+}}, wzr, #0xffff0000
   store i32 4294901760, i32* @var32
   ret void
 }
@@ -119,6 +119,6 @@ define i64 @test17() {
 ; CHECK-LABEL: test17:
 
   ; Mustn't MOVN w0 here.
-; CHECK-ARM64: orr x0, xzr, #0xfffffffffffffffd
+; CHECK: orr x0, xzr, #0xfffffffffffffffd
   ret i64 -3
 }
diff --git a/test/CodeGen/AArch64/movw-shift-encoding.ll b/test/CodeGen/AArch64/movw-shift-encoding.ll
index 2fe9dd4516e..178fccce333 100644
--- a/test/CodeGen/AArch64/movw-shift-encoding.ll
+++ b/test/CodeGen/AArch64/movw-shift-encoding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu < %s -show-mc-encoding -code-model=large | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -mtriple=aarch64-linux-gnu < %s -show-mc-encoding -code-model=large | FileCheck %s
 
 @var = global i32 0
 
@@ -8,8 +8,8 @@
 define i32* @get_var() {
   ret i32* @var
 
-; CHECK-ARM64: movz    x0, #:abs_g3:var        // encoding: [0bAAA00000,A,0b111AAAAA,0xd2]
-; CHECK-ARM64: movk    x0, #:abs_g2_nc:var     // encoding: [0bAAA00000,A,0b110AAAAA,0xf2]
-; CHECK-ARM64: movk    x0, #:abs_g1_nc:var     // encoding: [0bAAA00000,A,0b101AAAAA,0xf2]
-; CHECK-ARM64: movk    x0, #:abs_g0_nc:var     // encoding: [0bAAA00000,A,0b100AAAAA,0xf2]
+; CHECK: movz    x0, #:abs_g3:var        // encoding: [0bAAA00000,A,0b111AAAAA,0xd2]
+; CHECK: movk    x0, #:abs_g2_nc:var     // encoding: [0bAAA00000,A,0b110AAAAA,0xf2]
+; CHECK: movk    x0, #:abs_g1_nc:var     // encoding: [0bAAA00000,A,0b101AAAAA,0xf2]
+; CHECK: movk    x0, #:abs_g0_nc:var     // encoding: [0bAAA00000,A,0b100AAAAA,0xf2]
 }
diff --git a/test/CodeGen/AArch64/neon-bitcast.ll b/test/CodeGen/AArch64/neon-bitcast.ll
index b70cda3175a..61099d48fdd 100644
--- a/test/CodeGen/AArch64/neon-bitcast.ll
+++ b/test/CodeGen/AArch64/neon-bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
 
 ; From <8 x i8>
 
diff --git a/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index dfaf1f25179..6497856c7d3 100644
--- a/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: and8xi8:
diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll
index b99057ebf2b..6d89dfbacf4 100644
--- a/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
 ; CHECK-LABEL: cmeq8xi8:
diff --git a/test/CodeGen/AArch64/neon-diagnostics.ll b/test/CodeGen/AArch64/neon-diagnostics.ll
index e28df29f3e8..099b6856cec 100644
--- a/test/CodeGen/AArch64/neon-diagnostics.ll
+++ b/test/CodeGen/AArch64/neon-diagnostics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
 ; CHECK: test_vfma_lane_f32:
diff --git a/test/CodeGen/AArch64/neon-extract.ll b/test/CodeGen/AArch64/neon-extract.ll
index 96b4084a257..f270b54abb4 100644
--- a/test/CodeGen/AArch64/neon-extract.ll
+++ b/test/CodeGen/AArch64/neon-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: test_vext_s8:
diff --git a/test/CodeGen/AArch64/neon-fma.ll b/test/CodeGen/AArch64/neon-fma.ll
index 6df494dedae..af70302ca93 100644
--- a/test/CodeGen/AArch64/neon-fma.ll
+++ b/test/CodeGen/AArch64/neon-fma.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define <2 x float> @fmla2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
 ;CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
diff --git a/test/CodeGen/AArch64/neon-fpround_f128.ll b/test/CodeGen/AArch64/neon-fpround_f128.ll
index e48dbbaec92..a93f3f2723c 100644
--- a/test/CodeGen/AArch64/neon-fpround_f128.ll
+++ b/test/CodeGen/AArch64/neon-fpround_f128.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define <1 x double> @test_fpround_v1f128(<1 x fp128>* %a) {
 ; CHECK-LABEL: test_fpround_v1f128:
diff --git a/test/CodeGen/AArch64/neon-idiv.ll b/test/CodeGen/AArch64/neon-idiv.ll
index 11e1af7e143..de402c4780b 100644
--- a/test/CodeGen/AArch64/neon-idiv.ll
+++ b/test/CodeGen/AArch64/neon-idiv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
 
 define <4 x i32> @test1(<4 x i32> %a) {
   %rem = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
diff --git a/test/CodeGen/AArch64/neon-mla-mls.ll b/test/CodeGen/AArch64/neon-mla-mls.ll
index e7bff748ad3..71bb0e70abf 100644
--- a/test/CodeGen/AArch64/neon-mla-mls.ll
+++ b/test/CodeGen/AArch64/neon-mla-mls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 
 define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll
index b7baf25f807..40649aeb1b8 100644
--- a/test/CodeGen/AArch64/neon-mov.ll
+++ b/test/CodeGen/AArch64/neon-mov.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
 define <8 x i8> @movi8b() {
 ; CHECK-LABEL: movi8b:
@@ -14,75 +14,75 @@ define <16 x i8> @movi16b() {
 
 define <2 x i32> @movi2s_lsl0() {
 ; CHECK-LABEL: movi2s_lsl0:
-; CHECK-ARM64: movi {{d[0-9]+}}, #0x0000ff000000ff
+; CHECK: movi {{d[0-9]+}}, #0x0000ff000000ff
    ret <2 x i32> < i32 255, i32 255 >
 }
 
 define <2 x i32> @movi2s_lsl8() {
 ; CHECK-LABEL: movi2s_lsl8:
-; CHECK-ARM64: movi {{d[0-9]+}}, #0x00ff000000ff00
+; CHECK: movi {{d[0-9]+}}, #0x00ff000000ff00
    ret <2 x i32> < i32 65280, i32 65280 >
 }
 
 define <2 x i32> @movi2s_lsl16() {
 ; CHECK-LABEL: movi2s_lsl16:
-; CHECK-ARM64: movi {{d[0-9]+}}, #0xff000000ff0000
+; CHECK: movi {{d[0-9]+}}, #0xff000000ff0000
    ret <2 x i32> < i32 16711680, i32 16711680 >
 
 }
 
 define <2 x i32> @movi2s_lsl24() {
 ; CHECK-LABEL: movi2s_lsl24:
-; CHECK-ARM64: movi {{d[0-9]+}}, #0xff000000ff000000
+; CHECK: movi {{d[0-9]+}}, #0xff000000ff000000
    ret <2 x i32> < i32 4278190080, i32 4278190080 >
 }
 
 define <4 x i32> @movi4s_lsl0() {
 ; CHECK-LABEL: movi4s_lsl0:
-; CHECK-ARM64: movi {{v[0-9]+}}.2d, #0x0000ff000000ff
+; CHECK: movi {{v[0-9]+}}.2d, #0x0000ff000000ff
    ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 >
 }
 
 define <4 x i32> @movi4s_lsl8() {
 ; CHECK-LABEL: movi4s_lsl8:
-; CHECK-ARM64: movi {{v[0-9]+}}.2d, #0x00ff000000ff00
+; CHECK: movi {{v[0-9]+}}.2d, #0x00ff000000ff00
    ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 >
 }
 
 define <4 x i32> @movi4s_lsl16() {
 ; CHECK-LABEL: movi4s_lsl16:
-; CHECK-ARM64:  movi {{v[0-9]+}}.2d, #0xff000000ff0000
+; CHECK:  movi {{v[0-9]+}}.2d, #0xff000000ff0000
    ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 >
 
 }
 
 define <4 x i32> @movi4s_lsl24() {
 ; CHECK-LABEL: movi4s_lsl24:
-; CHECK-ARM64:  movi {{v[0-9]+}}.2d, #0xff000000ff000000
+; CHECK:  movi {{v[0-9]+}}.2d, #0xff000000ff000000
    ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 >
 }
 
 define <4 x i16> @movi4h_lsl0() {
 ; CHECK-LABEL: movi4h_lsl0:
-; CHECK-ARM64:  movi {{d[0-9]+}}, #0xff00ff00ff00ff
+; CHECK:  movi {{d[0-9]+}}, #0xff00ff00ff00ff
    ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 >
 }
 
 define <4 x i16> @movi4h_lsl8() {
 ; CHECK-LABEL: movi4h_lsl8:
-; CHECK-ARM64: movi d0, #0xff00ff00ff00ff00
+; CHECK: movi d0, #0xff00ff00ff00ff00
    ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 >
 }
 
 define <8 x i16> @movi8h_lsl0() {
 ; CHECK-LABEL: movi8h_lsl0:
-; CHECK-ARM64: movi v0.2d, #0xff00ff00ff00ff
+; CHECK: movi v0.2d, #0xff00ff00ff00ff
    ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
 }
 
 define <8 x i16> @movi8h_lsl8() {
 ; CHECK-LABEL: movi8h_lsl8:
-; CHECK-ARM64: movi v0.2d, #0xff00ff00ff00ff00
+; CHECK: movi v0.2d, #0xff00ff00ff00ff00
    ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
 }
 
@@ -164,26 +164,26 @@ define <8 x i16> @mvni8h_lsl8() {
 
 define <2 x i32> @movi2s_msl8(<2 x i32> %a) {
 ; CHECK-LABEL: movi2s_msl8:
-; CHECK-ARM64: movi {{d[0-9]+}}, #0x00ffff0000ffff
+; CHECK: movi {{d[0-9]+}}, #0x00ffff0000ffff
 	ret <2 x i32> < i32 65535, i32 65535 >
 }
 
 define <2 x i32> @movi2s_msl16() {
 ; CHECK-LABEL: movi2s_msl16:
-; CHECK-ARM64:  movi d0, #0xffffff00ffffff
+; CHECK:  movi d0, #0xffffff00ffffff
    ret <2 x i32> < i32 16777215, i32 16777215 >
 }
 
 
 define <4 x i32> @movi4s_msl8() {
 ; CHECK-LABEL: movi4s_msl8:
-; CHECK-ARM64:  movi v0.2d, #0x00ffff0000ffff
+; CHECK:  movi v0.2d, #0x00ffff0000ffff
    ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 >
 }
 
 define <4 x i32> @movi4s_msl16() {
 ; CHECK-LABEL: movi4s_msl16:
-; CHECK-ARM64:  movi v0.2d, #0xffffff00ffffff
+; CHECK:  movi v0.2d, #0xffffff00ffffff
    ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 >
 }
 
diff --git a/test/CodeGen/AArch64/neon-or-combine.ll b/test/CodeGen/AArch64/neon-or-combine.ll
index d98c12802a0..260f6935dde 100644
--- a/test/CodeGen/AArch64/neon-or-combine.ll
+++ b/test/CodeGen/AArch64/neon-or-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 ; Check that the DAGCombiner does not crash with an assertion failure
 ; when performing a target specific combine to simplify a 'or' dag node
diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
index d45dde649e4..4f8571db748 100644
--- a/test/CodeGen/AArch64/neon-perm.ll
+++ b/test/CodeGen/AArch64/neon-perm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
@@ -53,7 +53,7 @@ entry:
 
 define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp1_s32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -69,7 +69,7 @@ entry:
 
 define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vuzp1q_s64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -109,7 +109,7 @@ entry:
 
 define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp1_u32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -125,7 +125,7 @@ entry:
 
 define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vuzp1q_u64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -133,7 +133,7 @@ entry:
 
 define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vuzp1_f32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
@@ -149,7 +149,7 @@ entry:
 
 define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vuzp1q_f64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
@@ -221,7 +221,7 @@ entry:
 
 define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp2_s32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -237,7 +237,7 @@ entry:
 
 define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vuzp2q_s64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -277,7 +277,7 @@ entry:
 
 define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp2_u32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -293,7 +293,7 @@ entry:
 
 define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vuzp2q_u64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -301,7 +301,7 @@ entry:
 
 define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vuzp2_f32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
@@ -317,7 +317,7 @@ entry:
 
 define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vuzp2q_f64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
@@ -389,7 +389,7 @@ entry:
 
 define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip1_s32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -405,7 +405,7 @@ entry:
 
 define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vzip1q_s64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -445,7 +445,7 @@ entry:
 
 define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip1_u32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -461,7 +461,7 @@ entry:
 
 define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vzip1q_u64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -469,7 +469,7 @@ entry:
 
 define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vzip1_f32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
@@ -485,7 +485,7 @@ entry:
 
 define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vzip1q_f64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
@@ -557,7 +557,7 @@ entry:
 
 define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip2_s32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -573,7 +573,7 @@ entry:
 
 define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vzip2q_s64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -613,7 +613,7 @@ entry:
 
 define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip2_u32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -629,7 +629,7 @@ entry:
 
 define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vzip2q_u64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -637,7 +637,7 @@ entry:
 
 define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vzip2_f32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
@@ -653,7 +653,7 @@ entry:
 
 define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vzip2q_f64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
@@ -725,7 +725,7 @@ entry:
 
 define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn1_s32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -741,7 +741,7 @@ entry:
 
 define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vtrn1q_s64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -781,7 +781,7 @@ entry:
 
 define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn1_u32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -797,7 +797,7 @@ entry:
 
 define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vtrn1q_u64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -805,7 +805,7 @@ entry:
 
 define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vtrn1_f32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
@@ -821,7 +821,7 @@ entry:
 
 define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vtrn1q_f64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
@@ -893,7 +893,7 @@ entry:
 
 define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn2_s32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -909,7 +909,7 @@ entry:
 
 define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vtrn2q_s64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -949,7 +949,7 @@ entry:
 
 define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn2_u32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -965,7 +965,7 @@ entry:
 
 define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vtrn2q_u64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -973,7 +973,7 @@ entry:
 
 define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vtrn2_f32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
@@ -989,7 +989,7 @@ entry:
 
 define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vtrn2q_f64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
@@ -2494,8 +2494,8 @@ entry:
 
 define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp_s32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2530,8 +2530,8 @@ entry:
 
 define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp_u32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2542,8 +2542,8 @@ entry:
 
 define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vuzp_f32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2710,8 +2710,8 @@ entry:
 
 define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip_s32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2746,8 +2746,8 @@ entry:
 
 define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip_u32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2758,8 +2758,8 @@ entry:
 
 define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vzip_f32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2926,8 +2926,8 @@ entry:
 
 define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn_s32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2962,8 +2962,8 @@ entry:
 
 define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn_u32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2974,8 +2974,8 @@ entry:
 
 define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vtrn_f32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index 6cfdc5be131..32f59626b38 100644
--- a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 declare float @llvm.fma.f32(float, float, float)
 declare double @llvm.fma.f64(double, double, double)
diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll
index ab7ea661b40..a01df3275a9 100644
--- a/test/CodeGen/AArch64/neon-scalar-copy.ll
+++ b/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK
 
 
 define float @test_dup_sv2S(<2 x float> %v) {
  ; CHECK-LABEL: test_dup_sv2S
- ; CHECK-ARM64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+ ; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
  %tmp1 = extractelement <2 x float> %v, i32 1
  ret float  %tmp1
 }
@@ -37,14 +37,14 @@ define double @test_dup_dvD(<1 x double> %v) {
 
 define double @test_dup_dv2D(<2 x double> %v) {
  ; CHECK-LABEL: test_dup_dv2D
- ; CHECK-ARM64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+ ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
  %tmp1 = extractelement <2 x double> %v, i32 1
  ret double  %tmp1
 }
 
 define double @test_dup_dv2D_0(<2 x double> %v) {
  ; CHECK-LABEL: test_dup_dv2D_0
- ; CHECK-ARM64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+ ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
  ; CHECK: ret
  %tmp1 = extractelement <2 x double> %v, i32 1
  ret double  %tmp1
@@ -88,7 +88,7 @@ define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) {
 
 define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) {
  ; CHECK-LABEL: test_vector_dup_dv2D
- ; CHECK-ARM64: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8
+ ; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8
  %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> <i32 1> 
  ret <1 x i64> %shuffle.i
 }
diff --git a/test/CodeGen/AArch64/neon-shift-left-long.ll b/test/CodeGen/AArch64/neon-shift-left-long.ll
index 1d9c92c999d..d10d551805a 100644
--- a/test/CodeGen/AArch64/neon-shift-left-long.ll
+++ b/test/CodeGen/AArch64/neon-shift-left-long.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i16> @test_sshll_v8i8(<8 x i8> %a) {
 ; CHECK: test_sshll_v8i8:
diff --git a/test/CodeGen/AArch64/neon-truncStore-extLoad.ll b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
index f15cd24e5d4..1df3719c886 100644
--- a/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
+++ b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 ; A vector TruncStore can not be selected.
 ; Test a trunc IR and a vector store IR can be selected correctly.
diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll
index d2697910e6f..e8c762504fc 100644
--- a/test/CodeGen/AArch64/pic-eh-stubs.ll
+++ b/test/CodeGen/AArch64/pic-eh-stubs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
 ; RUN: llc -mtriple=arm64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
 
 ; Make sure exception-handling PIC code can be linked correctly. An alternative
diff --git a/test/CodeGen/AArch64/regress-f128csel-flags.ll b/test/CodeGen/AArch64/regress-f128csel-flags.ll
index 313cdb1bf0c..25b5e0c5f77 100644
--- a/test/CodeGen/AArch64/regress-f128csel-flags.ll
+++ b/test/CodeGen/AArch64/regress-f128csel-flags.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 ; We used to not mark NZCV as being used in the continuation basic-block
 ; when lowering a 128-bit "select" to branches. This meant a subsequent use
diff --git a/test/CodeGen/AArch64/regress-fp128-livein.ll b/test/CodeGen/AArch64/regress-fp128-livein.ll
index 141c0d862f6..5e6ab0a9675 100644
--- a/test/CodeGen/AArch64/regress-fp128-livein.ll
+++ b/test/CodeGen/AArch64/regress-fp128-livein.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s
 
 ; Regression test for NZCV reg live-in not being added to fp128csel IfTrue BB,
 ; causing a crash during live range calc.
diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll
index 55c3bcdcdd4..477d99625ee 100644
--- a/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s --check-prefix CHECK-ARM64
+; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
 
 ; When generating DAG selection tables, TableGen used to only flag an
 ; instruction as needing a chain on its own account if it had a built-in pattern
@@ -12,7 +12,7 @@
 declare void @bar(i8*)
 
 define i64 @test_chains() {
-; CHECK-ARM64-LABEL: test_chains:
+; CHECK-LABEL: test_chains:
 
   %locvar = alloca i8
 
@@ -25,13 +25,13 @@ define i64 @test_chains() {
   %inc.4 = trunc i64 %inc.3 to i8
   store i8 %inc.4, i8* %locvar
 
-; CHECK-ARM64: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
-; CHECK-ARM64: add {{w[0-9]+}}, {{w[0-9]+}}, #1
-; CHECK-ARM64: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]]
-; CHECK-ARM64: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]]
+; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]]
+; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]]
 
   %ret.1 = load i8* %locvar
   %ret.2 = zext i8 %ret.1 to i64
   ret i64 %ret.2
-; CHECK-ARM64: ret
+; CHECK: ret
 }
diff --git a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
index cc42b0c9df4..c3167e4f4bd 100644
--- a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
+++ b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
 @var = global i32 0
 
 declare void @bar()
diff --git a/test/CodeGen/AArch64/setcc-takes-i32.ll b/test/CodeGen/AArch64/setcc-takes-i32.ll
index f06c8ecd28d..ec8615910cf 100644
--- a/test/CodeGen/AArch64/setcc-takes-i32.ll
+++ b/test/CodeGen/AArch64/setcc-takes-i32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
 
 ; Most important point here is that the promotion of the i1 works
 ; correctly. Previously LLVM thought that i64 was the appropriate SetCC output,
diff --git a/test/CodeGen/AArch64/sibling-call.ll b/test/CodeGen/AArch64/sibling-call.ll
index 85245718afc..34e3bb410e8 100644
--- a/test/CodeGen/AArch64/sibling-call.ll
+++ b/test/CodeGen/AArch64/sibling-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -arm64-load-store-opt=0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -aarch64-load-store-opt=0 | FileCheck %s
 
 declare void @callee_stack0()
 declare void @callee_stack8([8 x i32], i64)
diff --git a/test/CodeGen/AArch64/sincos-expansion.ll b/test/CodeGen/AArch64/sincos-expansion.ll
index 5ba1d8d0a83..c3a172dfb42 100644
--- a/test/CodeGen/AArch64/sincos-expansion.ll
+++ b/test/CodeGen/AArch64/sincos-expansion.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 define float @test_sincos_f32(float %f) {
   %sin = call float @sinf(float %f) readnone
diff --git a/test/CodeGen/AArch64/sincospow-vector-expansion.ll b/test/CodeGen/AArch64/sincospow-vector-expansion.ll
index 38c8bb2d5e3..22f33a83394 100644
--- a/test/CodeGen/AArch64/sincospow-vector-expansion.ll
+++ b/test/CodeGen/AArch64/sincospow-vector-expansion.ll
@@ -1,4 +1,4 @@
-; RUN: llc -o - %s -verify-machineinstrs -mtriple=arm64-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc -o - %s -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+neon | FileCheck %s
 
 
 define <2 x float> @test_cos_v2f64(<2 x double> %v1) {
diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
index b3841fac68a..8aab8421526 100644
--- a/test/CodeGen/AArch64/tail-call.ll
+++ b/test/CodeGen/AArch64/tail-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck --check-prefix=CHECK-ARM64 %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
 
 declare fastcc void @callee_stack0()
 declare fastcc void @callee_stack8([8 x i32], i64)
@@ -8,91 +8,59 @@ define fastcc void @caller_to0_from0() nounwind {
 ; CHECK-LABEL: caller_to0_from0:
 ; CHECK-NEXT: // BB
 
-; CHECK-ARM64-LABEL: caller_to0_from0:
-; CHECK-ARM64-NEXT: // BB
-
   tail call fastcc void @callee_stack0()
   ret void
 
 ; CHECK-NEXT: b callee_stack0
-
-; CHECK-ARM64-NEXT: b callee_stack0
 }
 
 define fastcc void @caller_to0_from8([8 x i32], i64) {
 ; CHECK-LABEL: caller_to0_from8:
 
-; CHECK-ARM64-LABEL: caller_to0_from8:
-
   tail call fastcc void @callee_stack0()
   ret void
 
 ; CHECK: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack0
-
-; CHECK-ARM64: add sp, sp, #16
-; CHECK-ARM64-NEXT: b callee_stack0
 }
 
 define fastcc void @caller_to8_from0() {
 ; CHECK-LABEL: caller_to8_from0:
 ; CHECK: sub sp, sp, #32
 
-; CHECK-ARM64-LABEL: caller_to8_from0:
-; CHECK-ARM64: sub sp, sp, #32
-
 ; Key point is that the "42" should go #16 below incoming stack
 ; pointer (we didn't have arg space to reuse).
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
 
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK: str {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack8
-
-; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
-; CHECK-ARM64-NEXT: b callee_stack8
 }
 
 define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
 ; CHECK-LABEL: caller_to8_from8:
 ; CHECK: sub sp, sp, #16
 
-; CHECK-ARM64-LABEL: caller_to8_from8:
-; CHECK-ARM64: sub sp, sp, #16
-
 ; Key point is that the "%a" should go where at SP on entry.
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
 
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK: str {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack8
-
-; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
-; CHECK-ARM64-NEXT: b callee_stack8
 }
 
 define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
 ; CHECK-LABEL: caller_to16_from8:
 ; CHECK: sub sp, sp, #16
 
-; CHECK-ARM64-LABEL: caller_to16_from8:
-; CHECK-ARM64: sub sp, sp, #16
-
 ; Important point is that the call reuses the "dead" argument space
 ; above %a on the stack. If it tries to go below incoming-SP then the
 ; callee will not deallocate the space, even in fastcc.
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
 
-; CHECK: str {{x[0-9]+}}, [sp, #24]
-; CHECK: str {{x[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
 ; CHECK-NEXT: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack16
-
-; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-ARM64-NEXT: add sp, sp, #16
-; CHECK-ARM64-NEXT: b callee_stack16
   ret void
 }
 
@@ -101,19 +69,12 @@ define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: caller_to8_from24:
 ; CHECK: sub sp, sp, #16
 
-; CHECK-ARM64-LABEL: caller_to8_from24:
-; CHECK-ARM64: sub sp, sp, #16
-
 ; Key point is that the "%a" should go where at #16 above SP on entry.
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
 
-; CHECK: str {{x[0-9]+}}, [sp, #32]
-; CHECK-NEXT: add sp, sp, #32
+; CHECK: str {{x[0-9]+}}, [sp, #32]!
 ; CHECK-NEXT: b callee_stack8
-
-; CHECK-ARM64: str {{x[0-9]+}}, [sp, #32]!
-; CHECK-ARM64-NEXT: b callee_stack8
 }
 
 
@@ -121,24 +82,13 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
 ; CHECK-LABEL: caller_to16_from16:
 ; CHECK: sub sp, sp, #16
 
-; CHECK-ARM64-LABEL: caller_to16_from16:
-; CHECK-ARM64: sub sp, sp, #16
-
 ; Here we want to make sure that both loads happen before the stores:
 ; otherwise either %a or %b will be wrongly clobbered.
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
   ret void
 
-; CHECK: ldr x0,
-; CHECK: ldr x1,
-; CHECK: str x1,
-; CHECK: str x0,
-
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
 ; CHECK-NEXT: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack16
-
-; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-ARM64-NEXT: add sp, sp, #16
-; CHECK-ARM64-NEXT: b callee_stack16
 }
diff --git a/test/CodeGen/AArch64/zero-reg.ll b/test/CodeGen/AArch64/zero-reg.ll
index 44072c67d90..bc112ab8db9 100644
--- a/test/CodeGen/AArch64/zero-reg.ll
+++ b/test/CodeGen/AArch64/zero-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll b/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
deleted file mode 100644
index 6fb7c3fb5e0..00000000000
--- a/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin
-
-; Can't copy or spill / restore CPSR.
-; rdar://9105206
-
-define fastcc void @t() ssp align 2 {
-entry:
-  br i1 undef, label %bb3.i, label %bb2.i
-
-bb2.i:                                            ; preds = %entry
-  br label %bb3.i
-
-bb3.i:                                            ; preds = %bb2.i, %entry
-  br i1 undef, label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71, label %bb.i69
-
-bb.i69:                                           ; preds = %bb3.i
-  br label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
-
-_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71: ; preds = %bb.i69, %bb3.i
-  %0 = select i1 undef, float 0.000000e+00, float undef
-  %1 = fdiv float %0, undef
-  %2 = fcmp ult float %1, 0xBF847AE140000000
-  %storemerge9 = select i1 %2, float %1, float 0.000000e+00
-  store float %storemerge9, float* undef, align 4
-  br i1 undef, label %bb42, label %bb47
-
-bb42:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
-  br i1 undef, label %bb46, label %bb53
-
-bb46:                                             ; preds = %bb42
-  br label %bb48
-
-bb47:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
-  br label %bb48
-
-bb48:                                             ; preds = %bb47, %bb46
-  br i1 undef, label %bb1.i14, label %bb.i13
-
-bb.i13:                                           ; preds = %bb48
-  br label %bb1.i14
-
-bb1.i14:                                          ; preds = %bb.i13, %bb48
-  br label %bb53
-
-bb53:                                             ; preds = %bb1.i14, %bb42
-  ret void
-}
diff --git a/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
deleted file mode 100644
index 2b083d80491..00000000000
--- a/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin
-
-; rdar://9146594
-
-define void @drt_vsprintf() nounwind ssp {
-entry:
-  %do_tab_convert = alloca i32, align 4
-  br i1 undef, label %if.then24, label %if.else295, !dbg !13
-
-if.then24:                                        ; preds = %entry
-  unreachable
-
-if.else295:                                       ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16), !dbg !18
-  store i32 0, i32* %do_tab_convert, align 4, !dbg !19
-  unreachable
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-!llvm.dbg.gv = !{!0}
-!llvm.dbg.sp = !{!1, !7, !10, !11, !12}
-
-!0 = metadata !{i32 589876, i32 0, metadata !1, metadata !"vsplive", metadata !"vsplive", metadata !"", metadata !2, i32 617, metadata !6, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
-!1 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"drt_vsprintf", metadata !"drt_vsprintf", metadata !"", i32 616, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{i32 589860, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"putc_mem", metadata !"putc_mem", metadata !"", i32 30, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!8 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!9 = metadata !{null}
-!10 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_double", metadata !"print_double", metadata !"", i32 203, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_number", metadata !"print_number", metadata !"", i32 75, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!12 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"get_flags", metadata !"get_flags", metadata !"", i32 508, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 653, i32 5, metadata !14, null}
-!14 = metadata !{i32 589835, metadata !20, metadata !15, i32 652, i32 35, i32 2} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{i32 589835, metadata !20, metadata !1, i32 616, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 590080, metadata !17, metadata !"do_tab_convert", metadata !2, i32 853, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!17 = metadata !{i32 589835, metadata !20, metadata !14, i32 850, i32 12, i32 33} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{i32 853, i32 11, metadata !17, null}
-!19 = metadata !{i32 853, i32 29, metadata !17, null}
-!20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"}
-!21 = metadata !{i32 0}
diff --git a/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll b/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
deleted file mode 100644
index 6f0ec34fc1d..00000000000
--- a/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-define void @foo(i64 %val) {
-; CHECK: foo
-;   The stack frame store is not 64-bit aligned. Make sure we use an
-;   instruction that can handle that.
-; CHECK: stur x0, [sp, #20]
-  %a = alloca [49 x i32], align 4
-  %p32 = getelementptr inbounds [49 x i32]* %a, i64 0, i64 2
-  %p = bitcast i32* %p32 to i64*
-  store i64 %val, i64* %p, align 8
-  ret void
-}
diff --git a/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll b/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
deleted file mode 100644
index 88232fcc0b4..00000000000
--- a/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-iOS5.0
-
-; CPSR is not allocatable so fast allocatable wouldn't mark them killed.
-; rdar://9313272
-
-define hidden void @t() nounwind {
-entry:
-  %cmp = icmp eq i32* null, undef
-  %frombool = zext i1 %cmp to i8
-  store i8 %frombool, i8* undef, align 1
-  %tmp4 = load i8* undef, align 1
-  %tobool = trunc i8 %tmp4 to i1
-  br i1 %tobool, label %land.lhs.true, label %if.end
-
-land.lhs.true:                                    ; preds = %entry
-  unreachable
-
-if.end:                                           ; preds = %entry
-  br i1 undef, label %land.lhs.true14, label %if.end33
-
-land.lhs.true14:                                  ; preds = %if.end
-  unreachable
-
-if.end33:                                         ; preds = %if.end
-  unreachable
-}
diff --git a/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll b/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
deleted file mode 100644
index 8f99bc30a55..00000000000
--- a/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
-
-; Can't fold the increment by 1<<12 into a post-increment load
-; rdar://10301335
-
-@test_data = common global i32 0, align 4
-
-define void @t() nounwind ssp {
-; CHECK-LABEL: t:
-entry:
-  br label %for.body
-
-for.body:
-; CHECK: for.body
-; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
-; CHECK: add x[[REG:[0-9]+]],
-; CHECK:                      x[[REG]], #1, lsl  #12
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 12
-  %add = add nsw i64 %0, 34628173824
-  %1 = inttoptr i64 %add to i32*
-  %2 = load volatile i32* %1, align 4096
-  store volatile i32 %2, i32* @test_data, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 200
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll b/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
deleted file mode 100644
index d47dbb28164..00000000000
--- a/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc < %s -march=arm64
-
-; The target lowering for integer comparisons was replacing some DAG nodes
-; during operation legalization, which resulted in dangling pointers,
-; cycles in DAGs, and eventually crashes.  This is the testcase for
-; one of those crashes. (rdar://10653656)
-
-define void @test(i1 zeroext %IsArrow) nounwind ssp align 2 {
-entry:
-  br i1 undef, label %return, label %lor.lhs.false
-
-lor.lhs.false:
-  br i1 undef, label %return, label %if.end
-
-if.end:
-  %tmp.i = load i64* undef, align 8
-  %and.i.i.i = and i64 %tmp.i, -16
-  br i1 %IsArrow, label %if.else_crit_edge, label %if.end32
-
-if.else_crit_edge:
-  br i1 undef, label %if.end32, label %return
-
-if.end32:
-  %0 = icmp ult i32 undef, 3
-  %1 = zext i64 %tmp.i to i320
-  %.pn.v = select i1 %0, i320 128, i320 64
-  %.pn = shl i320 %1, %.pn.v
-  %ins346392 = or i320 %.pn, 0
-  store i320 %ins346392, i320* undef, align 8
-  br i1 undef, label %sw.bb.i.i, label %exit
-
-sw.bb.i.i:
-  unreachable
-
-exit:
-  unreachable
-
-return:
-  ret void
-}
diff --git a/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll b/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
deleted file mode 100644
index a4d37e48685..00000000000
--- a/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i32 @foo(<4 x i32> %a, i32 %n) nounwind {
-; CHECK-LABEL: foo:
-; CHECK: fmov w0, s0
-; CHECK-NEXT: ret
-  %b = bitcast <4 x i32> %a to i128
-  %c = trunc i128 %b to i32
-  ret i32 %c
-}
-
-define i64 @bar(<2 x i64> %a, i64 %n) nounwind {
-; CHECK-LABEL: bar:
-; CHECK: fmov x0, d0
-; CHECK-NEXT: ret
-  %b = bitcast <2 x i64> %a to i128
-  %c = trunc i128 %b to i64
-  ret i64 %c
-}
-
diff --git a/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll b/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
deleted file mode 100644
index d59b0d00438..00000000000
--- a/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc < %s -march arm64 -mcpu=cyclone | FileCheck %s
-; <rdar://problem/11294426>
-
-@b = private unnamed_addr constant [3 x i32] [i32 1768775988, i32 1685481784, i32 1836253201], align 4
-
-; The important thing for this test is that we need an unaligned load of `l_b'
-; ("ldr w2, [x1, #8]" in this case).
-
-; CHECK:      adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}}
-; CHECK: add  x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}}
-; CHECK-NEXT: ldr  [[VAL:w[0-9]+]], [x[[ADDR]], #8]
-; CHECK-NEXT: str  [[VAL]], [x0, #8]
-; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
-; CHECK-NEXT: str  [[VAL2]], [x0]
-
-define void @foo(i8* %a) {
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([3 x i32]* @b to i8*), i64 12, i32 4, i1 false)
-  ret void
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll b/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
deleted file mode 100644
index d1840d35942..00000000000
--- a/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefix=CHECK-LINUX
-; <rdar://problem/11392109>
-
-define hidden void @t() optsize ssp {
-entry:
-  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* undef, align 8
-; CHECK:             adrp    x{{[0-9]+}}, _x@GOTPAGE
-; CHECK:        ldr     x{{[0-9]+}}, [x{{[0-9]+}}, _x@GOTPAGEOFF]
-; CHECK-NEXT:        and     x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff
-; CHECK-NEXT:        str     x{{[0-9]+}}, [x{{[0-9]+}}]
-  unreachable
-}
-
-declare i64 @x(i32) optsize
-
-; Worth checking the Linux code is sensible too: only way to access
-; the GOT is via a 64-bit load. Just loading wN is unacceptable
-; (there's no ELF relocation to do that).
-
-; CHECK-LINUX: adrp {{x[0-9]+}}, :got:x
-; CHECK-LINUX: ldr {{x[0-9]+}}, [{{x[0-9]+}}, :got_lo12:x]
diff --git a/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll b/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
deleted file mode 100644
index 4b037db9c84..00000000000
--- a/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios -verify-machineinstrs | FileCheck %s
-
-; LdStOpt bug created illegal instruction:
-;   %D1<def>, %D2<def> = LDPSi %X0, 1
-; rdar://11512047
-
-%0 = type opaque
-%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
-%struct.CGPoint = type { double, double }
-%struct.CGSize = type { double, double }
-
-@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
-
-define hidden %struct.CGRect @t(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
-entry:
-; CHECK-LABEL: t:
-; CHECK: ldp d{{[0-9]+}}, d{{[0-9]+}}
-  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
-  %0 = bitcast %0* %self to i8*
-  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
-  %add.ptr10.0 = bitcast i8* %add.ptr to double*
-  %tmp11 = load double* %add.ptr10.0, align 8
-  %add.ptr.sum = add i64 %ivar, 8
-  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
-  %1 = bitcast i8* %add.ptr10.1 to double*
-  %tmp12 = load double* %1, align 8
-  %add.ptr.sum17 = add i64 %ivar, 16
-  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
-  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
-  %tmp = load double* %add.ptr4.1.0, align 8
-  %add.ptr4.1.sum = add i64 %ivar, 24
-  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
-  %2 = bitcast i8* %add.ptr4.1.1 to double*
-  %tmp5 = load double* %2, align 8
-  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
-  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
-  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
-  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
-  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
-  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
-  ret %struct.CGRect %insert3
-}
-
-!llvm.module.flags = !{!0, !1, !2, !3}
-
-!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!4 = metadata !{}
diff --git a/test/CodeGen/ARM64/2012-06-06-FPToUI.ll b/test/CodeGen/ARM64/2012-06-06-FPToUI.ll
deleted file mode 100644
index 168e921bcc0..00000000000
--- a/test/CodeGen/ARM64/2012-06-06-FPToUI.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: llc -march=arm64 -O0 < %s | FileCheck %s
-; RUN: llc -march=arm64 -O3 < %s | FileCheck %s
-
-@.str = private unnamed_addr constant [9 x i8] c"%lf %lu\0A\00", align 1
-@.str1 = private unnamed_addr constant [8 x i8] c"%lf %u\0A\00", align 1
-@.str2 = private unnamed_addr constant [8 x i8] c"%f %lu\0A\00", align 1
-@.str3 = private unnamed_addr constant [7 x i8] c"%f %u\0A\00", align 1
-
-define void @testDouble(double %d) ssp {
-; CHECK-LABEL: testDouble:
-; CHECK:  fcvtzu x{{[0-9]+}}, d{{[0-9]+}}
-; CHECK:  fcvtzu w{{[0-9]+}}, d{{[0-9]+}}
-entry:
-  %d.addr = alloca double, align 8
-  store double %d, double* %d.addr, align 8
-  %0 = load double* %d.addr, align 8
-  %1 = load double* %d.addr, align 8
-  %conv = fptoui double %1 to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
-  %2 = load double* %d.addr, align 8
-  %3 = load double* %d.addr, align 8
-  %conv1 = fptoui double %3 to i32
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
-  ret void
-}
-
-declare i32 @printf(i8*, ...)
-
-define void @testFloat(float %f) ssp {
-; CHECK-LABEL: testFloat:
-; CHECK:  fcvtzu x{{[0-9]+}}, s{{[0-9]+}}
-; CHECK:  fcvtzu w{{[0-9]+}}, s{{[0-9]+}}
-entry:
-  %f.addr = alloca float, align 4
-  store float %f, float* %f.addr, align 4
-  %0 = load float* %f.addr, align 4
-  %conv = fpext float %0 to double
-  %1 = load float* %f.addr, align 4
-  %conv1 = fptoui float %1 to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
-  %2 = load float* %f.addr, align 4
-  %conv2 = fpext float %2 to double
-  %3 = load float* %f.addr, align 4
-  %conv3 = fptoui float %3 to i32
-  %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
-  ret void
-}
-
-define i32 @main(i32 %argc, i8** %argv) ssp {
-entry:
-  %retval = alloca i32, align 4
-  %argc.addr = alloca i32, align 4
-  %argv.addr = alloca i8**, align 8
-  store i32 0, i32* %retval
-  store i32 %argc, i32* %argc.addr, align 4
-  store i8** %argv, i8*** %argv.addr, align 8
-  call void @testDouble(double 1.159198e+01)
-  call void @testFloat(float 0x40272F1800000000)
-  ret i32 0
-}
-
-!llvm.module.flags = !{!0, !1, !2, !3}
-
-!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll b/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
deleted file mode 100644
index 55ecfb5d2bd..00000000000
--- a/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios
-; rdar://11849816
-
-@shlib_path_substitutions = external hidden unnamed_addr global i8**, align 8
-
-declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
-
-declare noalias i8* @xmalloc(i64) optsize
-
-declare i64 @strlen(i8* nocapture) nounwind readonly optsize
-
-declare i8* @__strcpy_chk(i8*, i8*, i64) nounwind optsize
-
-declare i8* @__strcat_chk(i8*, i8*, i64) nounwind optsize
-
-declare noalias i8* @xstrdup(i8*) optsize
-
-define i8* @dyld_fix_path(i8* %path) nounwind optsize ssp {
-entry:
-  br i1 undef, label %if.end56, label %for.cond
-
-for.cond:                                         ; preds = %entry
-  br i1 undef, label %for.cond10, label %for.body
-
-for.body:                                         ; preds = %for.cond
-  unreachable
-
-for.cond10:                                       ; preds = %for.cond
-  br i1 undef, label %if.end56, label %for.body14
-
-for.body14:                                       ; preds = %for.cond10
-  %call22 = tail call i64 @strlen(i8* undef) nounwind optsize
-  %sext = shl i64 %call22, 32
-  %conv30 = ashr exact i64 %sext, 32
-  %add29 = sub i64 0, %conv30
-  %sub = add i64 %add29, 0
-  %add31 = shl i64 %sub, 32
-  %sext59 = add i64 %add31, 4294967296
-  %conv33 = ashr exact i64 %sext59, 32
-  %call34 = tail call noalias i8* @xmalloc(i64 %conv33) nounwind optsize
-  br i1 undef, label %cond.false45, label %cond.true43
-
-cond.true43:                                      ; preds = %for.body14
-  unreachable
-
-cond.false45:                                     ; preds = %for.body14
-  %add.ptr = getelementptr inbounds i8* %path, i64 %conv30
-  unreachable
-
-if.end56:                                         ; preds = %for.cond10, %entry
-  ret i8* null
-}
-
-declare i32 @strncmp(i8* nocapture, i8* nocapture, i64) nounwind readonly optsize
-
-declare i8* @strcpy(i8*, i8* nocapture) nounwind
diff --git a/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll b/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
deleted file mode 100644
index b40a581d611..00000000000
--- a/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios7.0.0"
-
-;FAST-LABEL: _Z9example25v:
-;FAST: fcmgt.4s
-;FAST: ret
-
-;CHECK-LABEL: _Z9example25v:
-;CHECK: fcmgt.4s
-;CHECK: ret
-
-define <4 x i32> @_Z9example25v( <4 x float> %N0,  <4 x float> %N1) {
-  %A = fcmp olt <4 x float> %N0, %N1
-  %B = zext <4 x i1> %A to <4 x i32>
-  ret <4 x i32> %B
-}
diff --git a/test/CodeGen/ARM64/2013-01-23-frem-crash.ll b/test/CodeGen/ARM64/2013-01-23-frem-crash.ll
deleted file mode 100644
index 94511243a49..00000000000
--- a/test/CodeGen/ARM64/2013-01-23-frem-crash.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=arm64
-; Make sure we are not crashing on this test.
-
-define void @autogen_SD13158() {
-entry:
-  %B26 = frem float 0.000000e+00, undef
-  br i1 undef, label %CF, label %CF77
-
-CF:                                               ; preds = %CF, %CF76
-  store float %B26, float* undef
-  br i1 undef, label %CF, label %CF77
-
-CF77:                                             ; preds = %CF
-  ret void
-}
diff --git a/test/CodeGen/ARM64/2013-01-23-sext-crash.ll b/test/CodeGen/ARM64/2013-01-23-sext-crash.ll
deleted file mode 100644
index 404027bfd5f..00000000000
--- a/test/CodeGen/ARM64/2013-01-23-sext-crash.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; RUN: llc < %s -march=arm64
-
-; Make sure we are not crashing on this test.
-
-define void @autogen_SD12881() {
-BB:
-  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
-  br label %CF
-
-CF:                                               ; preds = %CF83, %CF, %BB
-  br i1 undef, label %CF, label %CF83
-
-CF83:                                             ; preds = %CF
-  %FC70 = sitofp <4 x i32> %B17 to <4 x double>
-  br label %CF
-}
-
-
-define void @autogen_SD12881_2() {
-BB:
-  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
-  br label %CF
-
-CF:                                               ; preds = %CF83, %CF, %BB
-  br i1 undef, label %CF, label %CF83
-
-CF83:                                             ; preds = %CF
-  %FC70 = uitofp <4 x i32> %B17 to <4 x double>
-  br label %CF
-}
-
-define void @_Z12my_example2bv() nounwind noinline ssp {
-entry:
-  %0 = fptosi <2 x double> undef to <2 x i32>
-  store <2 x i32> %0, <2 x i32>* undef, align 8
-  ret void
-}
diff --git a/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll b/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
deleted file mode 100644
index 70e745fc577..00000000000
--- a/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple
-
-;CHECK-LABEL: Shuff:
-;CHECK: tbl.8b
-;CHECK: ret
-define <8 x i8 > @Shuff(<8 x i8> %in, <8 x i8>* %out) nounwind ssp {
-  %value = shufflevector <8 x i8> %in, <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i8> %value
-}
-
-
diff --git a/test/CodeGen/ARM64/2014-04-16-AnInfiniteLoopInDAGCombine.ll b/test/CodeGen/ARM64/2014-04-16-AnInfiniteLoopInDAGCombine.ll
deleted file mode 100644
index a73b7071801..00000000000
--- a/test/CodeGen/ARM64/2014-04-16-AnInfiniteLoopInDAGCombine.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc < %s -march=arm64
-
-; This test case tests an infinite loop bug in DAG combiner.
-; It just tries to do the following replacing endlessly:
-; (1)  Replacing.3 0x2c509f0: v4i32 = any_extend 0x2c4cd08 [ORD=4]
-;      With: 0x2c4d128: v4i32 = sign_extend 0x2c4cd08 [ORD=4]
-;
-; (2)  Replacing.2 0x2c4d128: v4i32 = sign_extend 0x2c4cd08 [ORD=4]
-;      With: 0x2c509f0: v4i32 = any_extend 0x2c4cd08 [ORD=4]
-; As we think the (2) optimization from SIGN_EXTEND to ANY_EXTEND is
-; an optimization to replace unused bits with undefined bits, we remove
-; the (1) optimization (It doesn't make sense to replace undefined bits
-; with signed bits).
-
-define <4 x i32> @infiniteLoop(<4 x i32> %in0, <4 x i16> %in1) {
-entry:
-  %cmp.i = icmp sge <4 x i16> %in1, <i16 32767, i16 32767, i16 -1, i16 -32768>
-  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
-  %mul.i = mul <4 x i32> %in0, %sext.i
-  %sext = shl <4 x i32> %mul.i, <i32 16, i32 16, i32 16, i32 16>
-  %vmovl.i.i = ashr <4 x i32> %sext, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vmovl.i.i
-}
\ No newline at end of file
diff --git a/test/CodeGen/ARM64/2014-04-28-sqshl-uqshl-i64Contant.ll b/test/CodeGen/ARM64/2014-04-28-sqshl-uqshl-i64Contant.ll
deleted file mode 100644
index a4c9cd8f106..00000000000
--- a/test/CodeGen/ARM64/2014-04-28-sqshl-uqshl-i64Contant.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -march=arm64 | FileCheck %s
-
-; Check if sqshl/uqshl with constant shift amout can be selected. 
-define i64 @test_vqshld_s64_i(i64 %a) {
-; CHECK-LABEL: test_vqshld_s64_i:
-; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
-  %1 = tail call i64 @llvm.arm64.neon.sqshl.i64(i64 %a, i64 36)
-  ret i64 %1
-}
-
-define i64 @test_vqshld_u64_i(i64 %a) {
-; CHECK-LABEL: test_vqshld_u64_i:
-; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
-  %1 = tail call i64 @llvm.arm64.neon.uqshl.i64(i64 %a, i64 36)
-  ret i64 %1
-}
-
-declare i64 @llvm.arm64.neon.uqshl.i64(i64, i64)
-declare i64 @llvm.arm64.neon.sqshl.i64(i64, i64)
diff --git a/test/CodeGen/ARM64/2014-04-29-EXT-undef-mask.ll b/test/CodeGen/ARM64/2014-04-29-EXT-undef-mask.ll
deleted file mode 100644
index b0ab9fda22d..00000000000
--- a/test/CodeGen/ARM64/2014-04-29-EXT-undef-mask.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc < %s -O0 -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-; The following 2 test cases test shufflevector with beginning UNDEF mask.
-define <8 x i16> @test_vext_undef_traverse(<8 x i16> %in) {
-;CHECK-LABEL: test_vext_undef_traverse:
-;CHECK: {{ext.16b.*v0, #4}}
-  %vext = shufflevector <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0>, <8 x i16> %in, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9>
-  ret <8 x i16> %vext
-}
-
-define <8 x i16> @test_vext_undef_traverse2(<8 x i16> %in) {
-;CHECK-LABEL: test_vext_undef_traverse2:
-;CHECK: {{ext.16b.*v0, #6}}
-  %vext = shufflevector <8 x i16> %in, <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2>
-  ret <8 x i16> %vext
-}
-
-define <8 x i8> @test_vext_undef_traverse3(<8 x i8> %in) {
-;CHECK-LABEL: test_vext_undef_traverse3:
-;CHECK: {{ext.8b.*v0, #6}}
-  %vext = shufflevector <8 x i8> %in, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5>
-  ret <8 x i8> %vext
-}
diff --git a/test/CodeGen/ARM64/AdvSIMD-Scalar.ll b/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
deleted file mode 100644
index 3e75eed4cd5..00000000000
--- a/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -arm64-simd-scalar=true -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=generic -arm64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
-
-define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: bar:
-; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
-; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
-; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
-; GENERIC-LABEL: bar:
-; GENERIC: add	v[[REG:[0-9]+]].2d, v0.2d, v1.2d
-; GENERIC: add	d[[REG3:[0-9]+]], d[[REG]], d1
-; GENERIC: sub	d[[REG2:[0-9]+]], d[[REG]], d1
-  %add = add <2 x i64> %a, %b
-  %vgetq_lane = extractelement <2 x i64> %add, i32 0
-  %vgetq_lane2 = extractelement <2 x i64> %b, i32 0
-  %add3 = add i64 %vgetq_lane, %vgetq_lane2
-  %sub = sub i64 %vgetq_lane, %vgetq_lane2
-  %vecinit = insertelement <2 x i64> undef, i64 %add3, i32 0
-  %vecinit8 = insertelement <2 x i64> %vecinit, i64 %sub, i32 1
-  ret <2 x i64> %vecinit8
-}
-
-define double @subdd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: subdd_su64:
-; CHECK: sub d0, d1, d0
-; CHECK-NEXT: ret
-; GENERIC-LABEL: subdd_su64:
-; GENERIC: sub d0, d1, d0
-; GENERIC-NEXT: ret
-  %vecext = extractelement <2 x i64> %a, i32 0
-  %vecext1 = extractelement <2 x i64> %b, i32 0
-  %sub.i = sub nsw i64 %vecext1, %vecext
-  %retval = bitcast i64 %sub.i to double
-  ret double %retval
-}
-
-define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: vaddd_su64:
-; CHECK: add d0, d1, d0
-; CHECK-NEXT: ret
-; GENERIC-LABEL: vaddd_su64:
-; GENERIC: add d0, d1, d0
-; GENERIC-NEXT: ret
-  %vecext = extractelement <2 x i64> %a, i32 0
-  %vecext1 = extractelement <2 x i64> %b, i32 0
-  %add.i = add nsw i64 %vecext1, %vecext
-  %retval = bitcast i64 %add.i to double
-  ret double %retval
-}
-
-; sub MI doesn't access dsub register.
-define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: add_sub_su64:
-; CHECK: add d0, d1, d0
-; CHECK: sub d0, {{d[0-9]+}}, d0
-; CHECK-NEXT: ret
-; GENERIC-LABEL: add_sub_su64:
-; GENERIC: add d0, d1, d0
-; GENERIC: sub d0, {{d[0-9]+}}, d0
-; GENERIC-NEXT: ret
-  %vecext = extractelement <2 x i64> %a, i32 0
-  %vecext1 = extractelement <2 x i64> %b, i32 0
-  %add.i = add i64 %vecext1, %vecext
-  %sub.i = sub i64 0, %add.i
-  %retval = bitcast i64 %sub.i to double
-  ret double %retval
-}
diff --git a/test/CodeGen/ARM64/aapcs.ll b/test/CodeGen/ARM64/aapcs.ll
deleted file mode 100644
index b713f0d5a53..00000000000
--- a/test/CodeGen/ARM64/aapcs.ll
+++ /dev/null
@@ -1,103 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false < %s | FileCheck %s
-
-@var = global i32 0, align 4
-
-define i128 @test_i128_align(i32, i128 %arg, i32 %after) {
-  store i32 %after, i32* @var, align 4
-; CHECK: str w4, [{{x[0-9]+}}, :lo12:var]
-
-  ret i128 %arg
-; CHECK: mov x0, x2
-; CHECK: mov x1, x3
-}
-
-@var64 = global i64 0, align 8
-
-  ; Check stack slots are 64-bit at all times.
-define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
-                                i32 %int, i64 %long) {
-  ; Part of last store. Blasted scheduler.
-; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
-
-  %ext_bool = zext i1 %bool to i64
-  store volatile i64 %ext_bool, i64* @var64, align 8
-; CHECK: ldrb w[[EXT:[0-9]+]], [sp]
-; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1
-; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64]
-
-  %ext_char = zext i8 %char to i64
-  store volatile i64 %ext_char, i64* @var64, align 8
-; CHECK: ldrb w[[EXT:[0-9]+]], [sp, #8]
-; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  %ext_short = zext i16 %short to i64
-  store volatile i64 %ext_short, i64* @var64, align 8
-; CHECK: ldrh w[[EXT:[0-9]+]], [sp, #16]
-; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  %ext_int = zext i32 %int to i64
-  store volatile i64 %ext_int, i64* @var64, align 8
-; CHECK: ldr{{b?}} w[[EXT:[0-9]+]], [sp, #24]
-; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  store volatile i64 %long, i64* @var64, align 8
-; CHECK: str [[LONG]], [{{x[0-9]+}}, :lo12:var64]
-
-  ret void
-}
-
-; Make sure the callee does extensions (in the absence of zext/sext
-; keyword on args) while we're here.
-
-define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
-  %ext_bool = zext i1 %bool to i64
-  store volatile i64 %ext_bool, i64* @var64
-; CHECK: and [[EXT:x[0-9]+]], x0, #0x1
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  %ext_char = sext i8 %char to i64
-  store volatile i64 %ext_char, i64* @var64
-; CHECK: sxtb [[EXT:x[0-9]+]], w1
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  %ext_short = zext i16 %short to i64
-  store volatile i64 %ext_short, i64* @var64
-; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  %ext_int = zext i32 %int to i64
-  store volatile i64 %ext_int, i64* @var64
-; CHECK: ubfx [[EXT:x[0-9]+]], x3, #0, #32
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  ret void
-}
-
-declare void @variadic(i32 %a, ...)
-
-  ; Under AAPCS variadic functions have the same calling convention as
-  ; others. The extra arguments should go in registers rather than on the stack.
-define void @test_variadic() {
-  call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0)
-; CHECK: fmov d0, #2.0
-; CHECK: orr w1, wzr, #0x1
-; CHECK: bl variadic
-  ret void
-}
-
-; We weren't marking x7 as used after deciding that the i128 didn't fit into
-; registers and putting the first half on the stack, so the *second* half went
-; into x7. Yuck!
-define i128 @test_i128_shadow([7 x i64] %x0_x6, i128 %sp) {
-; CHECK-LABEL: test_i128_shadow:
-; CHECK: ldp x0, x1, [sp]
-
-  ret i128 %sp
-}
-
-; This test is to check if fp128 can be correctly handled on stack.
-define fp128 @test_fp128([8 x float] %arg0, fp128 %arg1) {
-; CHECK-LABEL: test_fp128:
-; CHECK: ldr {{q[0-9]+}}, [sp]
-  ret fp128 %arg1
-}
diff --git a/test/CodeGen/ARM64/aarch64-large-frame.ll b/test/CodeGen/ARM64/aarch64-large-frame.ll
deleted file mode 100644
index 5a53da69388..00000000000
--- a/test/CodeGen/ARM64/aarch64-large-frame.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
-declare void @use_addr(i8*)
-
-@addr = global i8* null
-
-define void @test_bigframe() {
-; CHECK-LABEL: test_bigframe:
-; CHECK: .cfi_startproc
-
-  %var1 = alloca i8, i32 20000000
-  %var2 = alloca i8, i32 16
-  %var3 = alloca i8, i32 20000000
-
-; CHECK: sub sp, sp, #4095, lsl #12
-; CHECK: sub sp, sp, #4095, lsl #12
-; CHECK: sub sp, sp, #1575, lsl #12
-; CHECK: sub sp, sp, #2576
-; CHECK: .cfi_def_cfa_offset 40000032
-
-
-; CHECK: add [[TMP:x[0-9]+]], sp, #4095, lsl #12
-; CHECK: add [[TMP1:x[0-9]+]], [[TMP]], #787, lsl #12
-; CHECK: add {{x[0-9]+}}, [[TMP1]], #3344
-  store volatile i8* %var1, i8** @addr
-
-  %var1plus2 = getelementptr i8* %var1, i32 2
-  store volatile i8* %var1plus2, i8** @addr
-
-; CHECK: add [[TMP:x[0-9]+]], sp, #4095, lsl #12
-; CHECK: add [[TMP1:x[0-9]+]], [[TMP]], #787, lsl #12
-; CHECK: add {{x[0-9]+}}, [[TMP1]], #3328
-  store volatile i8* %var2, i8** @addr
-
-  %var2plus2 = getelementptr i8* %var2, i32 2
-  store volatile i8* %var2plus2, i8** @addr
-
-  store volatile i8* %var3, i8** @addr
-
-  %var3plus2 = getelementptr i8* %var3, i32 2
-  store volatile i8* %var3plus2, i8** @addr
-
-; CHECK: add sp, sp, #4095, lsl #12
-; CHECK: add sp, sp, #4095, lsl #12
-; CHECK: add sp, sp, #1575, lsl #12
-; CHECK: add sp, sp, #2576
-; CHECK: .cfi_endproc
-  ret void
-}
-
-define void @test_mediumframe() {
-; CHECK-LABEL: test_mediumframe:
-  %var1 = alloca i8, i32 1000000
-  %var2 = alloca i8, i32 16
-  %var3 = alloca i8, i32 1000000
-; CHECK: sub sp, sp, #488, lsl #12
-; CHECK-NEXT: sub sp, sp, #1168
-
-  store volatile i8* %var1, i8** @addr
-; CHECK: add     [[VAR1ADDR:x[0-9]+]], sp, #244, lsl #12
-; CHECK: add     [[VAR1ADDR]], [[VAR1ADDR]], #592
-
-; CHECK: add [[VAR2ADDR:x[0-9]+]], sp, #244, lsl #12
-; CHECK: add [[VAR2ADDR]], [[VAR2ADDR]], #576
-
-  store volatile i8* %var2, i8** @addr
-; CHECK: add     sp, sp, #488, lsl #12
-; CHECK: add     sp, sp, #1168
-  ret void
-}
diff --git a/test/CodeGen/ARM64/aarch64-neon-2velem-high.ll b/test/CodeGen/ARM64/aarch64-neon-2velem-high.ll
deleted file mode 100644
index 2013747713b..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-2velem-high.ll
+++ /dev/null
@@ -1,341 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
-
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
-
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
-
-define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) {
-; CHECK-LABEL: test_vmull_high_n_s16:
-; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vmull15.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  ret <4 x i32> %vmull15.i.i
-}
-
-define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) {
-; CHECK-LABEL: test_vmull_high_n_s32:
-; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vmull9.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  ret <2 x i64> %vmull9.i.i
-}
-
-define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) {
-; CHECK-LABEL: test_vmull_high_n_u16:
-; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vmull15.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  ret <4 x i32> %vmull15.i.i
-}
-
-define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) {
-; CHECK-LABEL: test_vmull_high_n_u32:
-; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vmull9.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  ret <2 x i64> %vmull9.i.i
-}
-
-define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) {
-; CHECK-LABEL: test_vqdmull_high_n_s16:
-; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vqdmull15.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  ret <4 x i32> %vqdmull15.i.i
-}
-
-define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) {
-; CHECK-LABEL: test_vqdmull_high_n_s32:
-; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vqdmull9.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  ret <2 x i64> %vqdmull9.i.i
-}
-
-define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK-LABEL: test_vmlal_high_n_s16:
-; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK-LABEL: test_vmlal_high_n_s32:
-; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK-LABEL: test_vmlal_high_n_u16:
-; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK-LABEL: test_vmlal_high_n_u32:
-; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK-LABEL: test_vqdmlal_high_n_s16:
-; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vqdmlal15.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %vqdmlal17.i.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
-  ret <4 x i32> %vqdmlal17.i.i
-}
-
-define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK-LABEL: test_vqdmlal_high_n_s32:
-; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vqdmlal9.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %vqdmlal11.i.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
-  ret <2 x i64> %vqdmlal11.i.i
-}
-
-define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK-LABEL: test_vmlsl_high_n_s16:
-; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
-  ret <4 x i32> %sub.i.i
-}
-
-define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK-LABEL: test_vmlsl_high_n_s32:
-; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
-  ret <2 x i64> %sub.i.i
-}
-
-define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK-LABEL: test_vmlsl_high_n_u16:
-; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
-  ret <4 x i32> %sub.i.i
-}
-
-define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK-LABEL: test_vmlsl_high_n_u32:
-; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
-  ret <2 x i64> %sub.i.i
-}
-
-define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK-LABEL: test_vqdmlsl_high_n_s16:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
-  ret <4 x i32> %vqdmlsl17.i.i
-}
-
-define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK-LABEL: test_vqdmlsl_high_n_s32:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
-  ret <2 x i64> %vqdmlsl11.i.i
-}
-
-define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) {
-; CHECK-LABEL: test_vmul_n_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-entry:
-  %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
-  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1
-  %mul.i = fmul <2 x float> %vecinit1.i, %a
-  ret <2 x float> %mul.i
-}
-
-define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
-; CHECK-LABEL: test_vmulq_n_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-entry:
-  %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
-  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1
-  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2
-  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3
-  %mul.i = fmul <4 x float> %vecinit3.i, %a
-  ret <4 x float> %mul.i
-}
-
-define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) {
-; CHECK-LABEL: test_vmulq_n_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-entry:
-  %vecinit.i = insertelement <2 x double> undef, double %b, i32 0
-  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1
-  %mul.i = fmul <2 x double> %vecinit1.i, %a
-  ret <2 x double> %mul.i
-}
-
-define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
-; CHECK-LABEL: test_vfma_n_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
-  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
-; CHECK-LABEL: test_vfmaq_n_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
-  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
-  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
-  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
-; CHECK-LABEL: test_vfms_n_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
-  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
-  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
-  %1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a)
-  ret <2 x float> %1
-}
-
-define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
-; CHECK-LABEL: test_vfmsq_n_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
-  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
-  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
-  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
-  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
-  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a)
-  ret <4 x float> %1
-}
diff --git a/test/CodeGen/ARM64/aarch64-neon-2velem.ll b/test/CodeGen/ARM64/aarch64-neon-2velem.ll
deleted file mode 100644
index 4c6b72d55c3..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-2velem.ll
+++ /dev/null
@@ -1,2853 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
-
-declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
-
-declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
-
-declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
-
-declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
-
-declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
-
-declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
-
-declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
-
-declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
-
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmla_lane_s16:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlaq_lane_s16:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <8 x i16> %shuffle, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmla_lane_s32:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlaq_lane_s32:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmla_laneq_s16:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlaq_laneq_s16:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <8 x i16> %shuffle, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmla_laneq_s32:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlaq_laneq_s32:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i32> %shuffle, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmls_lane_s16:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
-
-define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlsq_lane_s16:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <8 x i16> %shuffle, %b
-  %sub = sub <8 x i16> %a, %mul
-  ret <8 x i16> %sub
-}
-
-define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmls_lane_s32:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %mul = mul <2 x i32> %shuffle, %b
-  %sub = sub <2 x i32> %a, %mul
-  ret <2 x i32> %sub
-}
-
-define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlsq_lane_s32:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle, %b
-  %sub = sub <4 x i32> %a, %mul
-  ret <4 x i32> %sub
-}
-
-define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmls_laneq_s16:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
-
-define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlsq_laneq_s16:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <8 x i16> %shuffle, %b
-  %sub = sub <8 x i16> %a, %mul
-  ret <8 x i16> %sub
-}
-
-define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmls_laneq_s32:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %b
-  %sub = sub <2 x i32> %a, %mul
-  ret <2 x i32> %sub
-}
-
-define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlsq_laneq_s32:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i32> %shuffle, %b
-  %sub = sub <4 x i32> %a, %mul
-  ret <4 x i32> %sub
-}
-
-define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmul_lane_s16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmulq_lane_s16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmul_lane_s32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmulq_lane_s32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmul_lane_u16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmulq_lane_u16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmul_lane_u32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmulq_lane_u32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmul_laneq_s16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmulq_laneq_s16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmul_laneq_s32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmulq_laneq_s32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmul_laneq_u16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmulq_laneq_u16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmul_laneq_u32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmulq_laneq_u32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfma_lane_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
-
-define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfmaq_lane_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
-
-define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfma_laneq_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmaq_laneq_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfms_lane_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfmsq_lane_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfms_laneq_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmsq_laneq_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
-; CHECK-LABEL: test_vfmaq_lane_f64:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
-
-define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmaq_laneq_f64:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
-; CHECK-LABEL: test_vfmsq_lane_f64:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <1 x double> <double -0.000000e+00>, %v
-  %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmsq_laneq_f64:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
-  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmas_laneq_f32
-; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %extract = extractelement <4 x float> %v, i32 3
-  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
-  ret float %0
-}
-
-declare float @llvm.fma.f32(float, float, float)
-
-define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
-; CHECK-LABEL: test_vfmsd_lane_f64
-; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: ret
-entry:
-  %extract.rhs = extractelement <1 x double> %v, i32 0
-  %extract = fsub double -0.000000e+00, %extract.rhs
-  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
-  ret double %0
-}
-
-declare double @llvm.fma.f64(double, double, double)
-
-define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmss_laneq_f32
-; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %extract.rhs = extractelement <4 x float> %v, i32 3
-  %extract = fsub float -0.000000e+00, %extract.rhs
-  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
-  ret float %0
-}
-
-define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmsd_laneq_f64
-; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %extract.rhs = extractelement <2 x double> %v, i32 1
-  %extract = fsub double -0.000000e+00, %extract.rhs
-  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
-  ret double %0
-}
-
-define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlal_lane_s16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlal_lane_s32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlal_laneq_s16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlal_laneq_s32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlal_high_lane_s16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlal_high_lane_s32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlal_high_laneq_s16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlal_high_laneq_s32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_lane_s16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_lane_s32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_laneq_s16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_laneq_s32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_high_lane_s16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_high_lane_s32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_high_laneq_s16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_high_laneq_s32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlal_lane_u16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlal_lane_u32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlal_laneq_u16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlal_laneq_u32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlal_high_lane_u16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlal_high_lane_u32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlal_high_laneq_u16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlal_high_laneq_u32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_lane_u16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_lane_u32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_laneq_u16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_laneq_u32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_high_lane_u16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_high_lane_u32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_high_laneq_u16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_high_laneq_u32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmull_lane_s16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmull_lane_s32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmull_lane_u16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmull_lane_u32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmull_high_lane_s16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmull_high_lane_s32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmull_high_lane_u16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmull_high_lane_u32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmull_laneq_s16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmull_laneq_s32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmull_laneq_u16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmull_laneq_u32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmull_high_laneq_s16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmull_high_laneq_s32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmull_high_laneq_u16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmull_high_laneq_u32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmlal_lane_s16:
-; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmlal_lane_s32:
-; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmlal_high_lane_s16:
-; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmlal_high_lane_s32:
-; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmlsl_lane_s16:
-; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmlsl_lane_s32:
-; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
-; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
-; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmull_lane_s16:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmull_lane_s32:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vqdmull_laneq_s16:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vqdmull_laneq_s32:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmull_high_lane_s16:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmull_high_lane_s32:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vqdmull_high_laneq_s16:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vqdmull_high_laneq_s32:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmulh_lane_s16:
-; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmulh2.i = tail call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i16> %vqdmulh2.i
-}
-
-define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmulhq_lane_s16:
-; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %vqdmulh2.i = tail call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
-  ret <8 x i16> %vqdmulh2.i
-}
-
-define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmulh_lane_s32:
-; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmulh2.i = tail call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i32> %vqdmulh2.i
-}
-
-define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmulhq_lane_s32:
-; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vqdmulh2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
-  ret <4 x i32> %vqdmulh2.i
-}
-
-define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vqrdmulh_lane_s16:
-; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i16> %vqrdmulh2.i
-}
-
-define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vqrdmulhq_lane_s16:
-; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
-  ret <8 x i16> %vqrdmulh2.i
-}
-
-define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vqrdmulh_lane_s32:
-; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i32> %vqrdmulh2.i
-}
-
-define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vqrdmulhq_lane_s32:
-; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
-  ret <4 x i32> %vqrdmulh2.i
-}
-
-define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmul_lane_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %mul = fmul <2 x float> %shuffle, %a
-  ret <2 x float> %mul
-}
-
-define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
-; CHECK-LABEL: test_vmul_lane_f64:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <1 x double> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to double
-  %extract = extractelement <1 x double> %v, i32 0
-  %2 = fmul double %1, %extract
-  %3 = insertelement <1 x double> undef, double %2, i32 0
-  ret <1 x double> %3
-}
-
-define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmulq_lane_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = fmul <4 x float> %shuffle, %a
-  ret <4 x float> %mul
-}
-
-define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
-; CHECK-LABEL: test_vmulq_lane_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %mul = fmul <2 x double> %shuffle, %a
-  ret <2 x double> %mul
-}
-
-define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmul_laneq_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %mul = fmul <2 x float> %shuffle, %a
-  ret <2 x float> %mul
-}
-
-define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
-; CHECK-LABEL: test_vmul_laneq_f64:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <1 x double> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to double
-  %extract = extractelement <2 x double> %v, i32 1
-  %2 = fmul double %1, %extract
-  %3 = insertelement <1 x double> undef, double %2, i32 0
-  ret <1 x double> %3
-}
-
-define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmulq_laneq_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = fmul <4 x float> %shuffle, %a
-  ret <4 x float> %mul
-}
-
-define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
-; CHECK-LABEL: test_vmulq_laneq_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %mul = fmul <2 x double> %shuffle, %a
-  ret <2 x double> %mul
-}
-
-define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmulx_lane_f32:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %vmulx2.i = tail call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
-  ret <2 x float> %vmulx2.i
-}
-
-define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmulxq_lane_f32:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmulx2.i = tail call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
-  ret <4 x float> %vmulx2.i
-}
-
-define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
-; CHECK-LABEL: test_vmulxq_lane_f64:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
-  ret <2 x double> %vmulx2.i
-}
-
-define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmulx_laneq_f32:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %vmulx2.i = tail call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
-  ret <2 x float> %vmulx2.i
-}
-
-define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmulxq_laneq_f32:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmulx2.i = tail call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
-  ret <4 x float> %vmulx2.i
-}
-
-define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
-; CHECK-LABEL: test_vmulxq_laneq_f64:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %vmulx2.i = tail call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
-  ret <2 x double> %vmulx2.i
-}
-
-define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmla_lane_s16_0:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlaq_lane_s16_0:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmla_lane_s32_0:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlaq_lane_s32_0:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmla_laneq_s16_0:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlaq_laneq_s16_0:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmla_laneq_s32_0:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlaq_laneq_s32_0:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmls_lane_s16_0:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
-
-define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlsq_lane_s16_0:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %b
-  %sub = sub <8 x i16> %a, %mul
-  ret <8 x i16> %sub
-}
-
-define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmls_lane_s32_0:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %b
-  %sub = sub <2 x i32> %a, %mul
-  ret <2 x i32> %sub
-}
-
-define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlsq_lane_s32_0:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %b
-  %sub = sub <4 x i32> %a, %mul
-  ret <4 x i32> %sub
-}
-
-define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmls_laneq_s16_0:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
-
-define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlsq_laneq_s16_0:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %b
-  %sub = sub <8 x i16> %a, %mul
-  ret <8 x i16> %sub
-}
-
-define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmls_laneq_s32_0:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %b
-  %sub = sub <2 x i32> %a, %mul
-  ret <2 x i32> %sub
-}
-
-define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlsq_laneq_s32_0:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %b
-  %sub = sub <4 x i32> %a, %mul
-  ret <4 x i32> %sub
-}
-
-define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmul_lane_s16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmulq_lane_s16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmul_lane_s32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmulq_lane_s32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmul_lane_u16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmulq_lane_u16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmul_lane_u32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmulq_lane_u32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmul_laneq_s16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmulq_laneq_s16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmul_laneq_s32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmulq_laneq_s32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmul_laneq_u16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmulq_laneq_u16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmul_laneq_u32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmulq_laneq_u32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfma_lane_f32_0:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfmaq_lane_f32_0:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfma_laneq_f32_0:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmaq_laneq_f32_0:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfms_lane_f32_0:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK-LABEL: test_vfmsq_lane_f32_0:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfms_laneq_f32_0:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmsq_laneq_f32_0:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmaq_laneq_f64_0:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmsq_laneq_f64_0:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
-  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlal_lane_s16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlal_lane_s32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlal_laneq_s16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlal_laneq_s32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlal_high_lane_s16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlal_high_lane_s32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_lane_s16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_lane_s32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_laneq_s16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_laneq_s32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlal_lane_u16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlal_lane_u32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlal_laneq_u16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlal_laneq_u32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlal_high_lane_u16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlal_high_lane_u32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_lane_u16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_lane_u32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_laneq_u16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_laneq_u32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmull_lane_s16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmull_lane_s32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmull_lane_u16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmull_lane_u32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmull_high_lane_s16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmull_high_lane_s32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vmull_high_lane_u16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vmull_high_lane_u32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmull_laneq_s16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmull_laneq_s32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmull_laneq_u16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmull_laneq_u32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmull_high_laneq_s16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmull_high_laneq_s32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vmull_high_laneq_u16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vmull_high_laneq_u32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmlal_lane_s16_0:
-; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmlal_lane_s32_0:
-; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
-; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
-; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
-; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
-; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
-; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
-; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmull_lane_s16_0:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmull_lane_s32_0:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vqdmull_laneq_s16_0:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vqdmull_laneq_s32_0:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmulh_lane_s16_0:
-; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i16> %vqdmulh2.i
-}
-
-define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
-; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
-  ret <8 x i16> %vqdmulh2.i
-}
-
-define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmulh_lane_s32_0:
-; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i32> %vqdmulh2.i
-}
-
-define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
-; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
-  ret <4 x i32> %vqdmulh2.i
-}
-
-define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
-; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i16> %vqrdmulh2.i
-}
-
-define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
-; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
-  ret <8 x i16> %vqrdmulh2.i
-}
-
-define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
-; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i32> %vqrdmulh2.i
-}
-
-define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
-; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
-  ret <4 x i32> %vqrdmulh2.i
-}
-
-define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmul_lane_f32_0:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
-  %mul = fmul <2 x float> %shuffle, %a
-  ret <2 x float> %mul
-}
-
-define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmulq_lane_f32_0:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
-  %mul = fmul <4 x float> %shuffle, %a
-  ret <4 x float> %mul
-}
-
-define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmul_laneq_f32_0:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
-  %mul = fmul <2 x float> %shuffle, %a
-  ret <2 x float> %mul
-}
-
-define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
-; CHECK-LABEL: test_vmul_laneq_f64_0:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <1 x double> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to double
-  %extract = extractelement <2 x double> %v, i32 0
-  %2 = fmul double %1, %extract
-  %3 = insertelement <1 x double> undef, double %2, i32 0
-  ret <1 x double> %3
-}
-
-define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmulq_laneq_f32_0:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-  %mul = fmul <4 x float> %shuffle, %a
-  ret <4 x float> %mul
-}
-
-define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
-; CHECK-LABEL: test_vmulq_laneq_f64_0:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-  %mul = fmul <2 x double> %shuffle, %a
-  ret <2 x double> %mul
-}
-
-define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmulx_lane_f32_0:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
-  ret <2 x float> %vmulx2.i
-}
-
-define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
-; CHECK-LABEL: test_vmulxq_lane_f32_0:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
-  %vmulx2.i = tail call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
-  ret <4 x float> %vmulx2.i
-}
-
-define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
-; CHECK-LABEL: test_vmulxq_lane_f64_0:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
-  ret <2 x double> %vmulx2.i
-}
-
-define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmulx_laneq_f32_0:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
-  ret <2 x float> %vmulx2.i
-}
-
-define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
-; CHECK-LABEL: test_vmulxq_laneq_f32_0:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-  %vmulx2.i = tail call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
-  ret <4 x float> %vmulx2.i
-}
-
-define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
-; CHECK-LABEL: test_vmulxq_laneq_f64_0:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
-  ret <2 x double> %vmulx2.i
-}
-
diff --git a/test/CodeGen/ARM64/aarch64-neon-3vdiff.ll b/test/CodeGen/ARM64/aarch64-neon-3vdiff.ll
deleted file mode 100644
index a479844de8d..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-3vdiff.ll
+++ /dev/null
@@ -1,1829 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
-
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
-
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
-
-declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
-
-declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
-
-declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
-
-declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
-
-declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
-
-declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
-
-declare <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>)
-
-declare <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>)
-
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>)
-
-declare <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>)
-
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>)
-
-declare <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vaddl_s8:
-; CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
-  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vaddl_s16:
-; CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
-  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vaddl_s32:
-; CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
-  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vaddl_u8:
-; CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
-  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vaddl_u16:
-; CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
-  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vaddl_u32:
-; CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
-  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vaddl_high_s8:
-; CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
-  %add.i = add <8 x i16> %0, %1
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vaddl_high_s16:
-; CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
-  %add.i = add <4 x i32> %0, %1
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaddl_high_s32:
-; CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
-  %add.i = add <2 x i64> %0, %1
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vaddl_high_u8:
-; CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
-  %add.i = add <8 x i16> %0, %1
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vaddl_high_u16:
-; CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
-  %add.i = add <4 x i32> %0, %1
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaddl_high_u32:
-; CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
-  %add.i = add <2 x i64> %0, %1
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vaddw_s8:
-; CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vaddw_s16:
-; CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vaddw_s32:
-; CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vaddw_u8:
-; CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vaddw_u16:
-; CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vaddw_u32:
-; CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vaddw_high_s8:
-; CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %add.i = add <8 x i16> %0, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vaddw_high_s16:
-; CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %add.i = add <4 x i32> %0, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaddw_high_s32:
-; CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %add.i = add <2 x i64> %0, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vaddw_high_u8:
-; CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %add.i = add <8 x i16> %0, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vaddw_high_u16:
-; CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %add.i = add <4 x i32> %0, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaddw_high_u32:
-; CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %add.i = add <2 x i64> %0, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vsubl_s8:
-; CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
-  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
-  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vsubl_s16:
-; CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
-  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
-  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vsubl_s32:
-; CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
-  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
-  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vsubl_u8:
-; CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
-  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
-  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vsubl_u16:
-; CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
-  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
-  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vsubl_u32:
-; CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
-  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
-  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsubl_high_s8:
-; CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
-  %sub.i = sub <8 x i16> %0, %1
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubl_high_s16:
-; CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
-  %sub.i = sub <4 x i32> %0, %1
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsubl_high_s32:
-; CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
-  %sub.i = sub <2 x i64> %0, %1
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsubl_high_u8:
-; CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
-  %sub.i = sub <8 x i16> %0, %1
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubl_high_u16:
-; CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
-  %sub.i = sub <4 x i32> %0, %1
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsubl_high_u32:
-; CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
-  %sub.i = sub <2 x i64> %0, %1
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vsubw_s8:
-; CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
-  %sub.i = sub <8 x i16> %a, %vmovl.i.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vsubw_s16:
-; CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
-  %sub.i = sub <4 x i32> %a, %vmovl.i.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vsubw_s32:
-; CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
-  %sub.i = sub <2 x i64> %a, %vmovl.i.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vsubw_u8:
-; CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
-  %sub.i = sub <8 x i16> %a, %vmovl.i.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vsubw_u16:
-; CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
-  %sub.i = sub <4 x i32> %a, %vmovl.i.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vsubw_u32:
-; CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
-  %sub.i = sub <2 x i64> %a, %vmovl.i.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsubw_high_s8:
-; CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %sub.i = sub <8 x i16> %a, %0
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubw_high_s16:
-; CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %sub.i = sub <4 x i32> %a, %0
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsubw_high_s32:
-; CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %sub.i = sub <2 x i64> %a, %0
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsubw_high_u8:
-; CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %sub.i = sub <8 x i16> %a, %0
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubw_high_u16:
-; CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %sub.i = sub <4 x i32> %a, %0
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsubw_high_u32:
-; CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %sub.i = sub <2 x i64> %a, %0
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vaddhn_s16:
-; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vaddhn.i = add <8 x i16> %a, %b
-  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
-  ret <8 x i8> %vaddhn2.i
-}
-
-define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaddhn_s32:
-; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vaddhn.i = add <4 x i32> %a, %b
-  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
-  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
-  ret <4 x i16> %vaddhn2.i
-}
-
-define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vaddhn_s64:
-; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vaddhn.i = add <2 x i64> %a, %b
-  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
-  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
-  ret <2 x i32> %vaddhn2.i
-}
-
-define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vaddhn_u16:
-; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vaddhn.i = add <8 x i16> %a, %b
-  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
-  ret <8 x i8> %vaddhn2.i
-}
-
-define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaddhn_u32:
-; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vaddhn.i = add <4 x i32> %a, %b
-  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
-  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
-  ret <4 x i16> %vaddhn2.i
-}
-
-define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vaddhn_u64:
-; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vaddhn.i = add <2 x i64> %a, %b
-  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
-  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
-  ret <2 x i32> %vaddhn2.i
-}
-
-define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vaddhn_high_s16:
-; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vaddhn.i.i = add <8 x i16> %a, %b
-  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaddhn_high_s32:
-; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vaddhn.i.i = add <4 x i32> %a, %b
-  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
-  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vaddhn_high_s64:
-; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vaddhn.i.i = add <2 x i64> %a, %b
-  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
-  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vaddhn_high_u16:
-; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vaddhn.i.i = add <8 x i16> %a, %b
-  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaddhn_high_u32:
-; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vaddhn.i.i = add <4 x i32> %a, %b
-  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
-  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vaddhn_high_u64:
-; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vaddhn.i.i = add <2 x i64> %a, %b
-  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
-  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vraddhn_s16:
-; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i8> %vraddhn2.i
-}
-
-define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vraddhn_s32:
-; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i16> %vraddhn2.i
-}
-
-define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vraddhn_s64:
-; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i32> %vraddhn2.i
-}
-
-define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vraddhn_u16:
-; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i8> %vraddhn2.i
-}
-
-define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vraddhn_u32:
-; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i16> %vraddhn2.i
-}
-
-define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vraddhn_u64:
-; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i32> %vraddhn2.i
-}
-
-define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vraddhn_high_s16:
-; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vraddhn_high_s32:
-; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vraddhn_high_s64:
-; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vraddhn_high_u16:
-; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vraddhn_high_u32:
-; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vraddhn_high_u64:
-; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubhn_s16:
-; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vsubhn.i = sub <8 x i16> %a, %b
-  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
-  ret <8 x i8> %vsubhn2.i
-}
-
-define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsubhn_s32:
-; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vsubhn.i = sub <4 x i32> %a, %b
-  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
-  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
-  ret <4 x i16> %vsubhn2.i
-}
-
-define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsubhn_s64:
-; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vsubhn.i = sub <2 x i64> %a, %b
-  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
-  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
-  ret <2 x i32> %vsubhn2.i
-}
-
-define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubhn_u16:
-; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vsubhn.i = sub <8 x i16> %a, %b
-  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
-  ret <8 x i8> %vsubhn2.i
-}
-
-define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsubhn_u32:
-; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vsubhn.i = sub <4 x i32> %a, %b
-  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
-  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
-  ret <4 x i16> %vsubhn2.i
-}
-
-define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsubhn_u64:
-; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vsubhn.i = sub <2 x i64> %a, %b
-  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
-  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
-  ret <2 x i32> %vsubhn2.i
-}
-
-define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubhn_high_s16:
-; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vsubhn.i.i = sub <8 x i16> %a, %b
-  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsubhn_high_s32:
-; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vsubhn.i.i = sub <4 x i32> %a, %b
-  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
-  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsubhn_high_s64:
-; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vsubhn.i.i = sub <2 x i64> %a, %b
-  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
-  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubhn_high_u16:
-; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vsubhn.i.i = sub <8 x i16> %a, %b
-  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsubhn_high_u32:
-; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vsubhn.i.i = sub <4 x i32> %a, %b
-  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
-  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsubhn_high_u64:
-; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vsubhn.i.i = sub <2 x i64> %a, %b
-  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
-  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vrsubhn_s16:
-; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i8> %vrsubhn2.i
-}
-
-define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vrsubhn_s32:
-; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i16> %vrsubhn2.i
-}
-
-define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vrsubhn_s64:
-; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i32> %vrsubhn2.i
-}
-
-define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vrsubhn_u16:
-; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i8> %vrsubhn2.i
-}
-
-define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vrsubhn_u32:
-; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i16> %vrsubhn2.i
-}
-
-define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vrsubhn_u64:
-; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i32> %vrsubhn2.i
-}
-
-define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vrsubhn_high_s16:
-; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vrsubhn_high_s32:
-; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vrsubhn_high_s64:
-; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vrsubhn_high_u16:
-; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vrsubhn_high_u32:
-; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vrsubhn_high_u64:
-; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vabdl_s8:
-; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vabd.i.i = tail call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
-  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
-  ret <8 x i16> %vmovl.i.i
-}
-
-define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vabdl_s16:
-; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vabd2.i.i = tail call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
-  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
-  ret <4 x i32> %vmovl.i.i
-}
-
-define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vabdl_s32:
-; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vabd2.i.i = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
-  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
-  ret <2 x i64> %vmovl.i.i
-}
-
-define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vabdl_u8:
-; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vabd.i.i = tail call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
-  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
-  ret <8 x i16> %vmovl.i.i
-}
-
-define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vabdl_u16:
-; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vabd2.i.i = tail call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
-  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
-  ret <4 x i32> %vmovl.i.i
-}
-
-define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vabdl_u32:
-; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vabd2.i.i = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
-  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
-  ret <2 x i64> %vmovl.i.i
-}
-
-define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK-LABEL: test_vabal_s8:
-; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
-  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK-LABEL: test_vabal_s16:
-; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
-  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vabal_s32:
-; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
-  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK-LABEL: test_vabal_u8:
-; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
-  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK-LABEL: test_vabal_u16:
-; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
-  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vabal_u32:
-; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
-  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vabdl_high_s8:
-; CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
-  ret <8 x i16> %vmovl.i.i.i
-}
-
-define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vabdl_high_s16:
-; CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
-  ret <4 x i32> %vmovl.i.i.i
-}
-
-define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vabdl_high_s32:
-; CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
-  ret <2 x i64> %vmovl.i.i.i
-}
-
-define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vabdl_high_u8:
-; CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
-  ret <8 x i16> %vmovl.i.i.i
-}
-
-define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vabdl_high_u16:
-; CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
-  ret <4 x i32> %vmovl.i.i.i
-}
-
-define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vabdl_high_u32:
-; CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
-  ret <2 x i64> %vmovl.i.i.i
-}
-
-define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vabal_high_s8:
-; CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
-  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
-  ret <8 x i16> %add.i.i
-}
-
-define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vabal_high_s16:
-; CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
-  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vabal_high_s32:
-; CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
-  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vabal_high_u8:
-; CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
-  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
-  ret <8 x i16> %add.i.i
-}
-
-define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vabal_high_u16:
-; CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
-  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vabal_high_u32:
-; CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
-  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vmull_s8:
-; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
-  ret <8 x i16> %vmull.i
-}
-
-define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vmull_s16:
-; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vmull_s32:
-; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
-  ret <2 x i64> %vmull2.i
-}
-
-define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vmull_u8:
-; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
-  ret <8 x i16> %vmull.i
-}
-
-define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vmull_u16:
-; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vmull_u32:
-; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
-  ret <2 x i64> %vmull2.i
-}
-
-define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vmull_high_s8:
-; CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  ret <8 x i16> %vmull.i.i
-}
-
-define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vmull_high_s16:
-; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vmull_high_s32:
-; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vmull_high_u8:
-; CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  ret <8 x i16> %vmull.i.i
-}
-
-define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vmull_high_u16:
-; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vmull_high_u32:
-; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK-LABEL: test_vmlal_s8:
-; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
-  %add.i = add <8 x i16> %vmull.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK-LABEL: test_vmlal_s16:
-; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %add.i = add <4 x i32> %vmull2.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vmlal_s32:
-; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %add.i = add <2 x i64> %vmull2.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK-LABEL: test_vmlal_u8:
-; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
-  %add.i = add <8 x i16> %vmull.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK-LABEL: test_vmlal_u16:
-; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %add.i = add <4 x i32> %vmull2.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vmlal_u32:
-; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %add.i = add <2 x i64> %vmull2.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vmlal_high_s8:
-; CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
-  ret <8 x i16> %add.i.i
-}
-
-define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vmlal_high_s16:
-; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vmlal_high_s32:
-; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vmlal_high_u8:
-; CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
-  ret <8 x i16> %add.i.i
-}
-
-define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vmlal_high_u16:
-; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vmlal_high_u32:
-; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK-LABEL: test_vmlsl_s8:
-; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
-  %sub.i = sub <8 x i16> %a, %vmull.i.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK-LABEL: test_vmlsl_s16:
-; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %sub.i = sub <4 x i32> %a, %vmull2.i.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vmlsl_s32:
-; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %sub.i = sub <2 x i64> %a, %vmull2.i.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK-LABEL: test_vmlsl_u8:
-; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
-  %sub.i = sub <8 x i16> %a, %vmull.i.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK-LABEL: test_vmlsl_u16:
-; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %sub.i = sub <4 x i32> %a, %vmull2.i.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vmlsl_u32:
-; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %sub.i = sub <2 x i64> %a, %vmull2.i.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vmlsl_high_s8:
-; CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
-  ret <8 x i16> %sub.i.i
-}
-
-define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vmlsl_high_s16:
-; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
-  ret <4 x i32> %sub.i.i
-}
-
-define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vmlsl_high_s32:
-; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
-  ret <2 x i64> %sub.i.i
-}
-
-define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK-LABEL: test_vmlsl_high_u8:
-; CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
-  ret <8 x i16> %sub.i.i
-}
-
-define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vmlsl_high_u16:
-; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
-  ret <4 x i32> %sub.i.i
-}
-
-define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vmlsl_high_u32:
-; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
-  ret <2 x i64> %sub.i.i
-}
-
-define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vqdmull_s16:
-; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vqdmull_s32:
-; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK-LABEL: test_vqdmlal_s16:
-; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vqdmlal_s32:
-; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK-LABEL: test_vqdmlsl_s16:
-; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vqdmlsl_s32:
-; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vqdmull_high_s16:
-; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  ret <4 x i32> %vqdmull2.i.i
-}
-
-define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vqdmull_high_s32:
-; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  ret <2 x i64> %vqdmull2.i.i
-}
-
-define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vqdmlal_high_s16:
-; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmlal2.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vqdmlal4.i.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
-  ret <4 x i32> %vqdmlal4.i.i
-}
-
-define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vqdmlal_high_s32:
-; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmlal2.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vqdmlal4.i.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
-  ret <2 x i64> %vqdmlal4.i.i
-}
-
-define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK-LABEL: test_vqdmlsl_high_s16:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
-  ret <4 x i32> %vqdmlsl4.i.i
-}
-
-define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK-LABEL: test_vqdmlsl_high_s32:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
-  ret <2 x i64> %vqdmlsl4.i.i
-}
-
-define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vmull_p8:
-; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
-  ret <8 x i16> %vmull.i
-}
-
-define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vmull_high_p8:
-; CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  ret <8 x i16> %vmull.i.i
-}
-
-define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
-; CHECK-LABEL: test_vmull_p64
-; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
-entry:
-  %vmull2.i = tail call <16 x i8> @llvm.arm64.neon.pmull64(i64 %a, i64 %b)
-  %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
-  ret i128 %vmull3.i
-}
-
-define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
-; CHECK-LABEL: test_vmull_high_p64
-; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %0 = extractelement <2 x i64> %a, i32 1
-  %1 = extractelement <2 x i64> %b, i32 1
-  %vmull2.i.i = tail call <16 x i8> @llvm.arm64.neon.pmull64(i64 %0, i64 %1) #1
-  %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
-  ret i128 %vmull3.i.i
-}
-
-declare <16 x i8> @llvm.arm64.neon.pmull64(i64, i64) #5
-
-
diff --git a/test/CodeGen/ARM64/aarch64-neon-aba-abd.ll b/test/CodeGen/ARM64/aarch64-neon-aba-abd.ll
deleted file mode 100644
index 9d321b23976..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-aba-abd.ll
+++ /dev/null
@@ -1,236 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uabd_v8i8:
-  %abd = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uabd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %abd
-}
-
-define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uaba_v8i8:
-  %abd = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-  %aba = add <8 x i8> %lhs, %abd
-; CHECK: uaba v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %aba
-}
-
-define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sabd_v8i8:
-  %abd = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sabd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %abd
-}
-
-define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_saba_v8i8:
-  %abd = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-  %aba = add <8 x i8> %lhs, %abd
-; CHECK: saba v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %aba
-}
-
-declare <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uabd_v16i8:
-  %abd = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uabd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %abd
-}
-
-define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uaba_v16i8:
-  %abd = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-  %aba = add <16 x i8> %lhs, %abd
-; CHECK: uaba v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %aba
-}
-
-define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sabd_v16i8:
-  %abd = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sabd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %abd
-}
-
-define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_saba_v16i8:
-  %abd = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-  %aba = add <16 x i8> %lhs, %abd
-; CHECK: saba v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %aba
-}
-
-declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uabd_v4i16:
-  %abd = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uabd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %abd
-}
-
-define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uaba_v4i16:
-  %abd = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-  %aba = add <4 x i16> %lhs, %abd
-; CHECK: uaba v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %aba
-}
-
-define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sabd_v4i16:
-  %abd = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sabd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %abd
-}
-
-define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_saba_v4i16:
-  %abd = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-  %aba = add <4 x i16> %lhs, %abd
-; CHECK: saba v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %aba
-}
-
-declare <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uabd_v8i16:
-  %abd = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uabd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %abd
-}
-
-define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uaba_v8i16:
-  %abd = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-  %aba = add <8 x i16> %lhs, %abd
-; CHECK: uaba v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %aba
-}
-
-define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sabd_v8i16:
-  %abd = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sabd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %abd
-}
-
-define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_saba_v8i16:
-  %abd = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-  %aba = add <8 x i16> %lhs, %abd
-; CHECK: saba v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %aba
-}
-
-declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uabd_v2i32:
-  %abd = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uabd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %abd
-}
-
-define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uaba_v2i32:
-  %abd = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-  %aba = add <2 x i32> %lhs, %abd
-; CHECK: uaba v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %aba
-}
-
-define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sabd_v2i32:
-  %abd = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sabd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %abd
-}
-
-define <2 x i32> @test_sabd_v2i32_const() {
-; CHECK: test_sabd_v2i32_const:
-; CHECK: movi     d1, #0x00ffffffff0000
-; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s
-  %1 = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(
-    <2 x i32> <i32 -2147483648, i32 2147450880>,
-    <2 x i32> <i32 -65536, i32 65535>)
-  ret <2 x i32> %1
-}
-
-define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_saba_v2i32:
-  %abd = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-  %aba = add <2 x i32> %lhs, %abd
-; CHECK: saba v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %aba
-}
-
-declare <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uabd_v4i32:
-  %abd = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uabd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %abd
-}
-
-define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uaba_v4i32:
-  %abd = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-  %aba = add <4 x i32> %lhs, %abd
-; CHECK: uaba v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %aba
-}
-
-define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sabd_v4i32:
-  %abd = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sabd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %abd
-}
-
-define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_saba_v4i32:
-  %abd = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-  %aba = add <4 x i32> %lhs, %abd
-; CHECK: saba v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %aba
-}
-
-declare <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float>, <2 x float>)
-
-define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fabd_v2f32:
-  %abd = call <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fabd v0.2s, v0.2s, v1.2s
-  ret <2 x float> %abd
-}
-
-declare <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float>, <4 x float>)
-
-define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fabd_v4f32:
-  %abd = call <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fabd v0.4s, v0.4s, v1.4s
-  ret <4 x float> %abd
-}
-
-declare <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double>, <2 x double>)
-
-define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fabd_v2f64:
-  %abd = call <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fabd v0.2d, v0.2d, v1.2d
-  ret <2 x double> %abd
-}
diff --git a/test/CodeGen/ARM64/aarch64-neon-across.ll b/test/CodeGen/ARM64/aarch64-neon-across.ll
deleted file mode 100644
index 5a3538ba88e..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-across.ll
+++ /dev/null
@@ -1,460 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float>)
-
-declare float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float>)
-
-declare float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float>)
-
-declare float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8>)
-
-declare i32 @llvm.arm64.neon.uminv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>)
-
-declare i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8>)
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>)
-
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8>)
-
-declare i64 @llvm.arm64.neon.uaddlv.i64.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.uaddlv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8>)
-
-declare i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.saddlv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.saddlv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.uaddlv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.uaddlv.i32.v8i8(<8 x i8>)
-
-declare i32 @llvm.arm64.neon.saddlv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.saddlv.i32.v8i8(<8 x i8>)
-
-define i16 @test_vaddlv_s8(<8 x i8> %a) {
-; CHECK: test_vaddlv_s8:
-; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %saddlvv.i = tail call i32 @llvm.arm64.neon.saddlv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %saddlvv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddlv_s16(<4 x i16> %a) {
-; CHECK: test_vaddlv_s16:
-; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %saddlvv.i = tail call i32 @llvm.arm64.neon.saddlv.i32.v4i16(<4 x i16> %a)
-  ret i32 %saddlvv.i
-}
-
-define i16 @test_vaddlv_u8(<8 x i8> %a) {
-; CHECK: test_vaddlv_u8:
-; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %uaddlvv.i = tail call i32 @llvm.arm64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %uaddlvv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddlv_u16(<4 x i16> %a) {
-; CHECK: test_vaddlv_u16:
-; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %uaddlvv.i = tail call i32 @llvm.arm64.neon.uaddlv.i32.v4i16(<4 x i16> %a)
-  ret i32 %uaddlvv.i
-}
-
-define i16 @test_vaddlvq_s8(<16 x i8> %a) {
-; CHECK: test_vaddlvq_s8:
-; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %saddlvv.i = tail call i32 @llvm.arm64.neon.saddlv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %saddlvv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddlvq_s16(<8 x i16> %a) {
-; CHECK: test_vaddlvq_s16:
-; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %saddlvv.i = tail call i32 @llvm.arm64.neon.saddlv.i32.v8i16(<8 x i16> %a)
-  ret i32 %saddlvv.i
-}
-
-define i64 @test_vaddlvq_s32(<4 x i32> %a) {
-; CHECK: test_vaddlvq_s32:
-; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %saddlvv.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %a)
-  ret i64 %saddlvv.i
-}
-
-define i16 @test_vaddlvq_u8(<16 x i8> %a) {
-; CHECK: test_vaddlvq_u8:
-; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %uaddlvv.i = tail call i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %uaddlvv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddlvq_u16(<8 x i16> %a) {
-; CHECK: test_vaddlvq_u16:
-; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %uaddlvv.i = tail call i32 @llvm.arm64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
-  ret i32 %uaddlvv.i
-}
-
-define i64 @test_vaddlvq_u32(<4 x i32> %a) {
-; CHECK: test_vaddlvq_u32:
-; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %uaddlvv.i = tail call i64 @llvm.arm64.neon.uaddlv.i64.v4i32(<4 x i32> %a)
-  ret i64 %uaddlvv.i
-}
-
-define i8 @test_vmaxv_s8(<8 x i8> %a) {
-; CHECK: test_vmaxv_s8:
-; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %smaxv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vmaxv_s16(<4 x i16> %a) {
-; CHECK: test_vmaxv_s16:
-; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16> %a)
-  %0 = trunc i32 %smaxv.i to i16
-  ret i16 %0
-}
-
-define i8 @test_vmaxv_u8(<8 x i8> %a) {
-; CHECK: test_vmaxv_u8:
-; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %umaxv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vmaxv_u16(<4 x i16> %a) {
-; CHECK: test_vmaxv_u16:
-; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16> %a)
-  %0 = trunc i32 %umaxv.i to i16
-  ret i16 %0
-}
-
-define i8 @test_vmaxvq_s8(<16 x i8> %a) {
-; CHECK: test_vmaxvq_s8:
-; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %smaxv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vmaxvq_s16(<8 x i16> %a) {
-; CHECK: test_vmaxvq_s16:
-; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16> %a)
-  %0 = trunc i32 %smaxv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vmaxvq_s32(<4 x i32> %a) {
-; CHECK: test_vmaxvq_s32:
-; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32> %a)
-  ret i32 %smaxv.i
-}
-
-define i8 @test_vmaxvq_u8(<16 x i8> %a) {
-; CHECK: test_vmaxvq_u8:
-; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %umaxv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vmaxvq_u16(<8 x i16> %a) {
-; CHECK: test_vmaxvq_u16:
-; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16> %a)
-  %0 = trunc i32 %umaxv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vmaxvq_u32(<4 x i32> %a) {
-; CHECK: test_vmaxvq_u32:
-; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i32(<4 x i32> %a)
-  ret i32 %umaxv.i
-}
-
-define i8 @test_vminv_s8(<8 x i8> %a) {
-; CHECK: test_vminv_s8:
-; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %sminv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vminv_s16(<4 x i16> %a) {
-; CHECK: test_vminv_s16:
-; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16> %a)
-  %0 = trunc i32 %sminv.i to i16
-  ret i16 %0
-}
-
-define i8 @test_vminv_u8(<8 x i8> %a) {
-; CHECK: test_vminv_u8:
-; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %uminv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vminv_u16(<4 x i16> %a) {
-; CHECK: test_vminv_u16:
-; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16> %a)
-  %0 = trunc i32 %uminv.i to i16
-  ret i16 %0
-}
-
-define i8 @test_vminvq_s8(<16 x i8> %a) {
-; CHECK: test_vminvq_s8:
-; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %sminv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vminvq_s16(<8 x i16> %a) {
-; CHECK: test_vminvq_s16:
-; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16> %a)
-  %0 = trunc i32 %sminv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vminvq_s32(<4 x i32> %a) {
-; CHECK: test_vminvq_s32:
-; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32> %a)
-  ret i32 %sminv.i
-}
-
-define i8 @test_vminvq_u8(<16 x i8> %a) {
-; CHECK: test_vminvq_u8:
-; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %uminv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vminvq_u16(<8 x i16> %a) {
-; CHECK: test_vminvq_u16:
-; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16> %a)
-  %0 = trunc i32 %uminv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vminvq_u32(<4 x i32> %a) {
-; CHECK: test_vminvq_u32:
-; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i32(<4 x i32> %a)
-  ret i32 %uminv.i
-}
-
-define i8 @test_vaddv_s8(<8 x i8> %a) {
-; CHECK: test_vaddv_s8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vaddv_s16(<4 x i16> %a) {
-; CHECK: test_vaddv_s16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i8 @test_vaddv_u8(<8 x i8> %a) {
-; CHECK: test_vaddv_u8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vaddv_u16(<4 x i16> %a) {
-; CHECK: test_vaddv_u16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i8 @test_vaddvq_s8(<16 x i8> %a) {
-; CHECK: test_vaddvq_s8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vaddvq_s16(<8 x i16> %a) {
-; CHECK: test_vaddvq_s16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_s32(<4 x i32> %a) {
-; CHECK: test_vaddvq_s32:
-; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a)
-  ret i32 %vaddv.i
-}
-
-define i8 @test_vaddvq_u8(<16 x i8> %a) {
-; CHECK: test_vaddvq_u8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vaddvq_u16(<8 x i16> %a) {
-; CHECK: test_vaddvq_u16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_u32(<4 x i32> %a) {
-; CHECK: test_vaddvq_u32:
-; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a)
-  ret i32 %vaddv.i
-}
-
-define float @test_vmaxvq_f32(<4 x float> %a) {
-; CHECK: test_vmaxvq_f32:
-; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vminvq_f32(<4 x float> %a) {
-; CHECK: test_vminvq_f32:
-; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vmaxnmvq_f32(<4 x float> %a) {
-; CHECK: test_vmaxnmvq_f32:
-; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vminnmvq_f32(<4 x float> %a) {
-; CHECK: test_vminnmvq_f32:
-; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float> %a)
-  ret float %0
-}
-
diff --git a/test/CodeGen/ARM64/aarch64-neon-add-pairwise.ll b/test/CodeGen/ARM64/aarch64-neon-add-pairwise.ll
deleted file mode 100644
index 9cd76ff1b7e..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-add-pairwise.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_addp_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: addp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_addp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: addp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_addp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: addp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_addp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: addp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_addp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: addp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_addp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: addp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
-declare <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_addp_v2i64:
-        %val = call <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: addp v0.2d, v0.2d, v1.2d
-        ret <2 x i64> %val
-}
-
-declare <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_faddp_v2f32:
-        %val = call <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: faddp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_faddp_v4f32:
-        %val = call <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: faddp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_faddp_v2f64:
-        %val = call <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: faddp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-define i32 @test_vaddv.v2i32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddv.v2i32
-; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32> %a)
-  ret i32 %1
-}
-
-declare i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32>)
diff --git a/test/CodeGen/ARM64/aarch64-neon-add-sub.ll b/test/CodeGen/ARM64/aarch64-neon-add-sub.ll
deleted file mode 100644
index 241025eca33..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-add-sub.ll
+++ /dev/null
@@ -1,237 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -arm64-simd-scalar| FileCheck %s
-
-define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = add <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @add16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = add <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @add4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = add <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @add8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = add <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @add2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = add <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @add4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = add <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @add2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = add <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <2 x float> @add2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fadd <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @add4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fadd <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @add2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fadd <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-define <8 x i8> @sub8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = sub <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sub16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = sub <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @sub4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = sub <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sub8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = sub <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sub2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = sub <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sub4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = sub <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sub2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = sub <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <2 x float> @sub2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fsub <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @sub4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fsub <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fsub <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vadd_f64
-; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fadd <1 x double> %a, %b
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vmul_f64
-; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fmul <1 x double> %a, %b
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vdiv_f64
-; CHECK: fdiv d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fdiv <1 x double> %a, %b
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
-; CHECK-LABEL: test_vmla_f64
-; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fmul <1 x double> %b, %c
-  %2 = fadd <1 x double> %1, %a
-  ret <1 x double> %2
-}
-
-define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
-; CHECK-LABEL: test_vmls_f64
-; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fmul <1 x double> %b, %c
-  %2 = fsub <1 x double> %a, %1
-  ret <1 x double> %2
-}
-
-define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
-; CHECK-LABEL: test_vfms_f64
-; CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fsub <1 x double> <double -0.000000e+00>, %b
-  %2 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %1, <1 x double> %c, <1 x double> %a)
-  ret <1 x double> %2
-}
-
-define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
-; CHECK-LABEL: test_vfma_f64
-; CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vsub_f64
-; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fsub <1 x double> %a, %b
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vabd_f64
-; CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vmax_f64
-; CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vmin_f64
-; CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vmaxnm_f64
-; CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vminnm_f64
-; CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vabs_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vabs_f64
-; CHECK: fabs d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.fabs.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vneg_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vneg_f64
-; CHECK: fneg d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fsub <1 x double> <double -0.000000e+00>, %a
-  ret <1 x double> %1
-}
-
-declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
-declare <1 x double> @llvm.arm64.neon.fminnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm64.neon.fmaxnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm64.neon.fmin.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm64.neon.fmax.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm64.neon.fabd.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
diff --git a/test/CodeGen/ARM64/aarch64-neon-copy.ll b/test/CodeGen/ARM64/aarch64-neon-copy.ll
deleted file mode 100644
index 9493cad3345..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-copy.ll
+++ /dev/null
@@ -1,1445 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-
-define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
-; CHECK-LABEL: ins16bw:
-; CHECK: ins {{v[0-9]+}}.b[15], {{w[0-9]+}}
-  %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) {
-; CHECK-LABEL: ins8hw:
-; CHECK: ins {{v[0-9]+}}.h[6], {{w[0-9]+}}
-  %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6
-  ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) {
-; CHECK-LABEL: ins4sw:
-; CHECK: ins {{v[0-9]+}}.s[2], {{w[0-9]+}}
-  %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2
-  ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) {
-; CHECK-LABEL: ins2dw:
-; CHECK: ins {{v[0-9]+}}.d[1], {{x[0-9]+}}
-  %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1
-  ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) {
-; CHECK-LABEL: ins8bw:
-; CHECK: ins {{v[0-9]+}}.b[5], {{w[0-9]+}}
-  %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5
-  ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) {
-; CHECK-LABEL: ins4hw:
-; CHECK: ins {{v[0-9]+}}.h[3], {{w[0-9]+}}
-  %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3
-  ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
-; CHECK-LABEL: ins2sw:
-; CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}}
-  %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
-  ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
-; CHECK-LABEL: ins16b16:
-; CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
-  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
-  ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
-; CHECK-LABEL: ins8h8:
-; CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
-  ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
-; CHECK-LABEL: ins4s4:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
-  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
-; CHECK-LABEL: ins2d2:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
-  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
-  ret <2 x i64> %tmp4
-}
-
-define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
-; CHECK-LABEL: ins4f4:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x float> %tmp1, i32 2
-  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
-  ret <4 x float> %tmp4
-}
-
-define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
-; CHECK-LABEL: ins2df2:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x double> %tmp1, i32 0
-  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
-  ret <2 x double> %tmp4
-}
-
-define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
-; CHECK-LABEL: ins8b16:
-; CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
-  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
-  ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
-; CHECK-LABEL: ins4h8:
-; CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
-  ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
-; CHECK-LABEL: ins2s4:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
-  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
-  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
-; CHECK-LABEL: ins1d2:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
-  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
-  ret <2 x i64> %tmp4
-}
-
-define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
-; CHECK-LABEL: ins2f4:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
-  %tmp3 = extractelement <2 x float> %tmp1, i32 1
-  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
-  ret <4 x float> %tmp4
-}
-
-define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
-; CHECK-LABEL: ins1f2:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <1 x double> %tmp1, i32 0
-  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
-  ret <2 x double> %tmp4
-}
-
-define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
-; CHECK-LABEL: ins16b8:
-; CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
-  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
-  ret <8 x i8> %tmp4
-}
-
-define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
-; CHECK-LABEL: ins8h4:
-; CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
-  ret <4 x i16> %tmp4
-}
-
-define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
-; CHECK-LABEL: ins4s2:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
-  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
-  ret <2 x i32> %tmp4
-}
-
-define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
-; CHECK-LABEL: ins2d1:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
-  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
-  ret <1 x i64> %tmp4
-}
-
-define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
-; CHECK-LABEL: ins4f2:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x float> %tmp1, i32 2
-  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
-  ret <2 x float> %tmp4
-}
-
-define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
-; CHECK-LABEL: ins2f1:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-  %tmp3 = extractelement <2 x double> %tmp1, i32 1
-  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
-  ret <1 x double> %tmp4
-}
-
-define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
-; CHECK-LABEL: ins8b8:
-; CHECK: ins {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
-  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
-  ret <8 x i8> %tmp4
-}
-
-define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
-; CHECK-LABEL: ins4h4:
-; CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
-  ret <4 x i16> %tmp4
-}
-
-define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
-; CHECK-LABEL: ins2s2:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  %tmp3 = extractelement <2 x i32> %tmp1, i32 0
-  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
-  ret <2 x i32> %tmp4
-}
-
-define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
-; CHECK-LABEL: ins1d1:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
-  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
-  ret <1 x i64> %tmp4
-}
-
-define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
-; CHECK-LABEL: ins2f2:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  %tmp3 = extractelement <2 x float> %tmp1, i32 0
-  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
-  ret <2 x float> %tmp4
-}
-
-define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
-; CHECK-LABEL: ins1df1:
-; CHECK-NOT: ins {{v[0-9]+}}
-  %tmp3 = extractelement <1 x double> %tmp1, i32 0
-  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
-  ret <1 x double> %tmp4
-}
-
-define i32 @umovw16b(<16 x i8> %tmp1) {
-; CHECK-LABEL: umovw16b:
-; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
-  %tmp4 = zext i8 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @umovw8h(<8 x i16> %tmp1) {
-; CHECK-LABEL: umovw8h:
-; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = zext i16 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @umovw4s(<4 x i32> %tmp1) {
-; CHECK-LABEL: umovw4s:
-; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
-  ret i32 %tmp3
-}
-
-define i64 @umovx2d(<2 x i64> %tmp1) {
-; CHECK-LABEL: umovx2d:
-; CHECK: mov {{x[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp3 = extractelement <2 x i64> %tmp1, i32 1
-  ret i64 %tmp3
-}
-
-define i32 @umovw8b(<8 x i8> %tmp1) {
-; CHECK-LABEL: umovw8b:
-; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.b[7]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 7
-  %tmp4 = zext i8 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @umovw4h(<4 x i16> %tmp1) {
-; CHECK-LABEL: umovw4h:
-; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = zext i16 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @umovw2s(<2 x i32> %tmp1) {
-; CHECK-LABEL: umovw2s:
-; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
-  ret i32 %tmp3
-}
-
-define i64 @umovx1d(<1 x i64> %tmp1) {
-; CHECK-LABEL: umovx1d:
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
-  ret i64 %tmp3
-}
-
-define i32 @smovw16b(<16 x i8> %tmp1) {
-; CHECK-LABEL: smovw16b:
-; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
-  %tmp4 = sext i8 %tmp3 to i32
-  %tmp5 = add i32 %tmp4, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovw8h(<8 x i16> %tmp1) {
-; CHECK-LABEL: smovw8h:
-; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  %tmp5 = add i32 %tmp4, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovx16b(<16 x i8> %tmp1) {
-; CHECK-LABEL: smovx16b:
-; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[8]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
-  %tmp4 = sext i8 %tmp3 to i32
-  %tmp5 = add i32 %tmp4, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovx8h(<8 x i16> %tmp1) {
-; CHECK-LABEL: smovx8h:
-; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i64 @smovx4s(<4 x i32> %tmp1) {
-; CHECK-LABEL: smovx4s:
-; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
-  %tmp4 = sext i32 %tmp3 to i64
-  ret i64 %tmp4
-}
-
-define i32 @smovw8b(<8 x i8> %tmp1) {
-; CHECK-LABEL: smovw8b:
-; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 4
-  %tmp4 = sext i8 %tmp3 to i32
-  %tmp5 = add i32 %tmp4, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovw4h(<4 x i16> %tmp1) {
-; CHECK-LABEL: smovw4h:
-; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  %tmp5 = add i32 %tmp4, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovx8b(<8 x i8> %tmp1) {
-; CHECK-LABEL: smovx8b:
-; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[6]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 6
-  %tmp4 = sext i8 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @smovx4h(<4 x i16> %tmp1) {
-; CHECK-LABEL: smovx4h:
-; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i64 @smovx2s(<2 x i32> %tmp1) {
-; CHECK-LABEL: smovx2s:
-; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
-  %tmp4 = sext i32 %tmp3 to i64
-  ret i64 %tmp4
-}
-
-define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
-; CHECK-LABEL: test_vcopy_lane_s8:
-; CHECK: ins  {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
-  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
-  ret <8 x i8> %vset_lane
-}
-
-define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
-; CHECK-LABEL: test_vcopyq_laneq_s8:
-; CHECK: ins  {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
-  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
-  ret <16 x i8> %vset_lane
-}
-
-define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
-; CHECK-LABEL: test_vcopy_lane_swap_s8:
-; CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
-  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
-  ret <8 x i8> %vset_lane
-}
-
-define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
-; CHECK-LABEL: test_vcopyq_laneq_swap_s8:
-; CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
-  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ret <16 x i8> %vset_lane
-}
-
-define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
-; CHECK-LABEL: test_vdup_n_u8:
-; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
-  %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
-  %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
-  %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
-  %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3
-  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4
-  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5
-  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6
-  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7
-  ret <8 x i8> %vecinit7.i
-}
-
-define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
-; CHECK-LABEL: test_vdup_n_u16:
-; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
-  %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
-; CHECK-LABEL: test_vdup_n_u32:
-; CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
-  %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
-; CHECK-LABEL: test_vdup_n_u64:
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-  %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
-  ret <1 x i64> %vecinit.i
-}
-
-define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
-; CHECK-LABEL: test_vdupq_n_u8:
-; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
-  %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
-  %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
-  %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
-  %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3
-  %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4
-  %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5
-  %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6
-  %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7
-  %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8
-  %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9
-  %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10
-  %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11
-  %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12
-  %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13
-  %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14
-  %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15
-  ret <16 x i8> %vecinit15.i
-}
-
-define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
-; CHECK-LABEL: test_vdupq_n_u16:
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
-  %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
-  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
-  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
-  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3
-  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4
-  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5
-  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6
-  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7
-  ret <8 x i16> %vecinit7.i
-}
-
-define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
-; CHECK-LABEL: test_vdupq_n_u32:
-; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
-  %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
-  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
-  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
-  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3
-  ret <4 x i32> %vecinit3.i
-}
-
-define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
-; CHECK-LABEL: test_vdupq_n_u64:
-; CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
-  %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
-  %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
-  ret <2 x i64> %vecinit1.i
-}
-
-define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
-; CHECK-LABEL: test_vdup_lane_s8:
-; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
-  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-  ret <8 x i8> %shuffle
-}
-
-define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
-; CHECK-LABEL: test_vdup_lane_s16:
-; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
-  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-  ret <4 x i16> %shuffle
-}
-
-define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
-; CHECK-LABEL: test_vdup_lane_s32:
-; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i32> %shuffle
-}
-
-define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
-; CHECK-LABEL: test_vdupq_lane_s8:
-; CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
-  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-  ret <16 x i8> %shuffle
-}
-
-define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
-; CHECK-LABEL: test_vdupq_lane_s16:
-; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
-  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  ret <8 x i16> %shuffle
-}
-
-define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
-; CHECK-LABEL: test_vdupq_lane_s32:
-; CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  ret <4 x i32> %shuffle
-}
-
-define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
-; CHECK-LABEL: test_vdupq_lane_s64:
-; CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %shuffle
-}
-
-define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
-; CHECK-LABEL: test_vdup_laneq_s8:
-; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
-  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-  ret <8 x i8> %shuffle
-}
-
-define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
-; CHECK-LABEL: test_vdup_laneq_s16:
-; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
-  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-  ret <4 x i16> %shuffle
-}
-
-define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
-; CHECK-LABEL: test_vdup_laneq_s32:
-; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i32> %shuffle
-}
-
-define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
-; CHECK-LABEL: test_vdupq_laneq_s8:
-; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
-  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-  ret <16 x i8> %shuffle
-}
-
-define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
-; CHECK-LABEL: test_vdupq_laneq_s16:
-; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
-  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  ret <8 x i16> %shuffle
-}
-
-define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
-; CHECK-LABEL: test_vdupq_laneq_s32:
-; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  ret <4 x i32> %shuffle
-}
-
-define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
-; CHECK-LABEL: test_vdupq_laneq_s64:
-; CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %shuffle
-}
-
-define i64 @test_bitcastv8i8toi64(<8 x i8> %in) {
-; CHECK-LABEL: test_bitcastv8i8toi64:
-   %res = bitcast <8 x i8> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv4i16toi64(<4 x i16> %in) {
-; CHECK-LABEL: test_bitcastv4i16toi64:
-   %res = bitcast <4 x i16> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv2i32toi64(<2 x i32> %in) {
-; CHECK-LABEL: test_bitcastv2i32toi64:
-   %res = bitcast <2 x i32> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv2f32toi64(<2 x float> %in) {
-; CHECK-LABEL: test_bitcastv2f32toi64:
-   %res = bitcast <2 x float> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv1i64toi64(<1 x i64> %in) {
-; CHECK-LABEL: test_bitcastv1i64toi64:
-   %res = bitcast <1 x i64> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv1f64toi64(<1 x double> %in) {
-; CHECK-LABEL: test_bitcastv1f64toi64:
-   %res = bitcast <1 x double> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define <8 x i8> @test_bitcasti64tov8i8(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov8i8:
-   %res = bitcast i64 %in to <8 x i8>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <8 x i8> %res
-}
-
-define <4 x i16> @test_bitcasti64tov4i16(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov4i16:
-   %res = bitcast i64 %in to <4 x i16>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <4 x i16> %res
-}
-
-define <2 x i32> @test_bitcasti64tov2i32(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov2i32:
-   %res = bitcast i64 %in to <2 x i32>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <2 x i32> %res
-}
-
-define <2 x float> @test_bitcasti64tov2f32(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov2f32:
-   %res = bitcast i64 %in to <2 x float>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <2 x float> %res
-}
-
-define <1 x i64> @test_bitcasti64tov1i64(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov1i64:
-   %res = bitcast i64 %in to <1 x i64>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <1 x i64> %res
-}
-
-define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov1f64:
-   %res = bitcast i64 %in to <1 x double>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <1 x double> %res
-}
-
-define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
-; CHECK-LABEL: test_bitcastv8i8tov1f64:
-; CHECK: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
-  %sub.i = sub <8 x i8> zeroinitializer, %a
-  %1 = bitcast <8 x i8> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
-; CHECK-LABEL: test_bitcastv4i16tov1f64:
-; CHECK: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
-  %sub.i = sub <4 x i16> zeroinitializer, %a
-  %1 = bitcast <4 x i16> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_bitcastv2i32tov1f64:
-; CHECK: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
-  %sub.i = sub <2 x i32> zeroinitializer, %a
-  %1 = bitcast <2 x i32> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1i64tov1f64:
-; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
-  %sub.i = sub <1 x i64> zeroinitializer, %a
-  %1 = bitcast <1 x i64> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
-; CHECK-LABEL: test_bitcastv2f32tov1f64:
-; CHECK: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
-  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
-  %1 = bitcast <2 x float> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov8i8:
-; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
-; CHECK-NEXT: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <8 x i8>
-  %sub.i = sub <8 x i8> zeroinitializer, %1
-  ret <8 x i8> %sub.i
-}
-
-define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov4i16:
-; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
-; CHECK-NEXT: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <4 x i16>
-  %sub.i = sub <4 x i16> zeroinitializer, %1
-  ret <4 x i16> %sub.i
-}
-
-define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov2i32:
-; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
-; CHECK-NEXT: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <2 x i32>
-  %sub.i = sub <2 x i32> zeroinitializer, %1
-  ret <2 x i32> %sub.i
-}
-
-define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov1i64:
-; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
-; CHECK-NEXT: neg {{d[0-9]+}}, {{d[0-9]+}}
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <1 x i64>
-  %sub.i = sub <1 x i64> zeroinitializer, %1
-  ret <1 x i64> %sub.i
-}
-
-define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov2f32:
-; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
-; CHECK-NEXT: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <2 x float>
-  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %1
-  ret <2 x float> %sub.i
-}
-
-; Test insert element into an undef vector
-define <8 x i8> @scalar_to_vector.v8i8(i8 %a) {
-; CHECK-LABEL: scalar_to_vector.v8i8:
-; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
-  %b = insertelement <8 x i8> undef, i8 %a, i32 0
-  ret <8 x i8> %b
-}
-
-define <16 x i8> @scalar_to_vector.v16i8(i8 %a) {
-; CHECK-LABEL: scalar_to_vector.v16i8:
-; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
-  %b = insertelement <16 x i8> undef, i8 %a, i32 0
-  ret <16 x i8> %b
-}
-
-define <4 x i16> @scalar_to_vector.v4i16(i16 %a) {
-; CHECK-LABEL: scalar_to_vector.v4i16:
-; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
-  %b = insertelement <4 x i16> undef, i16 %a, i32 0
-  ret <4 x i16> %b
-}
-
-define <8 x i16> @scalar_to_vector.v8i16(i16 %a) {
-; CHECK-LABEL: scalar_to_vector.v8i16:
-; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
-  %b = insertelement <8 x i16> undef, i16 %a, i32 0
-  ret <8 x i16> %b
-}
-
-define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
-; CHECK-LABEL: scalar_to_vector.v2i32:
-; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
-  %b = insertelement <2 x i32> undef, i32 %a, i32 0
-  ret <2 x i32> %b
-}
-
-define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
-; CHECK-LABEL: scalar_to_vector.v4i32:
-; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
-  %b = insertelement <4 x i32> undef, i32 %a, i32 0
-  ret <4 x i32> %b
-}
-
-define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
-; CHECK-LABEL: scalar_to_vector.v2i64:
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-  %b = insertelement <2 x i64> undef, i64 %a, i32 0
-  ret <2 x i64> %b
-}
-
-define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
-; CHECK-LABEL: testDUP.v1i8:
-; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
-  %b = extractelement <1 x i8> %a, i32 0
-  %c = insertelement <8 x i8> undef, i8 %b, i32 0
-  %d = insertelement <8 x i8> %c, i8 %b, i32 1
-  %e = insertelement <8 x i8> %d, i8 %b, i32 2
-  %f = insertelement <8 x i8> %e, i8 %b, i32 3
-  %g = insertelement <8 x i8> %f, i8 %b, i32 4
-  %h = insertelement <8 x i8> %g, i8 %b, i32 5
-  %i = insertelement <8 x i8> %h, i8 %b, i32 6
-  %j = insertelement <8 x i8> %i, i8 %b, i32 7
-  ret <8 x i8> %j
-}
-
-define <8 x i16> @testDUP.v1i16(<1 x i16> %a) {
-; CHECK-LABEL: testDUP.v1i16:
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
-  %b = extractelement <1 x i16> %a, i32 0
-  %c = insertelement <8 x i16> undef, i16 %b, i32 0
-  %d = insertelement <8 x i16> %c, i16 %b, i32 1
-  %e = insertelement <8 x i16> %d, i16 %b, i32 2
-  %f = insertelement <8 x i16> %e, i16 %b, i32 3
-  %g = insertelement <8 x i16> %f, i16 %b, i32 4
-  %h = insertelement <8 x i16> %g, i16 %b, i32 5
-  %i = insertelement <8 x i16> %h, i16 %b, i32 6
-  %j = insertelement <8 x i16> %i, i16 %b, i32 7
-  ret <8 x i16> %j
-}
-
-define <4 x i32> @testDUP.v1i32(<1 x i32> %a) {
-; CHECK-LABEL: testDUP.v1i32:
-; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
-  %b = extractelement <1 x i32> %a, i32 0
-  %c = insertelement <4 x i32> undef, i32 %b, i32 0
-  %d = insertelement <4 x i32> %c, i32 %b, i32 1
-  %e = insertelement <4 x i32> %d, i32 %b, i32 2
-  %f = insertelement <4 x i32> %e, i32 %b, i32 3
-  ret <4 x i32> %f
-}
-
-define <8 x i8> @getl(<16 x i8> %x) #0 {
-; CHECK-LABEL: getl:
-; CHECK: ret
-  %vecext = extractelement <16 x i8> %x, i32 0
-  %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0
-  %vecext1 = extractelement <16 x i8> %x, i32 1
-  %vecinit2 = insertelement <8 x i8> %vecinit, i8 %vecext1, i32 1
-  %vecext3 = extractelement <16 x i8> %x, i32 2
-  %vecinit4 = insertelement <8 x i8> %vecinit2, i8 %vecext3, i32 2
-  %vecext5 = extractelement <16 x i8> %x, i32 3
-  %vecinit6 = insertelement <8 x i8> %vecinit4, i8 %vecext5, i32 3
-  %vecext7 = extractelement <16 x i8> %x, i32 4
-  %vecinit8 = insertelement <8 x i8> %vecinit6, i8 %vecext7, i32 4
-  %vecext9 = extractelement <16 x i8> %x, i32 5
-  %vecinit10 = insertelement <8 x i8> %vecinit8, i8 %vecext9, i32 5
-  %vecext11 = extractelement <16 x i8> %x, i32 6
-  %vecinit12 = insertelement <8 x i8> %vecinit10, i8 %vecext11, i32 6
-  %vecext13 = extractelement <16 x i8> %x, i32 7
-  %vecinit14 = insertelement <8 x i8> %vecinit12, i8 %vecext13, i32 7
-  ret <8 x i8> %vecinit14
-}
-
-define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
-; CHECK-LABEL: test_dup_v2i32_v4i16:
-; CHECK: dup v0.4h, v0.h[2]
-entry:
-  %x = extractelement <2 x i32> %a, i32 1
-  %vget_lane = trunc i32 %x to i16
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) {
-; CHECK-LABEL: test_dup_v4i32_v8i16:
-; CHECK: dup v0.8h, v0.h[6]
-entry:
-  %x = extractelement <4 x i32> %a, i32 3
-  %vget_lane = trunc i32 %x to i16
-  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
-  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
-  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
-  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
-  ret <8 x i16> %vecinit7.i
-}
-
-define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) {
-; CHECK-LABEL: test_dup_v1i64_v4i16:
-; CHECK: dup v0.4h, v0.h[0]
-entry:
-  %x = extractelement <1 x i64> %a, i32 0
-  %vget_lane = trunc i64 %x to i16
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) {
-; CHECK-LABEL: test_dup_v1i64_v2i32:
-; CHECK: dup v0.2s, v0.s[0]
-entry:
-  %x = extractelement <1 x i64> %a, i32 0
-  %vget_lane = trunc i64 %x to i32
-  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) {
-; CHECK-LABEL: test_dup_v2i64_v8i16:
-; CHECK: dup v0.8h, v0.h[4]
-entry:
-  %x = extractelement <2 x i64> %a, i32 1
-  %vget_lane = trunc i64 %x to i16
-  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
-  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
-  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
-  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
-  ret <8 x i16> %vecinit7.i
-}
-
-define <4 x i32> @test_dup_v2i64_v4i32(<2 x i64> %a) {
-; CHECK-LABEL: test_dup_v2i64_v4i32:
-; CHECK: dup v0.4s, v0.s[2]
-entry:
-  %x = extractelement <2 x i64> %a, i32 1
-  %vget_lane = trunc i64 %x to i32
-  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
-  ret <4 x i32> %vecinit3.i
-}
-
-define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) {
-; CHECK-LABEL: test_dup_v4i32_v4i16:
-; CHECK: dup v0.4h, v0.h[2]
-entry:
-  %x = extractelement <4 x i32> %a, i32 1
-  %vget_lane = trunc i32 %x to i16
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) {
-; CHECK-LABEL: test_dup_v2i64_v4i16:
-; CHECK: dup v0.4h, v0.h[0]
-entry:
-  %x = extractelement <2 x i64> %a, i32 0
-  %vget_lane = trunc i64 %x to i16
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) {
-; CHECK-LABEL: test_dup_v2i64_v2i32:
-; CHECK: dup v0.2s, v0.s[0]
-entry:
-  %x = extractelement <2 x i64> %a, i32 0
-  %vget_lane = trunc i64 %x to i32
-  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-
-define <2 x float> @test_scalar_to_vector_f32_to_v2f32(<2 x float> %a) {
-; CHECK-LABEL: test_scalar_to_vector_f32_to_v2f32:
-; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
-; CHECK-NEXT: ret
-entry:
-  %0 = call float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float> %a)
-  %1 = insertelement <1 x float> undef, float %0, i32 0
-  %2 = extractelement <1 x float> %1, i32 0
-  %vecinit1.i = insertelement <2 x float> undef, float %2, i32 0
-  ret <2 x float> %vecinit1.i
-}
-
-define <4 x float> @test_scalar_to_vector_f32_to_v4f32(<2 x float> %a) {
-; CHECK-LABEL: test_scalar_to_vector_f32_to_v4f32:
-; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
-; CHECK-NEXT: ret
-entry:
-  %0 = call float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float> %a)
-  %1 = insertelement <1 x float> undef, float %0, i32 0
-  %2 = extractelement <1 x float> %1, i32 0
-  %vecinit1.i = insertelement <4 x float> undef, float %2, i32 0
-  ret <4 x float> %vecinit1.i
-}
-
-declare float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float>)
-
-define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) {
-; CHECK-LABEL: test_concat_undef_v1i32:
-; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-entry:
-  %0 = extractelement <2 x i32> %a, i32 0
-  %vecinit1.i = insertelement <2 x i32> undef, i32 %0, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-declare i32 @llvm.arm64.neon.sqabs.i32(i32) #4
-
-define <2 x i32> @test_concat_v1i32_undef(i32 %a) {
-; CHECK-LABEL: test_concat_v1i32_undef:
-; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: ret
-entry:
-  %b = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %a)
-  %vecinit.i432 = insertelement <2 x i32> undef, i32 %b, i32 0
-  ret <2 x i32> %vecinit.i432
-}
-
-define <2 x i32> @test_concat_same_v1i32_v1i32(<2 x i32> %a) {
-; CHECK-LABEL: test_concat_same_v1i32_v1i32:
-; CHECK: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
-entry:
-  %0 = extractelement <2 x i32> %a, i32 0
-  %vecinit.i = insertelement <2 x i32> undef, i32 %0, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %0, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) {
-; CHECK-LABEL: test_concat_diff_v1i32_v1i32:
-; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %c = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %a)
-  %d = insertelement <2 x i32> undef, i32 %c, i32 0
-  %e = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %b)
-  %f = insertelement <2 x i32> undef, i32 %e, i32 0
-  %h = shufflevector <2 x i32> %d, <2 x i32> %f, <2 x i32> <i32 0, i32 2>
-  ret <2 x i32> %h
-}
-
-define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 {
-; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  ret <16 x i8> %vecinit30
-}
-
-define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
-; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <8 x i8> %x, i32 0
-  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
-  %vecext1 = extractelement <8 x i8> %x, i32 1
-  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
-  %vecext3 = extractelement <8 x i8> %x, i32 2
-  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
-  %vecext5 = extractelement <8 x i8> %x, i32 3
-  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
-  %vecext7 = extractelement <8 x i8> %x, i32 4
-  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
-  %vecext9 = extractelement <8 x i8> %x, i32 5
-  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
-  %vecext11 = extractelement <8 x i8> %x, i32 6
-  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
-  %vecext13 = extractelement <8 x i8> %x, i32 7
-  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
-  %vecinit30 = shufflevector <16 x i8> %vecinit14, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  ret <16 x i8> %vecinit30
-}
-
-define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
-; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <16 x i8> %x, i32 0
-  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
-  %vecext1 = extractelement <16 x i8> %x, i32 1
-  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
-  %vecext3 = extractelement <16 x i8> %x, i32 2
-  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
-  %vecext5 = extractelement <16 x i8> %x, i32 3
-  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
-  %vecext7 = extractelement <16 x i8> %x, i32 4
-  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
-  %vecext9 = extractelement <16 x i8> %x, i32 5
-  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
-  %vecext11 = extractelement <16 x i8> %x, i32 6
-  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
-  %vecext13 = extractelement <16 x i8> %x, i32 7
-  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
-  %vecext15 = extractelement <8 x i8> %y, i32 0
-  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
-  %vecext17 = extractelement <8 x i8> %y, i32 1
-  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
-  %vecext19 = extractelement <8 x i8> %y, i32 2
-  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
-  %vecext21 = extractelement <8 x i8> %y, i32 3
-  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
-  %vecext23 = extractelement <8 x i8> %y, i32 4
-  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
-  %vecext25 = extractelement <8 x i8> %y, i32 5
-  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
-  %vecext27 = extractelement <8 x i8> %y, i32 6
-  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
-  %vecext29 = extractelement <8 x i8> %y, i32 7
-  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
-  ret <16 x i8> %vecinit30
-}
-
-define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
-; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <8 x i8> %x, i32 0
-  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
-  %vecext1 = extractelement <8 x i8> %x, i32 1
-  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
-  %vecext3 = extractelement <8 x i8> %x, i32 2
-  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
-  %vecext5 = extractelement <8 x i8> %x, i32 3
-  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
-  %vecext7 = extractelement <8 x i8> %x, i32 4
-  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
-  %vecext9 = extractelement <8 x i8> %x, i32 5
-  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
-  %vecext11 = extractelement <8 x i8> %x, i32 6
-  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
-  %vecext13 = extractelement <8 x i8> %x, i32 7
-  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
-  %vecext15 = extractelement <8 x i8> %y, i32 0
-  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
-  %vecext17 = extractelement <8 x i8> %y, i32 1
-  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
-  %vecext19 = extractelement <8 x i8> %y, i32 2
-  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
-  %vecext21 = extractelement <8 x i8> %y, i32 3
-  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
-  %vecext23 = extractelement <8 x i8> %y, i32 4
-  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
-  %vecext25 = extractelement <8 x i8> %y, i32 5
-  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
-  %vecext27 = extractelement <8 x i8> %y, i32 6
-  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
-  %vecext29 = extractelement <8 x i8> %y, i32 7
-  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
-  ret <16 x i8> %vecinit30
-}
-
-define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 {
-; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  ret <8 x i16> %vecinit14
-}
-
-define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
-; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <4 x i16> %x, i32 0
-  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
-  %vecext1 = extractelement <4 x i16> %x, i32 1
-  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
-  %vecext3 = extractelement <4 x i16> %x, i32 2
-  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
-  %vecext5 = extractelement <4 x i16> %x, i32 3
-  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
-  %vecinit14 = shufflevector <8 x i16> %vecinit6, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  ret <8 x i16> %vecinit14
-}
-
-define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
-; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <8 x i16> %x, i32 0
-  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
-  %vecext1 = extractelement <8 x i16> %x, i32 1
-  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
-  %vecext3 = extractelement <8 x i16> %x, i32 2
-  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
-  %vecext5 = extractelement <8 x i16> %x, i32 3
-  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
-  %vecext7 = extractelement <4 x i16> %y, i32 0
-  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
-  %vecext9 = extractelement <4 x i16> %y, i32 1
-  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
-  %vecext11 = extractelement <4 x i16> %y, i32 2
-  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
-  %vecext13 = extractelement <4 x i16> %y, i32 3
-  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
-  ret <8 x i16> %vecinit14
-}
-
-define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
-; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <4 x i16> %x, i32 0
-  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
-  %vecext1 = extractelement <4 x i16> %x, i32 1
-  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
-  %vecext3 = extractelement <4 x i16> %x, i32 2
-  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
-  %vecext5 = extractelement <4 x i16> %x, i32 3
-  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
-  %vecext7 = extractelement <4 x i16> %y, i32 0
-  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
-  %vecext9 = extractelement <4 x i16> %y, i32 1
-  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
-  %vecext11 = extractelement <4 x i16> %y, i32 2
-  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
-  %vecext13 = extractelement <4 x i16> %y, i32 3
-  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
-  ret <8 x i16> %vecinit14
-}
-
-define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 {
-; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  ret <4 x i32> %vecinit6
-}
-
-define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
-; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <2 x i32> %x, i32 0
-  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
-  %vecext1 = extractelement <2 x i32> %x, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
-  %vecinit6 = shufflevector <4 x i32> %vecinit2, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  ret <4 x i32> %vecinit6
-}
-
-define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
-; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <4 x i32> %x, i32 0
-  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
-  %vecext1 = extractelement <4 x i32> %x, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
-  %vecext3 = extractelement <2 x i32> %y, i32 0
-  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
-  %vecext5 = extractelement <2 x i32> %y, i32 1
-  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3
-  ret <4 x i32> %vecinit6
-}
-
-define <4 x i32> @test_concat_v4i32_v2i32_v2i32(<2 x i32> %x, <2 x i32> %y) #0 {
-; CHECK-LABEL: test_concat_v4i32_v2i32_v2i32:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecinit6 = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %vecinit6
-}
-
-define <2 x i64> @test_concat_v2i64_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v2i64_v2i64:
-; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vecinit2 = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
-  ret <2 x i64> %vecinit2
-}
-
-define <2 x i64> @test_concat_v2i64_v1i64_v2i64(<1 x i64> %x, <2 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v1i64_v2i64:
-; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vecext = extractelement <1 x i64> %x, i32 0
-  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
-  %vecinit2 = shufflevector <2 x i64> %vecinit, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
-  ret <2 x i64> %vecinit2
-}
-
-define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <2 x i64> %x, i32 0
-  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
-  %vecext1 = extractelement <1 x i64> %y, i32 0
-  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
-  ret <2 x i64> %vecinit2
-}
-
-define <2 x i64> @test_concat_v2i64_v1i64_v1i64(<1 x i64> %x, <1 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v1i64_v1i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <1 x i64> %x, i32 0
-  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
-  %vecext1 = extractelement <1 x i64> %y, i32 0
-  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
-  ret <2 x i64> %vecinit2
-}
-
-
-define <4 x i16> @concat_vector_v4i16_const() {
-; CHECK-LABEL: concat_vector_v4i16_const:
-; CHECK: movi {{d[0-9]+}}, #0
- %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
- ret <4 x i16> %r
-}
-
-define <4 x i16> @concat_vector_v4i16_const_one() {
-; CHECK-LABEL: concat_vector_v4i16_const_one:
-; CHECK: movi {{v[0-9]+}}.4h, #0x1
- %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <4 x i32> zeroinitializer
- ret <4 x i16> %r
-}
-
-define <4 x i32> @concat_vector_v4i32_const() {
-; CHECK-LABEL: concat_vector_v4i32_const:
-; CHECK: movi {{v[0-9]+}}.2d, #0
- %r = shufflevector <1 x i32> zeroinitializer, <1 x i32> undef, <4 x i32> zeroinitializer
- ret <4 x i32> %r
-}
-
-define <8 x i8> @concat_vector_v8i8_const() {
-; CHECK-LABEL: concat_vector_v8i8_const:
-; CHECK: movi {{d[0-9]+}}, #0
- %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
- ret <8 x i8> %r
-}
-
-define <8 x i16> @concat_vector_v8i16_const() {
-; CHECK-LABEL: concat_vector_v8i16_const:
-; CHECK: movi {{v[0-9]+}}.2d, #0
- %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <8 x i32> zeroinitializer
- ret <8 x i16> %r
-}
-
-define <8 x i16> @concat_vector_v8i16_const_one() {
-; CHECK-LABEL: concat_vector_v8i16_const_one:
-; CHECK: movi {{v[0-9]+}}.8h, #0x1
- %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <8 x i32> zeroinitializer
- ret <8 x i16> %r
-}
-
-define <16 x i8> @concat_vector_v16i8_const() {
-; CHECK-LABEL: concat_vector_v16i8_const:
-; CHECK: movi {{v[0-9]+}}.2d, #0
- %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <16 x i32> zeroinitializer
- ret <16 x i8> %r
-}
-
-define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) {
-; CHECK-LABEL: concat_vector_v4i16:
-; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
- %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer
- ret <4 x i16> %r
-}
-
-define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) {
-; CHECK-LABEL: concat_vector_v4i32:
-; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
- %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer
- ret <4 x i32> %r
-}
-
-define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
-; CHECK-LABEL: concat_vector_v8i8:
-; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
- %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer
- ret <8 x i8> %r
-}
-
-define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) {
-; CHECK-LABEL: concat_vector_v8i16:
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
- %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer
- ret <8 x i16> %r
-}
-
-define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
-; CHECK-LABEL: concat_vector_v16i8:
-; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
- %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer
- ret <16 x i8> %r
-}
diff --git a/test/CodeGen/ARM64/aarch64-neon-copyPhysReg-tuple.ll b/test/CodeGen/ARM64/aarch64-neon-copyPhysReg-tuple.ll
deleted file mode 100644
index f24392bb8fc..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-copyPhysReg-tuple.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has a separate copy due to intrinsics
-
-define <4 x i32> @copyTuple.QPair(i32* %a, i32* %b) {
-; CHECK-LABEL: copyTuple.QPair:
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i64 1, i32* %a)
-  %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i64 1, i32* %b)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-define <4 x i32> @copyTuple.QTriple(i32* %a, i32* %b, <4 x i32> %c) {
-; CHECK-LABEL: copyTuple.QTriple:
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
-  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i64 1, i32* %b)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-define <4 x i32> @copyTuple.QQuad(i32* %a, i32* %b, <4 x i32> %c) {
-; CHECK-LABEL: copyTuple.QQuad:
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
-  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %b)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
diff --git a/test/CodeGen/ARM64/aarch64-neon-mul-div.ll b/test/CodeGen/ARM64/aarch64-neon-mul-div.ll
deleted file mode 100644
index f3a97663197..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-mul-div.ll
+++ /dev/null
@@ -1,797 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has its own copy of this because of the intrinsics
-
-define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) {
-; CHECK-LABEL: mul8xi8:
-; CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = mul <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) {
-; CHECK-LABEL: mul16xi8:
-; CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = mul <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) {
-; CHECK-LABEL: mul4xi16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = mul <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) {
-; CHECK-LABEL: mul8xi16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = mul <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: mul2xi32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = mul <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) {
-; CHECK-LABEL: mul4x32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = mul <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) {
-; CHECK-LABEL: mul1xi64:
-; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-  %tmp3 = mul <1 x i64> %A, %B;
-  ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) {
-; CHECK-LABEL: mul2xi64:
-; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-  %tmp3 = mul <2 x i64> %A, %B;
-  ret <2 x i64> %tmp3
-}
-
- define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) {
-; CHECK-LABEL: mul2xfloat:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fmul <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) {
-; CHECK-LABEL: mul4xfloat:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fmul <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) {
-; CHECK-LABEL: mul2xdouble:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fmul <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-
- define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) {
-; CHECK-LABEL: div2xfloat:
-; CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fdiv <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) {
-; CHECK-LABEL: div4xfloat:
-; CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fdiv <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) {
-; CHECK-LABEL: div2xdouble:
-; CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fdiv <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-define <1 x i8> @sdiv1x8(<1 x i8> %A, <1 x i8> %B) {
-; CHECK-LABEL: sdiv1x8:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) {
-; CHECK-LABEL: sdiv8x8:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) {
-; CHECK-LABEL: sdiv16x8:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @sdiv1x16(<1 x i16> %A, <1 x i16> %B) {
-; CHECK-LABEL: sdiv1x16:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @sdiv4x16(<4 x i16> %A, <4 x i16> %B) {
-; CHECK-LABEL: sdiv4x16:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) {
-; CHECK-LABEL: sdiv8x16:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @sdiv1x32(<1 x i32> %A, <1 x i32> %B) {
-; CHECK-LABEL: sdiv1x32:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @sdiv2x32(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: sdiv2x32:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) {
-; CHECK-LABEL: sdiv4x32:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @sdiv1x64(<1 x i64> %A, <1 x i64> %B) {
-; CHECK-LABEL: sdiv1x64:
-; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = sdiv <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) {
-; CHECK-LABEL: sdiv2x64:
-; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = sdiv <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @udiv1x8(<1 x i8> %A, <1 x i8> %B) {
-; CHECK-LABEL: udiv1x8:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) {
-; CHECK-LABEL: udiv8x8:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) {
-; CHECK-LABEL: udiv16x8:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @udiv1x16(<1 x i16> %A, <1 x i16> %B) {
-; CHECK-LABEL: udiv1x16:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @udiv4x16(<4 x i16> %A, <4 x i16> %B) {
-; CHECK-LABEL: udiv4x16:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) {
-; CHECK-LABEL: udiv8x16:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @udiv1x32(<1 x i32> %A, <1 x i32> %B) {
-; CHECK-LABEL: udiv1x32:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @udiv2x32(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: udiv2x32:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) {
-; CHECK-LABEL: udiv4x32:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @udiv1x64(<1 x i64> %A, <1 x i64> %B) {
-; CHECK-LABEL: udiv1x64:
-; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = udiv <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) {
-; CHECK-LABEL: udiv2x64:
-; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = udiv <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @srem1x8(<1 x i8> %A, <1 x i8> %B) {
-; CHECK-LABEL: srem1x8:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) {
-; CHECK-LABEL: srem8x8:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
-; CHECK-LABEL: srem16x8:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @srem1x16(<1 x i16> %A, <1 x i16> %B) {
-; CHECK-LABEL: srem1x16:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) {
-; CHECK-LABEL: srem4x16:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) {
-; CHECK-LABEL: srem8x16:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @srem1x32(<1 x i32> %A, <1 x i32> %B) {
-; CHECK-LABEL: srem1x32:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: srem2x32:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) {
-; CHECK-LABEL: srem4x32:
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @srem1x64(<1 x i64> %A, <1 x i64> %B) {
-; CHECK-LABEL: srem1x64:
-; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = srem <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) {
-; CHECK-LABEL: srem2x64:
-; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = srem <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @urem1x8(<1 x i8> %A, <1 x i8> %B) {
-; CHECK-LABEL: urem1x8:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) {
-; CHECK-LABEL: urem8x8:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
-; CHECK-LABEL: urem16x8:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @urem1x16(<1 x i16> %A, <1 x i16> %B) {
-; CHECK-LABEL: urem1x16:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) {
-; CHECK-LABEL: urem4x16:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) {
-; CHECK-LABEL: urem8x16:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @urem1x32(<1 x i32> %A, <1 x i32> %B) {
-; CHECK-LABEL: urem1x32:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: urem2x32:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) {
-; CHECK-LABEL: urem4x32:
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @urem1x64(<1 x i64> %A, <1 x i64> %B) {
-; CHECK-LABEL: urem1x64:
-; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = urem <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) {
-; CHECK-LABEL: urem2x64:
-; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = urem <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) {
-; CHECK-LABEL: frem2f32:
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-	%tmp3 = frem <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frem4f32(<4 x float> %A, <4 x float> %B) {
-; CHECK-LABEL: frem4f32:
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-	%tmp3 = frem <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-
-define <1 x double> @frem1d64(<1 x double> %A, <1 x double> %B) {
-; CHECK-LABEL: frem1d64:
-; CHECK: bl fmod
-	%tmp3 = frem <1 x double> %A, %B;
-	ret <1 x double> %tmp3
-}
-
-define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) {
-; CHECK-LABEL: frem2d64:
-; CHECK: bl fmod
-; CHECK: bl fmod
-	%tmp3 = frem <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.pmul.v8i8(<8 x i8>, <8 x i8>)
-declare <16 x i8> @llvm.arm64.neon.pmul.v16i8(<16 x i8>, <16 x i8>)
-
-define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK-LABEL: poly_mulv8i8:
-   %prod = call <8 x i8> @llvm.arm64.neon.pmul.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: pmul v0.8b, v0.8b, v1.8b
-   ret <8 x i8> %prod
-}
-
-define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK-LABEL: poly_mulv16i8:
-   %prod = call <16 x i8> @llvm.arm64.neon.pmul.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: pmul v0.16b, v0.16b, v1.16b
-   ret <16 x i8> %prod
-}
-
-declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK-LABEL: test_sqdmulh_v4i16:
-   %prod = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
-   ret <4 x i16> %prod
-}
-
-define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK-LABEL: test_sqdmulh_v8i16:
-   %prod = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
-   ret <8 x i16> %prod
-}
-
-define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK-LABEL: test_sqdmulh_v2i32:
-   %prod = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
-   ret <2 x i32> %prod
-}
-
-define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: test_sqdmulh_v4i32:
-   %prod = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
-   ret <4 x i32> %prod
-}
-
-declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK-LABEL: test_sqrdmulh_v4i16:
-   %prod = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
-   ret <4 x i16> %prod
-}
-
-define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK-LABEL: test_sqrdmulh_v8i16:
-   %prod = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
-   ret <8 x i16> %prod
-}
-
-define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK-LABEL: test_sqrdmulh_v2i32:
-   %prod = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
-   ret <2 x i32> %prod
-}
-
-define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: test_sqrdmulh_v4i32:
-   %prod = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
-   ret <4 x i32> %prod
-}
-
-declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK-LABEL: fmulx_v2f32:
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.2s, v0.2s, v1.2s
-        %val = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-        ret <2 x float> %val
-}
-
-define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK-LABEL: fmulx_v4f32:
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.4s, v0.4s, v1.4s
-        %val = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-        ret <4 x float> %val
-}
-
-define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK-LABEL: fmulx_v2f64:
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.2d, v0.2d, v1.2d
-        %val = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-        ret <2 x double> %val
-}
-
diff --git a/test/CodeGen/ARM64/aarch64-neon-scalar-by-elem-mul.ll b/test/CodeGen/ARM64/aarch64-neon-scalar-by-elem-mul.ll
deleted file mode 100644
index 18e6e0fe8b6..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-scalar-by-elem-mul.ll
+++ /dev/null
@@ -1,124 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmul_lane_ss2S
-  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp1 = extractelement <2 x float> %v, i32 1
-  %tmp2 = fmul float %a, %tmp1;
-  ret float %tmp2;
-}
-
-define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmul_lane_ss2S_swap
-  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp1 = extractelement <2 x float> %v, i32 1
-  %tmp2 = fmul float %tmp1, %a;
-  ret float %tmp2;
-}
-
-
-define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmul_lane_ss4S
-  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-  %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = fmul float %a, %tmp1;
-  ret float %tmp2;
-}
-
-define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmul_lane_ss4S_swap
-  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-  %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = fmul float %tmp1, %a;
-  ret float %tmp2;
-}
-
-
-define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
-  ; CHECK-LABEL: test_fmul_lane_ddD
-  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0]|d[0-9]+}}
-  %tmp1 = extractelement <1 x double> %v, i32 0
-  %tmp2 = fmul double %a, %tmp1;
-  ret double %tmp2;
-}
-
-
-
-define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmul_lane_dd2D
-  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = fmul double %a, %tmp1;
-  ret double %tmp2;
-}
-
-
-define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmul_lane_dd2D_swap
-  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = fmul double %tmp1, %a;
-  ret double %tmp2;
-}
-
-declare float @llvm.arm64.neon.fmulx.f32(float, float)
-
-define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmulx_lane_f32
-  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp1 = extractelement <2 x float> %v, i32 1
-  %tmp2 = call float @llvm.arm64.neon.fmulx.f32(float %a, float %tmp1)
-  ret float %tmp2;
-}
-
-define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmulx_laneq_f32
-  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-  %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = call float @llvm.arm64.neon.fmulx.f32(float %a, float %tmp1)
-  ret float %tmp2;
-}
-
-define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmulx_laneq_f32_swap
-  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-  %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = call float @llvm.arm64.neon.fmulx.f32(float %tmp1, float %a)
-  ret float %tmp2;
-}
-
-declare double @llvm.arm64.neon.fmulx.f64(double, double)
-
-define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
-  ; CHECK-LABEL: test_fmulx_lane_f64
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0]|d[0-9]+}}
-  %tmp1 = extractelement <1 x double> %v, i32 0
-  %tmp2 = call double @llvm.arm64.neon.fmulx.f64(double %a, double %tmp1)
-  ret double %tmp2;
-}
-
-define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmulx_laneq_f64_0
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-  %tmp1 = extractelement <2 x double> %v, i32 0
-  %tmp2 = call double @llvm.arm64.neon.fmulx.f64(double %a, double %tmp1)
-  ret double %tmp2;
-}
-
-
-define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmulx_laneq_f64_1
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = call double @llvm.arm64.neon.fmulx.f64(double %a, double %tmp1)
-  ret double %tmp2;
-}
-
-define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmulx_laneq_f64_1_swap
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = call double @llvm.arm64.neon.fmulx.f64(double %tmp1, double %a)
-  ret double %tmp2;
-}
-
diff --git a/test/CodeGen/ARM64/aarch64-neon-select_cc.ll b/test/CodeGen/ARM64/aarch64-neon-select_cc.ll
deleted file mode 100644
index 255b90dfa64..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-select_cc.ll
+++ /dev/null
@@ -1,206 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v8i8_i8:
-; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
-; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
-; CHECK: cmeq [[MASK:v[0-9]+]].8b, v[[LHS]].8b, v[[RHS]].8b
-; CHECK: dup [[DUPMASK:v[0-9]+]].8b, [[MASK]].b[0]
-; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i8 %a, %b
-  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
-  ret <8x i8> %e
-}
-
-define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v8i8_f32:
-; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s
-; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
-; CHECK-NEXT: bsl [[DUPMASK]].8b, v2.8b, v3.8b
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
-  ret <8x i8> %e
-}
-
-define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v8i8_f64:
-; CHECK: fcmeq d[[MASK:[0-9]+]], d0, d1
-; CHECK-NEXT: bsl v[[MASK]].8b, v2.8b, v3.8b
-  %cmp31 = fcmp oeq double %a, %b
-  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
-  ret <8x i8> %e
-}
-
-define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v16i8_i8:
-; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
-; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
-; CHECK: cmeq [[MASK:v[0-9]+]].16b, v[[LHS]].16b, v[[RHS]].16b
-; CHECK: dup [[DUPMASK:v[0-9]+]].16b, [[MASK]].b[0]
-; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i8 %a, %b
-  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
-  ret <16x i8> %e
-}
-
-define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v16i8_f32:
-; CHECK: fcmeq [[MASK:v[0-9]+]].4s, v0.4s, v1.4s
-; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
-; CHECK-NEXT: bsl [[DUPMASK]].16b, v2.16b, v3.16b
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
-  ret <16x i8> %e
-}
-
-define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v16i8_f64:
-; CHECK: fcmeq [[MASK:v[0-9]+]].2d, v0.2d, v1.2d
-; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
-; CHECK-NEXT: bsl [[DUPMASK]].16b, v2.16b, v3.16b
-  %cmp31 = fcmp oeq double %a, %b
-  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
-  ret <16x i8> %e
-}
-
-define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) {
-; CHECK-LABEL: test_select_cc_v4i16:
-; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
-; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
-; CHECK: cmeq [[MASK:v[0-9]+]].4h, v[[LHS]].4h, v[[RHS]].4h
-; CHECK: dup [[DUPMASK:v[0-9]+]].4h, [[MASK]].h[0]
-; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i16 %a, %b
-  %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
-  ret <4x i16> %e
-}
-
-define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) {
-; CHECK-LABEL: test_select_cc_v8i16:
-; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
-; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
-; CHECK: cmeq [[MASK:v[0-9]+]].8h, v[[LHS]].8h, v[[RHS]].8h
-; CHECK: dup [[DUPMASK:v[0-9]+]].8h, [[MASK]].h[0]
-; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i16 %a, %b
-  %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
-  ret <8x i16> %e
-}
-
-define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) {
-; CHECK-LABEL: test_select_cc_v2i32:
-; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
-; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
-; CHECK: cmeq [[MASK:v[0-9]+]].2s, v[[LHS]].2s, v[[RHS]].2s
-; CHECK: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
-; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i32 %a, %b
-  %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
-  ret <2x i32> %e
-}
-
-define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) {
-; CHECK-LABEL: test_select_cc_v4i32:
-; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
-; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
-; CHECK: cmeq [[MASK:v[0-9]+]].4s, v[[LHS]].4s, v[[RHS]].4s
-; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
-; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i32 %a, %b
-  %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
-  ret <4x i32> %e
-}
-
-define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) {
-; CHECK-LABEL: test_select_cc_v1i64:
-; CHECK-DAG: fmov d[[LHS:[0-9]+]], x0
-; CHECK-DAG: fmov d[[RHS:[0-9]+]], x1
-; CHECK: cmeq d[[MASK:[0-9]+]], d[[LHS]], d[[RHS]]
-; CHECK: bsl v[[MASK]].8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i64 %a, %b
-  %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
-  ret <1x i64> %e
-}
-
-define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) {
-; CHECK-LABEL: test_select_cc_v2i64:
-; CHECK-DAG: fmov d[[LHS:[0-9]+]], x0
-; CHECK-DAG: fmov d[[RHS:[0-9]+]], x1
-; CHECK: cmeq [[MASK:v[0-9]+]].2d, v[[LHS]].2d, v[[RHS]].2d
-; CHECK: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
-; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i64 %a, %b
-  %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
-  ret <2x i64> %e
-}
-
-define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) {
-; CHECK-LABEL: test_select_cc_v1f32:
-; CHECK: fcmp s0, s1
-; CHECK-NEXT: fcsel s0, s2, s3, eq
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d
-  ret <1 x float> %e
-}
-
-define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d ) {
-; CHECK-LABEL: test_select_cc_v2f32:
-; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s
-; CHECK: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
-; CHECK: bsl [[DUPMASK]].8b, v2.8b, v3.8b
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <2 x float> %c, <2 x float> %d
-  ret <2 x float> %e
-}
-
-define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x float> %d ) {
-; CHECK-LABEL: test_select_cc_v4f32:
-; CHECK: fcmeq [[MASK:v[0-9]+]].4s, v0.4s, v1.4s
-; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
-; CHECK: bsl [[DUPMASK]].16b, v2.16b, v3.16b
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
-  ret <4x float> %e
-}
-
-define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x float> %d ) {
-; CHECK-LABEL: test_select_cc_v4f32_icmp:
-; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
-; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
-; CHECK: cmeq [[MASK:v[0-9]+]].4s, v[[LHS]].4s, v[[RHS]].4s
-; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
-; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i32 %a, %b
-  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
-  ret <4x float> %e
-}
-
-define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c, <1 x double> %d ) {
-; CHECK-LABEL: test_select_cc_v1f64:
-; CHECK: fcmeq d[[MASK:[0-9]+]], d0, d1
-; CHECK: bsl v[[MASK]].8b, v2.8b, v3.8b
-  %cmp31 = fcmp oeq double %a, %b
-  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
-  ret <1 x double> %e
-}
-
-define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c, <1 x double> %d ) {
-; CHECK-LABEL: test_select_cc_v1f64_icmp:
-; CHECK-DAG: fmov [[LHS:d[0-9]+]], x0
-; CHECK-DAG: fmov [[RHS:d[0-9]+]], x1
-; CHECK: cmeq d[[MASK:[0-9]+]], [[LHS]], [[RHS]]
-; CHECK: bsl v[[MASK]].8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i64 %a, %b
-  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
-  ret <1 x double> %e
-}
-
-define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d ) {
-; CHECK-LABEL: test_select_cc_v2f64:
-; CHECK: fcmeq [[MASK:v[0-9]+]].2d, v0.2d, v1.2d
-; CHECK: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
-; CHECK: bsl [[DUPMASK]].16b, v2.16b, v3.16b
-  %cmp31 = fcmp oeq double %a, %b
-  %e = select i1 %cmp31, <2 x double> %c, <2 x double> %d
-  ret <2 x double> %e
-}
diff --git a/test/CodeGen/ARM64/aarch64-neon-simd-ldst-one.ll b/test/CodeGen/ARM64/aarch64-neon-simd-ldst-one.ll
deleted file mode 100644
index cca6bfef730..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-simd-ldst-one.ll
+++ /dev/null
@@ -1,482 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-
-%struct.uint8x16x2_t = type { [2 x <16 x i8>] }
-%struct.poly8x16x2_t = type { [2 x <16 x i8>] }
-%struct.uint8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int8x16x2_t = type { [2 x <16 x i8>] }
-%struct.int16x8x2_t = type { [2 x <8 x i16>] }
-%struct.int32x4x2_t = type { [2 x <4 x i32>] }
-%struct.int64x2x2_t = type { [2 x <2 x i64>] }
-%struct.float32x4x2_t = type { [2 x <4 x float>] }
-%struct.float64x2x2_t = type { [2 x <2 x double>] }
-%struct.int8x8x2_t = type { [2 x <8 x i8>] }
-%struct.int16x4x2_t = type { [2 x <4 x i16>] }
-%struct.int32x2x2_t = type { [2 x <2 x i32>] }
-%struct.int64x1x2_t = type { [2 x <1 x i64>] }
-%struct.float32x2x2_t = type { [2 x <2 x float>] }
-%struct.float64x1x2_t = type { [2 x <1 x double>] }
-%struct.int8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int16x8x3_t = type { [3 x <8 x i16>] }
-%struct.int32x4x3_t = type { [3 x <4 x i32>] }
-%struct.int64x2x3_t = type { [3 x <2 x i64>] }
-%struct.float32x4x3_t = type { [3 x <4 x float>] }
-%struct.float64x2x3_t = type { [3 x <2 x double>] }
-%struct.int8x8x3_t = type { [3 x <8 x i8>] }
-%struct.int16x4x3_t = type { [3 x <4 x i16>] }
-%struct.int32x2x3_t = type { [3 x <2 x i32>] }
-%struct.int64x1x3_t = type { [3 x <1 x i64>] }
-%struct.float32x2x3_t = type { [3 x <2 x float>] }
-%struct.float64x1x3_t = type { [3 x <1 x double>] }
-%struct.int8x16x4_t = type { [4 x <16 x i8>] }
-%struct.int16x8x4_t = type { [4 x <8 x i16>] }
-%struct.int32x4x4_t = type { [4 x <4 x i32>] }
-%struct.int64x2x4_t = type { [4 x <2 x i64>] }
-%struct.float32x4x4_t = type { [4 x <4 x float>] }
-%struct.float64x2x4_t = type { [4 x <2 x double>] }
-%struct.int8x8x4_t = type { [4 x <8 x i8>] }
-%struct.int16x4x4_t = type { [4 x <4 x i16>] }
-%struct.int32x2x4_t = type { [4 x <2 x i32>] }
-%struct.int64x1x4_t = type { [4 x <1 x i64>] }
-%struct.float32x2x4_t = type { [4 x <2 x float>] }
-%struct.float64x1x4_t = type { [4 x <1 x double>] }
-
-define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
-; CHECK-LABEL: test_ld_from_poll_v16i8:
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
-  ret <16 x i8> %b
-}
-
-define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
-; CHECK-LABEL: test_ld_from_poll_v8i16:
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
-  ret <8 x i16> %b
-}
-
-define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4i32:
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
-  ret <4 x i32> %b
-}
-
-define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2i64:
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <2 x i64> %a, <i64 1, i64 2>
-  ret <2 x i64> %b
-}
-
-define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4f32:
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
-  ret <4 x float> %b
-}
-
-define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2f64:
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = fadd <2 x double> %a, <double 1.0, double 2.0>
-  ret <2 x double> %b
-}
-
-define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
-; CHECK-LABEL: test_ld_from_poll_v8i8:
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
-  ret <8 x i8> %b
-}
-
-define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4i16:
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
-  ret <4 x i16> %b
-}
-
-define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2i32:
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <2 x i32> %a, <i32 1, i32 2>
-  ret <2 x i32> %b
-}
-
-define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld1q_dup_s8:
-; CHECK: ld1r {{{ ?v[0-9]+.16b ?}}}, [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
-  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  ret <16 x i8> %lane
-}
-
-define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld1q_dup_s16:
-; CHECK: ld1r {{{ ?v[0-9]+.8h ?}}}, [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  ret <8 x i16> %lane
-}
-
-define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld1q_dup_s32:
-; CHECK: ld1r {{{ ?v[0-9]+.4s ?}}}, [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %1 = insertelement <4 x i32> undef, i32 %0, i32 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  ret <4 x i32> %lane
-}
-
-define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld1q_dup_s64:
-; CHECK: ld1r {{{ ?v[0-9]+.2d ?}}}, [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %1 = insertelement <2 x i64> undef, i64 %0, i32 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %lane
-}
-
-define <4 x float> @test_vld1q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld1q_dup_f32:
-; CHECK: ld1r {{{ ?v[0-9]+.4s ?}}}, [x0]
-entry:
-  %0 = load float* %a, align 4
-  %1 = insertelement <4 x float> undef, float %0, i32 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  ret <4 x float> %lane
-}
-
-define <2 x double> @test_vld1q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld1q_dup_f64:
-; CHECK: ld1r {{{ ?v[0-9]+.2d ?}}}, [x0]
-entry:
-  %0 = load double* %a, align 8
-  %1 = insertelement <2 x double> undef, double %0, i32 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  ret <2 x double> %lane
-}
-
-define <8 x i8> @test_vld1_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld1_dup_s8:
-; CHECK: ld1r {{{ ?v[0-9]+.8b ?}}}, [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
-  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  ret <8 x i8> %lane
-}
-
-define <4 x i16> @test_vld1_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld1_dup_s16:
-; CHECK: ld1r {{{ ?v[0-9]+.4h ?}}}, [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  ret <4 x i16> %lane
-}
-
-define <2 x i32> @test_vld1_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld1_dup_s32:
-; CHECK: ld1r {{{ ?v[0-9]+.2s ?}}}, [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  ret <2 x i32> %lane
-}
-
-define <1 x i64> @test_vld1_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld1_dup_s64:
-; CHECK: ldr {{d[0-9]+}}, [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %1 = insertelement <1 x i64> undef, i64 %0, i32 0
-  ret <1 x i64> %1
-}
-
-define <2 x float> @test_vld1_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld1_dup_f32:
-; CHECK: ld1r {{{ ?v[0-9]+.2s ?}}}, [x0]
-entry:
-  %0 = load float* %a, align 4
-  %1 = insertelement <2 x float> undef, float %0, i32 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  ret <2 x float> %lane
-}
-
-define <1 x double> @test_vld1_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld1_dup_f64:
-; CHECK: ldr {{d[0-9]+}}, [x0]
-entry:
-  %0 = load double* %a, align 8
-  %1 = insertelement <1 x double> undef, double %0, i32 0
-  ret <1 x double> %1
-}
-
-define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
-; As there is a store operation depending on %1, LD1R pattern can't be selected.
-; So LDR and FMOV should be emitted.
-; CHECK-LABEL: testDUP.v1i64:
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
-; CHECK-DAG: fmov {{d[0-9]+}}, {{x[0-9]+}}
-; CHECK-DAG: str {{x[0-9]+}}, [{{x[0-9]+}}]
-  %1 = load i64* %a, align 8
-  store i64 %1, i64* %b, align 8
-  %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
-  ret <1 x i64> %vecinit.i
-}
-
-define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
-; As there is a store operation depending on %1, LD1R pattern can't be selected.
-; So LDR and FMOV should be emitted.
-; CHECK-LABEL: testDUP.v1f64:
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
-; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
-  %1 = load double* %a, align 8
-  store double %1, double* %b, align 8
-  %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
-  ret <1 x double> %vecinit.i
-}
-
-define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vld1q_lane_s8:
-; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
-  ret <16 x i8> %vld1_lane
-}
-
-define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vld1q_lane_s16:
-; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
-  ret <8 x i16> %vld1_lane
-}
-
-define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vld1q_lane_s32:
-; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
-  ret <4 x i32> %vld1_lane
-}
-
-define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vld1q_lane_s64:
-; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
-  ret <2 x i64> %vld1_lane
-}
-
-define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vld1q_lane_f32:
-; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load float* %a, align 4
-  %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
-  ret <4 x float> %vld1_lane
-}
-
-define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vld1q_lane_f64:
-; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load double* %a, align 8
-  %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
-  ret <2 x double> %vld1_lane
-}
-
-define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vld1_lane_s8:
-; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
-  ret <8 x i8> %vld1_lane
-}
-
-define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vld1_lane_s16:
-; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
-  ret <4 x i16> %vld1_lane
-}
-
-define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vld1_lane_s32:
-; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
-  ret <2 x i32> %vld1_lane
-}
-
-define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vld1_lane_s64:
-; CHECK: ldr {{d[0-9]+}}, [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
-  ret <1 x i64> %vld1_lane
-}
-
-define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vld1_lane_f32:
-; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load float* %a, align 4
-  %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
-  ret <2 x float> %vld1_lane
-}
-
-define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vld1_lane_f64:
-; CHECK: ldr {{d[0-9]+}}, [x0]
-entry:
-  %0 = load double* %a, align 8
-  %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
-  ret <1 x double> %vld1_lane
-}
-
-define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vst1q_lane_s8:
-; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <16 x i8> %b, i32 15
-  store i8 %0, i8* %a, align 1
-  ret void
-}
-
-define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vst1q_lane_s16:
-; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <8 x i16> %b, i32 7
-  store i16 %0, i16* %a, align 2
-  ret void
-}
-
-define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vst1q_lane_s32:
-; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <4 x i32> %b, i32 3
-  store i32 %0, i32* %a, align 4
-  ret void
-}
-
-define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vst1q_lane_s64:
-; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x i64> %b, i32 1
-  store i64 %0, i64* %a, align 8
-  ret void
-}
-
-define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vst1q_lane_f32:
-; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <4 x float> %b, i32 3
-  store float %0, float* %a, align 4
-  ret void
-}
-
-define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vst1q_lane_f64:
-; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x double> %b, i32 1
-  store double %0, double* %a, align 8
-  ret void
-}
-
-define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vst1_lane_s8:
-; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <8 x i8> %b, i32 7
-  store i8 %0, i8* %a, align 1
-  ret void
-}
-
-define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vst1_lane_s16:
-; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <4 x i16> %b, i32 3
-  store i16 %0, i16* %a, align 2
-  ret void
-}
-
-define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vst1_lane_s32:
-; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x i32> %b, i32 1
-  store i32 %0, i32* %a, align 4
-  ret void
-}
-
-define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vst1_lane_s64:
-; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <1 x i64> %b, i32 0
-  store i64 %0, i64* %a, align 8
-  ret void
-}
-
-define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vst1_lane_f32:
-; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x float> %b, i32 1
-  store float %0, float* %a, align 4
-  ret void
-}
-
-define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vst1_lane_f64:
-; CHECK: str {{d[0-9]+}}, [x0]
-entry:
-  %0 = extractelement <1 x double> %b, i32 0
-  store double %0, double* %a, align 8
-  ret void
-}
diff --git a/test/CodeGen/ARM64/aarch64-neon-simd-shift.ll b/test/CodeGen/ARM64/aarch64-neon-simd-shift.ll
deleted file mode 100644
index 2fd2c1e35ce..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-simd-shift.ll
+++ /dev/null
@@ -1,663 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) {
-; CHECK: test_vshr_n_s8
-; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vshr_n = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <8 x i8> %vshr_n
-}
-
-define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) {
-; CHECK: test_vshr_n_s16
-; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vshr_n = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
-  ret <4 x i16> %vshr_n
-}
-
-define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) {
-; CHECK: test_vshr_n_s32
-; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vshr_n = ashr <2 x i32> %a, <i32 3, i32 3>
-  ret <2 x i32> %vshr_n
-}
-
-define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) {
-; CHECK: test_vshrq_n_s8
-; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vshr_n = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <16 x i8> %vshr_n
-}
-
-define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) {
-; CHECK: test_vshrq_n_s16
-; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vshr_n = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  ret <8 x i16> %vshr_n
-}
-
-define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) {
-; CHECK: test_vshrq_n_s32
-; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vshr_n = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %vshr_n
-}
-
-define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) {
-; CHECK: test_vshrq_n_s64
-; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vshr_n = ashr <2 x i64> %a, <i64 3, i64 3>
-  ret <2 x i64> %vshr_n
-}
-
-define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) {
-; CHECK: test_vshr_n_u8
-; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vshr_n = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <8 x i8> %vshr_n
-}
-
-define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) {
-; CHECK: test_vshr_n_u16
-; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vshr_n = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
-  ret <4 x i16> %vshr_n
-}
-
-define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) {
-; CHECK: test_vshr_n_u32
-; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vshr_n = lshr <2 x i32> %a, <i32 3, i32 3>
-  ret <2 x i32> %vshr_n
-}
-
-define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) {
-; CHECK: test_vshrq_n_u8
-; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vshr_n = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <16 x i8> %vshr_n
-}
-
-define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) {
-; CHECK: test_vshrq_n_u16
-; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vshr_n = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  ret <8 x i16> %vshr_n
-}
-
-define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) {
-; CHECK: test_vshrq_n_u32
-; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vshr_n = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %vshr_n
-}
-
-define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) {
-; CHECK: test_vshrq_n_u64
-; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vshr_n = lshr <2 x i64> %a, <i64 3, i64 3>
-  ret <2 x i64> %vshr_n
-}
-
-define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsra_n_s8
-; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsra_n = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <8 x i8> %vsra_n, %a
-  ret <8 x i8> %1
-}
-
-define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsra_n_s16
-; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsra_n = ashr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
-  %1 = add <4 x i16> %vsra_n, %a
-  ret <4 x i16> %1
-}
-
-define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsra_n_s32
-; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsra_n = ashr <2 x i32> %b, <i32 3, i32 3>
-  %1 = add <2 x i32> %vsra_n, %a
-  ret <2 x i32> %1
-}
-
-define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsraq_n_s8
-; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsra_n = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <16 x i8> %vsra_n, %a
-  ret <16 x i8> %1
-}
-
-define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsraq_n_s16
-; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsra_n = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %1 = add <8 x i16> %vsra_n, %a
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsraq_n_s32
-; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsra_n = ashr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
-  %1 = add <4 x i32> %vsra_n, %a
-  ret <4 x i32> %1
-}
-
-define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsraq_n_s64
-; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsra_n = ashr <2 x i64> %b, <i64 3, i64 3>
-  %1 = add <2 x i64> %vsra_n, %a
-  ret <2 x i64> %1
-}
-
-define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsra_n_u8
-; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsra_n = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <8 x i8> %vsra_n, %a
-  ret <8 x i8> %1
-}
-
-define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsra_n_u16
-; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsra_n = lshr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
-  %1 = add <4 x i16> %vsra_n, %a
-  ret <4 x i16> %1
-}
-
-define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsra_n_u32
-; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsra_n = lshr <2 x i32> %b, <i32 3, i32 3>
-  %1 = add <2 x i32> %vsra_n, %a
-  ret <2 x i32> %1
-}
-
-define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsraq_n_u8
-; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsra_n = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <16 x i8> %vsra_n, %a
-  ret <16 x i8> %1
-}
-
-define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsraq_n_u16
-; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsra_n = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %1 = add <8 x i16> %vsra_n, %a
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsraq_n_u32
-; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsra_n = lshr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
-  %1 = add <4 x i32> %vsra_n, %a
-  ret <4 x i32> %1
-}
-
-define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsraq_n_u64
-; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsra_n = lshr <2 x i64> %b, <i64 3, i64 3>
-  %1 = add <2 x i64> %vsra_n, %a
-  ret <2 x i64> %1
-}
-
-define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vshrn_n_s16
-; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %1 = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  ret <8 x i8> %vshrn_n
-}
-
-define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vshrn_n_s32
-; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %1 = ashr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  ret <4 x i16> %vshrn_n
-}
-
-define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vshrn_n_s64
-; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %1 = ashr <2 x i64> %a, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
-  ret <2 x i32> %vshrn_n
-}
-
-define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vshrn_n_u16
-; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %1 = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  ret <8 x i8> %vshrn_n
-}
-
-define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vshrn_n_u32
-; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %1 = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  ret <4 x i16> %vshrn_n
-}
-
-define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vshrn_n_u64
-; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %1 = lshr <2 x i64> %a, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
-  ret <2 x i32> %vshrn_n
-}
-
-define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vshrn_high_n_s16
-; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %1 = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  %2 = bitcast <8 x i8> %a to <1 x i64>
-  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %4
-}
-
-define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vshrn_high_n_s32
-; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %1 = ashr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  %2 = bitcast <4 x i16> %a to <1 x i64>
-  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %4
-}
-
-define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vshrn_high_n_s64
-; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %2 = ashr <2 x i64> %b, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
-  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %4
-}
-
-define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vshrn_high_n_u16
-; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %1 = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  %2 = bitcast <8 x i8> %a to <1 x i64>
-  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %4
-}
-
-define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vshrn_high_n_u32
-; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %1 = lshr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  %2 = bitcast <4 x i16> %a to <1 x i64>
-  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %4
-}
-
-define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vshrn_high_n_u64
-; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %2 = lshr <2 x i64> %b, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
-  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %4
-}
-
-define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrun_high_n_s16
-; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrun = tail call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrun_high_n_s32
-; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrun = tail call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrun_high_n_s64
-; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrun = tail call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vrshrn_high_n_s16
-; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vrshrn = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vrshrn_high_n_s32
-; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vrshrn = tail call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vrshrn_high_n_s64
-; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vrshrn = tail call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrun_high_n_s16
-; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrun = tail call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrun_high_n_s32
-; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrun = tail call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrun_high_n_s64
-; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrun = tail call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrn_high_n_s16
-; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrn_high_n_s32
-; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrn_high_n_s64
-; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrn = tail call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrn_high_n_u16
-; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrn_high_n_u32
-; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrn_high_n_u64
-; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrn = tail call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrn_high_n_s16
-; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrn_high_n_s32
-; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrn_high_n_s64
-; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrn = tail call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrn_high_n_u16
-; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrn_high_n_u32
-; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrn_high_n_u64
-; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrn = tail call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-
-
-declare <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64>, i32)
-
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
-
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
-
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
-
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
-
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
-
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
-
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
-
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
-
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
-
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
-
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
-
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
-
-define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_n_s64_f64
-; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x i64> @llvm.arm64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_n_u64_f64
-; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x i64> @llvm.arm64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
-  ret <1 x i64> %1
-}
-
-define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_n_f64_s64
-; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x double> @llvm.arm64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_n_f64_u64
-; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x double> @llvm.arm64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
-  ret <1 x double> %1
-}
-
-declare <1 x i64> @llvm.arm64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
-declare <1 x i64> @llvm.arm64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
-declare <1 x double> @llvm.arm64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
-declare <1 x double> @llvm.arm64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
diff --git a/test/CodeGen/ARM64/aarch64-neon-simd-vget.ll b/test/CodeGen/ARM64/aarch64-neon-simd-vget.ll
deleted file mode 100644
index 87f3956eb20..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-simd-vget.ll
+++ /dev/null
@@ -1,225 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <8 x i8> @test_vget_high_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_high_s8:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_high_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_high_s16:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vget_high_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_vget_high_s32:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  ret <2 x i32> %shuffle.i
-}
-
-define <1 x i64> @test_vget_high_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_high_s64:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
-  ret <1 x i64> %shuffle.i
-}
-
-define <8 x i8> @test_vget_high_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_high_u8:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_high_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_high_u16:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vget_high_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vget_high_u32:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  ret <2 x i32> %shuffle.i
-}
-
-define <1 x i64> @test_vget_high_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_high_u64:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
-  ret <1 x i64> %shuffle.i
-}
-
-define <1 x i64> @test_vget_high_p64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_high_p64:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
-  ret <1 x i64> %shuffle.i
-}
-
-define <4 x i16> @test_vget_high_f16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_high_f16:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x float> @test_vget_high_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vget_high_f32:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
-  ret <2 x float> %shuffle.i
-}
-
-define <8 x i8> @test_vget_high_p8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_high_p8:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_high_p16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_high_p16:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %shuffle.i
-}
-
-define <1 x double> @test_vget_high_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vget_high_f64:
-; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
-entry:
-  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> <i32 1>
-  ret <1 x double> %shuffle.i
-}
-
-define <8 x i8> @test_vget_low_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_low_s8:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_low_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_low_s16:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vget_low_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_vget_low_s32:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i32> %shuffle.i
-}
-
-define <1 x i64> @test_vget_low_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_low_s64:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
-  ret <1 x i64> %shuffle.i
-}
-
-define <8 x i8> @test_vget_low_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_low_u8:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_low_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_low_u16:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vget_low_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vget_low_u32:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i32> %shuffle.i
-}
-
-define <1 x i64> @test_vget_low_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_low_u64:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
-  ret <1 x i64> %shuffle.i
-}
-
-define <1 x i64> @test_vget_low_p64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_low_p64:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
-  ret <1 x i64> %shuffle.i
-}
-
-define <4 x i16> @test_vget_low_f16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_low_f16:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x float> @test_vget_low_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vget_low_f32:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuffle.i
-}
-
-define <8 x i8> @test_vget_low_p8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_low_p8:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_low_p16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_low_p16:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i16> %shuffle.i
-}
-
-define <1 x double> @test_vget_low_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vget_low_f64:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> zeroinitializer
-  ret <1 x double> %shuffle.i
-}
diff --git a/test/CodeGen/ARM64/aarch64-neon-v1i1-setcc.ll b/test/CodeGen/ARM64/aarch64-neon-v1i1-setcc.ll
deleted file mode 100644
index c932253049e..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-v1i1-setcc.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; arm64 has a separate copy as aarch64-neon-v1i1-setcc.ll
-
-; This file test the DAG node like "v1i1 SETCC v1i64, v1i64". As the v1i1 type
-; is illegal in AArch64 backend, the legalizer tries to scalarize this node.
-; As the v1i64 operands of SETCC are legal types, they will not be scalarized.
-; Currently the type legalizer will have an assertion failure as it assumes all
-; operands of SETCC have been legalized.
-; FIXME: If the algorithm of type scalarization is improved and can legaize
-; "v1i1 SETCC" correctly, these test cases are not needed.
-
-define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
-; CHECK-LABEL: test_sext_extr_cmp_0:
-; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
-  %1 = icmp sge <1 x i64> %v1, %v2
-  %2 = extractelement <1 x i1> %1, i32 0
-  %vget_lane = sext i1 %2 to i64
-  ret i64 %vget_lane
-}
-
-define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
-; CHECK-LABEL: test_sext_extr_cmp_1:
-; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = fcmp oeq <1 x double> %v1, %v2
-  %2 = extractelement <1 x i1> %1, i32 0
-  %vget_lane = sext i1 %2 to i64
-  ret i64 %vget_lane
-}
-
-define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_select_v1i1_0:
-; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %1 = icmp eq <1 x i64> %v1, %v2
-  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
-  ret <1 x i64> %res
-}
-
-define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_select_v1i1_1:
-; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %1 = fcmp oeq <1 x double> %v1, %v2
-  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
-  ret <1 x i64> %res
-}
-
-define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
-; CHECK-LABEL: test_select_v1i1_2:
-; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %1 = icmp eq <1 x i64> %v1, %v2
-  %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
-  ret <1 x double> %res
-}
-
-define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) {
-; CHECK-LABEL: test_br_extr_cmp:
-; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}}
-  %1 = icmp eq <1 x i64> %v1, %v2
-  %2 = extractelement <1 x i1> %1, i32 0
-  br i1 %2, label %if.end, label %if.then
-
-if.then:
-  ret i32 0;
-
-if.end:
-  ret i32 1;
-}
diff --git a/test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll b/test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll
deleted file mode 100644
index 9e69ac025f9..00000000000
--- a/test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll
+++ /dev/null
@@ -1,175 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-; FIXME: We should not generate ld/st for such register spill/fill, because the
-; test case seems very simple and the register pressure is not high. If the
-; spill/fill algorithm is optimized, this test case may not be triggered. And
-; then we can delete it.
-define i32 @spill.DPairReg(i32* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.DPairReg:
-; CHECK: ld2 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %arg1)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <2 x i32>, <2 x i32> } %vld, 0
-  %res = extractelement <2 x i32> %vld.extract, i32 1
-  ret i32 %res
-}
-
-define i16 @spill.DTripleReg(i16* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.DTripleReg:
-; CHECK: ld3 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %arg1)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
-  %res = extractelement <4 x i16> %vld.extract, i32 1
-  ret i16 %res
-}
-
-define i16 @spill.DQuadReg(i16* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.DQuadReg:
-; CHECK: ld4 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %arg1)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
-  %res = extractelement <4 x i16> %vld.extract, i32 0
-  ret i16 %res
-}
-
-define i32 @spill.QPairReg(i32* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.QPairReg:
-; CHECK: ld2 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %arg1)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
-  %res = extractelement <4 x i32> %vld.extract, i32 1
-  ret i32 %res
-}
-
-define float @spill.QTripleReg(float* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.QTripleReg:
-; CHECK: ld3 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-entry:
-  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %arg1)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
-  %res = extractelement <4 x float> %vld3.extract, i32 1
-  ret float %res
-}
-
-define i8 @spill.QQuadReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.QQuadReg:
-; CHECK: ld4 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %arg1)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld, 0
-  %res = extractelement <16 x i8> %vld.extract, i32 1
-  ret i8 %res
-}
-
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32*)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16*)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16*)
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32*)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float*)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8*)
-
-declare void @foo()
-
-; FIXME: We should not generate ld/st for such register spill/fill, because the
-; test case seems very simple and the register pressure is not high. If the
-; spill/fill algorithm is optimized, this test case may not be triggered. And
-; then we can delete it.
-; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_2xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
-  tail call void @foo()
-  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
-  %1 = bitcast <2 x i64> %sv to <8 x i16>
-  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %3 = mul <8 x i16> %2, %2
-  ret <8 x i16> %3
-}
-
-; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_3xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
-  tail call void @foo()
-  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
-  %1 = bitcast <2 x i64> %sv to <8 x i16>
-  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %3 = mul <8 x i16> %2, %2
-  ret <8 x i16> %3
-}
-
-; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_4xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
-  tail call void @foo()
-  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
-  %1 = bitcast <2 x i64> %sv to <8 x i16>
-  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %3 = mul <8 x i16> %2, %2
-  ret <8 x i16> %3
-}
-
-declare void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
-declare void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
-declare void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
diff --git a/test/CodeGen/ARM64/abi-varargs.ll b/test/CodeGen/ARM64/abi-varargs.ll
deleted file mode 100644
index 92db392cd04..00000000000
--- a/test/CodeGen/ARM64/abi-varargs.ll
+++ /dev/null
@@ -1,191 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
-target triple = "arm64-apple-ios7.0.0"
-
-; rdar://13625505
-; Here we have 9 fixed integer arguments the 9th argument in on stack, the
-; varargs start right after at 8-byte alignment.
-define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
-; CHECK-LABEL: fn9:
-; 9th fixed argument
-; CHECK: ldr {{w[0-9]+}}, [sp, #64]
-; CHECK: add [[ARGS:x[0-9]+]], sp, #72
-; CHECK: add {{x[0-9]+}}, [[ARGS]], #8
-; First vararg
-; CHECK: ldr {{w[0-9]+}}, [sp, #72]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
-; Second vararg
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
-; Third vararg
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
-  %1 = alloca i32, align 4
-  %2 = alloca i32, align 4
-  %3 = alloca i32, align 4
-  %4 = alloca i32, align 4
-  %5 = alloca i32, align 4
-  %6 = alloca i32, align 4
-  %7 = alloca i32, align 4
-  %8 = alloca i32, align 4
-  %9 = alloca i32, align 4
-  %args = alloca i8*, align 8
-  %a10 = alloca i32, align 4
-  %a11 = alloca i32, align 4
-  %a12 = alloca i32, align 4
-  store i32 %a1, i32* %1, align 4
-  store i32 %a2, i32* %2, align 4
-  store i32 %a3, i32* %3, align 4
-  store i32 %a4, i32* %4, align 4
-  store i32 %a5, i32* %5, align 4
-  store i32 %a6, i32* %6, align 4
-  store i32 %a7, i32* %7, align 4
-  store i32 %a8, i32* %8, align 4
-  store i32 %a9, i32* %9, align 4
-  %10 = bitcast i8** %args to i8*
-  call void @llvm.va_start(i8* %10)
-  %11 = va_arg i8** %args, i32
-  store i32 %11, i32* %a10, align 4
-  %12 = va_arg i8** %args, i32
-  store i32 %12, i32* %a11, align 4
-  %13 = va_arg i8** %args, i32
-  store i32 %13, i32* %a12, align 4
-  ret void
-}
-
-declare void @llvm.va_start(i8*) nounwind
-
-define i32 @main() nounwind ssp {
-; CHECK-LABEL: main:
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: str {{x[0-9]+}}, [sp, #8]
-; CHECK: str {{w[0-9]+}}, [sp]
-  %a1 = alloca i32, align 4
-  %a2 = alloca i32, align 4
-  %a3 = alloca i32, align 4
-  %a4 = alloca i32, align 4
-  %a5 = alloca i32, align 4
-  %a6 = alloca i32, align 4
-  %a7 = alloca i32, align 4
-  %a8 = alloca i32, align 4
-  %a9 = alloca i32, align 4
-  %a10 = alloca i32, align 4
-  %a11 = alloca i32, align 4
-  %a12 = alloca i32, align 4
-  store i32 1, i32* %a1, align 4
-  store i32 2, i32* %a2, align 4
-  store i32 3, i32* %a3, align 4
-  store i32 4, i32* %a4, align 4
-  store i32 5, i32* %a5, align 4
-  store i32 6, i32* %a6, align 4
-  store i32 7, i32* %a7, align 4
-  store i32 8, i32* %a8, align 4
-  store i32 9, i32* %a9, align 4
-  store i32 10, i32* %a10, align 4
-  store i32 11, i32* %a11, align 4
-  store i32 12, i32* %a12, align 4
-  %1 = load i32* %a1, align 4
-  %2 = load i32* %a2, align 4
-  %3 = load i32* %a3, align 4
-  %4 = load i32* %a4, align 4
-  %5 = load i32* %a5, align 4
-  %6 = load i32* %a6, align 4
-  %7 = load i32* %a7, align 4
-  %8 = load i32* %a8, align 4
-  %9 = load i32* %a9, align 4
-  %10 = load i32* %a10, align 4
-  %11 = load i32* %a11, align 4
-  %12 = load i32* %a12, align 4
-  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
-  ret i32 0
-}
-
-;rdar://13668483
-@.str = private unnamed_addr constant [4 x i8] c"fmt\00", align 1
-define void @foo(i8* %fmt, ...) nounwind {
-entry:
-; CHECK-LABEL: foo:
-; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
-; CHECK: ldr {{w[0-9]+}}, [sp, #48]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
-; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
-; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
-  %fmt.addr = alloca i8*, align 8
-  %args = alloca i8*, align 8
-  %vc = alloca i32, align 4
-  %vv = alloca <4 x i32>, align 16
-  store i8* %fmt, i8** %fmt.addr, align 8
-  %args1 = bitcast i8** %args to i8*
-  call void @llvm.va_start(i8* %args1)
-  %0 = va_arg i8** %args, i32
-  store i32 %0, i32* %vc, align 4
-  %1 = va_arg i8** %args, <4 x i32>
-  store <4 x i32> %1, <4 x i32>* %vv, align 16
-  ret void
-}
-
-define void @bar(i32 %x, <4 x i32> %y) nounwind {
-entry:
-; CHECK-LABEL: bar:
-; CHECK: str {{q[0-9]+}}, [sp, #16]
-; CHECK: str {{x[0-9]+}}, [sp]
-  %x.addr = alloca i32, align 4
-  %y.addr = alloca <4 x i32>, align 16
-  store i32 %x, i32* %x.addr, align 4
-  store <4 x i32> %y, <4 x i32>* %y.addr, align 16
-  %0 = load i32* %x.addr, align 4
-  %1 = load <4 x i32>* %y.addr, align 16
-  call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
-  ret void
-}
-
-; rdar://13668927
-; When passing 16-byte aligned small structs as vararg, make sure the caller
-; side is 16-byte aligned on stack.
-%struct.s41 = type { i32, i16, i32, i16 }
-define void @foo2(i8* %fmt, ...) nounwind {
-entry:
-; CHECK-LABEL: foo2:
-; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
-; CHECK: ldr {{w[0-9]+}}, [sp, #48]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
-; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
-; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
-  %fmt.addr = alloca i8*, align 8
-  %args = alloca i8*, align 8
-  %vc = alloca i32, align 4
-  %vs = alloca %struct.s41, align 16
-  store i8* %fmt, i8** %fmt.addr, align 8
-  %args1 = bitcast i8** %args to i8*
-  call void @llvm.va_start(i8* %args1)
-  %0 = va_arg i8** %args, i32
-  store i32 %0, i32* %vc, align 4
-  %ap.cur = load i8** %args
-  %1 = getelementptr i8* %ap.cur, i32 15
-  %2 = ptrtoint i8* %1 to i64
-  %3 = and i64 %2, -16
-  %ap.align = inttoptr i64 %3 to i8*
-  %ap.next = getelementptr i8* %ap.align, i32 16
-  store i8* %ap.next, i8** %args
-  %4 = bitcast i8* %ap.align to %struct.s41*
-  %5 = bitcast %struct.s41* %vs to i8*
-  %6 = bitcast %struct.s41* %4 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 16, i32 16, i1 false)
-  ret void
-}
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
-
-define void @bar2(i32 %x, i128 %s41.coerce) nounwind {
-entry:
-; CHECK-LABEL: bar2:
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: str {{x[0-9]+}}, [sp]
-  %x.addr = alloca i32, align 4
-  %s41 = alloca %struct.s41, align 16
-  store i32 %x, i32* %x.addr, align 4
-  %0 = bitcast %struct.s41* %s41 to i128*
-  store i128 %s41.coerce, i128* %0, align 1
-  %1 = load i32* %x.addr, align 4
-  %2 = bitcast %struct.s41* %s41 to i128*
-  %3 = load i128* %2, align 1
-  call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
-  ret void
-}
diff --git a/test/CodeGen/ARM64/abi.ll b/test/CodeGen/ARM64/abi.ll
deleted file mode 100644
index e2de434c7b0..00000000000
--- a/test/CodeGen/ARM64/abi.ll
+++ /dev/null
@@ -1,238 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
-; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
-target triple = "arm64-apple-darwin"
-
-; rdar://9932559
-define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline {
-entry:
-; CHECK-LABEL: i8i16callee:
-; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
-; They are i8, i16, i8 and i8.
-; CHECK: ldrsb	{{w[0-9]+}}, [sp, #5]
-; CHECK: ldrsh	{{w[0-9]+}}, [sp, #2]
-; CHECK: ldrsb	{{w[0-9]+}}, [sp]
-; CHECK: ldrsb	{{w[0-9]+}}, [sp, #4]
-; FAST-LABEL: i8i16callee:
-; FAST: ldrb  {{w[0-9]+}}, [sp, #5]
-; FAST: ldrb  {{w[0-9]+}}, [sp, #4]
-; FAST: ldrh  {{w[0-9]+}}, [sp, #2]
-; FAST: ldrb  {{w[0-9]+}}, [sp]
-  %conv = sext i8 %a4 to i64
-  %conv3 = sext i16 %a5 to i64
-  %conv8 = sext i8 %b1 to i64
-  %conv9 = sext i16 %b2 to i64
-  %conv11 = sext i8 %b3 to i64
-  %conv13 = sext i8 %b4 to i64
-  %add10 = add i64 %a2, %a1
-  %add12 = add i64 %add10, %a3
-  %add14 = add i64 %add12, %conv
-  %add = add i64 %add14, %conv3
-  %add1 = add i64 %add, %a6
-  %add2 = add i64 %add1, %a7
-  %add4 = add i64 %add2, %a8
-  %add5 = add i64 %add4, %conv8
-  %add6 = add i64 %add5, %conv9
-  %add7 = add i64 %add6, %conv11
-  %add15 = add i64 %add7, %conv13
-  %sext = shl i64 %add15, 32
-  %conv17 = ashr exact i64 %sext, 32
-  ret i64 %conv17
-}
-
-define i32 @i8i16caller() nounwind readnone {
-entry:
-; CHECK: i8i16caller
-; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
-; They are i8, i16, i8 and i8.
-; CHECK: strb {{w[0-9]+}}, [sp, #5]
-; CHECK: strb {{w[0-9]+}}, [sp, #4]
-; CHECK: strh {{w[0-9]+}}, [sp, #2]
-; CHECK: strb {{w[0-9]+}}, [sp]
-; CHECK: bl
-; FAST: i8i16caller
-; FAST: strb {{w[0-9]+}}, [sp]
-; FAST: strh {{w[0-9]+}}, [sp, #2]
-; FAST: strb {{w[0-9]+}}, [sp, #4]
-; FAST: strb {{w[0-9]+}}, [sp, #5]
-; FAST: bl
-  %call = tail call i64 @i8i16callee(i64 0, i64 1, i64 2, i8 signext 3, i16 signext 4, i64 5, i64 6, i64 7, i8 signext 97, i16 signext 98, i8 signext 99, i8 signext 100)
-  %conv = trunc i64 %call to i32
-  ret i32 %conv
-}
-
-; rdar://12651543
-define double @circle_center([2 x float] %a) nounwind ssp {
-  %call = tail call double @ext([2 x float] %a) nounwind
-; CHECK: circle_center
-; CHECK: bl
-  ret double %call
-}
-declare double @ext([2 x float])
-
-; rdar://12656141
-; 16-byte vector should be aligned at 16-byte when passing on stack.
-; A double argument will be passed on stack, so vecotr should be at sp+16.
-define double @fixed_4i(<4 x i32>* nocapture %in) nounwind {
-entry:
-; CHECK: fixed_4i
-; CHECK: str [[REG_1:q[0-9]+]], [sp, #16]
-; FAST: fixed_4i
-; FAST: sub sp, sp, #64
-; FAST: mov x[[ADDR:[0-9]+]], sp
-; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16]
-  %0 = load <4 x i32>* %in, align 16
-  %call = tail call double @args_vec_4i(double 3.000000e+00, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, double 3.000000e+00, <4 x i32> %0, i8 signext 3)
-  ret double %call
-}
-declare double @args_vec_4i(double, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, double, <4 x i32>, i8 signext)
-
-; rdar://12695237
-; d8 at sp, i in register w0.
-@g_d = common global double 0.000000e+00, align 8
-define void @test1(float %f1, double %d1, double %d2, double %d3, double %d4,
-       double %d5, double %d6, double %d7, double %d8, i32 %i) nounwind ssp {
-entry:
-; CHECK: test1
-; CHECK: ldr [[REG_1:d[0-9]+]], [sp]
-; CHECK: scvtf [[REG_2:s[0-9]+]], w0
-; CHECK: fadd s0, [[REG_2]], s0
-  %conv = sitofp i32 %i to float
-  %add = fadd float %conv, %f1
-  %conv1 = fpext float %add to double
-  %add2 = fadd double %conv1, %d7
-  %add3 = fadd double %add2, %d8
-  store double %add3, double* @g_d, align 8
-  ret void
-}
-
-; i9 at sp, d1 in register s0.
-define void @test2(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-            i32 %i7, i32 %i8, i32 %i9, float %d1) nounwind ssp {
-entry:
-; CHECK: test2
-; CHECK: scvtf [[REG_2:s[0-9]+]], w0
-; CHECK: fadd s0, [[REG_2]], s0
-; CHECK: ldr [[REG_1:s[0-9]+]], [sp]
-  %conv = sitofp i32 %i1 to float
-  %add = fadd float %conv, %d1
-  %conv1 = fpext float %add to double
-  %conv2 = sitofp i32 %i8 to double
-  %add3 = fadd double %conv2, %conv1
-  %conv4 = sitofp i32 %i9 to double
-  %add5 = fadd double %conv4, %add3
-  store double %add5, double* @g_d, align 8
-  ret void
-}
-
-; rdar://12648441
-; Check alignment on stack for v64, f64, i64, f32, i32.
-define double @test3(<2 x i32>* nocapture %in) nounwind {
-entry:
-; CHECK: test3
-; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
-; FAST: test3
-; FAST: sub sp, sp, #32
-; FAST: mov x[[ADDR:[0-9]+]], sp
-; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
-  %0 = load <2 x i32>* %in, align 8
-  %call = tail call double @args_vec_2i(double 3.000000e+00, <2 x i32> %0,
-          <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0,
-          <2 x i32> %0, float 3.000000e+00, <2 x i32> %0, i8 signext 3)
-  ret double %call
-}
-declare double @args_vec_2i(double, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>,
-               <2 x i32>, <2 x i32>, <2 x i32>, float, <2 x i32>, i8 signext)
-
-define double @test4(double* nocapture %in) nounwind {
-entry:
-; CHECK: test4
-; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
-; CHECK: str [[REG_2:w[0-9]+]], [sp]
-; CHECK: orr w0, wzr, #0x3
-  %0 = load double* %in, align 8
-  %call = tail call double @args_f64(double 3.000000e+00, double %0, double %0,
-          double %0, double %0, double %0, double %0, double %0,
-          float 3.000000e+00, double %0, i8 signext 3)
-  ret double %call
-}
-declare double @args_f64(double, double, double, double, double, double, double,
-               double, float, double, i8 signext)
-
-define i64 @test5(i64* nocapture %in) nounwind {
-entry:
-; CHECK: test5
-; CHECK: strb [[REG_3:w[0-9]+]], [sp, #16]
-; CHECK: str [[REG_1:x[0-9]+]], [sp, #8]
-; CHECK: str [[REG_2:w[0-9]+]], [sp]
-  %0 = load i64* %in, align 8
-  %call = tail call i64 @args_i64(i64 3, i64 %0, i64 %0, i64 %0, i64 %0, i64 %0,
-                         i64 %0, i64 %0, i32 3, i64 %0, i8 signext 3)
-  ret i64 %call
-}
-declare i64 @args_i64(i64, i64, i64, i64, i64, i64, i64, i64, i32, i64,
-             i8 signext)
-
-define i32 @test6(float* nocapture %in) nounwind {
-entry:
-; CHECK: test6
-; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
-; CHECK: str [[REG_1:s[0-9]+]], [sp, #4]
-; CHECK: strh [[REG_3:w[0-9]+]], [sp]
-  %0 = load float* %in, align 4
-  %call = tail call i32 @args_f32(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
-          i32 7, i32 8, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0,
-          float 6.0, float 7.0, float 8.0, i16 signext 3, float %0,
-          i8 signext 3)
-  ret i32 %call
-}
-declare i32 @args_f32(i32, i32, i32, i32, i32, i32, i32, i32,
-                      float, float, float, float, float, float, float, float,
-                      i16 signext, float, i8 signext)
-
-define i32 @test7(i32* nocapture %in) nounwind {
-entry:
-; CHECK: test7
-; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
-; CHECK: str [[REG_1:w[0-9]+]], [sp, #4]
-; CHECK: strh [[REG_3:w[0-9]+]], [sp]
-  %0 = load i32* %in, align 4
-  %call = tail call i32 @args_i32(i32 3, i32 %0, i32 %0, i32 %0, i32 %0, i32 %0,
-                         i32 %0, i32 %0, i16 signext 3, i32 %0, i8 signext 4)
-  ret i32 %call
-}
-declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
-             i8 signext)
-
-define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
-entry:
-; CHECK: test8
-; CHECK: strb {{w[0-9]+}}, [sp, #3]
-; CHECK: strb wzr, [sp, #2]
-; CHECK: strb {{w[0-9]+}}, [sp, #1]
-; CHECK: strb wzr, [sp]
-; CHECK: bl
-; FAST: test8
-; FAST: strb {{w[0-9]+}}, [sp]
-; FAST: strb {{w[0-9]+}}, [sp, #1]
-; FAST: strb {{w[0-9]+}}, [sp, #2]
-; FAST: strb {{w[0-9]+}}, [sp, #3]
-; FAST: bl
-  tail call void @args_i1(i1 zeroext false, i1 zeroext true, i1 zeroext false,
-                  i1 zeroext true, i1 zeroext false, i1 zeroext true,
-                  i1 zeroext false, i1 zeroext true, i1 zeroext false,
-                  i1 zeroext true, i1 zeroext false, i1 zeroext true)
-  ret i32 0
-}
-
-declare void @args_i1(i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
-                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
-                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext)
-
-define i32 @i1_stack_incoming(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f,
-                               i64 %g, i64 %h, i64 %i, i1 zeroext %j) {
-; CHECK-LABEL: i1_stack_incoming:
-; CHECK: ldrb w0, [sp, #8]
-; CHECK: ret
-  %v = zext i1 %j to i32
-  ret i32 %v
-}
diff --git a/test/CodeGen/ARM64/abi_align.ll b/test/CodeGen/ARM64/abi_align.ll
deleted file mode 100644
index 44c5a07ce39..00000000000
--- a/test/CodeGen/ARM64/abi_align.ll
+++ /dev/null
@@ -1,532 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
-; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
-target triple = "arm64-apple-darwin"
-
-; rdar://12648441
-; Generated from arm64-arguments.c with -O2.
-; Test passing structs with size < 8, < 16 and > 16
-; with alignment of 16 and without
-
-; Structs with size < 8
-%struct.s38 = type { i32, i16 }
-; With alignment of 16, the size will be padded to multiple of 16 bytes.
-%struct.s39 = type { i32, i16, [10 x i8] }
-; Structs with size < 16
-%struct.s40 = type { i32, i16, i32, i16 }
-%struct.s41 = type { i32, i16, i32, i16 }
-; Structs with size > 16
-%struct.s42 = type { i32, i16, i32, i16, i32, i16 }
-%struct.s43 = type { i32, i16, i32, i16, i32, i16, [10 x i8] }
-
-@g38 = common global %struct.s38 zeroinitializer, align 4
-@g38_2 = common global %struct.s38 zeroinitializer, align 4
-@g39 = common global %struct.s39 zeroinitializer, align 16
-@g39_2 = common global %struct.s39 zeroinitializer, align 16
-@g40 = common global %struct.s40 zeroinitializer, align 4
-@g40_2 = common global %struct.s40 zeroinitializer, align 4
-@g41 = common global %struct.s41 zeroinitializer, align 16
-@g41_2 = common global %struct.s41 zeroinitializer, align 16
-@g42 = common global %struct.s42 zeroinitializer, align 4
-@g42_2 = common global %struct.s42 zeroinitializer, align 4
-@g43 = common global %struct.s43 zeroinitializer, align 16
-@g43_2 = common global %struct.s43 zeroinitializer, align 16
-
-; structs with size < 8 bytes, passed via i64 in x1 and x2
-define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce) #0 {
-entry:
-; CHECK: f38
-; CHECK: add w[[A:[0-9]+]], w1, w0
-; CHECK: add {{w[0-9]+}}, w[[A]], w2
-  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce to i32
-  %s1.sroa.1.4.extract.shift = lshr i64 %s1.coerce, 32
-  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce to i32
-  %s2.sroa.1.4.extract.shift = lshr i64 %s2.coerce, 32
-  %sext8 = shl nuw nsw i64 %s1.sroa.1.4.extract.shift, 16
-  %sext = trunc i64 %sext8 to i32
-  %conv = ashr exact i32 %sext, 16
-  %sext1011 = shl nuw nsw i64 %s2.sroa.1.4.extract.shift, 16
-  %sext10 = trunc i64 %sext1011 to i32
-  %conv6 = ashr exact i32 %sext10, 16
-  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
-  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
-  %add4 = add i32 %add3, %conv
-  %add7 = add i32 %add4, %conv6
-  ret i32 %add7
-}
-
-define i32 @caller38() #1 {
-entry:
-; CHECK: caller38
-; CHECK: ldr x1,
-; CHECK: ldr x2,
-  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
-  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
-  %call = tail call i32 @f38(i32 3, i64 %0, i64 %1) #5
-  ret i32 %call
-}
-
-declare i32 @f38_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-                i32 %i7, i32 %i8, i32 %i9, i64 %s1.coerce, i64 %s2.coerce) #0
-
-; structs with size < 8 bytes, passed on stack at [sp+8] and [sp+16]
-; i9 at [sp]
-define i32 @caller38_stack() #1 {
-entry:
-; CHECK: caller38_stack
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #0x9
-; CHECK: str w[[C]], [sp]
-  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
-  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
-  %call = tail call i32 @f38_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
-                                   i32 7, i32 8, i32 9, i64 %0, i64 %1) #5
-  ret i32 %call
-}
-
-; structs with size < 8 bytes, alignment of 16
-; passed via i128 in x1 and x3
-define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
-entry:
-; CHECK: f39
-; CHECK: add w[[A:[0-9]+]], w1, w0
-; CHECK: add {{w[0-9]+}}, w[[A]], w3
-  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
-  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
-  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
-  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
-  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
-  %sext = trunc i128 %sext8 to i32
-  %conv = ashr exact i32 %sext, 16
-  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
-  %sext10 = trunc i128 %sext1011 to i32
-  %conv6 = ashr exact i32 %sext10, 16
-  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
-  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
-  %add4 = add i32 %add3, %conv
-  %add7 = add i32 %add4, %conv6
-  ret i32 %add7
-}
-
-define i32 @caller39() #1 {
-entry:
-; CHECK: caller39
-; CHECK: ldp x1, x2,
-; CHECK: ldp x3, x4,
-  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
-  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
-  %call = tail call i32 @f39(i32 3, i128 %0, i128 %1) #5
-  ret i32 %call
-}
-
-declare i32 @f39_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
-
-; structs with size < 8 bytes, alignment 16
-; passed on stack at [sp+16] and [sp+32]
-define i32 @caller39_stack() #1 {
-entry:
-; CHECK: caller39_stack
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: movz w[[C:[0-9]+]], #0x9
-; CHECK: str w[[C]], [sp]
-  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
-  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
-  %call = tail call i32 @f39_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
-                                   i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
-  ret i32 %call
-}
-
-; structs with size < 16 bytes
-; passed via i128 in x1 and x3
-define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 {
-entry:
-; CHECK: f40
-; CHECK: add w[[A:[0-9]+]], w1, w0
-; CHECK: add {{w[0-9]+}}, w[[A]], w3
-  %s1.coerce.fca.0.extract = extractvalue [2 x i64] %s1.coerce, 0
-  %s2.coerce.fca.0.extract = extractvalue [2 x i64] %s2.coerce, 0
-  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce.fca.0.extract to i32
-  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce.fca.0.extract to i32
-  %s1.sroa.0.4.extract.shift = lshr i64 %s1.coerce.fca.0.extract, 32
-  %sext8 = shl nuw nsw i64 %s1.sroa.0.4.extract.shift, 16
-  %sext = trunc i64 %sext8 to i32
-  %conv = ashr exact i32 %sext, 16
-  %s2.sroa.0.4.extract.shift = lshr i64 %s2.coerce.fca.0.extract, 32
-  %sext1011 = shl nuw nsw i64 %s2.sroa.0.4.extract.shift, 16
-  %sext10 = trunc i64 %sext1011 to i32
-  %conv6 = ashr exact i32 %sext10, 16
-  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
-  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
-  %add4 = add i32 %add3, %conv
-  %add7 = add i32 %add4, %conv6
-  ret i32 %add7
-}
-
-define i32 @caller40() #1 {
-entry:
-; CHECK: caller40
-; CHECK: ldp x1, x2,
-; CHECK: ldp x3, x4,
-  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
-  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
-  %call = tail call i32 @f40(i32 3, [2 x i64] %0, [2 x i64] %1) #5
-  ret i32 %call
-}
-
-declare i32 @f40_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-                i32 %i7, i32 %i8, i32 %i9, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0
-
-; structs with size < 16 bytes
-; passed on stack at [sp+8] and [sp+24]
-define i32 @caller40_stack() #1 {
-entry:
-; CHECK: caller40_stack
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #0x9
-; CHECK: str w[[C]], [sp]
-  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
-  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
-  %call = tail call i32 @f40_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
-                         i32 7, i32 8, i32 9, [2 x i64] %0, [2 x i64] %1) #5
-  ret i32 %call
-}
-
-; structs with size < 16 bytes, alignment of 16
-; passed via i128 in x1 and x3
-define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
-entry:
-; CHECK: f41
-; CHECK: add w[[A:[0-9]+]], w1, w0
-; CHECK: add {{w[0-9]+}}, w[[A]], w3
-  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
-  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
-  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
-  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
-  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
-  %sext = trunc i128 %sext8 to i32
-  %conv = ashr exact i32 %sext, 16
-  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
-  %sext10 = trunc i128 %sext1011 to i32
-  %conv6 = ashr exact i32 %sext10, 16
-  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
-  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
-  %add4 = add i32 %add3, %conv
-  %add7 = add i32 %add4, %conv6
-  ret i32 %add7
-}
-
-define i32 @caller41() #1 {
-entry:
-; CHECK: caller41
-; CHECK: ldp x1, x2,
-; CHECK: ldp x3, x4,
-  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
-  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
-  %call = tail call i32 @f41(i32 3, i128 %0, i128 %1) #5
-  ret i32 %call
-}
-
-declare i32 @f41_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
-
-; structs with size < 16 bytes, alignment of 16
-; passed on stack at [sp+16] and [sp+32]
-define i32 @caller41_stack() #1 {
-entry:
-; CHECK: caller41_stack
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: movz w[[C:[0-9]+]], #0x9
-; CHECK: str w[[C]], [sp]
-  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
-  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
-  %call = tail call i32 @f41_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
-                            i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
-  ret i32 %call
-}
-
-; structs with size of 22 bytes, passed indirectly in x1 and x2
-define i32 @f42(i32 %i, %struct.s42* nocapture %s1, %struct.s42* nocapture %s2) #2 {
-entry:
-; CHECK: f42
-; CHECK: ldr w[[A:[0-9]+]], [x1]
-; CHECK: ldr w[[B:[0-9]+]], [x2]
-; CHECK: add w[[C:[0-9]+]], w[[A]], w0
-; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
-; FAST: f42
-; FAST: ldr w[[A:[0-9]+]], [x1]
-; FAST: ldr w[[B:[0-9]+]], [x2]
-; FAST: add w[[C:[0-9]+]], w[[A]], w0
-; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
-  %i1 = getelementptr inbounds %struct.s42* %s1, i64 0, i32 0
-  %0 = load i32* %i1, align 4, !tbaa !0
-  %i2 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 0
-  %1 = load i32* %i2, align 4, !tbaa !0
-  %s = getelementptr inbounds %struct.s42* %s1, i64 0, i32 1
-  %2 = load i16* %s, align 2, !tbaa !3
-  %conv = sext i16 %2 to i32
-  %s5 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 1
-  %3 = load i16* %s5, align 2, !tbaa !3
-  %conv6 = sext i16 %3 to i32
-  %add = add i32 %0, %i
-  %add3 = add i32 %add, %1
-  %add4 = add i32 %add3, %conv
-  %add7 = add i32 %add4, %conv6
-  ret i32 %add7
-}
-
-; For s1, we allocate a 22-byte space, pass its address via x1
-define i32 @caller42() #3 {
-entry:
-; CHECK: caller42
-; CHECK: str {{x[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK: str {{q[0-9]+}}, [sp]
-; CHECK: add x1, sp, #32
-; CHECK: mov x2, sp
-; Space for s1 is allocated at sp+32
-; Space for s2 is allocated at sp
-
-; FAST: caller42
-; FAST: sub sp, sp, #96
-; Space for s1 is allocated at fp-24 = sp+72
-; Space for s2 is allocated at sp+48
-; FAST: sub x[[A:[0-9]+]], x29, #24
-; FAST: add x[[A:[0-9]+]], sp, #48
-; Call memcpy with size = 24 (0x18)
-; FAST: orr {{x[0-9]+}}, xzr, #0x18
-  %tmp = alloca %struct.s42, align 4
-  %tmp1 = alloca %struct.s42, align 4
-  %0 = bitcast %struct.s42* %tmp to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
-  %1 = bitcast %struct.s42* %tmp1 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
-  %call = call i32 @f42(i32 3, %struct.s42* %tmp, %struct.s42* %tmp1) #5
-  ret i32 %call
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #4
-
-declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-                       i32 %i7, i32 %i8, i32 %i9, %struct.s42* nocapture %s1,
-                       %struct.s42* nocapture %s2) #2
-
-define i32 @caller42_stack() #3 {
-entry:
-; CHECK: caller42_stack
-; CHECK: mov x29, sp
-; CHECK: sub sp, sp, #96
-; CHECK: stur {{x[0-9]+}}, [x29, #-16]
-; CHECK: stur {{q[0-9]+}}, [x29, #-32]
-; CHECK: str {{x[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
-; Space for s1 is allocated at x29-32 = sp+64
-; Space for s2 is allocated at sp+32
-; CHECK: add x[[B:[0-9]+]], sp, #32
-; CHECK: str x[[B]], [sp, #16]
-; CHECK: sub x[[A:[0-9]+]], x29, #32
-; Address of s1 is passed on stack at sp+8
-; CHECK: str x[[A]], [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #0x9
-; CHECK: str w[[C]], [sp]
-
-; FAST: caller42_stack
-; Space for s1 is allocated at fp-24
-; Space for s2 is allocated at fp-48
-; FAST: sub x[[A:[0-9]+]], x29, #24
-; FAST: sub x[[B:[0-9]+]], x29, #48
-; Call memcpy with size = 24 (0x18)
-; FAST: orr {{x[0-9]+}}, xzr, #0x18
-; FAST: str {{w[0-9]+}}, [sp]
-; Address of s1 is passed on stack at sp+8
-; FAST: str {{x[0-9]+}}, [sp, #8]
-; FAST: str {{x[0-9]+}}, [sp, #16]
-  %tmp = alloca %struct.s42, align 4
-  %tmp1 = alloca %struct.s42, align 4
-  %0 = bitcast %struct.s42* %tmp to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
-  %1 = bitcast %struct.s42* %tmp1 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
-  %call = call i32 @f42_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                       i32 8, i32 9, %struct.s42* %tmp, %struct.s42* %tmp1) #5
-  ret i32 %call
-}
-
-; structs with size of 22 bytes, alignment of 16
-; passed indirectly in x1 and x2
-define i32 @f43(i32 %i, %struct.s43* nocapture %s1, %struct.s43* nocapture %s2) #2 {
-entry:
-; CHECK: f43
-; CHECK: ldr w[[A:[0-9]+]], [x1]
-; CHECK: ldr w[[B:[0-9]+]], [x2]
-; CHECK: add w[[C:[0-9]+]], w[[A]], w0
-; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
-; FAST: f43
-; FAST: ldr w[[A:[0-9]+]], [x1]
-; FAST: ldr w[[B:[0-9]+]], [x2]
-; FAST: add w[[C:[0-9]+]], w[[A]], w0
-; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
-  %i1 = getelementptr inbounds %struct.s43* %s1, i64 0, i32 0
-  %0 = load i32* %i1, align 4, !tbaa !0
-  %i2 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 0
-  %1 = load i32* %i2, align 4, !tbaa !0
-  %s = getelementptr inbounds %struct.s43* %s1, i64 0, i32 1
-  %2 = load i16* %s, align 2, !tbaa !3
-  %conv = sext i16 %2 to i32
-  %s5 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 1
-  %3 = load i16* %s5, align 2, !tbaa !3
-  %conv6 = sext i16 %3 to i32
-  %add = add i32 %0, %i
-  %add3 = add i32 %add, %1
-  %add4 = add i32 %add3, %conv
-  %add7 = add i32 %add4, %conv6
-  ret i32 %add7
-}
-
-define i32 @caller43() #3 {
-entry:
-; CHECK: caller43
-; CHECK: str {{q[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
-; CHECK: str {{q[0-9]+}}, [sp, #16]
-; CHECK: str {{q[0-9]+}}, [sp]
-; CHECK: add x1, sp, #32
-; CHECK: mov x2, sp
-; Space for s1 is allocated at sp+32
-; Space for s2 is allocated at sp
-
-; FAST: caller43
-; FAST: mov x29, sp
-; Space for s1 is allocated at sp+32
-; Space for s2 is allocated at sp
-; FAST: add x1, sp, #32
-; FAST: mov x2, sp
-; FAST: str {{x[0-9]+}}, [sp, #32]
-; FAST: str {{x[0-9]+}}, [sp, #40]
-; FAST: str {{x[0-9]+}}, [sp, #48]
-; FAST: str {{x[0-9]+}}, [sp, #56]
-; FAST: str {{x[0-9]+}}, [sp]
-; FAST: str {{x[0-9]+}}, [sp, #8]
-; FAST: str {{x[0-9]+}}, [sp, #16]
-; FAST: str {{x[0-9]+}}, [sp, #24]
-  %tmp = alloca %struct.s43, align 16
-  %tmp1 = alloca %struct.s43, align 16
-  %0 = bitcast %struct.s43* %tmp to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
-  %1 = bitcast %struct.s43* %tmp1 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
-  %call = call i32 @f43(i32 3, %struct.s43* %tmp, %struct.s43* %tmp1) #5
-  ret i32 %call
-}
-
-declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-                       i32 %i7, i32 %i8, i32 %i9, %struct.s43* nocapture %s1,
-                       %struct.s43* nocapture %s2) #2
-
-define i32 @caller43_stack() #3 {
-entry:
-; CHECK: caller43_stack
-; CHECK: mov x29, sp
-; CHECK: sub sp, sp, #96
-; CHECK: stur {{q[0-9]+}}, [x29, #-16]
-; CHECK: stur {{q[0-9]+}}, [x29, #-32]
-; CHECK: str {{q[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
-; Space for s1 is allocated at x29-32 = sp+64
-; Space for s2 is allocated at sp+32
-; CHECK: add x[[B:[0-9]+]], sp, #32
-; CHECK: str x[[B]], [sp, #16]
-; CHECK: sub x[[A:[0-9]+]], x29, #32
-; Address of s1 is passed on stack at sp+8
-; CHECK: str x[[A]], [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #0x9
-; CHECK: str w[[C]], [sp]
-
-; FAST: caller43_stack
-; FAST: sub sp, sp, #96
-; Space for s1 is allocated at fp-32 = sp+64
-; Space for s2 is allocated at sp+32
-; FAST: sub x[[A:[0-9]+]], x29, #32
-; FAST: add x[[B:[0-9]+]], sp, #32
-; FAST: stur {{x[0-9]+}}, [x29, #-32]
-; FAST: stur {{x[0-9]+}}, [x29, #-24]
-; FAST: stur {{x[0-9]+}}, [x29, #-16]
-; FAST: stur {{x[0-9]+}}, [x29, #-8]
-; FAST: str {{x[0-9]+}}, [sp, #32]
-; FAST: str {{x[0-9]+}}, [sp, #40]
-; FAST: str {{x[0-9]+}}, [sp, #48]
-; FAST: str {{x[0-9]+}}, [sp, #56]
-; FAST: str {{w[0-9]+}}, [sp]
-; Address of s1 is passed on stack at sp+8
-; FAST: str {{x[0-9]+}}, [sp, #8]
-; FAST: str {{x[0-9]+}}, [sp, #16]
-  %tmp = alloca %struct.s43, align 16
-  %tmp1 = alloca %struct.s43, align 16
-  %0 = bitcast %struct.s43* %tmp to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
-  %1 = bitcast %struct.s43* %tmp1 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
-  %call = call i32 @f43_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                       i32 8, i32 9, %struct.s43* %tmp, %struct.s43* %tmp1) #5
-  ret i32 %call
-}
-
-; rdar://13668927
-; Check that we don't split an i128.
-declare i32 @callee_i128_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
-                               i32 %i6, i32 %i7, i128 %s1, i32 %i8)
-
-define i32 @i128_split() {
-entry:
-; CHECK: i128_split
-; "i128 %0" should be on stack at [sp].
-; "i32 8" should be on stack at [sp, #16].
-; CHECK: str {{w[0-9]+}}, [sp, #16]
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp]
-; FAST: i128_split
-; FAST: sub sp, sp, #48
-; FAST: mov x[[ADDR:[0-9]+]], sp
-; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16]
-; Load/Store opt is disabled with -O0, so the i128 is split.
-; FAST: str {{x[0-9]+}}, [x[[ADDR]], #8]
-; FAST: str {{x[0-9]+}}, [x[[ADDR]]]
-  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
-  %call = tail call i32 @callee_i128_split(i32 1, i32 2, i32 3, i32 4, i32 5,
-                                           i32 6, i32 7, i128 %0, i32 8) #5
-  ret i32 %call
-}
-
-declare i32 @callee_i64(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
-                               i32 %i6, i32 %i7, i64 %s1, i32 %i8)
-
-define i32 @i64_split() {
-entry:
-; CHECK: i64_split
-; "i64 %0" should be in register x7.
-; "i32 8" should be on stack at [sp].
-; CHECK: ldr x7, [{{x[0-9]+}}]
-; CHECK: str {{w[0-9]+}}, [sp]
-; FAST: i64_split
-; FAST: ldr x7, [{{x[0-9]+}}]
-; FAST: str {{w[0-9]+}}, [sp]
-  %0 = load i64* bitcast (%struct.s41* @g41 to i64*), align 16
-  %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5,
-                                    i32 6, i32 7, i64 %0, i32 8) #5
-  ret i32 %call
-}
-
-attributes #0 = { noinline nounwind readnone "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
-attributes #1 = { nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
-attributes #2 = { noinline nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
-attributes #3 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
-attributes #4 = { nounwind }
-attributes #5 = { nobuiltin }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"short", metadata !1}
-!4 = metadata !{i64 0, i64 4, metadata !0, i64 4, i64 2, metadata !3, i64 8, i64 4, metadata !0, i64 12, i64 2, metadata !3, i64 16, i64 4, metadata !0, i64 20, i64 2, metadata !3}
diff --git a/test/CodeGen/ARM64/addp.ll b/test/CodeGen/ARM64/addp.ll
deleted file mode 100644
index 428f6d4f28a..00000000000
--- a/test/CodeGen/ARM64/addp.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
-
-define double @foo(<2 x double> %a) nounwind {
-; CHECK-LABEL: foo:
-; CHECK: faddp.2d d0, v0
-; CHECK-NEXT: ret
-  %lane0.i = extractelement <2 x double> %a, i32 0
-  %lane1.i = extractelement <2 x double> %a, i32 1
-  %vpaddd.i = fadd double %lane0.i, %lane1.i
-  ret double %vpaddd.i
-}
-
-define i64 @foo0(<2 x i64> %a) nounwind {
-; CHECK-LABEL: foo0:
-; CHECK: addp.2d d0, v0
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: ret
-  %lane0.i = extractelement <2 x i64> %a, i32 0
-  %lane1.i = extractelement <2 x i64> %a, i32 1
-  %vpaddd.i = add i64 %lane0.i, %lane1.i
-  ret i64 %vpaddd.i
-}
-
-define float @foo1(<2 x float> %a) nounwind {
-; CHECK-LABEL: foo1:
-; CHECK: faddp.2s
-; CHECK-NEXT: ret
-  %lane0.i = extractelement <2 x float> %a, i32 0
-  %lane1.i = extractelement <2 x float> %a, i32 1
-  %vpaddd.i = fadd float %lane0.i, %lane1.i
-  ret float %vpaddd.i
-}
diff --git a/test/CodeGen/ARM64/addr-mode-folding.ll b/test/CodeGen/ARM64/addr-mode-folding.ll
deleted file mode 100644
index 08fb8c90c48..00000000000
--- a/test/CodeGen/ARM64/addr-mode-folding.ll
+++ /dev/null
@@ -1,171 +0,0 @@
-; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s
-; <rdar://problem/13621857>
-
-@block = common global i8* null, align 8
-
-define i32 @fct(i32 %i1, i32 %i2) {
-; CHECK: @fct
-; Sign extension is used more than once, thus it should not be folded.
-; CodeGenPrepare is not sharing sext across uses, thus this is folded because
-; of that.
-; _CHECK-NOT_: , sxtw]
-entry:
-  %idxprom = sext i32 %i1 to i64
-  %0 = load i8** @block, align 8
-  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
-  %1 = load i8* %arrayidx, align 1
-  %idxprom1 = sext i32 %i2 to i64
-  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
-  %2 = load i8* %arrayidx2, align 1
-  %cmp = icmp eq i8 %1, %2
-  br i1 %cmp, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %cmp7 = icmp ugt i8 %1, %2
-  %conv8 = zext i1 %cmp7 to i32
-  br label %return
-
-if.end:                                           ; preds = %entry
-  %inc = add nsw i32 %i1, 1
-  %inc9 = add nsw i32 %i2, 1
-  %idxprom10 = sext i32 %inc to i64
-  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
-  %3 = load i8* %arrayidx11, align 1
-  %idxprom12 = sext i32 %inc9 to i64
-  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
-  %4 = load i8* %arrayidx13, align 1
-  %cmp16 = icmp eq i8 %3, %4
-  br i1 %cmp16, label %if.end23, label %if.then18
-
-if.then18:                                        ; preds = %if.end
-  %cmp21 = icmp ugt i8 %3, %4
-  %conv22 = zext i1 %cmp21 to i32
-  br label %return
-
-if.end23:                                         ; preds = %if.end
-  %inc24 = add nsw i32 %i1, 2
-  %inc25 = add nsw i32 %i2, 2
-  %idxprom26 = sext i32 %inc24 to i64
-  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
-  %5 = load i8* %arrayidx27, align 1
-  %idxprom28 = sext i32 %inc25 to i64
-  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
-  %6 = load i8* %arrayidx29, align 1
-  %cmp32 = icmp eq i8 %5, %6
-  br i1 %cmp32, label %return, label %if.then34
-
-if.then34:                                        ; preds = %if.end23
-  %cmp37 = icmp ugt i8 %5, %6
-  %conv38 = zext i1 %cmp37 to i32
-  br label %return
-
-return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
-  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
-  ret i32 %retval.0
-}
-
-define i32 @fct1(i32 %i1, i32 %i2) optsize {
-; CHECK: @fct1
-; Addressing are folded when optimizing for code size.
-; CHECK: , sxtw]
-; CHECK: , sxtw]
-entry:
-  %idxprom = sext i32 %i1 to i64
-  %0 = load i8** @block, align 8
-  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
-  %1 = load i8* %arrayidx, align 1
-  %idxprom1 = sext i32 %i2 to i64
-  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
-  %2 = load i8* %arrayidx2, align 1
-  %cmp = icmp eq i8 %1, %2
-  br i1 %cmp, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %cmp7 = icmp ugt i8 %1, %2
-  %conv8 = zext i1 %cmp7 to i32
-  br label %return
-
-if.end:                                           ; preds = %entry
-  %inc = add nsw i32 %i1, 1
-  %inc9 = add nsw i32 %i2, 1
-  %idxprom10 = sext i32 %inc to i64
-  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
-  %3 = load i8* %arrayidx11, align 1
-  %idxprom12 = sext i32 %inc9 to i64
-  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
-  %4 = load i8* %arrayidx13, align 1
-  %cmp16 = icmp eq i8 %3, %4
-  br i1 %cmp16, label %if.end23, label %if.then18
-
-if.then18:                                        ; preds = %if.end
-  %cmp21 = icmp ugt i8 %3, %4
-  %conv22 = zext i1 %cmp21 to i32
-  br label %return
-
-if.end23:                                         ; preds = %if.end
-  %inc24 = add nsw i32 %i1, 2
-  %inc25 = add nsw i32 %i2, 2
-  %idxprom26 = sext i32 %inc24 to i64
-  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
-  %5 = load i8* %arrayidx27, align 1
-  %idxprom28 = sext i32 %inc25 to i64
-  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
-  %6 = load i8* %arrayidx29, align 1
-  %cmp32 = icmp eq i8 %5, %6
-  br i1 %cmp32, label %return, label %if.then34
-
-if.then34:                                        ; preds = %if.end23
-  %cmp37 = icmp ugt i8 %5, %6
-  %conv38 = zext i1 %cmp37 to i32
-  br label %return
-
-return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
-  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
-  ret i32 %retval.0
-}
-
-; CHECK: @test
-; CHECK-NOT: , uxtw #2]
-define i32 @test(i32* %array, i8 zeroext %c, i32 %arg) {
-entry:
-  %conv = zext i8 %c to i32
-  %add = sub i32 0, %arg
-  %tobool = icmp eq i32 %conv, %add
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %idxprom = zext i8 %c to i64
-  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
-  %0 = load volatile i32* %arrayidx, align 4
-  %1 = load volatile i32* %arrayidx, align 4
-  %add3 = add nsw i32 %1, %0
-  br label %if.end
-
-if.end:                                           ; preds = %entry, %if.then
-  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
-  ret i32 %res.0
-}
-
-
-; CHECK: @test2
-; CHECK: , uxtw #2]
-; CHECK: , uxtw #2]
-define i32 @test2(i32* %array, i8 zeroext %c, i32 %arg) optsize {
-entry:
-  %conv = zext i8 %c to i32
-  %add = sub i32 0, %arg
-  %tobool = icmp eq i32 %conv, %add
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %idxprom = zext i8 %c to i64
-  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
-  %0 = load volatile i32* %arrayidx, align 4
-  %1 = load volatile i32* %arrayidx, align 4
-  %add3 = add nsw i32 %1, %0
-  br label %if.end
-
-if.end:                                           ; preds = %entry, %if.then
-  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
-  ret i32 %res.0
-}
diff --git a/test/CodeGen/ARM64/addr-type-promotion.ll b/test/CodeGen/ARM64/addr-type-promotion.ll
deleted file mode 100644
index 1a3ca8bd5b8..00000000000
--- a/test/CodeGen/ARM64/addr-type-promotion.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; RUN: llc -march arm64 < %s | FileCheck %s
-; rdar://13452552
-; ModuleID = 'reduced_test.ll'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios3.0.0"
-
-@block = common global i8* null, align 8
-
-define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
-; CHECK: fullGtU
-; CHECK: adrp [[PAGE:x[0-9]+]], _block@GOTPAGE
-; CHECK: ldr [[ADDR:x[0-9]+]], {{\[}}[[PAGE]], _block@GOTPAGEOFF]
-; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]]
-; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]],  w0, sxtw]
-; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], w1, sxtw]
-; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
-; CHECK-NEXT b.ne
-; Next BB
-; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw
-; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw
-; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1]
-; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1]
-; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]]
-; CHECK-NEXT: b.ne
-; Next BB
-; CHECK: ldrb [[LOADEDVAL3:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #2]
-; CHECK-NEXT: ldrb [[LOADEDVAL4:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #2]
-; CHECK-NEXT: cmp [[LOADEDVAL3]], [[LOADEDVAL4]]
-entry:
-  %idxprom = sext i32 %i1 to i64
-  %tmp = load i8** @block, align 8
-  %arrayidx = getelementptr inbounds i8* %tmp, i64 %idxprom
-  %tmp1 = load i8* %arrayidx, align 1
-  %idxprom1 = sext i32 %i2 to i64
-  %arrayidx2 = getelementptr inbounds i8* %tmp, i64 %idxprom1
-  %tmp2 = load i8* %arrayidx2, align 1
-  %cmp = icmp eq i8 %tmp1, %tmp2
-  br i1 %cmp, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %cmp7 = icmp ugt i8 %tmp1, %tmp2
-  %conv9 = zext i1 %cmp7 to i8
-  br label %return
-
-if.end:                                           ; preds = %entry
-  %inc = add nsw i32 %i1, 1
-  %inc10 = add nsw i32 %i2, 1
-  %idxprom11 = sext i32 %inc to i64
-  %arrayidx12 = getelementptr inbounds i8* %tmp, i64 %idxprom11
-  %tmp3 = load i8* %arrayidx12, align 1
-  %idxprom13 = sext i32 %inc10 to i64
-  %arrayidx14 = getelementptr inbounds i8* %tmp, i64 %idxprom13
-  %tmp4 = load i8* %arrayidx14, align 1
-  %cmp17 = icmp eq i8 %tmp3, %tmp4
-  br i1 %cmp17, label %if.end25, label %if.then19
-
-if.then19:                                        ; preds = %if.end
-  %cmp22 = icmp ugt i8 %tmp3, %tmp4
-  %conv24 = zext i1 %cmp22 to i8
-  br label %return
-
-if.end25:                                         ; preds = %if.end
-  %inc26 = add nsw i32 %i1, 2
-  %inc27 = add nsw i32 %i2, 2
-  %idxprom28 = sext i32 %inc26 to i64
-  %arrayidx29 = getelementptr inbounds i8* %tmp, i64 %idxprom28
-  %tmp5 = load i8* %arrayidx29, align 1
-  %idxprom30 = sext i32 %inc27 to i64
-  %arrayidx31 = getelementptr inbounds i8* %tmp, i64 %idxprom30
-  %tmp6 = load i8* %arrayidx31, align 1
-  %cmp34 = icmp eq i8 %tmp5, %tmp6
-  br i1 %cmp34, label %return, label %if.then36
-
-if.then36:                                        ; preds = %if.end25
-  %cmp39 = icmp ugt i8 %tmp5, %tmp6
-  %conv41 = zext i1 %cmp39 to i8
-  br label %return
-
-return:                                           ; preds = %if.then36, %if.end25, %if.then19, %if.then
-  %retval.0 = phi i8 [ %conv9, %if.then ], [ %conv24, %if.then19 ], [ %conv41, %if.then36 ], [ 0, %if.end25 ]
-  ret i8 %retval.0
-}
diff --git a/test/CodeGen/ARM64/addrmode.ll b/test/CodeGen/ARM64/addrmode.ll
deleted file mode 100644
index 700fba80149..00000000000
--- a/test/CodeGen/ARM64/addrmode.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: llc -march=arm64 < %s | FileCheck %s
-; rdar://10232252
-
-@object = external hidden global i64, section "__DATA, __objc_ivar", align 8
-
-; base + offset (imm9)
-; CHECK: @t1
-; CHECK: ldr xzr, [x{{[0-9]+}}, #8]
-; CHECK: ret
-define void @t1() {
-  %incdec.ptr = getelementptr inbounds i64* @object, i64 1
-  %tmp = load volatile i64* %incdec.ptr, align 8
-  ret void
-}
-
-; base + offset (> imm9)
-; CHECK: @t2
-; CHECK: sub [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #264
-; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
-; CHECK: ret
-define void @t2() {
-  %incdec.ptr = getelementptr inbounds i64* @object, i64 -33
-  %tmp = load volatile i64* %incdec.ptr, align 8
-  ret void
-}
-
-; base + unsigned offset (> imm9 and <= imm12 * size of type in bytes)
-; CHECK: @t3
-; CHECK: ldr xzr, [x{{[0-9]+}}, #32760]
-; CHECK: ret
-define void @t3() {
-  %incdec.ptr = getelementptr inbounds i64* @object, i64 4095
-  %tmp = load volatile i64* %incdec.ptr, align 8
-  ret void
-}
-
-; base + unsigned offset (> imm12 * size of type in bytes)
-; CHECK: @t4
-; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #8, lsl #12
-; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
-; CHECK: ret
-define void @t4() {
-  %incdec.ptr = getelementptr inbounds i64* @object, i64 4096
-  %tmp = load volatile i64* %incdec.ptr, align 8
-  ret void
-}
-
-; base + reg
-; CHECK: @t5
-; CHECK: ldr xzr, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #3]
-; CHECK: ret
-define void @t5(i64 %a) {
-  %incdec.ptr = getelementptr inbounds i64* @object, i64 %a
-  %tmp = load volatile i64* %incdec.ptr, align 8
-  ret void
-}
-
-; base + reg + imm
-; CHECK: @t6
-; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
-; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #8, lsl #12
-; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
-; CHECK: ret
-define void @t6(i64 %a) {
-  %tmp1 = getelementptr inbounds i64* @object, i64 %a
-  %incdec.ptr = getelementptr inbounds i64* %tmp1, i64 4096
-  %tmp = load volatile i64* %incdec.ptr, align 8
-  ret void
-}
diff --git a/test/CodeGen/ARM64/alloc-no-stack-realign.ll b/test/CodeGen/ARM64/alloc-no-stack-realign.ll
deleted file mode 100644
index f396bc99170..00000000000
--- a/test/CodeGen/ARM64/alloc-no-stack-realign.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=false | FileCheck %s
-
-; rdar://12713765
-; Make sure we are not creating stack objects that are assumed to be 64-byte
-; aligned.
-@T3_retval = common global <16 x float> zeroinitializer, align 16
-
-define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp {
-entry:
-; CHECK: test
-; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32]
-; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp]
-; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE:x[0-9]+]], #32]
-; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE]]]
- %retval = alloca <16 x float>, align 16
- %0 = load <16 x float>* @T3_retval, align 16
- store <16 x float> %0, <16 x float>* %retval
- %1 = load <16 x float>* %retval
- store <16 x float> %1, <16 x float>* %agg.result, align 16
- ret void
-}
diff --git a/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll b/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
deleted file mode 100644
index 3750f31b373..00000000000
--- a/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s
-
-; CHECK: foo
-; CHECK: ldr w[[REG:[0-9]+]], [x19, #264]
-; CHECK: str w[[REG]], [x19, #132]
-; CHECK: ldr w{{[0-9]+}}, [x19, #264]
-
-define i32 @foo(i32 %a) nounwind {
-  %retval = alloca i32, align 4
-  %a.addr = alloca i32, align 4
-  %arr = alloca [32 x i32], align 4
-  %i = alloca i32, align 4
-  %arr2 = alloca [32 x i32], align 4
-  %j = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  %tmp = load i32* %a.addr, align 4
-  %tmp1 = zext i32 %tmp to i64
-  %v = mul i64 4, %tmp1
-  %vla = alloca i8, i64 %v, align 4
-  %tmp2 = bitcast i8* %vla to i32*
-  %tmp3 = load i32* %a.addr, align 4
-  store i32 %tmp3, i32* %i, align 4
-  %tmp4 = load i32* %a.addr, align 4
-  store i32 %tmp4, i32* %j, align 4
-  %tmp5 = load i32* %j, align 4
-  store i32 %tmp5, i32* %retval
-  %x = load i32* %retval
-  ret i32 %x
-}
diff --git a/test/CodeGen/ARM64/andCmpBrToTBZ.ll b/test/CodeGen/ARM64/andCmpBrToTBZ.ll
deleted file mode 100644
index 419497722f4..00000000000
--- a/test/CodeGen/ARM64/andCmpBrToTBZ.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: llc -O1 -march=arm64 -enable-andcmp-sinking=true < %s | FileCheck %s
-; ModuleID = 'and-cbz-extr-mr.bc'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios7.0.0"
-
-define zeroext i1 @foo(i1 %IsEditable, i1 %isTextField, i8* %str1, i8* %str2, i8* %str3, i8* %str4, i8* %str5, i8* %str6, i8* %str7, i8* %str8, i8* %str9, i8* %str10, i8* %str11, i8* %str12, i8* %str13, i32 %int1, i8* %str14) unnamed_addr #0 align 2 {
-; CHECK: _foo:
-entry:
-  %tobool = icmp eq i8* %str14, null
-  br i1 %tobool, label %return, label %if.end
-
-; CHECK: %if.end
-; CHECK: tbz
-if.end:                                           ; preds = %entry
-  %and.i.i.i = and i32 %int1, 4
-  %tobool.i.i.i = icmp eq i32 %and.i.i.i, 0
-  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i
-
-land.rhs.i:                                       ; preds = %if.end
-  %cmp.i.i.i = icmp eq i8* %str12, %str13
-  br i1 %cmp.i.i.i, label %if.then3, label %lor.rhs.i.i.i
-
-lor.rhs.i.i.i:                                    ; preds = %land.rhs.i
-  %cmp.i13.i.i.i = icmp eq i8* %str10, %str11
-  br i1 %cmp.i13.i.i.i, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, label %if.end5
-
-_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit: ; preds = %lor.rhs.i.i.i
-  %cmp.i.i.i.i = icmp eq i8* %str8, %str9
-  br i1 %cmp.i.i.i.i, label %if.then3, label %if.end5
-
-if.then3:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %land.rhs.i
-  %tmp11 = load i8* %str14, align 8
-  %tmp12 = and i8 %tmp11, 2
-  %tmp13 = icmp ne i8 %tmp12, 0
-  br label %return
-
-if.end5:                                          ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %lor.rhs.i.i.i
-; CHECK: %if.end5
-; CHECK: tbz
-  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i19
-
-land.rhs.i19:                                     ; preds = %if.end5
-  %cmp.i.i.i18 = icmp eq i8* %str6, %str7
-  br i1 %cmp.i.i.i18, label %if.then7, label %lor.rhs.i.i.i23
-
-lor.rhs.i.i.i23:                                  ; preds = %land.rhs.i19
-  %cmp.i13.i.i.i22 = icmp eq i8* %str3, %str4
-  br i1 %cmp.i13.i.i.i22, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, label %if.end12
-
-_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28: ; preds = %lor.rhs.i.i.i23
-  %cmp.i.i.i.i26 = icmp eq i8* %str1, %str2
-  br i1 %cmp.i.i.i.i26, label %if.then7, label %if.end12
-
-if.then7:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %land.rhs.i19
-  br i1 %isTextField, label %if.then9, label %if.end12
-
-if.then9:                                         ; preds = %if.then7
-  %tmp23 = load i8* %str5, align 8
-  %tmp24 = and i8 %tmp23, 2
-  %tmp25 = icmp ne i8 %tmp24, 0
-  br label %return
-
-if.end12:                                         ; preds = %if.then7, %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %lor.rhs.i.i.i23, %if.end5, %if.end
-  %lnot = xor i1 %IsEditable, true
-  br label %return
-
-return:                                           ; preds = %if.end12, %if.then9, %if.then3, %entry
-  %retval.0 = phi i1 [ %tmp13, %if.then3 ], [ %tmp25, %if.then9 ], [ %lnot, %if.end12 ], [ true, %entry ]
-  ret i1 %retval.0
-}
-
-attributes #0 = { nounwind ssp }
diff --git a/test/CodeGen/ARM64/ands-bad-peephole.ll b/test/CodeGen/ARM64/ands-bad-peephole.ll
deleted file mode 100644
index 34d6287b8b4..00000000000
--- a/test/CodeGen/ARM64/ands-bad-peephole.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc %s -o - | FileCheck %s
-; Check that ANDS (tst) is not merged with ADD when the immediate
-; is not 0.
-; <rdar://problem/16693089>
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios"
-
-; CHECK-LABEL: tst1:
-; CHECK: add [[REG:w[0-9]+]], w{{[0-9]+}}, #1
-; CHECK: tst [[REG]], #0x1
-define void @tst1() {
-entry:
-  br i1 undef, label %for.end, label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %result.09 = phi i32 [ %add2.result.0, %for.body ], [ 1, %entry ]
-  %i.08 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
-  %and = and i32 %i.08, 1
-  %cmp1 = icmp eq i32 %and, 0
-  %add2.result.0 = select i1 %cmp1, i32 undef, i32 %result.09
-  %inc = add nsw i32 %i.08, 1
-  %cmp = icmp slt i32 %i.08, undef
-  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  %add2.result.0.lcssa = phi i32 [ %add2.result.0, %for.body ]
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
-  ret void
-}
diff --git a/test/CodeGen/ARM64/anyregcc-crash.ll b/test/CodeGen/ARM64/anyregcc-crash.ll
deleted file mode 100644
index 241cf974c05..00000000000
--- a/test/CodeGen/ARM64/anyregcc-crash.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
-;
-; Check that misuse of anyregcc results in a compile time error.
-
-; CHECK: LLVM ERROR: ran out of registers during register allocation
-define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
-                        i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
-                        i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
-                        i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
-entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
-                i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
-                i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
-                i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
-                i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32)
-  ret i64 %result
-}
-
-declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/anyregcc.ll b/test/CodeGen/ARM64/anyregcc.ll
deleted file mode 100644
index e26875d52f9..00000000000
--- a/test/CodeGen/ARM64/anyregcc.ll
+++ /dev/null
@@ -1,363 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-
-; Stackmap Header: no constants - 6 callsites
-; CHECK-LABEL: .section	__LLVM_STACKMAPS,__llvm_stackmaps
-; CHECK-NEXT:  __LLVM_StackMaps:
-; Header
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 0
-; CHECK-NEXT:   .short 0
-; Num Functions
-; CHECK-NEXT:   .long 8
-; Num LargeConstants
-; CHECK-NEXT:   .long 0
-; Num Callsites
-; CHECK-NEXT:   .long 8
-
-; Functions and stack size
-; CHECK-NEXT:   .quad _test
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _property_access1
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _property_access2
-; CHECK-NEXT:   .quad 32
-; CHECK-NEXT:   .quad _property_access3
-; CHECK-NEXT:   .quad 32
-; CHECK-NEXT:   .quad _anyreg_test1
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _anyreg_test2
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _patchpoint_spilldef
-; CHECK-NEXT:   .quad 112
-; CHECK-NEXT:   .quad _patchpoint_spillargs
-; CHECK-NEXT:   .quad 128
-
-
-; test
-; CHECK-LABEL:  .long   L{{.*}}-_test
-; CHECK-NEXT:   .short  0
-; 3 locations
-; CHECK-NEXT:   .short  3
-; Loc 0: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 4
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 1: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 4
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 2: Constant 3
-; CHECK-NEXT:   .byte 4
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long 3
-define i64 @test() nounwind ssp uwtable {
-entry:
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
-  ret i64 0
-}
-
-; property access 1 - %obj is an anyreg call argument and should therefore be in a register
-; CHECK-LABEL:  .long   L{{.*}}-_property_access1
-; CHECK-NEXT:   .short  0
-; 2 locations
-; CHECK-NEXT:   .short  2
-; Loc 0: Register <-- this is the return register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 1: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
-entry:
-  %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
-  ret i64 %ret
-}
-
-; property access 2 - %obj is an anyreg call argument and should therefore be in a register
-; CHECK-LABEL:  .long   L{{.*}}-_property_access2
-; CHECK-NEXT:   .short  0
-; 2 locations
-; CHECK-NEXT:   .short  2
-; Loc 0: Register <-- this is the return register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 1: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-define i64 @property_access2() nounwind ssp uwtable {
-entry:
-  %obj = alloca i64, align 8
-  %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
-  ret i64 %ret
-}
-
-; property access 3 - %obj is a frame index
-; CHECK-LABEL:  .long   L{{.*}}-_property_access3
-; CHECK-NEXT:   .short  0
-; 2 locations
-; CHECK-NEXT:   .short  2
-; Loc 0: Register <-- this is the return register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 1: Direct FP - 8
-; CHECK-NEXT:   .byte 2
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short 29
-; CHECK-NEXT:   .long -8
-define i64 @property_access3() nounwind ssp uwtable {
-entry:
-  %obj = alloca i64, align 8
-  %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
-  ret i64 %ret
-}
-
-; anyreg_test1
-; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test1
-; CHECK-NEXT:   .short  0
-; 14 locations
-; CHECK-NEXT:   .short  14
-; Loc 0: Register <-- this is the return register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 1: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 2: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 3: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 4: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 5: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 6: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 7: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 8: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 9: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 10: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 11: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 12: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 13: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
-entry:
-  %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
-  ret i64 %ret
-}
-
-; anyreg_test2
-; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test2
-; CHECK-NEXT:   .short  0
-; 14 locations
-; CHECK-NEXT:   .short  14
-; Loc 0: Register <-- this is the return register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 1: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 2: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 3: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 4: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 5: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 6: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 7: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 8: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 9: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 10: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 11: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 12: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 13: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
-entry:
-  %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
-  ret i64 %ret
-}
-
-; Test spilling the return value of an anyregcc call.
-;
-; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
-;
-; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef
-; CHECK-NEXT: .short 0
-; CHECK-NEXT: .short 3
-; Loc 0: Register (some register that will be spilled to the stack)
-; CHECK-NEXT: .byte  1
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short {{[0-9]+}}
-; CHECK-NEXT: .long  0
-; Loc 1: Register
-; CHECK-NEXT: .byte  1
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short {{[0-9]+}}
-; CHECK-NEXT: .long  0
-; Loc 1: Register
-; CHECK-NEXT: .byte  1
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short {{[0-9]+}}
-; CHECK-NEXT: .long  0
-define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
-  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  ret i64 %result
-}
-
-; Test spilling the arguments of an anyregcc call.
-;
-; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
-;
-; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs
-; CHECK-NEXT: .short 0
-; CHECK-NEXT: .short 5
-; Loc 0: Return a register
-; CHECK-NEXT: .byte  1
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short {{[0-9]+}}
-; CHECK-NEXT: .long  0
-; Loc 1: Arg0 in a Register
-; CHECK-NEXT: .byte  1
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short {{[0-9]+}}
-; CHECK-NEXT: .long  0
-; Loc 2: Arg1 in a Register
-; CHECK-NEXT: .byte  1
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short {{[0-9]+}}
-; CHECK-NEXT: .long  0
-; Loc 3: Arg2 spilled to FP -96
-; CHECK-NEXT: .byte  3
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short 29
-; CHECK-NEXT: .long -96
-; Loc 4: Arg3 spilled to FP - 88
-; CHECK-NEXT: .byte  3
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short 29
-; CHECK-NEXT: .long -88
-define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
-  ret i64 %result
-}
-
-declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
-declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/arith-saturating.ll b/test/CodeGen/ARM64/arith-saturating.ll
deleted file mode 100644
index 2d188ff9cc7..00000000000
--- a/test/CodeGen/ARM64/arith-saturating.ll
+++ /dev/null
@@ -1,153 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
-
-define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: qadds:
-; CHECK: sqadd s0, s0, s1
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqadd.i = tail call i32 @llvm.arm64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
-  ret i32 %vqadd.i
-}
-
-define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: qaddd:
-; CHECK: sqadd d0, d0, d1
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqadd.i = tail call i64 @llvm.arm64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
-  ret i64 %vqadd.i
-}
-
-define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: uqadds:
-; CHECK: uqadd s0, s0, s1
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqadd.i = tail call i32 @llvm.arm64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
-  ret i32 %vqadd.i
-}
-
-define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: uqaddd:
-; CHECK: uqadd d0, d0, d1
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqadd.i = tail call i64 @llvm.arm64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
-  ret i64 %vqadd.i
-}
-
-declare i64 @llvm.arm64.neon.uqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.uqadd.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32) nounwind readnone
-
-define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: qsubs:
-; CHECK: sqsub s0, s0, s1
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqsub.i = tail call i32 @llvm.arm64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
-  ret i32 %vqsub.i
-}
-
-define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: qsubd:
-; CHECK: sqsub d0, d0, d1
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqsub.i = tail call i64 @llvm.arm64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
-  ret i64 %vqsub.i
-}
-
-define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: uqsubs:
-; CHECK: uqsub s0, s0, s1
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqsub.i = tail call i32 @llvm.arm64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
-  ret i32 %vqsub.i
-}
-
-define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: uqsubd:
-; CHECK: uqsub d0, d0, d1
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqsub.i = tail call i64 @llvm.arm64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
-  ret i64 %vqsub.i
-}
-
-declare i64 @llvm.arm64.neon.uqsub.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.uqsub.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32) nounwind readnone
-
-define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
-; CHECK-LABEL: qabss:
-; CHECK: sqabs s0, s0
-; CHECK: ret
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vqabs.i = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %vecext) nounwind
-  ret i32 %vqabs.i
-}
-
-define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
-; CHECK-LABEL: qabsd:
-; CHECK: sqabs d0, d0
-; CHECK: ret
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqabs.i = tail call i64 @llvm.arm64.neon.sqabs.i64(i64 %vecext) nounwind
-  ret i64 %vqabs.i
-}
-
-define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
-; CHECK-LABEL: qnegs:
-; CHECK: sqneg s0, s0
-; CHECK: ret
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vqneg.i = tail call i32 @llvm.arm64.neon.sqneg.i32(i32 %vecext) nounwind
-  ret i32 %vqneg.i
-}
-
-define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
-; CHECK-LABEL: qnegd:
-; CHECK: sqneg d0, d0
-; CHECK: ret
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqneg.i = tail call i64 @llvm.arm64.neon.sqneg.i64(i64 %vecext) nounwind
-  ret i64 %vqneg.i
-}
-
-declare i64 @llvm.arm64.neon.sqneg.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqneg.i32(i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqabs.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqabs.i32(i32) nounwind readnone
-
-
-define i32 @vqmovund(<2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: vqmovund:
-; CHECK: sqxtun s0, d0
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovun.i = tail call i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
-  ret i32 %vqmovun.i
-}
-
-define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: vqmovnd_s:
-; CHECK: sqxtn s0, d0
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
-  ret i32 %vqmovn.i
-}
-
-define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: vqmovnd_u:
-; CHECK: uqxtn s0, d0
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
-  ret i32 %vqmovn.i
-}
-
-declare i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
diff --git a/test/CodeGen/ARM64/arith.ll b/test/CodeGen/ARM64/arith.ll
deleted file mode 100644
index e4be8e98a78..00000000000
--- a/test/CodeGen/ARM64/arith.ll
+++ /dev/null
@@ -1,262 +0,0 @@
-; RUN: llc < %s -march=arm64 -asm-verbose=false | FileCheck %s
-
-define i32 @t1(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: add w0, w1, w0
-; CHECK: ret
-  %add = add i32 %b, %a
-  ret i32 %add
-}
-
-define i32 @t2(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: udiv w0, w0, w1
-; CHECK: ret
-  %udiv = udiv i32 %a, %b
-  ret i32 %udiv
-}
-
-define i64 @t3(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t3:
-; CHECK: udiv x0, x0, x1
-; CHECK: ret
-  %udiv = udiv i64 %a, %b
-  ret i64 %udiv
-}
-
-define i32 @t4(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t4:
-; CHECK: sdiv w0, w0, w1
-; CHECK: ret
-  %sdiv = sdiv i32 %a, %b
-  ret i32 %sdiv
-}
-
-define i64 @t5(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t5:
-; CHECK: sdiv x0, x0, x1
-; CHECK: ret
-  %sdiv = sdiv i64 %a, %b
-  ret i64 %sdiv
-}
-
-define i32 @t6(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t6:
-; CHECK: lsl w0, w0, w1
-; CHECK: ret
-  %shl = shl i32 %a, %b
-  ret i32 %shl
-}
-
-define i64 @t7(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t7:
-; CHECK: lsl x0, x0, x1
-; CHECK: ret
-  %shl = shl i64 %a, %b
-  ret i64 %shl
-}
-
-define i32 @t8(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t8:
-; CHECK: lsr w0, w0, w1
-; CHECK: ret
-  %lshr = lshr i32 %a, %b
-  ret i32 %lshr
-}
-
-define i64 @t9(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t9:
-; CHECK: lsr x0, x0, x1
-; CHECK: ret
-  %lshr = lshr i64 %a, %b
-  ret i64 %lshr
-}
-
-define i32 @t10(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t10:
-; CHECK: asr w0, w0, w1
-; CHECK: ret
-  %ashr = ashr i32 %a, %b
-  ret i32 %ashr
-}
-
-define i64 @t11(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t11:
-; CHECK: asr x0, x0, x1
-; CHECK: ret
-  %ashr = ashr i64 %a, %b
-  ret i64 %ashr
-}
-
-define i32 @t12(i16 %a, i32 %x) nounwind ssp {
-entry:
-; CHECK-LABEL: t12:
-; CHECK: add	w0, w1, w0, sxth
-; CHECK: ret
-  %c = sext i16 %a to i32
-  %e = add i32 %x, %c
-  ret i32 %e
-}
-
-define i32 @t13(i16 %a, i32 %x) nounwind ssp {
-entry:
-; CHECK-LABEL: t13:
-; CHECK: add	w0, w1, w0, sxth #2
-; CHECK: ret
-  %c = sext i16 %a to i32
-  %d = shl i32 %c, 2
-  %e = add i32 %x, %d
-  ret i32 %e
-}
-
-define i64 @t14(i16 %a, i64 %x) nounwind ssp {
-entry:
-; CHECK-LABEL: t14:
-; CHECK: add	x0, x1, w0, uxth #3
-; CHECK: ret
-  %c = zext i16 %a to i64
-  %d = shl i64 %c, 3
-  %e = add i64 %x, %d
-  ret i64 %e
-}
-
-; rdar://9160598
-define i64 @t15(i64 %a, i64 %x) nounwind ssp {
-entry:
-; CHECK-LABEL: t15:
-; CHECK: add x0, x1, w0, uxtw
-; CHECK: ret
-  %b = and i64 %a, 4294967295
-  %c = add i64 %x, %b
-  ret i64 %c
-}
-
-define i64 @t16(i64 %x) nounwind ssp {
-entry:
-; CHECK-LABEL: t16:
-; CHECK: lsl x0, x0, #1
-; CHECK: ret
-  %a = shl i64 %x, 1
-  ret i64 %a
-}
-
-; rdar://9166974
-define i64 @t17(i16 %a, i64 %x) nounwind ssp {
-entry:
-; CHECK-LABEL: t17:
-; CHECK: sxth [[REG:x[0-9]+]], w0
-; CHECK: neg x0, [[REG]], lsl #32
-; CHECK: ret
-  %tmp16 = sext i16 %a to i64
-  %tmp17 = mul i64 %tmp16, -4294967296
-  ret i64 %tmp17
-}
-
-define i32 @t18(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t18:
-; CHECK: sdiv w0, w0, w1
-; CHECK: ret
-  %sdiv = call i32 @llvm.arm64.sdiv.i32(i32 %a, i32 %b)
-  ret i32 %sdiv
-}
-
-define i64 @t19(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t19:
-; CHECK: sdiv x0, x0, x1
-; CHECK: ret
-  %sdiv = call i64 @llvm.arm64.sdiv.i64(i64 %a, i64 %b)
-  ret i64 %sdiv
-}
-
-define i32 @t20(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t20:
-; CHECK: udiv w0, w0, w1
-; CHECK: ret
-  %udiv = call i32 @llvm.arm64.udiv.i32(i32 %a, i32 %b)
-  ret i32 %udiv
-}
-
-define i64 @t21(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t21:
-; CHECK: udiv x0, x0, x1
-; CHECK: ret
-  %udiv = call i64 @llvm.arm64.udiv.i64(i64 %a, i64 %b)
-  ret i64 %udiv
-}
-
-declare i32 @llvm.arm64.sdiv.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.sdiv.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.udiv.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.udiv.i64(i64, i64) nounwind readnone
-
-; 32-bit not.
-define i32 @inv_32(i32 %x) nounwind ssp {
-entry:
-; CHECK: inv_32
-; CHECK: mvn w0, w0
-; CHECK: ret
-  %inv = xor i32 %x, -1
-  ret i32 %inv
-}
-
-; 64-bit not.
-define i64 @inv_64(i64 %x) nounwind ssp {
-entry:
-; CHECK: inv_64
-; CHECK: mvn x0, x0
-; CHECK: ret
-  %inv = xor i64 %x, -1
-  ret i64 %inv
-}
-
-; Multiplying by a power of two plus or minus one is better done via shift
-; and add/sub rather than the madd/msub instructions. The latter are 4+ cycles,
-; and the former are two (total for the two instruction sequence for subtract).
-define i32 @f0(i32 %a) nounwind readnone ssp {
-; CHECK-LABEL: f0:
-; CHECK-NEXT: add w0, w0, w0, lsl #3
-; CHECK-NEXT: ret
-  %res = mul i32 %a, 9
-  ret i32 %res
-}
-
-define i64 @f1(i64 %a) nounwind readnone ssp {
-; CHECK-LABEL: f1:
-; CHECK-NEXT: lsl x8, x0, #4
-; CHECK-NEXT: sub x0, x8, x0
-; CHECK-NEXT: ret
-  %res = mul i64 %a, 15
-  ret i64 %res
-}
-
-define i32 @f2(i32 %a) nounwind readnone ssp {
-; CHECK-LABEL: f2:
-; CHECK-NEXT: lsl w8, w0, #3
-; CHECK-NEXT: sub w0, w8, w0
-; CHECK-NEXT: ret
-  %res = mul nsw i32 %a, 7
-  ret i32 %res
-}
-
-define i64 @f3(i64 %a) nounwind readnone ssp {
-; CHECK-LABEL: f3:
-; CHECK-NEXT: add x0, x0, x0, lsl #4
-; CHECK-NEXT: ret
-  %res = mul nsw i64 %a, 17
-  ret i64 %res
-}
diff --git a/test/CodeGen/ARM64/arm64-dead-def-elimination-flag.ll b/test/CodeGen/ARM64/arm64-dead-def-elimination-flag.ll
deleted file mode 100644
index babf4827693..00000000000
--- a/test/CodeGen/ARM64/arm64-dead-def-elimination-flag.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc -march=arm64 -arm64-dead-def-elimination=false < %s | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios7.0.0"
-
-; Function Attrs: nounwind ssp uwtable
-define i32 @test1() #0 {
-  %tmp1 = alloca i8
-  %tmp2 = icmp eq i8* %tmp1, null
-  %tmp3 = zext i1 %tmp2 to i32
-
-  ret i32 %tmp3
-
-  ; CHECK-LABEL: test1
-  ; CHECK: adds {{x[0-9]+}}, sp, #15
-}
diff --git a/test/CodeGen/ARM64/atomic-128.ll b/test/CodeGen/ARM64/atomic-128.ll
deleted file mode 100644
index 3b43aa16d2b..00000000000
--- a/test/CodeGen/ARM64/atomic-128.ll
+++ /dev/null
@@ -1,225 +0,0 @@
-; RUN: llc < %s -march=arm64 -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone | FileCheck %s
-
-@var = global i128 0
-
-define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
-; CHECK-LABEL: val_compare_and_swap:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp   [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x[[ADDR:[0-9]+]]]
-; CHECK-DAG: eor     [[MISMATCH_LO:x[0-9]+]], [[RESULTLO]], x2
-; CHECK-DAG: eor     [[MISMATCH_HI:x[0-9]+]], [[RESULTHI]], x3
-; CHECK: orr [[MISMATCH:x[0-9]+]], [[MISMATCH_LO]], [[MISMATCH_HI]]
-; CHECK: cbnz    [[MISMATCH]], [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK: stxp   [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-; CHECK: [[DONE]]:
-  %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
-  ret i128 %val
-}
-
-define void @fetch_and_nand(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_nand:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK-DAG: bic    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
-; CHECK-DAG: bic    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
-  %val = atomicrmw nand i128* %p, i128 %bits release
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_or(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_or:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK-DAG: orr    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
-; CHECK-DAG: orr    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
-  %val = atomicrmw or i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_add(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_add:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: adds   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
-; CHECK: adcs   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
-  %val = atomicrmw add i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_sub(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_sub:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: subs   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
-; CHECK: sbcs    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
-  %val = atomicrmw sub i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_min(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_min:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp   [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: cmp     [[DEST_REGLO]], x2
-; CHECK: cset    [[LOCMP:w[0-9]+]], ls
-; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
-; CHECK: cset    [[HICMP:w[0-9]+]], le
-; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
-; CHECK: cmp     [[CMP]], #0
-; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
-; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
-  %val = atomicrmw min i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_max(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_max:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: cmp     [[DEST_REGLO]], x2
-; CHECK: cset    [[LOCMP:w[0-9]+]], hi
-; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
-; CHECK: cset    [[HICMP:w[0-9]+]], gt
-; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
-; CHECK: cmp     [[CMP]], #0
-; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
-; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
-  %val = atomicrmw max i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_umin(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_umin:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: cmp     [[DEST_REGLO]], x2
-; CHECK: cset    [[LOCMP:w[0-9]+]], ls
-; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
-; CHECK: cset    [[HICMP:w[0-9]+]], ls
-; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
-; CHECK: cmp     [[CMP]], #0
-; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
-; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
-  %val = atomicrmw umin i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_umax(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_umax:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: cmp     [[DEST_REGLO]], x2
-; CHECK: cset    [[LOCMP:w[0-9]+]], hi
-; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
-; CHECK: cset    [[HICMP:w[0-9]+]], hi
-; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
-; CHECK: cmp     [[CMP]], #0
-; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
-; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK-DAG: str    [[DEST_REGHI]]
-; CHECK-DAG: str    [[DEST_REGLO]]
-  %val = atomicrmw umax i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define i128 @atomic_load_seq_cst(i128* %p) {
-; CHECK-LABEL: atomic_load_seq_cst:
-; CHECK-NOT: dmb
-; CHECK-LABEL: ldaxp
-; CHECK-NOT: dmb
-   %r = load atomic i128* %p seq_cst, align 16
-   ret i128 %r
-}
-
-define i128 @atomic_load_relaxed(i128* %p) {
-; CHECK-LABEL: atomic_load_relaxed:
-; CHECK-NOT: dmb
-; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
-; CHECK-NOT: dmb
-   %r = load atomic i128* %p monotonic, align 16
-   ret i128 %r
-}
-
-
-define void @atomic_store_seq_cst(i128 %in, i128* %p) {
-; CHECK-LABEL: atomic_store_seq_cst:
-; CHECK-NOT: dmb
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp xzr, xzr, [x2]
-; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
-; CHECK: cbnz [[SUCCESS]], [[LABEL]]
-; CHECK-NOT: dmb
-   store atomic i128 %in, i128* %p seq_cst, align 16
-   ret void
-}
-
-define void @atomic_store_release(i128 %in, i128* %p) {
-; CHECK-LABEL: atomic_store_release:
-; CHECK-NOT: dmb
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxp xzr, xzr, [x2]
-; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
-; CHECK: cbnz [[SUCCESS]], [[LABEL]]
-; CHECK-NOT: dmb
-   store atomic i128 %in, i128* %p release, align 16
-   ret void
-}
-
-define void @atomic_store_relaxed(i128 %in, i128* %p) {
-; CHECK-LABEL: atomic_store_relaxed:
-; CHECK-NOT: dmb
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxp xzr, xzr, [x2]
-; CHECK: stxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
-; CHECK: cbnz [[SUCCESS]], [[LABEL]]
-; CHECK-NOT: dmb
-   store atomic i128 %in, i128* %p unordered, align 16
-   ret void
-}
diff --git a/test/CodeGen/ARM64/atomic.ll b/test/CodeGen/ARM64/atomic.ll
deleted file mode 100644
index aa9b284410b..00000000000
--- a/test/CodeGen/ARM64/atomic.ll
+++ /dev/null
@@ -1,331 +0,0 @@
-; RUN: llc < %s -march=arm64 -verify-machineinstrs -mcpu=cyclone | FileCheck %s
-
-define i32 @val_compare_and_swap(i32* %p) {
-; CHECK-LABEL: val_compare_and_swap:
-; CHECK: orr    [[NEWVAL_REG:w[0-9]+]], wzr, #0x4
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxr   [[RESULT:w[0-9]+]], [x0]
-; CHECK: cmp    [[RESULT]], #7
-; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
-; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK: [[LABEL2]]:
-  %val = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
-  ret i32 %val
-}
-
-define i64 @val_compare_and_swap_64(i64* %p) {
-; CHECK-LABEL: val_compare_and_swap_64:
-; CHECK: orr    w[[NEWVAL_REG:[0-9]+]], wzr, #0x4
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxr   [[RESULT:x[0-9]+]], [x0]
-; CHECK: cmp    [[RESULT]], #7
-; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
-; CHECK-NOT: stxr x[[NEWVAL_REG]], x[[NEWVAL_REG]]
-; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], x[[NEWVAL_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK: [[LABEL2]]:
-  %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
-  ret i64 %val
-}
-
-define i32 @fetch_and_nand(i32* %p) {
-; CHECK-LABEL: fetch_and_nand:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxr   w[[DEST_REG:[0-9]+]], [x0]
-; CHECK: and    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], #0xfffffff8
-; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
-; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK: mov    x0, x[[DEST_REG]]
-  %val = atomicrmw nand i32* %p, i32 7 release
-  ret i32 %val
-}
-
-define i64 @fetch_and_nand_64(i64* %p) {
-; CHECK-LABEL: fetch_and_nand_64:
-; CHECK: mov    x[[ADDR:[0-9]+]], x0
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxr   [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
-; CHECK: and    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0xfffffffffffffff8
-; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-
-  %val = atomicrmw nand i64* %p, i64 7 acq_rel
-  ret i64 %val
-}
-
-define i32 @fetch_and_or(i32* %p) {
-; CHECK-LABEL: fetch_and_or:
-; CHECK: movz   [[OLDVAL_REG:w[0-9]+]], #0x5
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxr   w[[DEST_REG:[0-9]+]], [x0]
-; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
-; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
-; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK: mov    x0, x[[DEST_REG]]
-  %val = atomicrmw or i32* %p, i32 5 seq_cst
-  ret i32 %val
-}
-
-define i64 @fetch_and_or_64(i64* %p) {
-; CHECK: fetch_and_or_64:
-; CHECK: mov    x[[ADDR:[0-9]+]], x0
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxr   [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
-; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0x7
-; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-  %val = atomicrmw or i64* %p, i64 7 monotonic
-  ret i64 %val
-}
-
-define void @acquire_fence() {
-   fence acquire
-   ret void
-   ; CHECK-LABEL: acquire_fence:
-   ; CHECK: dmb ishld
-}
-
-define void @release_fence() {
-   fence release
-   ret void
-   ; CHECK-LABEL: release_fence:
-   ; CHECK: dmb ish{{$}}
-}
-
-define void @seq_cst_fence() {
-   fence seq_cst
-   ret void
-   ; CHECK-LABEL: seq_cst_fence:
-   ; CHECK: dmb ish{{$}}
-}
-
-define i32 @atomic_load(i32* %p) {
-   %r = load atomic i32* %p seq_cst, align 4
-   ret i32 %r
-   ; CHECK-LABEL: atomic_load:
-   ; CHECK: ldar
-}
-
-define i8 @atomic_load_relaxed_8(i8* %p, i32 %off32) {
-; CHECK-LABEL: atomic_load_relaxed_8:
-  %ptr_unsigned = getelementptr i8* %p, i32 4095
-  %val_unsigned = load atomic i8* %ptr_unsigned monotonic, align 1
-; CHECK: ldrb {{w[0-9]+}}, [x0, #4095]
-
-  %ptr_regoff = getelementptr i8* %p, i32 %off32
-  %val_regoff = load atomic i8* %ptr_regoff unordered, align 1
-  %tot1 = add i8 %val_unsigned, %val_regoff
-; CHECK: ldrb {{w[0-9]+}}, [x0, w1, sxtw]
-
-  %ptr_unscaled = getelementptr i8* %p, i32 -256
-  %val_unscaled = load atomic i8* %ptr_unscaled monotonic, align 1
-  %tot2 = add i8 %tot1, %val_unscaled
-; CHECK: ldurb {{w[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
-  %val_random = load atomic i8* %ptr_random unordered, align 1
-  %tot3 = add i8 %tot2, %val_random
-; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
-; CHECK: ldrb {{w[0-9]+}}, [x[[ADDR]]]
-
-  ret i8 %tot3
-}
-
-define i16 @atomic_load_relaxed_16(i16* %p, i32 %off32) {
-; CHECK-LABEL: atomic_load_relaxed_16:
-  %ptr_unsigned = getelementptr i16* %p, i32 4095
-  %val_unsigned = load atomic i16* %ptr_unsigned monotonic, align 2
-; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
-
-  %ptr_regoff = getelementptr i16* %p, i32 %off32
-  %val_regoff = load atomic i16* %ptr_regoff unordered, align 2
-  %tot1 = add i16 %val_unsigned, %val_regoff
-; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1]
-
-  %ptr_unscaled = getelementptr i16* %p, i32 -128
-  %val_unscaled = load atomic i16* %ptr_unscaled monotonic, align 2
-  %tot2 = add i16 %tot1, %val_unscaled
-; CHECK: ldurh {{w[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
-  %val_random = load atomic i16* %ptr_random unordered, align 2
-  %tot3 = add i16 %tot2, %val_random
-; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
-; CHECK: ldrh {{w[0-9]+}}, [x[[ADDR]]]
-
-  ret i16 %tot3
-}
-
-define i32 @atomic_load_relaxed_32(i32* %p, i32 %off32) {
-; CHECK-LABEL: atomic_load_relaxed_32:
-  %ptr_unsigned = getelementptr i32* %p, i32 4095
-  %val_unsigned = load atomic i32* %ptr_unsigned monotonic, align 4
-; CHECK: ldr {{w[0-9]+}}, [x0, #16380]
-
-  %ptr_regoff = getelementptr i32* %p, i32 %off32
-  %val_regoff = load atomic i32* %ptr_regoff unordered, align 4
-  %tot1 = add i32 %val_unsigned, %val_regoff
-; CHECK: ldr {{w[0-9]+}}, [x0, w1, sxtw #2]
-
-  %ptr_unscaled = getelementptr i32* %p, i32 -64
-  %val_unscaled = load atomic i32* %ptr_unscaled monotonic, align 4
-  %tot2 = add i32 %tot1, %val_unscaled
-; CHECK: ldur {{w[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
-  %val_random = load atomic i32* %ptr_random unordered, align 4
-  %tot3 = add i32 %tot2, %val_random
-; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
-; CHECK: ldr {{w[0-9]+}}, [x[[ADDR]]]
-
-  ret i32 %tot3
-}
-
-define i64 @atomic_load_relaxed_64(i64* %p, i32 %off32) {
-; CHECK-LABEL: atomic_load_relaxed_64:
-  %ptr_unsigned = getelementptr i64* %p, i32 4095
-  %val_unsigned = load atomic i64* %ptr_unsigned monotonic, align 8
-; CHECK: ldr {{x[0-9]+}}, [x0, #32760]
-
-  %ptr_regoff = getelementptr i64* %p, i32 %off32
-  %val_regoff = load atomic i64* %ptr_regoff unordered, align 8
-  %tot1 = add i64 %val_unsigned, %val_regoff
-; CHECK: ldr {{x[0-9]+}}, [x0, w1, sxtw #3]
-
-  %ptr_unscaled = getelementptr i64* %p, i32 -32
-  %val_unscaled = load atomic i64* %ptr_unscaled monotonic, align 8
-  %tot2 = add i64 %tot1, %val_unscaled
-; CHECK: ldur {{x[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
-  %val_random = load atomic i64* %ptr_random unordered, align 8
-  %tot3 = add i64 %tot2, %val_random
-; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
-; CHECK: ldr {{x[0-9]+}}, [x[[ADDR]]]
-
-  ret i64 %tot3
-}
-
-
-define void @atomc_store(i32* %p) {
-   store atomic i32 4, i32* %p seq_cst, align 4
-   ret void
-   ; CHECK-LABEL: atomc_store:
-   ; CHECK: stlr
-}
-
-define void @atomic_store_relaxed_8(i8* %p, i32 %off32, i8 %val) {
-; CHECK-LABEL: atomic_store_relaxed_8:
-  %ptr_unsigned = getelementptr i8* %p, i32 4095
-  store atomic i8 %val, i8* %ptr_unsigned monotonic, align 1
-; CHECK: strb {{w[0-9]+}}, [x0, #4095]
-
-  %ptr_regoff = getelementptr i8* %p, i32 %off32
-  store atomic i8 %val, i8* %ptr_regoff unordered, align 1
-; CHECK: strb {{w[0-9]+}}, [x0, w1, sxtw]
-
-  %ptr_unscaled = getelementptr i8* %p, i32 -256
-  store atomic i8 %val, i8* %ptr_unscaled monotonic, align 1
-; CHECK: sturb {{w[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
-  store atomic i8 %val, i8* %ptr_random unordered, align 1
-; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
-; CHECK: strb {{w[0-9]+}}, [x[[ADDR]]]
-
-  ret void
-}
-
-define void @atomic_store_relaxed_16(i16* %p, i32 %off32, i16 %val) {
-; CHECK-LABEL: atomic_store_relaxed_16:
-  %ptr_unsigned = getelementptr i16* %p, i32 4095
-  store atomic i16 %val, i16* %ptr_unsigned monotonic, align 2
-; CHECK: strh {{w[0-9]+}}, [x0, #8190]
-
-  %ptr_regoff = getelementptr i16* %p, i32 %off32
-  store atomic i16 %val, i16* %ptr_regoff unordered, align 2
-; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1]
-
-  %ptr_unscaled = getelementptr i16* %p, i32 -128
-  store atomic i16 %val, i16* %ptr_unscaled monotonic, align 2
-; CHECK: sturh {{w[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
-  store atomic i16 %val, i16* %ptr_random unordered, align 2
-; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
-; CHECK: strh {{w[0-9]+}}, [x[[ADDR]]]
-
-  ret void
-}
-
-define void @atomic_store_relaxed_32(i32* %p, i32 %off32, i32 %val) {
-; CHECK-LABEL: atomic_store_relaxed_32:
-  %ptr_unsigned = getelementptr i32* %p, i32 4095
-  store atomic i32 %val, i32* %ptr_unsigned monotonic, align 4
-; CHECK: str {{w[0-9]+}}, [x0, #16380]
-
-  %ptr_regoff = getelementptr i32* %p, i32 %off32
-  store atomic i32 %val, i32* %ptr_regoff unordered, align 4
-; CHECK: str {{w[0-9]+}}, [x0, w1, sxtw #2]
-
-  %ptr_unscaled = getelementptr i32* %p, i32 -64
-  store atomic i32 %val, i32* %ptr_unscaled monotonic, align 4
-; CHECK: stur {{w[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
-  store atomic i32 %val, i32* %ptr_random unordered, align 4
-; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
-; CHECK: str {{w[0-9]+}}, [x[[ADDR]]]
-
-  ret void
-}
-
-define void @atomic_store_relaxed_64(i64* %p, i32 %off32, i64 %val) {
-; CHECK-LABEL: atomic_store_relaxed_64:
-  %ptr_unsigned = getelementptr i64* %p, i32 4095
-  store atomic i64 %val, i64* %ptr_unsigned monotonic, align 8
-; CHECK: str {{x[0-9]+}}, [x0, #32760]
-
-  %ptr_regoff = getelementptr i64* %p, i32 %off32
-  store atomic i64 %val, i64* %ptr_regoff unordered, align 8
-; CHECK: str {{x[0-9]+}}, [x0, w1, sxtw #3]
-
-  %ptr_unscaled = getelementptr i64* %p, i32 -32
-  store atomic i64 %val, i64* %ptr_unscaled monotonic, align 8
-; CHECK: stur {{x[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
-  store atomic i64 %val, i64* %ptr_random unordered, align 8
-; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
-; CHECK: str {{x[0-9]+}}, [x[[ADDR]]]
-
-  ret void
-}
-
-; rdar://11531169
-; rdar://11531308
-
-%"class.X::Atomic" = type { %struct.x_atomic_t }
-%struct.x_atomic_t = type { i32 }
-
-@counter = external hidden global %"class.X::Atomic", align 4
-
-define i32 @next_id() nounwind optsize ssp align 2 {
-entry:
-  %0 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
-  %add.i = add i32 %0, 1
-  %tobool = icmp eq i32 %add.i, 0
-  br i1 %tobool, label %if.else, label %return
-
-if.else:                                          ; preds = %entry
-  %1 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
-  %add.i2 = add i32 %1, 1
-  br label %return
-
-return:                                           ; preds = %if.else, %entry
-  %retval.0 = phi i32 [ %add.i2, %if.else ], [ %add.i, %entry ]
-  ret i32 %retval.0
-}
diff --git a/test/CodeGen/ARM64/basic-pic.ll b/test/CodeGen/ARM64/basic-pic.ll
deleted file mode 100644
index 9fdb1e91385..00000000000
--- a/test/CodeGen/ARM64/basic-pic.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
-
-@var = global i32 0
-
-define i32 @get_globalvar() {
-; CHECK-LABEL: get_globalvar:
-
-  %val = load i32* @var
-; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
-; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], :got_lo12:var]
-; CHECK: ldr w0, [x[[GOTLOC]]]
-
-  ret i32 %val
-}
-
-define i32* @get_globalvaraddr() {
-; CHECK-LABEL: get_globalvaraddr:
-
-  %val = load i32* @var
-; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
-; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:var]
-
-  ret i32* @var
-}
-
-@hiddenvar = hidden global i32 0
-
-define i32 @get_hiddenvar() {
-; CHECK-LABEL: get_hiddenvar:
-
-  %val = load i32* @hiddenvar
-; CHECK: adrp x[[HI:[0-9]+]], hiddenvar
-; CHECK: ldr w0, [x[[HI]], :lo12:hiddenvar]
-
-  ret i32 %val
-}
-
-define i32* @get_hiddenvaraddr() {
-; CHECK-LABEL: get_hiddenvaraddr:
-
-  %val = load i32* @hiddenvar
-; CHECK: adrp [[HI:x[0-9]+]], hiddenvar
-; CHECK: add x0, [[HI]], :lo12:hiddenvar
-
-  ret i32* @hiddenvar
-}
-
-define void()* @get_func() {
-; CHECK-LABEL: get_func:
-
-  ret void()* bitcast(void()*()* @get_func to void()*)
-; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func
-; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:get_func]
-}
diff --git a/test/CodeGen/ARM64/big-endian-bitconverts.ll b/test/CodeGen/ARM64/big-endian-bitconverts.ll
deleted file mode 100644
index cb8708b9267..00000000000
--- a/test/CodeGen/ARM64/big-endian-bitconverts.ll
+++ /dev/null
@@ -1,1101 +0,0 @@
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -O1 -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
-
-; CHECK-LABEL: test_i64_f64:
-define void @test_i64_f64(double* %p, i64* %q) {
-; CHECK: ldr
-; CHECK: str
-    %1 = load double* %p
-    %2 = fadd double %1, %1
-    %3 = bitcast double %2 to i64
-    %4 = add i64 %3, %3
-    store i64 %4, i64* %q
-    ret void
-}
-
-; CHECK-LABEL: test_i64_v1i64:
-define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
-; CHECK: ldr
-; CHECK: str
-    %1 = load <1 x i64>* %p
-    %2 = add <1 x i64> %1, %1
-    %3 = bitcast <1 x i64> %2 to i64
-    %4 = add i64 %3, %3
-    store i64 %4, i64* %q
-    ret void
-}
-
-; CHECK-LABEL: test_i64_v2f32:
-define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2s }
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: str
-    %1 = load <2 x float>* %p
-    %2 = fadd <2 x float> %1, %1
-    %3 = bitcast <2 x float> %2 to i64
-    %4 = add i64 %3, %3
-    store i64 %4, i64* %q
-    ret void
-}
-
-; CHECK-LABEL: test_i64_v2i32:
-define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2s }
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: str
-    %1 = load <2 x i32>* %p
-    %2 = add <2 x i32> %1, %1
-    %3 = bitcast <2 x i32> %2 to i64
-    %4 = add i64 %3, %3
-    store i64 %4, i64* %q
-    ret void
-}
-
-; CHECK-LABEL: test_i64_v4i16:
-define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.4h }
-; CHECK: rev64 v{{[0-9]+}}.4h
-; CHECK: str
-    %1 = load <4 x i16>* %p
-    %2 = add <4 x i16> %1, %1
-    %3 = bitcast <4 x i16> %2 to i64
-    %4 = add i64 %3, %3
-    store i64 %4, i64* %q
-    ret void
-}
-
-; CHECK-LABEL: test_i64_v8i8:
-define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.8b }
-; CHECK: rev64 v{{[0-9]+}}.8b
-; CHECK: str
-    %1 = load <8 x i8>* %p
-    %2 = add <8 x i8> %1, %1
-    %3 = bitcast <8 x i8> %2 to i64
-    %4 = add i64 %3, %3
-    store i64 %4, i64* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f64_i64:
-define void @test_f64_i64(i64* %p, double* %q) {
-; CHECK: ldr
-; CHECK: str
-    %1 = load i64* %p
-    %2 = add i64 %1, %1
-    %3 = bitcast i64 %2 to double
-    %4 = fadd double %3, %3
-    store double %4, double* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f64_v1i64:
-define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
-; CHECK: ldr
-; CHECK: str
-    %1 = load <1 x i64>* %p
-    %2 = add <1 x i64> %1, %1
-    %3 = bitcast <1 x i64> %2 to double
-    %4 = fadd double %3, %3
-    store double %4, double* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f64_v2f32:
-define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2s }
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: str
-    %1 = load <2 x float>* %p
-    %2 = fadd <2 x float> %1, %1
-    %3 = bitcast <2 x float> %2 to double
-    %4 = fadd double %3, %3
-    store double %4, double* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f64_v2i32:
-define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2s }
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: str
-    %1 = load <2 x i32>* %p
-    %2 = add <2 x i32> %1, %1
-    %3 = bitcast <2 x i32> %2 to double
-    %4 = fadd double %3, %3
-    store double %4, double* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f64_v4i16:
-define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.4h }
-; CHECK: rev64 v{{[0-9]+}}.4h
-; CHECK: str
-    %1 = load <4 x i16>* %p
-    %2 = add <4 x i16> %1, %1
-    %3 = bitcast <4 x i16> %2 to double
-    %4 = fadd double %3, %3
-    store double %4, double* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f64_v8i8:
-define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.8b }
-; CHECK: rev64 v{{[0-9]+}}.8b
-; CHECK: str
-    %1 = load <8 x i8>* %p
-    %2 = add <8 x i8> %1, %1
-    %3 = bitcast <8 x i8> %2 to double
-    %4 = fadd double %3, %3
-    store double %4, double* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v1i64_i64:
-define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
-; CHECK: ldr
-; CHECK: str
-    %1 = load i64* %p
-    %2 = add i64 %1, %1
-    %3 = bitcast i64 %2 to <1 x i64>
-    %4 = add <1 x i64> %3, %3
-    store <1 x i64> %4, <1 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v1i64_f64:
-define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
-; CHECK: ldr
-; CHECK: str
-    %1 = load double* %p
-    %2 = fadd double %1, %1
-    %3 = bitcast double %2 to <1 x i64>
-    %4 = add <1 x i64> %3, %3
-    store <1 x i64> %4, <1 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v1i64_v2f32:
-define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2s }
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: str
-    %1 = load <2 x float>* %p
-    %2 = fadd <2 x float> %1, %1
-    %3 = bitcast <2 x float> %2 to <1 x i64>
-    %4 = add <1 x i64> %3, %3
-    store <1 x i64> %4, <1 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v1i64_v2i32:
-define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2s }
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: str
-    %1 = load <2 x i32>* %p
-    %2 = add <2 x i32> %1, %1
-    %3 = bitcast <2 x i32> %2 to <1 x i64>
-    %4 = add <1 x i64> %3, %3
-    store <1 x i64> %4, <1 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v1i64_v4i16:
-define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.4h }
-; CHECK: rev64 v{{[0-9]+}}.4h
-; CHECK: str
-    %1 = load <4 x i16>* %p
-    %2 = add <4 x i16> %1, %1
-    %3 = bitcast <4 x i16> %2 to <1 x i64>
-    %4 = add <1 x i64> %3, %3
-    store <1 x i64> %4, <1 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v1i64_v8i8:
-define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.8b }
-; CHECK: rev64 v{{[0-9]+}}.8b
-; CHECK: str
-    %1 = load <8 x i8>* %p
-    %2 = add <8 x i8> %1, %1
-    %3 = bitcast <8 x i8> %2 to <1 x i64>
-    %4 = add <1 x i64> %3, %3
-    store <1 x i64> %4, <1 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f32_i64:
-define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load i64* %p
-    %2 = add i64 %1, %1
-    %3 = bitcast i64 %2 to <2 x float>
-    %4 = fadd <2 x float> %3, %3
-    store <2 x float> %4, <2 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f32_f64:
-define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load double* %p
-    %2 = fadd double %1, %1
-    %3 = bitcast double %2 to <2 x float>
-    %4 = fadd <2 x float> %3, %3
-    store <2 x float> %4, <2 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f32_v1i64:
-define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load <1 x i64>* %p
-    %2 = add <1 x i64> %1, %1
-    %3 = bitcast <1 x i64> %2 to <2 x float>
-    %4 = fadd <2 x float> %3, %3
-    store <2 x float> %4, <2 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f32_v2i32:
-define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2s }
-; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load <2 x i32>* %p
-    %2 = add <2 x i32> %1, %1
-    %3 = bitcast <2 x i32> %2 to <2 x float>
-    %4 = fadd <2 x float> %3, %3
-    store <2 x float> %4, <2 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f32_v4i16:
-define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.4h }
-; CHECK: rev32 v{{[0-9]+}}.4h
-; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load <4 x i16>* %p
-    %2 = add <4 x i16> %1, %1
-    %3 = bitcast <4 x i16> %2 to <2 x float>
-    %4 = fadd <2 x float> %3, %3
-    store <2 x float> %4, <2 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f32_v8i8:
-define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.8b }
-; CHECK: rev32 v{{[0-9]+}}.8b
-; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load <8 x i8>* %p
-    %2 = add <8 x i8> %1, %1
-    %3 = bitcast <8 x i8> %2 to <2 x float>
-    %4 = fadd <2 x float> %3, %3
-    store <2 x float> %4, <2 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i32_i64:
-define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load i64* %p
-    %2 = add i64 %1, %1
-    %3 = bitcast i64 %2 to <2 x i32>
-    %4 = add <2 x i32> %3, %3
-    store <2 x i32> %4, <2 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i32_f64:
-define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load double* %p
-    %2 = fadd double %1, %1
-    %3 = bitcast double %2 to <2 x i32>
-    %4 = add <2 x i32> %3, %3
-    store <2 x i32> %4, <2 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i32_v1i64:
-define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load <1 x i64>* %p
-    %2 = add <1 x i64> %1, %1
-    %3 = bitcast <1 x i64> %2 to <2 x i32>
-    %4 = add <2 x i32> %3, %3
-    store <2 x i32> %4, <2 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i32_v2f32:
-define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2s }
-; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load <2 x float>* %p
-    %2 = fadd <2 x float> %1, %1
-    %3 = bitcast <2 x float> %2 to <2 x i32>
-    %4 = add <2 x i32> %3, %3
-    store <2 x i32> %4, <2 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i32_v4i16:
-define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.4h }
-; CHECK: rev32 v{{[0-9]+}}.4h
-; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load <4 x i16>* %p
-    %2 = add <4 x i16> %1, %1
-    %3 = bitcast <4 x i16> %2 to <2 x i32>
-    %4 = add <2 x i32> %3, %3
-    store <2 x i32> %4, <2 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i32_v8i8:
-define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.8b }
-; CHECK: rev32 v{{[0-9]+}}.8b
-; CHECK: st1 { v{{[0-9]+}}.2s }
-    %1 = load <8 x i8>* %p
-    %2 = add <8 x i8> %1, %1
-    %3 = bitcast <8 x i8> %2 to <2 x i32>
-    %4 = add <2 x i32> %3, %3
-    store <2 x i32> %4, <2 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i16_i64:
-define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.4h
-; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load i64* %p
-    %2 = add i64 %1, %1
-    %3 = bitcast i64 %2 to <4 x i16>
-    %4 = add <4 x i16> %3, %3
-    store <4 x i16> %4, <4 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i16_f64:
-define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.4h
-; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load double* %p
-    %2 = fadd double %1, %1
-    %3 = bitcast double %2 to <4 x i16>
-    %4 = add <4 x i16> %3, %3
-    store <4 x i16> %4, <4 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i16_v1i64:
-define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.4h
-; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load <1 x i64>* %p
-    %2 = add <1 x i64> %1, %1
-    %3 = bitcast <1 x i64> %2 to <4 x i16>
-    %4 = add <4 x i16> %3, %3
-    store <4 x i16> %4, <4 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i16_v2f32:
-define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2s }
-; CHECK: rev32 v{{[0-9]+}}.4h
-; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load <2 x float>* %p
-    %2 = fadd <2 x float> %1, %1
-    %3 = bitcast <2 x float> %2 to <4 x i16>
-    %4 = add <4 x i16> %3, %3
-    store <4 x i16> %4, <4 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i16_v2i32:
-define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2s }
-; CHECK: rev32 v{{[0-9]+}}.4h
-; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load <2 x i32>* %p
-    %2 = add <2 x i32> %1, %1
-    %3 = bitcast <2 x i32> %2 to <4 x i16>
-    %4 = add <4 x i16> %3, %3
-    store <4 x i16> %4, <4 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i16_v8i8:
-define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.8b }
-; CHECK: rev16 v{{[0-9]+}}.8b
-; CHECK: st1 { v{{[0-9]+}}.4h }
-    %1 = load <8 x i8>* %p
-    %2 = add <8 x i8> %1, %1
-    %3 = bitcast <8 x i8> %2 to <4 x i16>
-    %4 = add <4 x i16> %3, %3
-    store <4 x i16> %4, <4 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i8_i64:
-define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.8b
-; CHECK: st1 { v{{[0-9]+}}.8b }
-    %1 = load i64* %p
-    %2 = add i64 %1, %1
-    %3 = bitcast i64 %2 to <8 x i8>
-    %4 = add <8 x i8> %3, %3
-    store <8 x i8> %4, <8 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i8_f64:
-define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.8b
-; CHECK: st1 { v{{[0-9]+}}.8b }
-    %1 = load double* %p
-    %2 = fadd double %1, %1
-    %3 = bitcast double %2 to <8 x i8>
-    %4 = add <8 x i8> %3, %3
-    store <8 x i8> %4, <8 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i8_v1i64:
-define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.8b
-; CHECK: st1 { v{{[0-9]+}}.8b }
-    %1 = load <1 x i64>* %p
-    %2 = add <1 x i64> %1, %1
-    %3 = bitcast <1 x i64> %2 to <8 x i8>
-    %4 = add <8 x i8> %3, %3
-    store <8 x i8> %4, <8 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i8_v2f32:
-define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2s }
-; CHECK: rev32 v{{[0-9]+}}.8b
-; CHECK: st1 { v{{[0-9]+}}.8b }
-    %1 = load <2 x float>* %p
-    %2 = fadd <2 x float> %1, %1
-    %3 = bitcast <2 x float> %2 to <8 x i8>
-    %4 = add <8 x i8> %3, %3
-    store <8 x i8> %4, <8 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i8_v2i32:
-define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2s }
-; CHECK: rev32 v{{[0-9]+}}.8b
-; CHECK: st1 { v{{[0-9]+}}.8b }
-    %1 = load <2 x i32>* %p
-    %2 = add <2 x i32> %1, %1
-    %3 = bitcast <2 x i32> %2 to <8 x i8>
-    %4 = add <8 x i8> %3, %3
-    store <8 x i8> %4, <8 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i8_v4i16:
-define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.4h }
-; CHECK: rev16 v{{[0-9]+}}.8b
-; CHECK: st1 { v{{[0-9]+}}.8b }
-    %1 = load <4 x i16>* %p
-    %2 = add <4 x i16> %1, %1
-    %3 = bitcast <4 x i16> %2 to <8 x i8>
-    %4 = add <8 x i8> %3, %3
-    store <8 x i8> %4, <8 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f128_v2f64:
-define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: ext
-; CHECK: str
-    %1 = load <2 x double>* %p
-    %2 = fadd <2 x double> %1, %1
-    %3 = bitcast <2 x double> %2 to fp128
-    %4 = fadd fp128 %3, %3
-    store fp128 %4, fp128* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f128_v2i64:
-define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: ext
-; CHECK: str
-    %1 = load <2 x i64>* %p
-    %2 = add <2 x i64> %1, %1
-    %3 = bitcast <2 x i64> %2 to fp128
-    %4 = fadd fp128 %3, %3
-    store fp128 %4, fp128* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f128_v4f32:
-define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: str q
-    %1 = load <4 x float>* %p
-    %2 = fadd <4 x float> %1, %1
-    %3 = bitcast <4 x float> %2 to fp128
-    %4 = fadd fp128 %3, %3
-    store fp128 %4, fp128* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f128_v4i32:
-define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.4s }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: str
-    %1 = load <4 x i32>* %p
-    %2 = add <4 x i32> %1, %1
-    %3 = bitcast <4 x i32> %2 to fp128
-    %4 = fadd fp128 %3, %3
-    store fp128 %4, fp128* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f128_v8i16:
-define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.8h }
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-; CHECK: str
-    %1 = load <8 x i16>* %p
-    %2 = add <8 x i16> %1, %1
-    %3 = bitcast <8 x i16> %2 to fp128
-    %4 = fadd fp128 %3, %3
-    store fp128 %4, fp128* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f128_v16i8:
-define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.16b }
-; CHECK: ext
-; CHECK: str q
-    %1 = load <16 x i8>* %p
-    %2 = add <16 x i8> %1, %1
-    %3 = bitcast <16 x i8> %2 to fp128
-    %4 = fadd fp128 %3, %3
-    store fp128 %4, fp128* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f64_f128:
-define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
-; CHECK: ldr
-; CHECK: ext
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load fp128* %p
-    %2 = fadd fp128 %1, %1
-    %3 = bitcast fp128 %2 to <2 x double>
-    %4 = fadd <2 x double> %3, %3
-    store <2 x double> %4, <2 x double>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f64_v2i64:
-define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <2 x i64>* %p
-    %2 = add <2 x i64> %1, %1
-    %3 = bitcast <2 x i64> %2 to <2 x double>
-    %4 = fadd <2 x double> %3, %3
-    store <2 x double> %4, <2 x double>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f64_v4f32:
-define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <4 x float>* %p
-    %2 = fadd <4 x float> %1, %1
-    %3 = bitcast <4 x float> %2 to <2 x double>
-    %4 = fadd <2 x double> %3, %3
-    store <2 x double> %4, <2 x double>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f64_v4i32:
-define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.4s }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <4 x i32>* %p
-    %2 = add <4 x i32> %1, %1
-    %3 = bitcast <4 x i32> %2 to <2 x double>
-    %4 = fadd <2 x double> %3, %3
-    store <2 x double> %4, <2 x double>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f64_v8i16:
-define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.8h }
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <8 x i16>* %p
-    %2 = add <8 x i16> %1, %1
-    %3 = bitcast <8 x i16> %2 to <2 x double>
-    %4 = fadd <2 x double> %3, %3
-    store <2 x double> %4, <2 x double>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f64_v16i8:
-define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.16b }
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <16 x i8>* %p
-    %2 = add <16 x i8> %1, %1
-    %3 = bitcast <16 x i8> %2 to <2 x double>
-    %4 = fadd <2 x double> %3, %3
-    store <2 x double> %4, <2 x double>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i64_f128:
-define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
-; CHECK: ldr
-; CHECK: ext
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load fp128* %p
-    %2 = fadd fp128 %1, %1
-    %3 = bitcast fp128 %2 to <2 x i64>
-    %4 = add <2 x i64> %3, %3
-    store <2 x i64> %4, <2 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i64_v2f64:
-define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <2 x double>* %p
-    %2 = fadd <2 x double> %1, %1
-    %3 = bitcast <2 x double> %2 to <2 x i64>
-    %4 = add <2 x i64> %3, %3
-    store <2 x i64> %4, <2 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i64_v4f32:
-define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <4 x float>* %p
-    %2 = fadd <4 x float> %1, %1
-    %3 = bitcast <4 x float> %2 to <2 x i64>
-    %4 = add <2 x i64> %3, %3
-    store <2 x i64> %4, <2 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i64_v4i32:
-define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.4s }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <4 x i32>* %p
-    %2 = add <4 x i32> %1, %1
-    %3 = bitcast <4 x i32> %2 to <2 x i64>
-    %4 = add <2 x i64> %3, %3
-    store <2 x i64> %4, <2 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i64_v8i16:
-define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.8h }
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <8 x i16>* %p
-    %2 = add <8 x i16> %1, %1
-    %3 = bitcast <8 x i16> %2 to <2 x i64>
-    %4 = add <2 x i64> %3, %3
-    store <2 x i64> %4, <2 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i64_v16i8:
-define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.16b }
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <16 x i8>* %p
-    %2 = add <16 x i8> %1, %1
-    %3 = bitcast <16 x i8> %2 to <2 x i64>
-    %4 = add <2 x i64> %3, %3
-    store <2 x i64> %4, <2 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4f32_f128:
-define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
-; CHECK: ldr q
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load fp128* %p
-    %2 = fadd fp128 %1, %1
-    %3 = bitcast fp128 %2 to <4 x float>
-    %4 = fadd <4 x float> %3, %3
-    store <4 x float> %4, <4 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4f32_v2f64:
-define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <2 x double>* %p
-    %2 = fadd <2 x double> %1, %1
-    %3 = bitcast <2 x double> %2 to <4 x float>
-    %4 = fadd <4 x float> %3, %3
-    store <4 x float> %4, <4 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4f32_v2i64:
-define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <2 x i64>* %p
-    %2 = add <2 x i64> %1, %1
-    %3 = bitcast <2 x i64> %2 to <4 x float>
-    %4 = fadd <4 x float> %3, %3
-    store <4 x float> %4, <4 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4f32_v4i32:
-define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.4s }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <4 x i32>* %p
-    %2 = add <4 x i32> %1, %1
-    %3 = bitcast <4 x i32> %2 to <4 x float>
-    %4 = fadd <4 x float> %3, %3
-    store <4 x float> %4, <4 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4f32_v8i16:
-define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.8h }
-; CHECK: rev32 v{{[0-9]+}}.8h
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <8 x i16>* %p
-    %2 = add <8 x i16> %1, %1
-    %3 = bitcast <8 x i16> %2 to <4 x float>
-    %4 = fadd <4 x float> %3, %3
-    store <4 x float> %4, <4 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4f32_v16i8:
-define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.16b }
-; CHECK: rev32 v{{[0-9]+}}.16b
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.2d }
-    %1 = load <16 x i8>* %p
-    %2 = add <16 x i8> %1, %1
-    %3 = bitcast <16 x i8> %2 to <4 x float>
-    %4 = fadd <4 x float> %3, %3
-    store <4 x float> %4, <4 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i32_f128:
-define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load fp128* %p
-    %2 = fadd fp128 %1, %1
-    %3 = bitcast fp128 %2 to <4 x i32>
-    %4 = add <4 x i32> %3, %3
-    store <4 x i32> %4, <4 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i32_v2f64:
-define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <2 x double>* %p
-    %2 = fadd <2 x double> %1, %1
-    %3 = bitcast <2 x double> %2 to <4 x i32>
-    %4 = add <4 x i32> %3, %3
-    store <4 x i32> %4, <4 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i32_v2i64:
-define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <2 x i64>* %p
-    %2 = add <2 x i64> %1, %1
-    %3 = bitcast <2 x i64> %2 to <4 x i32>
-    %4 = add <4 x i32> %3, %3
-    store <4 x i32> %4, <4 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i32_v4f32:
-define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <4 x float>* %p
-    %2 = fadd <4 x float> %1, %1
-    %3 = bitcast <4 x float> %2 to <4 x i32>
-    %4 = add <4 x i32> %3, %3
-    store <4 x i32> %4, <4 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i32_v8i16:
-define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.8h }
-; CHECK: rev32 v{{[0-9]+}}.8h
-; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <8 x i16>* %p
-    %2 = add <8 x i16> %1, %1
-    %3 = bitcast <8 x i16> %2 to <4 x i32>
-    %4 = add <4 x i32> %3, %3
-    store <4 x i32> %4, <4 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i32_v16i8:
-define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.16b }
-; CHECK: rev32 v{{[0-9]+}}.16b
-; CHECK: st1 { v{{[0-9]+}}.4s }
-    %1 = load <16 x i8>* %p
-    %2 = add <16 x i8> %1, %1
-    %3 = bitcast <16 x i8> %2 to <4 x i32>
-    %4 = add <4 x i32> %3, %3
-    store <4 x i32> %4, <4 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i16_f128:
-define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
-; CHECK: ldr
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-; CHECK: st1 { v{{[0-9]+}}.8h }
-    %1 = load fp128* %p
-    %2 = fadd fp128 %1, %1
-    %3 = bitcast fp128 %2 to <8 x i16>
-    %4 = add <8 x i16> %3, %3
-    store <8 x i16> %4, <8 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i16_v2f64:
-define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: st1 { v{{[0-9]+}}.8h }
-    %1 = load <2 x double>* %p
-    %2 = fadd <2 x double> %1, %1
-    %3 = bitcast <2 x double> %2 to <8 x i16>
-    %4 = add <8 x i16> %3, %3
-    store <8 x i16> %4, <8 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i16_v2i64:
-define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: st1 { v{{[0-9]+}}.8h }
-    %1 = load <2 x i64>* %p
-    %2 = add <2 x i64> %1, %1
-    %3 = bitcast <2 x i64> %2 to <8 x i16>
-    %4 = add <8 x i16> %3, %3
-    store <8 x i16> %4, <8 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i16_v4f32:
-define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: rev32 v{{[0-9]+}}.8h
-; CHECK: st1 { v{{[0-9]+}}.8h }
-    %1 = load <4 x float>* %p
-    %2 = fadd <4 x float> %1, %1
-    %3 = bitcast <4 x float> %2 to <8 x i16>
-    %4 = add <8 x i16> %3, %3
-    store <8 x i16> %4, <8 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i16_v4i32:
-define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.4s }
-; CHECK: rev32 v{{[0-9]+}}.8h
-; CHECK: st1 { v{{[0-9]+}}.8h }
-    %1 = load <4 x i32>* %p
-    %2 = add <4 x i32> %1, %1
-    %3 = bitcast <4 x i32> %2 to <8 x i16>
-    %4 = add <8 x i16> %3, %3
-    store <8 x i16> %4, <8 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i16_v16i8:
-define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.16b }
-; CHECK: rev16 v{{[0-9]+}}.16b
-; CHECK: st1 { v{{[0-9]+}}.8h }
-    %1 = load <16 x i8>* %p
-    %2 = add <16 x i8> %1, %1
-    %3 = bitcast <16 x i8> %2 to <8 x i16>
-    %4 = add <8 x i16> %3, %3
-    store <8 x i16> %4, <8 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v16i8_f128:
-define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
-; CHECK: ldr q
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-; CHECK: st1 { v{{[0-9]+}}.16b }
-    %1 = load fp128* %p
-    %2 = fadd fp128 %1, %1
-    %3 = bitcast fp128 %2 to <16 x i8>
-    %4 = add <16 x i8> %3, %3
-    store <16 x i8> %4, <16 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v16i8_v2f64:
-define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: st1 { v{{[0-9]+}}.16b }
-    %1 = load <2 x double>* %p
-    %2 = fadd <2 x double> %1, %1
-    %3 = bitcast <2 x double> %2 to <16 x i8>
-    %4 = add <16 x i8> %3, %3
-    store <16 x i8> %4, <16 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v16i8_v2i64:
-define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: st1 { v{{[0-9]+}}.16b }
-    %1 = load <2 x i64>* %p
-    %2 = add <2 x i64> %1, %1
-    %3 = bitcast <2 x i64> %2 to <16 x i8>
-    %4 = add <16 x i8> %3, %3
-    store <16 x i8> %4, <16 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v16i8_v4f32:
-define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.2d }
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: rev32 v{{[0-9]+}}.16b
-; CHECK: st1 { v{{[0-9]+}}.16b }
-    %1 = load <4 x float>* %p
-    %2 = fadd <4 x float> %1, %1
-    %3 = bitcast <4 x float> %2 to <16 x i8>
-    %4 = add <16 x i8> %3, %3
-    store <16 x i8> %4, <16 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v16i8_v4i32:
-define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.4s }
-; CHECK: rev32 v{{[0-9]+}}.16b
-; CHECK: st1 { v{{[0-9]+}}.16b }
-    %1 = load <4 x i32>* %p
-    %2 = add <4 x i32> %1, %1
-    %3 = bitcast <4 x i32> %2 to <16 x i8>
-    %4 = add <16 x i8> %3, %3
-    store <16 x i8> %4, <16 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v16i8_v8i16:
-define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
-; CHECK: ld1 { v{{[0-9]+}}.8h }
-; CHECK: rev16 v{{[0-9]+}}.16b
-; CHECK: st1 { v{{[0-9]+}}.16b }
-    %1 = load <8 x i16>* %p
-    %2 = add <8 x i16> %1, %1
-    %3 = bitcast <8 x i16> %2 to <16 x i8>
-    %4 = add <16 x i8> %3, %3
-    store <16 x i8> %4, <16 x i8>* %q
-    ret void
-}
diff --git a/test/CodeGen/ARM64/big-endian-eh.ll b/test/CodeGen/ARM64/big-endian-eh.ll
deleted file mode 100644
index 93e7da98de2..00000000000
--- a/test/CodeGen/ARM64/big-endian-eh.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: llc -mtriple arm64_be-linux-gnu -filetype obj < %s | llvm-objdump -s - | FileCheck %s
-
-; ARM EHABI for big endian
-; This test case checks whether CIE length record is laid out in big endian format.
-;
-; This is the LLVM assembly generated from following C++ code:
-;
-; extern void foo(int);
-; void test(int a, int b) {
-;   try {
-;   foo(a);
-; } catch (...) {
-;   foo(b);
-; }
-;}
-
-define void @_Z4testii(i32 %a, i32 %b) #0 {
-entry:
-  invoke void @_Z3fooi(i32 %a)
-          to label %try.cont unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-          catch i8* null
-  %1 = extractvalue { i8*, i32 } %0, 0
-  %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2
-  invoke void @_Z3fooi(i32 %b)
-          to label %invoke.cont2 unwind label %lpad1
-
-invoke.cont2:                                     ; preds = %lpad
-  tail call void @__cxa_end_catch()
-  br label %try.cont
-
-try.cont:                                         ; preds = %entry, %invoke.cont2
-  ret void
-
-lpad1:                                            ; preds = %lpad
-  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-          cleanup
-  invoke void @__cxa_end_catch()
-          to label %eh.resume unwind label %terminate.lpad
-
-eh.resume:                                        ; preds = %lpad1
-  resume { i8*, i32 } %3
-
-terminate.lpad:                                   ; preds = %lpad1
-  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-          catch i8* null
-  %5 = extractvalue { i8*, i32 } %4, 0
-  tail call void @__clang_call_terminate(i8* %5) #3
-  unreachable
-}
-
-declare void @_Z3fooi(i32) #0
-
-declare i32 @__gxx_personality_v0(...)
-
-declare i8* @__cxa_begin_catch(i8*)
-
-declare void @__cxa_end_catch()
-
-; Function Attrs: noinline noreturn nounwind
-define linkonce_odr hidden void @__clang_call_terminate(i8*) #1 {
-  %2 = tail call i8* @__cxa_begin_catch(i8* %0) #2
-  tail call void @_ZSt9terminatev() #3
-  unreachable
-}
-
-declare void @_ZSt9terminatev()
-
-; CHECK-LABEL: Contents of section .eh_frame:
-; CHECK-NEXT: 0000 0000001c
-
diff --git a/test/CodeGen/ARM64/big-endian-varargs.ll b/test/CodeGen/ARM64/big-endian-varargs.ll
deleted file mode 100644
index d7b26b97523..00000000000
--- a/test/CodeGen/ARM64/big-endian-varargs.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-
-; Vararg saving must save Q registers using the equivalent of STR/STP.
-
-target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "arm64_be-arm-none-eabi"
-
-%struct.__va_list = type { i8*, i8*, i8*, i32, i32 }
-
-declare void @llvm.va_start(i8*) nounwind
-declare void @llvm.va_end(i8*) nounwind
-
-define double @callee(i32 %a, ...) {
-; CHECK: stp
-; CHECK: stp
-; CHECK: stp
-; CHECK: stp
-; CHECK: stp
-; CHECK: stp
-entry:
-  %vl = alloca %struct.__va_list, align 8
-  %vl1 = bitcast %struct.__va_list* %vl to i8*
-  call void @llvm.va_start(i8* %vl1)
-  %vr_offs_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 4
-  %vr_offs = load i32* %vr_offs_p, align 4
-  %0 = icmp sgt i32 %vr_offs, -1
-  br i1 %0, label %vaarg.on_stack, label %vaarg.maybe_reg
-
-vaarg.maybe_reg:                                  ; preds = %entry
-  %new_reg_offs = add i32 %vr_offs, 16
-  store i32 %new_reg_offs, i32* %vr_offs_p, align 4
-  %inreg = icmp slt i32 %new_reg_offs, 1
-  br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack
-
-vaarg.in_reg:                                     ; preds = %vaarg.maybe_reg
-  %reg_top_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 2
-  %reg_top = load i8** %reg_top_p, align 8
-  %1 = sext i32 %vr_offs to i64
-  %2 = getelementptr i8* %reg_top, i64 %1
-  %3 = ptrtoint i8* %2 to i64
-  %align_be = add i64 %3, 8
-  %4 = inttoptr i64 %align_be to i8*
-  br label %vaarg.end
-
-vaarg.on_stack:                                   ; preds = %vaarg.maybe_reg, %entry
-  %stack_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 0
-  %stack = load i8** %stack_p, align 8
-  %new_stack = getelementptr i8* %stack, i64 8
-  store i8* %new_stack, i8** %stack_p, align 8
-  br label %vaarg.end
-
-vaarg.end:                                        ; preds = %vaarg.on_stack, %vaarg.in_reg
-  %.sink = phi i8* [ %4, %vaarg.in_reg ], [ %stack, %vaarg.on_stack ]
-  %5 = bitcast i8* %.sink to double*
-  %6 = load double* %5, align 8
-  call void @llvm.va_end(i8* %vl1)
-  ret double %6
-}
diff --git a/test/CodeGen/ARM64/big-endian-vector-callee.ll b/test/CodeGen/ARM64/big-endian-vector-callee.ll
deleted file mode 100644
index 5b9ccace882..00000000000
--- a/test/CodeGen/ARM64/big-endian-vector-callee.ll
+++ /dev/null
@@ -1,848 +0,0 @@
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -fast-isel=true -arm64-load-store-opt=false -o - | FileCheck %s
-
-; CHECK-LABEL: test_i64_f64:
-define i64 @test_i64_f64(double %p) {
-; CHECK-NOT: rev
-    %1 = fadd double %p, %p
-    %2 = bitcast double %1 to i64
-    %3 = add i64 %2, %2
-    ret i64 %3
-}
-
-; CHECK-LABEL: test_i64_v1i64:
-define i64 @test_i64_v1i64(<1 x i64> %p) {
-; CHECK-NOT: rev
-    %1 = add <1 x i64> %p, %p
-    %2 = bitcast <1 x i64> %1 to i64
-    %3 = add i64 %2, %2
-    ret i64 %3
-}
-
-; CHECK-LABEL: test_i64_v2f32:
-define i64 @test_i64_v2f32(<2 x float> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = fadd <2 x float> %p, %p
-    %2 = bitcast <2 x float> %1 to i64
-    %3 = add i64 %2, %2
-    ret i64 %3
-}
-
-; CHECK-LABEL: test_i64_v2i32:
-define i64 @test_i64_v2i32(<2 x i32> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = add <2 x i32> %p, %p
-    %2 = bitcast <2 x i32> %1 to i64
-    %3 = add i64 %2, %2
-    ret i64 %3
-}
-
-; CHECK-LABEL: test_i64_v4i16:
-define i64 @test_i64_v4i16(<4 x i16> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = add <4 x i16> %p, %p
-    %2 = bitcast <4 x i16> %1 to i64
-    %3 = add i64 %2, %2
-    ret i64 %3
-}
-
-; CHECK-LABEL: test_i64_v8i8:
-define i64 @test_i64_v8i8(<8 x i8> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = add <8 x i8> %p, %p
-    %2 = bitcast <8 x i8> %1 to i64
-    %3 = add i64 %2, %2
-    ret i64 %3
-}
-
-; CHECK-LABEL: test_f64_i64:
-define double @test_f64_i64(i64 %p) {
-; CHECK-NOT: rev
-    %1 = add i64 %p, %p
-    %2 = bitcast i64 %1 to double
-    %3 = fadd double %2, %2
-    ret double %3
-}
-
-; CHECK-LABEL: test_f64_v1i64:
-define double @test_f64_v1i64(<1 x i64> %p) {
-; CHECK-NOT: rev
-    %1 = add <1 x i64> %p, %p
-    %2 = bitcast <1 x i64> %1 to double
-    %3 = fadd double %2, %2
-    ret double %3
-}
-
-; CHECK-LABEL: test_f64_v2f32:
-define double @test_f64_v2f32(<2 x float> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = fadd <2 x float> %p, %p
-    %2 = bitcast <2 x float> %1 to double
-    %3 = fadd double %2, %2
-    ret double %3
-}
-
-; CHECK-LABEL: test_f64_v2i32:
-define double @test_f64_v2i32(<2 x i32> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = add <2 x i32> %p, %p
-    %2 = bitcast <2 x i32> %1 to double
-    %3 = fadd double %2, %2
-    ret double %3
-}
-
-; CHECK-LABEL: test_f64_v4i16:
-define double @test_f64_v4i16(<4 x i16> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = add <4 x i16> %p, %p
-    %2 = bitcast <4 x i16> %1 to double
-    %3 = fadd double %2, %2
-    ret double %3
-}
-
-; CHECK-LABEL: test_f64_v8i8:
-define double @test_f64_v8i8(<8 x i8> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = add <8 x i8> %p, %p
-    %2 = bitcast <8 x i8> %1 to double
-    %3 = fadd double %2, %2
-    ret double %3
-}
-
-; CHECK-LABEL: test_v1i64_i64:
-define <1 x i64> @test_v1i64_i64(i64 %p) {
-; CHECK-NOT: rev
-    %1 = add i64 %p, %p
-    %2 = bitcast i64 %1 to <1 x i64>
-    %3 = add <1 x i64> %2, %2
-    ret <1 x i64> %3
-}
-
-; CHECK-LABEL: test_v1i64_f64:
-define <1 x i64> @test_v1i64_f64(double %p) {
-; CHECK-NOT: rev
-    %1 = fadd double %p, %p
-    %2 = bitcast double %1 to <1 x i64>
-    %3 = add <1 x i64> %2, %2
-    ret <1 x i64> %3
-}
-
-; CHECK-LABEL: test_v1i64_v2f32:
-define <1 x i64> @test_v1i64_v2f32(<2 x float> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = fadd <2 x float> %p, %p
-    %2 = bitcast <2 x float> %1 to <1 x i64>
-    %3 = add <1 x i64> %2, %2
-    ret <1 x i64> %3
-}
-
-; CHECK-LABEL: test_v1i64_v2i32:
-define <1 x i64> @test_v1i64_v2i32(<2 x i32> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = add <2 x i32> %p, %p
-    %2 = bitcast <2 x i32> %1 to <1 x i64>
-    %3 = add <1 x i64> %2, %2
-    ret <1 x i64> %3
-}
-
-; CHECK-LABEL: test_v1i64_v4i16:
-define <1 x i64> @test_v1i64_v4i16(<4 x i16> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = add <4 x i16> %p, %p
-    %2 = bitcast <4 x i16> %1 to <1 x i64>
-    %3 = add <1 x i64> %2, %2
-    ret <1 x i64> %3
-}
-
-; CHECK-LABEL: test_v1i64_v8i8:
-define <1 x i64> @test_v1i64_v8i8(<8 x i8> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = add <8 x i8> %p, %p
-    %2 = bitcast <8 x i8> %1 to <1 x i64>
-    %3 = add <1 x i64> %2, %2
-    ret <1 x i64> %3
-}
-
-; CHECK-LABEL: test_v2f32_i64:
-define <2 x float> @test_v2f32_i64(i64 %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = add i64 %p, %p
-    %2 = bitcast i64 %1 to <2 x float>
-    %3 = fadd <2 x float> %2, %2
-    ret <2 x float> %3
-}
-
-; CHECK-LABEL: test_v2f32_f64:
-define <2 x float> @test_v2f32_f64(double %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = fadd double %p, %p
-    %2 = bitcast double %1 to <2 x float>
-    %3 = fadd <2 x float> %2, %2
-    ret <2 x float> %3
-}
-
-; CHECK-LABEL: test_v2f32_v1i64:
-define <2 x float> @test_v2f32_v1i64(<1 x i64> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = add <1 x i64> %p, %p
-    %2 = bitcast <1 x i64> %1 to <2 x float>
-    %3 = fadd <2 x float> %2, %2
-    ret <2 x float> %3
-}
-
-; CHECK-LABEL: test_v2f32_v2i32:
-define <2 x float> @test_v2f32_v2i32(<2 x i32> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = add <2 x i32> %p, %p
-    %2 = bitcast <2 x i32> %1 to <2 x float>
-    %3 = fadd <2 x float> %2, %2
-    ret <2 x float> %3
-}
-
-; CHECK-LABEL: test_v2f32_v4i16:
-define <2 x float> @test_v2f32_v4i16(<4 x i16> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = add <4 x i16> %p, %p
-    %2 = bitcast <4 x i16> %1 to <2 x float>
-    %3 = fadd <2 x float> %2, %2
-    ret <2 x float> %3
-}
-
-; CHECK-LABEL: test_v2f32_v8i8:
-define <2 x float> @test_v2f32_v8i8(<8 x i8> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = add <8 x i8> %p, %p
-    %2 = bitcast <8 x i8> %1 to <2 x float>
-    %3 = fadd <2 x float> %2, %2
-    ret <2 x float> %3
-}
-
-; CHECK-LABEL: test_v2i32_i64:
-define <2 x i32> @test_v2i32_i64(i64 %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = add i64 %p, %p
-    %2 = bitcast i64 %1 to <2 x i32>
-    %3 = add <2 x i32> %2, %2
-    ret <2 x i32> %3
-}
-
-; CHECK-LABEL: test_v2i32_f64:
-define <2 x i32> @test_v2i32_f64(double %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = fadd double %p, %p
-    %2 = bitcast double %1 to <2 x i32>
-    %3 = add <2 x i32> %2, %2
-    ret <2 x i32> %3
-}
-
-; CHECK-LABEL: test_v2i32_v1i64:
-define <2 x i32> @test_v2i32_v1i64(<1 x i64> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = add <1 x i64> %p, %p
-    %2 = bitcast <1 x i64> %1 to <2 x i32>
-    %3 = add <2 x i32> %2, %2
-    ret <2 x i32> %3
-}
-
-; CHECK-LABEL: test_v2i32_v2f32:
-define <2 x i32> @test_v2i32_v2f32(<2 x float> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = fadd <2 x float> %p, %p
-    %2 = bitcast <2 x float> %1 to <2 x i32>
-    %3 = add <2 x i32> %2, %2
-    ret <2 x i32> %3
-}
-
-; CHECK-LABEL: test_v2i32_v4i16:
-define <2 x i32> @test_v2i32_v4i16(<4 x i16> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = add <4 x i16> %p, %p
-    %2 = bitcast <4 x i16> %1 to <2 x i32>
-    %3 = add <2 x i32> %2, %2
-    ret <2 x i32> %3
-}
-
-; CHECK-LABEL: test_v2i32_v8i8:
-define <2 x i32> @test_v2i32_v8i8(<8 x i8> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = add <8 x i8> %p, %p
-    %2 = bitcast <8 x i8> %1 to <2 x i32>
-    %3 = add <2 x i32> %2, %2
-    ret <2 x i32> %3
-}
-
-; CHECK-LABEL: test_v4i16_i64:
-define <4 x i16> @test_v4i16_i64(i64 %p) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = add i64 %p, %p
-    %2 = bitcast i64 %1 to <4 x i16>
-    %3 = add <4 x i16> %2, %2
-    ret <4 x i16> %3
-}
-
-; CHECK-LABEL: test_v4i16_f64:
-define <4 x i16> @test_v4i16_f64(double %p) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = fadd double %p, %p
-    %2 = bitcast double %1 to <4 x i16>
-    %3 = add <4 x i16> %2, %2
-    ret <4 x i16> %3
-}
-
-; CHECK-LABEL: test_v4i16_v1i64:
-define <4 x i16> @test_v4i16_v1i64(<1 x i64> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = add <1 x i64> %p, %p
-    %2 = bitcast <1 x i64> %1 to <4 x i16>
-    %3 = add <4 x i16> %2, %2
-    ret <4 x i16> %3
-}
-
-; CHECK-LABEL: test_v4i16_v2f32:
-define <4 x i16> @test_v4i16_v2f32(<2 x float> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = fadd <2 x float> %p, %p
-    %2 = bitcast <2 x float> %1 to <4 x i16>
-    %3 = add <4 x i16> %2, %2
-    ret <4 x i16> %3
-}
-
-; CHECK-LABEL: test_v4i16_v2i32:
-define <4 x i16> @test_v4i16_v2i32(<2 x i32> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = add <2 x i32> %p, %p
-    %2 = bitcast <2 x i32> %1 to <4 x i16>
-    %3 = add <4 x i16> %2, %2
-    ret <4 x i16> %3
-}
-
-; CHECK-LABEL: test_v4i16_v8i8:
-define <4 x i16> @test_v4i16_v8i8(<8 x i8> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = add <8 x i8> %p, %p
-    %2 = bitcast <8 x i8> %1 to <4 x i16>
-    %3 = add <4 x i16> %2, %2
-    ret <4 x i16> %3
-}
-
-; CHECK-LABEL: test_v8i8_i64:
-define <8 x i8> @test_v8i8_i64(i64 %p) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = add i64 %p, %p
-    %2 = bitcast i64 %1 to <8 x i8>
-    %3 = add <8 x i8> %2, %2
-    ret <8 x i8> %3
-}
-
-; CHECK-LABEL: test_v8i8_f64:
-define <8 x i8> @test_v8i8_f64(double %p) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = fadd double %p, %p
-    %2 = bitcast double %1 to <8 x i8>
-    %3 = add <8 x i8> %2, %2
-    ret <8 x i8> %3
-}
-
-; CHECK-LABEL: test_v8i8_v1i64:
-define <8 x i8> @test_v8i8_v1i64(<1 x i64> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = add <1 x i64> %p, %p
-    %2 = bitcast <1 x i64> %1 to <8 x i8>
-    %3 = add <8 x i8> %2, %2
-    ret <8 x i8> %3
-}
-
-; CHECK-LABEL: test_v8i8_v2f32:
-define <8 x i8> @test_v8i8_v2f32(<2 x float> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = fadd <2 x float> %p, %p
-    %2 = bitcast <2 x float> %1 to <8 x i8>
-    %3 = add <8 x i8> %2, %2
-    ret <8 x i8> %3
-}
-
-; CHECK-LABEL: test_v8i8_v2i32:
-define <8 x i8> @test_v8i8_v2i32(<2 x i32> %p) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = add <2 x i32> %p, %p
-    %2 = bitcast <2 x i32> %1 to <8 x i8>
-    %3 = add <8 x i8> %2, %2
-    ret <8 x i8> %3
-}
-
-; CHECK-LABEL: test_v8i8_v4i16:
-define <8 x i8> @test_v8i8_v4i16(<4 x i16> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = add <4 x i16> %p, %p
-    %2 = bitcast <4 x i16> %1 to <8 x i8>
-    %3 = add <8 x i8> %2, %2
-    ret <8 x i8> %3
-}
-
-; CHECK-LABEL: test_f128_v2f64:
-define fp128 @test_f128_v2f64(<2 x double> %p) {
-; CHECK: ext
-    %1 = fadd <2 x double> %p, %p
-    %2 = bitcast <2 x double> %1 to fp128
-    %3 = fadd fp128 %2, %2
-    ret fp128 %3
-}
-
-; CHECK-LABEL: test_f128_v2i64:
-define fp128 @test_f128_v2i64(<2 x i64> %p) {
-; CHECK: ext
-    %1 = add <2 x i64> %p, %p
-    %2 = bitcast <2 x i64> %1 to fp128
-    %3 = fadd fp128 %2, %2
-    ret fp128 %3
-}
-
-; CHECK-LABEL: test_f128_v4f32:
-define fp128 @test_f128_v4f32(<4 x float> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = fadd <4 x float> %p, %p
-    %2 = bitcast <4 x float> %1 to fp128
-    %3 = fadd fp128 %2, %2
-    ret fp128 %3
-}
-
-; CHECK-LABEL: test_f128_v4i32:
-define fp128 @test_f128_v4i32(<4 x i32> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = add <4 x i32> %p, %p
-    %2 = bitcast <4 x i32> %1 to fp128
-    %3 = fadd fp128 %2, %2
-    ret fp128 %3
-}
-
-; CHECK-LABEL: test_f128_v8i16:
-define fp128 @test_f128_v8i16(<8 x i16> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = add <8 x i16> %p, %p
-    %2 = bitcast <8 x i16> %1 to fp128
-    %3 = fadd fp128 %2, %2
-    ret fp128 %3
-}
-
-; CHECK-LABEL: test_f128_v16i8:
-define fp128 @test_f128_v16i8(<16 x i8> %p) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = add <16 x i8> %p, %p
-    %2 = bitcast <16 x i8> %1 to fp128
-    %3 = fadd fp128 %2, %2
-    ret fp128 %3
-}
-
-; CHECK-LABEL: test_v2f64_f128:
-define <2 x double> @test_v2f64_f128(fp128 %p) {
-; CHECK: ext
-    %1 = fadd fp128 %p, %p
-    %2 = bitcast fp128 %1 to <2 x double>
-    %3 = fadd <2 x double> %2, %2
-    ret <2 x double> %3
-}
-
-; CHECK-LABEL: test_v2f64_v2i64:
-define <2 x double> @test_v2f64_v2i64(<2 x i64> %p) {
-; CHECK: ext
-; CHECK: ext
-    %1 = add <2 x i64> %p, %p
-    %2 = bitcast <2 x i64> %1 to <2 x double>
-    %3 = fadd <2 x double> %2, %2
-    ret <2 x double> %3
-}
-
-; CHECK-LABEL: test_v2f64_v4f32:
-define <2 x double> @test_v2f64_v4f32(<4 x float> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: ext
-    %1 = fadd <4 x float> %p, %p
-    %2 = bitcast <4 x float> %1 to <2 x double>
-    %3 = fadd <2 x double> %2, %2
-    ret <2 x double> %3
-}
-
-; CHECK-LABEL: test_v2f64_v4i32:
-define <2 x double> @test_v2f64_v4i32(<4 x i32> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: ext
-    %1 = add <4 x i32> %p, %p
-    %2 = bitcast <4 x i32> %1 to <2 x double>
-    %3 = fadd <2 x double> %2, %2
-    ret <2 x double> %3
-}
-
-; CHECK-LABEL: test_v2f64_v8i16:
-define <2 x double> @test_v2f64_v8i16(<8 x i16> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-; CHECK: ext
-    %1 = add <8 x i16> %p, %p
-    %2 = bitcast <8 x i16> %1 to <2 x double>
-    %3 = fadd <2 x double> %2, %2
-    ret <2 x double> %3
-}
-
-; CHECK-LABEL: test_v2f64_v16i8:
-define <2 x double> @test_v2f64_v16i8(<16 x i8> %p) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-; CHECK: ext
-    %1 = add <16 x i8> %p, %p
-    %2 = bitcast <16 x i8> %1 to <2 x double>
-    %3 = fadd <2 x double> %2, %2
-    ret <2 x double> %3
-}
-
-; CHECK-LABEL: test_v2i64_f128:
-define <2 x i64> @test_v2i64_f128(fp128 %p) {
-; CHECK: ext
-    %1 = fadd fp128 %p, %p
-    %2 = bitcast fp128 %1 to <2 x i64>
-    %3 = add <2 x i64> %2, %2
-    ret <2 x i64> %3
-}
-
-; CHECK-LABEL: test_v2i64_v2f64:
-define <2 x i64> @test_v2i64_v2f64(<2 x double> %p) {
-; CHECK: ext
-; CHECK: ext
-    %1 = fadd <2 x double> %p, %p
-    %2 = bitcast <2 x double> %1 to <2 x i64>
-    %3 = add <2 x i64> %2, %2
-    ret <2 x i64> %3
-}
-
-; CHECK-LABEL: test_v2i64_v4f32:
-define <2 x i64> @test_v2i64_v4f32(<4 x float> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: ext
-    %1 = fadd <4 x float> %p, %p
-    %2 = bitcast <4 x float> %1 to <2 x i64>
-    %3 = add <2 x i64> %2, %2
-    ret <2 x i64> %3
-}
-
-; CHECK-LABEL: test_v2i64_v4i32:
-define <2 x i64> @test_v2i64_v4i32(<4 x i32> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: ext
-    %1 = add <4 x i32> %p, %p
-    %2 = bitcast <4 x i32> %1 to <2 x i64>
-    %3 = add <2 x i64> %2, %2
-    ret <2 x i64> %3
-}
-
-; CHECK-LABEL: test_v2i64_v8i16:
-define <2 x i64> @test_v2i64_v8i16(<8 x i16> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-; CHECK: ext
-    %1 = add <8 x i16> %p, %p
-    %2 = bitcast <8 x i16> %1 to <2 x i64>
-    %3 = add <2 x i64> %2, %2
-    ret <2 x i64> %3
-}
-
-; CHECK-LABEL: test_v2i64_v16i8:
-define <2 x i64> @test_v2i64_v16i8(<16 x i8> %p) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-; CHECK: ext
-    %1 = add <16 x i8> %p, %p
-    %2 = bitcast <16 x i8> %1 to <2 x i64>
-    %3 = add <2 x i64> %2, %2
-    ret <2 x i64> %3
-}
-
-; CHECK-LABEL: test_v4f32_f128:
-define <4 x float> @test_v4f32_f128(fp128 %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = fadd fp128 %p, %p
-    %2 = bitcast fp128 %1 to <4 x float>
-    %3 = fadd <4 x float> %2, %2
-    ret <4 x float> %3
-}
-
-; CHECK-LABEL: test_v4f32_v2f64:
-define <4 x float> @test_v4f32_v2f64(<2 x double> %p) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = fadd <2 x double> %p, %p
-    %2 = bitcast <2 x double> %1 to <4 x float>
-    %3 = fadd <4 x float> %2, %2
-    ret <4 x float> %3
-}
-
-; CHECK-LABEL: test_v4f32_v2i64:
-define <4 x float> @test_v4f32_v2i64(<2 x i64> %p) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = add <2 x i64> %p, %p
-    %2 = bitcast <2 x i64> %1 to <4 x float>
-    %3 = fadd <4 x float> %2, %2
-    ret <4 x float> %3
-}
-
-; CHECK-LABEL: test_v4f32_v4i32:
-define <4 x float> @test_v4f32_v4i32(<4 x i32> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = add <4 x i32> %p, %p
-    %2 = bitcast <4 x i32> %1 to <4 x float>
-    %3 = fadd <4 x float> %2, %2
-    ret <4 x float> %3
-}
-
-; CHECK-LABEL: test_v4f32_v8i16:
-define <4 x float> @test_v4f32_v8i16(<8 x i16> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = add <8 x i16> %p, %p
-    %2 = bitcast <8 x i16> %1 to <4 x float>
-    %3 = fadd <4 x float> %2, %2
-    ret <4 x float> %3
-}
-
-; CHECK-LABEL: test_v4f32_v16i8:
-define <4 x float> @test_v4f32_v16i8(<16 x i8> %p) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = add <16 x i8> %p, %p
-    %2 = bitcast <16 x i8> %1 to <4 x float>
-    %3 = fadd <4 x float> %2, %2
-    ret <4 x float> %3
-}
-
-; CHECK-LABEL: test_v4i32_f128:
-define <4 x i32> @test_v4i32_f128(fp128 %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = fadd fp128 %p, %p
-    %2 = bitcast fp128 %1 to <4 x i32>
-    %3 = add <4 x i32> %2, %2
-    ret <4 x i32> %3
-}
-
-; CHECK-LABEL: test_v4i32_v2f64:
-define <4 x i32> @test_v4i32_v2f64(<2 x double> %p) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = fadd <2 x double> %p, %p
-    %2 = bitcast <2 x double> %1 to <4 x i32>
-    %3 = add <4 x i32> %2, %2
-    ret <4 x i32> %3
-}
-
-; CHECK-LABEL: test_v4i32_v2i64:
-define <4 x i32> @test_v4i32_v2i64(<2 x i64> %p) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = add <2 x i64> %p, %p
-    %2 = bitcast <2 x i64> %1 to <4 x i32>
-    %3 = add <4 x i32> %2, %2
-    ret <4 x i32> %3
-}
-
-; CHECK-LABEL: test_v4i32_v4f32:
-define <4 x i32> @test_v4i32_v4f32(<4 x float> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = fadd <4 x float> %p, %p
-    %2 = bitcast <4 x float> %1 to <4 x i32>
-    %3 = add <4 x i32> %2, %2
-    ret <4 x i32> %3
-}
-
-; CHECK-LABEL: test_v4i32_v8i16:
-define <4 x i32> @test_v4i32_v8i16(<8 x i16> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = add <8 x i16> %p, %p
-    %2 = bitcast <8 x i16> %1 to <4 x i32>
-    %3 = add <4 x i32> %2, %2
-    ret <4 x i32> %3
-}
-
-; CHECK-LABEL: test_v4i32_v16i8:
-define <4 x i32> @test_v4i32_v16i8(<16 x i8> %p) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = add <16 x i8> %p, %p
-    %2 = bitcast <16 x i8> %1 to <4 x i32>
-    %3 = add <4 x i32> %2, %2
-    ret <4 x i32> %3
-}
-
-; CHECK-LABEL: test_v8i16_f128:
-define <8 x i16> @test_v8i16_f128(fp128 %p) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = fadd fp128 %p, %p
-    %2 = bitcast fp128 %1 to <8 x i16>
-    %3 = add <8 x i16> %2, %2
-    ret <8 x i16> %3
-}
-
-; CHECK-LABEL: test_v8i16_v2f64:
-define <8 x i16> @test_v8i16_v2f64(<2 x double> %p) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = fadd <2 x double> %p, %p
-    %2 = bitcast <2 x double> %1 to <8 x i16>
-    %3 = add <8 x i16> %2, %2
-    ret <8 x i16> %3
-}
-
-; CHECK-LABEL: test_v8i16_v2i64:
-define <8 x i16> @test_v8i16_v2i64(<2 x i64> %p) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = add <2 x i64> %p, %p
-    %2 = bitcast <2 x i64> %1 to <8 x i16>
-    %3 = add <8 x i16> %2, %2
-    ret <8 x i16> %3
-}
-
-; CHECK-LABEL: test_v8i16_v4f32:
-define <8 x i16> @test_v8i16_v4f32(<4 x float> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = fadd <4 x float> %p, %p
-    %2 = bitcast <4 x float> %1 to <8 x i16>
-    %3 = add <8 x i16> %2, %2
-    ret <8 x i16> %3
-}
-
-; CHECK-LABEL: test_v8i16_v4i32:
-define <8 x i16> @test_v8i16_v4i32(<4 x i32> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = add <4 x i32> %p, %p
-    %2 = bitcast <4 x i32> %1 to <8 x i16>
-    %3 = add <8 x i16> %2, %2
-    ret <8 x i16> %3
-}
-
-; CHECK-LABEL: test_v8i16_v16i8:
-define <8 x i16> @test_v8i16_v16i8(<16 x i8> %p) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = add <16 x i8> %p, %p
-    %2 = bitcast <16 x i8> %1 to <8 x i16>
-    %3 = add <8 x i16> %2, %2
-    ret <8 x i16> %3
-}
-
-; CHECK-LABEL: test_v16i8_f128:
-define <16 x i8> @test_v16i8_f128(fp128 %p) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = fadd fp128 %p, %p
-    %2 = bitcast fp128 %1 to <16 x i8>
-    %3 = add <16 x i8> %2, %2
-    ret <16 x i8> %3
-}
-
-; CHECK-LABEL: test_v16i8_v2f64:
-define <16 x i8> @test_v16i8_v2f64(<2 x double> %p) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = fadd <2 x double> %p, %p
-    %2 = bitcast <2 x double> %1 to <16 x i8>
-    %3 = add <16 x i8> %2, %2
-    ret <16 x i8> %3
-}
-
-; CHECK-LABEL: test_v16i8_v2i64:
-define <16 x i8> @test_v16i8_v2i64(<2 x i64> %p) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = add <2 x i64> %p, %p
-    %2 = bitcast <2 x i64> %1 to <16 x i8>
-    %3 = add <16 x i8> %2, %2
-    ret <16 x i8> %3
-}
-
-; CHECK-LABEL: test_v16i8_v4f32:
-define <16 x i8> @test_v16i8_v4f32(<4 x float> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = fadd <4 x float> %p, %p
-    %2 = bitcast <4 x float> %1 to <16 x i8>
-    %3 = add <16 x i8> %2, %2
-    ret <16 x i8> %3
-}
-
-; CHECK-LABEL: test_v16i8_v4i32:
-define <16 x i8> @test_v16i8_v4i32(<4 x i32> %p) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = add <4 x i32> %p, %p
-    %2 = bitcast <4 x i32> %1 to <16 x i8>
-    %3 = add <16 x i8> %2, %2
-    ret <16 x i8> %3
-}
-
-; CHECK-LABEL: test_v16i8_v8i16:
-define <16 x i8> @test_v16i8_v8i16(<8 x i16> %p) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = add <8 x i16> %p, %p
-    %2 = bitcast <8 x i16> %1 to <16 x i8>
-    %3 = add <16 x i8> %2, %2
-    ret <16 x i8> %3
-}
diff --git a/test/CodeGen/ARM64/big-endian-vector-caller.ll b/test/CodeGen/ARM64/big-endian-vector-caller.ll
deleted file mode 100644
index 194a3213925..00000000000
--- a/test/CodeGen/ARM64/big-endian-vector-caller.ll
+++ /dev/null
@@ -1,1100 +0,0 @@
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
-
-; CHECK-LABEL: test_i64_f64:
-declare i64 @test_i64_f64_helper(double %p)
-define void @test_i64_f64(double* %p, i64* %q) {
-; CHECK-NOT: rev
-    %1 = load double* %p
-    %2 = fadd double %1, %1
-    %3 = call i64 @test_i64_f64_helper(double %2)
-    %4 = add i64 %3, %3
-    store i64 %4, i64* %q
-    ret void
-}
-
-; CHECK-LABEL: test_i64_v1i64:
-declare i64 @test_i64_v1i64_helper(<1 x i64> %p)
-define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
-; CHECK-NOT: rev
-    %1 = load <1 x i64>* %p
-    %2 = add <1 x i64> %1, %1
-    %3 = call i64 @test_i64_v1i64_helper(<1 x i64> %2)
-    %4 = add i64 %3, %3
-    store i64 %4, i64* %q
-    ret void
-}
-
-; CHECK-LABEL: test_i64_v2f32:
-declare i64 @test_i64_v2f32_helper(<2 x float> %p)
-define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <2 x float>* %p
-    %2 = fadd <2 x float> %1, %1
-    %3 = call i64 @test_i64_v2f32_helper(<2 x float> %2)
-    %4 = add i64 %3, %3
-    store i64 %4, i64* %q
-    ret void
-}
-
-; CHECK-LABEL: test_i64_v2i32:
-declare i64 @test_i64_v2i32_helper(<2 x i32> %p)
-define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <2 x i32>* %p
-    %2 = add <2 x i32> %1, %1
-    %3 = call i64 @test_i64_v2i32_helper(<2 x i32> %2)
-    %4 = add i64 %3, %3
-    store i64 %4, i64* %q
-    ret void
-}
-
-; CHECK-LABEL: test_i64_v4i16:
-declare i64 @test_i64_v4i16_helper(<4 x i16> %p)
-define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = load <4 x i16>* %p
-    %2 = add <4 x i16> %1, %1
-    %3 = call i64 @test_i64_v4i16_helper(<4 x i16> %2)
-    %4 = add i64 %3, %3
-    store i64 %4, i64* %q
-    ret void
-}
-
-; CHECK-LABEL: test_i64_v8i8:
-declare i64 @test_i64_v8i8_helper(<8 x i8> %p)
-define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = load <8 x i8>* %p
-    %2 = add <8 x i8> %1, %1
-    %3 = call i64 @test_i64_v8i8_helper(<8 x i8> %2)
-    %4 = add i64 %3, %3
-    store i64 %4, i64* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f64_i64:
-declare double @test_f64_i64_helper(i64 %p)
-define void @test_f64_i64(i64* %p, double* %q) {
-; CHECK-NOT: rev
-    %1 = load i64* %p
-    %2 = add i64 %1, %1
-    %3 = call double @test_f64_i64_helper(i64 %2)
-    %4 = fadd double %3, %3
-    store double %4, double* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f64_v1i64:
-declare double @test_f64_v1i64_helper(<1 x i64> %p)
-define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
-; CHECK-NOT: rev
-    %1 = load <1 x i64>* %p
-    %2 = add <1 x i64> %1, %1
-    %3 = call double @test_f64_v1i64_helper(<1 x i64> %2)
-    %4 = fadd double %3, %3
-    store double %4, double* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f64_v2f32:
-declare double @test_f64_v2f32_helper(<2 x float> %p)
-define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <2 x float>* %p
-    %2 = fadd <2 x float> %1, %1
-    %3 = call double @test_f64_v2f32_helper(<2 x float> %2)
-    %4 = fadd double %3, %3
-    store double %4, double* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f64_v2i32:
-declare double @test_f64_v2i32_helper(<2 x i32> %p)
-define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <2 x i32>* %p
-    %2 = add <2 x i32> %1, %1
-    %3 = call double @test_f64_v2i32_helper(<2 x i32> %2)
-    %4 = fadd double %3, %3
-    store double %4, double* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f64_v4i16:
-declare double @test_f64_v4i16_helper(<4 x i16> %p)
-define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = load <4 x i16>* %p
-    %2 = add <4 x i16> %1, %1
-    %3 = call double @test_f64_v4i16_helper(<4 x i16> %2)
-    %4 = fadd double %3, %3
-    store double %4, double* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f64_v8i8:
-declare double @test_f64_v8i8_helper(<8 x i8> %p)
-define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = load <8 x i8>* %p
-    %2 = add <8 x i8> %1, %1
-    %3 = call double @test_f64_v8i8_helper(<8 x i8> %2)
-    %4 = fadd double %3, %3
-    store double %4, double* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v1i64_i64:
-declare <1 x i64> @test_v1i64_i64_helper(i64 %p)
-define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
-; CHECK-NOT: rev
-    %1 = load i64* %p
-    %2 = add i64 %1, %1
-    %3 = call <1 x i64> @test_v1i64_i64_helper(i64 %2)
-    %4 = add <1 x i64> %3, %3
-    store <1 x i64> %4, <1 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v1i64_f64:
-declare <1 x i64> @test_v1i64_f64_helper(double %p)
-define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
-; CHECK-NOT: rev
-    %1 = load double* %p
-    %2 = fadd double %1, %1
-    %3 = call <1 x i64> @test_v1i64_f64_helper(double %2)
-    %4 = add <1 x i64> %3, %3
-    store <1 x i64> %4, <1 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v1i64_v2f32:
-declare <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %p)
-define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <2 x float>* %p
-    %2 = fadd <2 x float> %1, %1
-    %3 = call <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %2)
-    %4 = add <1 x i64> %3, %3
-    store <1 x i64> %4, <1 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v1i64_v2i32:
-declare <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %p)
-define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <2 x i32>* %p
-    %2 = add <2 x i32> %1, %1
-    %3 = call <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %2)
-    %4 = add <1 x i64> %3, %3
-    store <1 x i64> %4, <1 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v1i64_v4i16:
-declare <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %p)
-define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = load <4 x i16>* %p
-    %2 = add <4 x i16> %1, %1
-    %3 = call <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %2)
-    %4 = add <1 x i64> %3, %3
-    store <1 x i64> %4, <1 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v1i64_v8i8:
-declare <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %p)
-define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = load <8 x i8>* %p
-    %2 = add <8 x i8> %1, %1
-    %3 = call <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %2)
-    %4 = add <1 x i64> %3, %3
-    store <1 x i64> %4, <1 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f32_i64:
-declare <2 x float> @test_v2f32_i64_helper(i64 %p)
-define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load i64* %p
-    %2 = add i64 %1, %1
-    %3 = call <2 x float> @test_v2f32_i64_helper(i64 %2)
-    %4 = fadd <2 x float> %3, %3
-    store <2 x float> %4, <2 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f32_f64:
-declare <2 x float> @test_v2f32_f64_helper(double %p)
-define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load double* %p
-    %2 = fadd double %1, %1
-    %3 = call <2 x float> @test_v2f32_f64_helper(double %2)
-    %4 = fadd <2 x float> %3, %3
-    store <2 x float> %4, <2 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f32_v1i64:
-declare <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %p)
-define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <1 x i64>* %p
-    %2 = add <1 x i64> %1, %1
-    %3 = call <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %2)
-    %4 = fadd <2 x float> %3, %3
-    store <2 x float> %4, <2 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f32_v2i32:
-declare <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %p)
-define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <2 x i32>* %p
-    %2 = add <2 x i32> %1, %1
-    %3 = call <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %2)
-    %4 = fadd <2 x float> %3, %3
-    store <2 x float> %4, <2 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f32_v4i16:
-declare <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %p)
-define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <4 x i16>* %p
-    %2 = add <4 x i16> %1, %1
-    %3 = call <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %2)
-    %4 = fadd <2 x float> %3, %3
-    store <2 x float> %4, <2 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f32_v8i8:
-declare <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %p)
-define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <8 x i8>* %p
-    %2 = add <8 x i8> %1, %1
-    %3 = call <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %2)
-    %4 = fadd <2 x float> %3, %3
-    store <2 x float> %4, <2 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i32_i64:
-declare <2 x i32> @test_v2i32_i64_helper(i64 %p)
-define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load i64* %p
-    %2 = add i64 %1, %1
-    %3 = call <2 x i32> @test_v2i32_i64_helper(i64 %2)
-    %4 = add <2 x i32> %3, %3
-    store <2 x i32> %4, <2 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i32_f64:
-declare <2 x i32> @test_v2i32_f64_helper(double %p)
-define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load double* %p
-    %2 = fadd double %1, %1
-    %3 = call <2 x i32> @test_v2i32_f64_helper(double %2)
-    %4 = add <2 x i32> %3, %3
-    store <2 x i32> %4, <2 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i32_v1i64:
-declare <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %p)
-define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <1 x i64>* %p
-    %2 = add <1 x i64> %1, %1
-    %3 = call <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %2)
-    %4 = add <2 x i32> %3, %3
-    store <2 x i32> %4, <2 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i32_v2f32:
-declare <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %p)
-define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <2 x float>* %p
-    %2 = fadd <2 x float> %1, %1
-    %3 = call <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %2)
-    %4 = add <2 x i32> %3, %3
-    store <2 x i32> %4, <2 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i32_v4i16:
-declare <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %p)
-define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <4 x i16>* %p
-    %2 = add <4 x i16> %1, %1
-    %3 = call <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %2)
-    %4 = add <2 x i32> %3, %3
-    store <2 x i32> %4, <2 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i32_v8i8:
-declare <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %p)
-define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-; CHECK: rev64 v{{[0-9]+}}.2s
-    %1 = load <8 x i8>* %p
-    %2 = add <8 x i8> %1, %1
-    %3 = call <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %2)
-    %4 = add <2 x i32> %3, %3
-    store <2 x i32> %4, <2 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i16_i64:
-declare <4 x i16> @test_v4i16_i64_helper(i64 %p)
-define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = load i64* %p
-    %2 = add i64 %1, %1
-    %3 = call <4 x i16> @test_v4i16_i64_helper(i64 %2)
-    %4 = add <4 x i16> %3, %3
-    store <4 x i16> %4, <4 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i16_f64:
-declare <4 x i16> @test_v4i16_f64_helper(double %p)
-define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = load double* %p
-    %2 = fadd double %1, %1
-    %3 = call <4 x i16> @test_v4i16_f64_helper(double %2)
-    %4 = add <4 x i16> %3, %3
-    store <4 x i16> %4, <4 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i16_v1i64:
-declare <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %p)
-define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = load <1 x i64>* %p
-    %2 = add <1 x i64> %1, %1
-    %3 = call <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %2)
-    %4 = add <4 x i16> %3, %3
-    store <4 x i16> %4, <4 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i16_v2f32:
-declare <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %p)
-define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = load <2 x float>* %p
-    %2 = fadd <2 x float> %1, %1
-    %3 = call <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %2)
-    %4 = add <4 x i16> %3, %3
-    store <4 x i16> %4, <4 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i16_v2i32:
-declare <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %p)
-define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = load <2 x i32>* %p
-    %2 = add <2 x i32> %1, %1
-    %3 = call <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %2)
-    %4 = add <4 x i16> %3, %3
-    store <4 x i16> %4, <4 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i16_v8i8:
-declare <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %p)
-define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-; CHECK: rev64 v{{[0-9]+}}.4h
-    %1 = load <8 x i8>* %p
-    %2 = add <8 x i8> %1, %1
-    %3 = call <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %2)
-    %4 = add <4 x i16> %3, %3
-    store <4 x i16> %4, <4 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i8_i64:
-declare <8 x i8> @test_v8i8_i64_helper(i64 %p)
-define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = load i64* %p
-    %2 = add i64 %1, %1
-    %3 = call <8 x i8> @test_v8i8_i64_helper(i64 %2)
-    %4 = add <8 x i8> %3, %3
-    store <8 x i8> %4, <8 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i8_f64:
-declare <8 x i8> @test_v8i8_f64_helper(double %p)
-define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = load double* %p
-    %2 = fadd double %1, %1
-    %3 = call <8 x i8> @test_v8i8_f64_helper(double %2)
-    %4 = add <8 x i8> %3, %3
-    store <8 x i8> %4, <8 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i8_v1i64:
-declare <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %p)
-define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = load <1 x i64>* %p
-    %2 = add <1 x i64> %1, %1
-    %3 = call <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %2)
-    %4 = add <8 x i8> %3, %3
-    store <8 x i8> %4, <8 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i8_v2f32:
-declare <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %p)
-define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = load <2 x float>* %p
-    %2 = fadd <2 x float> %1, %1
-    %3 = call <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %2)
-    %4 = add <8 x i8> %3, %3
-    store <8 x i8> %4, <8 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i8_v2i32:
-declare <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %p)
-define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.2s
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = load <2 x i32>* %p
-    %2 = add <2 x i32> %1, %1
-    %3 = call <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %2)
-    %4 = add <8 x i8> %3, %3
-    store <8 x i8> %4, <8 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i8_v4i16:
-declare <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %p)
-define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4h
-; CHECK: rev64 v{{[0-9]+}}.8b
-    %1 = load <4 x i16>* %p
-    %2 = add <4 x i16> %1, %1
-    %3 = call <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %2)
-    %4 = add <8 x i8> %3, %3
-    store <8 x i8> %4, <8 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f128_v2f64:
-declare fp128 @test_f128_v2f64_helper(<2 x double> %p)
-define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
-; CHECK: ext
-    %1 = load <2 x double>* %p
-    %2 = fadd <2 x double> %1, %1
-    %3 = call fp128 @test_f128_v2f64_helper(<2 x double> %2)
-    %4 = fadd fp128 %3, %3
-    store fp128 %4, fp128* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f128_v2i64:
-declare fp128 @test_f128_v2i64_helper(<2 x i64> %p)
-define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
-; CHECK: ext
-    %1 = load <2 x i64>* %p
-    %2 = add <2 x i64> %1, %1
-    %3 = call fp128 @test_f128_v2i64_helper(<2 x i64> %2)
-    %4 = fadd fp128 %3, %3
-    store fp128 %4, fp128* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f128_v4f32:
-declare fp128 @test_f128_v4f32_helper(<4 x float> %p)
-define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load <4 x float>* %p
-    %2 = fadd <4 x float> %1, %1
-    %3 = call fp128 @test_f128_v4f32_helper(<4 x float> %2)
-    %4 = fadd fp128 %3, %3
-    store fp128 %4, fp128* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f128_v4i32:
-declare fp128 @test_f128_v4i32_helper(<4 x i32> %p)
-define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load <4 x i32>* %p
-    %2 = add <4 x i32> %1, %1
-    %3 = call fp128 @test_f128_v4i32_helper(<4 x i32> %2)
-    %4 = fadd fp128 %3, %3
-    store fp128 %4, fp128* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f128_v8i16:
-declare fp128 @test_f128_v8i16_helper(<8 x i16> %p)
-define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = load <8 x i16>* %p
-    %2 = add <8 x i16> %1, %1
-    %3 = call fp128 @test_f128_v8i16_helper(<8 x i16> %2)
-    %4 = fadd fp128 %3, %3
-    store fp128 %4, fp128* %q
-    ret void
-}
-
-; CHECK-LABEL: test_f128_v16i8:
-declare fp128 @test_f128_v16i8_helper(<16 x i8> %p)
-define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = load <16 x i8>* %p
-    %2 = add <16 x i8> %1, %1
-    %3 = call fp128 @test_f128_v16i8_helper(<16 x i8> %2)
-    %4 = fadd fp128 %3, %3
-    store fp128 %4, fp128* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f64_f128:
-declare <2 x double> @test_v2f64_f128_helper(fp128 %p)
-define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
-; CHECK: ext
-    %1 = load fp128* %p
-    %2 = fadd fp128 %1, %1
-    %3 = call <2 x double> @test_v2f64_f128_helper(fp128 %2)
-    %4 = fadd <2 x double> %3, %3
-    store <2 x double> %4, <2 x double>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f64_v2i64:
-declare <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %p)
-define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
-; CHECK: ext
-; CHECK: ext
-    %1 = load <2 x i64>* %p
-    %2 = add <2 x i64> %1, %1
-    %3 = call <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %2)
-    %4 = fadd <2 x double> %3, %3
-    store <2 x double> %4, <2 x double>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f64_v4f32:
-declare <2 x double> @test_v2f64_v4f32_helper(<4 x float> %p)
-define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: ext
-    %1 = load <4 x float>* %p
-    %2 = fadd <4 x float> %1, %1
-    %3 = call <2 x double> @test_v2f64_v4f32_helper(<4 x float> %2)
-    %4 = fadd <2 x double> %3, %3
-    store <2 x double> %4, <2 x double>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f64_v4i32:
-declare <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %p)
-define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: ext
-    %1 = load <4 x i32>* %p
-    %2 = add <4 x i32> %1, %1
-    %3 = call <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %2)
-    %4 = fadd <2 x double> %3, %3
-    store <2 x double> %4, <2 x double>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f64_v8i16:
-declare <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %p)
-define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-; CHECK: ext
-    %1 = load <8 x i16>* %p
-    %2 = add <8 x i16> %1, %1
-    %3 = call <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %2)
-    %4 = fadd <2 x double> %3, %3
-    store <2 x double> %4, <2 x double>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2f64_v16i8:
-declare <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %p)
-define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-; CHECK: ext
-    %1 = load <16 x i8>* %p
-    %2 = add <16 x i8> %1, %1
-    %3 = call <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %2)
-    %4 = fadd <2 x double> %3, %3
-    store <2 x double> %4, <2 x double>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i64_f128:
-declare <2 x i64> @test_v2i64_f128_helper(fp128 %p)
-define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
-; CHECK: ext
-    %1 = load fp128* %p
-    %2 = fadd fp128 %1, %1
-    %3 = call <2 x i64> @test_v2i64_f128_helper(fp128 %2)
-    %4 = add <2 x i64> %3, %3
-    store <2 x i64> %4, <2 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i64_v2f64:
-declare <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %p)
-define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
-; CHECK: ext
-; CHECK: ext
-    %1 = load <2 x double>* %p
-    %2 = fadd <2 x double> %1, %1
-    %3 = call <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %2)
-    %4 = add <2 x i64> %3, %3
-    store <2 x i64> %4, <2 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i64_v4f32:
-declare <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %p)
-define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: ext
-    %1 = load <4 x float>* %p
-    %2 = fadd <4 x float> %1, %1
-    %3 = call <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %2)
-    %4 = add <2 x i64> %3, %3
-    store <2 x i64> %4, <2 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i64_v4i32:
-declare <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %p)
-define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: ext
-    %1 = load <4 x i32>* %p
-    %2 = add <4 x i32> %1, %1
-    %3 = call <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %2)
-    %4 = add <2 x i64> %3, %3
-    store <2 x i64> %4, <2 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i64_v8i16:
-declare <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %p)
-define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-; CHECK: ext
-    %1 = load <8 x i16>* %p
-    %2 = add <8 x i16> %1, %1
-    %3 = call <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %2)
-    %4 = add <2 x i64> %3, %3
-    store <2 x i64> %4, <2 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v2i64_v16i8:
-declare <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %p)
-define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-; CHECK: ext
-    %1 = load <16 x i8>* %p
-    %2 = add <16 x i8> %1, %1
-    %3 = call <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %2)
-    %4 = add <2 x i64> %3, %3
-    store <2 x i64> %4, <2 x i64>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4f32_f128:
-declare <4 x float> @test_v4f32_f128_helper(fp128 %p)
-define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load fp128* %p
-    %2 = fadd fp128 %1, %1
-    %3 = call <4 x float> @test_v4f32_f128_helper(fp128 %2)
-    %4 = fadd <4 x float> %3, %3
-    store <4 x float> %4, <4 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4f32_v2f64:
-declare <4 x float> @test_v4f32_v2f64_helper(<2 x double> %p)
-define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load <2 x double>* %p
-    %2 = fadd <2 x double> %1, %1
-    %3 = call <4 x float> @test_v4f32_v2f64_helper(<2 x double> %2)
-    %4 = fadd <4 x float> %3, %3
-    store <4 x float> %4, <4 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4f32_v2i64:
-declare <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %p)
-define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load <2 x i64>* %p
-    %2 = add <2 x i64> %1, %1
-    %3 = call <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %2)
-    %4 = fadd <4 x float> %3, %3
-    store <4 x float> %4, <4 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4f32_v4i32:
-declare <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %p)
-define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load <4 x i32>* %p
-    %2 = add <4 x i32> %1, %1
-    %3 = call <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %2)
-    %4 = fadd <4 x float> %3, %3
-    store <4 x float> %4, <4 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4f32_v8i16:
-declare <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %p)
-define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load <8 x i16>* %p
-    %2 = add <8 x i16> %1, %1
-    %3 = call <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %2)
-    %4 = fadd <4 x float> %3, %3
-    store <4 x float> %4, <4 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4f32_v16i8:
-declare <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %p)
-define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load <16 x i8>* %p
-    %2 = add <16 x i8> %1, %1
-    %3 = call <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %2)
-    %4 = fadd <4 x float> %3, %3
-    store <4 x float> %4, <4 x float>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i32_f128:
-declare <4 x i32> @test_v4i32_f128_helper(fp128 %p)
-define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load fp128* %p
-    %2 = fadd fp128 %1, %1
-    %3 = call <4 x i32> @test_v4i32_f128_helper(fp128 %2)
-    %4 = add <4 x i32> %3, %3
-    store <4 x i32> %4, <4 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i32_v2f64:
-declare <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %p)
-define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load <2 x double>* %p
-    %2 = fadd <2 x double> %1, %1
-    %3 = call <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %2)
-    %4 = add <4 x i32> %3, %3
-    store <4 x i32> %4, <4 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i32_v2i64:
-declare <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %p)
-define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load <2 x i64>* %p
-    %2 = add <2 x i64> %1, %1
-    %3 = call <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %2)
-    %4 = add <4 x i32> %3, %3
-    store <4 x i32> %4, <4 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i32_v4f32:
-declare <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %p)
-define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load <4 x float>* %p
-    %2 = fadd <4 x float> %1, %1
-    %3 = call <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %2)
-    %4 = add <4 x i32> %3, %3
-    store <4 x i32> %4, <4 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i32_v8i16:
-declare <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %p)
-define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load <8 x i16>* %p
-    %2 = add <8 x i16> %1, %1
-    %3 = call <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %2)
-    %4 = add <4 x i32> %3, %3
-    store <4 x i32> %4, <4 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v4i32_v16i8:
-declare <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %p)
-define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-    %1 = load <16 x i8>* %p
-    %2 = add <16 x i8> %1, %1
-    %3 = call <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %2)
-    %4 = add <4 x i32> %3, %3
-    store <4 x i32> %4, <4 x i32>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i16_f128:
-declare <8 x i16> @test_v8i16_f128_helper(fp128 %p)
-define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = load fp128* %p
-    %2 = fadd fp128 %1, %1
-    %3 = call <8 x i16> @test_v8i16_f128_helper(fp128 %2)
-    %4 = add <8 x i16> %3, %3
-    store <8 x i16> %4, <8 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i16_v2f64:
-declare <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %p)
-define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = load <2 x double>* %p
-    %2 = fadd <2 x double> %1, %1
-    %3 = call <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %2)
-    %4 = add <8 x i16> %3, %3
-    store <8 x i16> %4, <8 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i16_v2i64:
-declare <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %p)
-define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = load <2 x i64>* %p
-    %2 = add <2 x i64> %1, %1
-    %3 = call <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %2)
-    %4 = add <8 x i16> %3, %3
-    store <8 x i16> %4, <8 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i16_v4f32:
-declare <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %p)
-define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = load <4 x float>* %p
-    %2 = fadd <4 x float> %1, %1
-    %3 = call <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %2)
-    %4 = add <8 x i16> %3, %3
-    store <8 x i16> %4, <8 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i16_v4i32:
-declare <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %p)
-define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = load <4 x i32>* %p
-    %2 = add <4 x i32> %1, %1
-    %3 = call <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %2)
-    %4 = add <8 x i16> %3, %3
-    store <8 x i16> %4, <8 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v8i16_v16i8:
-declare <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %p)
-define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-    %1 = load <16 x i8>* %p
-    %2 = add <16 x i8> %1, %1
-    %3 = call <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %2)
-    %4 = add <8 x i16> %3, %3
-    store <8 x i16> %4, <8 x i16>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v16i8_f128:
-declare <16 x i8> @test_v16i8_f128_helper(fp128 %p)
-define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = load fp128* %p
-    %2 = fadd fp128 %1, %1
-    %3 = call <16 x i8> @test_v16i8_f128_helper(fp128 %2)
-    %4 = add <16 x i8> %3, %3
-    store <16 x i8> %4, <16 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v16i8_v2f64:
-declare <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %p)
-define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = load <2 x double>* %p
-    %2 = fadd <2 x double> %1, %1
-    %3 = call <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %2)
-    %4 = add <16 x i8> %3, %3
-    store <16 x i8> %4, <16 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v16i8_v2i64:
-declare <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %p)
-define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = load <2 x i64>* %p
-    %2 = add <2 x i64> %1, %1
-    %3 = call <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %2)
-    %4 = add <16 x i8> %3, %3
-    store <16 x i8> %4, <16 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v16i8_v4f32:
-declare <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %p)
-define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = load <4 x float>* %p
-    %2 = fadd <4 x float> %1, %1
-    %3 = call <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %2)
-    %4 = add <16 x i8> %3, %3
-    store <16 x i8> %4, <16 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v16i8_v4i32:
-declare <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %p)
-define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.4s
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = load <4 x i32>* %p
-    %2 = add <4 x i32> %1, %1
-    %3 = call <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %2)
-    %4 = add <16 x i8> %3, %3
-    store <16 x i8> %4, <16 x i8>* %q
-    ret void
-}
-
-; CHECK-LABEL: test_v16i8_v8i16:
-declare <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %p)
-define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
-; CHECK: rev64 v{{[0-9]+}}.8h
-; CHECK: ext
-; CHECK: rev64 v{{[0-9]+}}.16b
-; CHECK: ext
-    %1 = load <8 x i16>* %p
-    %2 = add <8 x i16> %1, %1
-    %3 = call <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %2)
-    %4 = add <16 x i8> %3, %3
-    store <16 x i8> %4, <16 x i8>* %q
-    ret void
-}
diff --git a/test/CodeGen/ARM64/big-imm-offsets.ll b/test/CodeGen/ARM64/big-imm-offsets.ll
deleted file mode 100644
index a56df07a49a..00000000000
--- a/test/CodeGen/ARM64/big-imm-offsets.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -march=arm64 < %s
-
-
-; Make sure large offsets aren't mistaken for valid immediate offsets.
-; <rdar://problem/13190511>
-define void @f(i32* nocapture %p) {
-entry:
-  %a = ptrtoint i32* %p to i64
-  %ao = add i64 %a, 25769803792
-  %b = inttoptr i64 %ao to i32*
-  store volatile i32 0, i32* %b, align 4
-  store volatile i32 0, i32* %b, align 4
-  ret void
-}
diff --git a/test/CodeGen/ARM64/big-stack.ll b/test/CodeGen/ARM64/big-stack.ll
deleted file mode 100644
index 3f91bb3c248..00000000000
--- a/test/CodeGen/ARM64/big-stack.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-target triple = "arm64-apple-macosx10"
-
-; Check that big stacks are generated correctly.
-; Currently, this is done by a sequence of sub instructions,
-; which can encode immediate with a 12 bits mask an optionally
-; shift left (up to 12). I.e., 16773120 is the biggest value.
-; <rdar://12513931>
-; CHECK-LABEL: foo:
-; CHECK: sub sp, sp, #4095, lsl #12
-; CHECK: sub sp, sp, #4095, lsl #12
-; CHECK: sub sp, sp, #2, lsl #12
-define void @foo() nounwind ssp {
-entry:
-  %buffer = alloca [33554432 x i8], align 1
-  %arraydecay = getelementptr inbounds [33554432 x i8]* %buffer, i64 0, i64 0
-  call void @doit(i8* %arraydecay) nounwind
-  ret void
-}
-
-declare void @doit(i8*)
diff --git a/test/CodeGen/ARM64/bitfield-extract.ll b/test/CodeGen/ARM64/bitfield-extract.ll
deleted file mode 100644
index 112efddd4fa..00000000000
--- a/test/CodeGen/ARM64/bitfield-extract.ll
+++ /dev/null
@@ -1,532 +0,0 @@
-; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s | FileCheck --check-prefix=OPT %s
-; RUN: llc < %s -march=arm64 | FileCheck %s
-%struct.X = type { i8, i8, [2 x i8] }
-%struct.Y = type { i32, i8 }
-%struct.Z = type { i8, i8, [2 x i8], i16 }
-%struct.A = type { i64, i8 }
-
-define void @foo(%struct.X* nocapture %x, %struct.Y* nocapture %y) nounwind optsize ssp {
-; CHECK-LABEL: foo:
-; CHECK: ubfx
-; CHECK-NOT: and
-; CHECK: ret
-
-  %tmp = bitcast %struct.X* %x to i32*
-  %tmp1 = load i32* %tmp, align 4
-  %b = getelementptr inbounds %struct.Y* %y, i64 0, i32 1
-  %bf.clear = lshr i32 %tmp1, 3
-  %bf.clear.lobit = and i32 %bf.clear, 1
-  %frombool = trunc i32 %bf.clear.lobit to i8
-  store i8 %frombool, i8* %b, align 1
-  ret void
-}
-
-define i32 @baz(i64 %cav1.coerce) nounwind {
-; CHECK-LABEL: baz:
-; CHECK: sbfx  w0, w0, #0, #4
-  %tmp = trunc i64 %cav1.coerce to i32
-  %tmp1 = shl i32 %tmp, 28
-  %bf.val.sext = ashr exact i32 %tmp1, 28
-  ret i32 %bf.val.sext
-}
-
-define i32 @bar(i64 %cav1.coerce) nounwind {
-; CHECK-LABEL: bar:
-; CHECK: sbfx  w0, w0, #4, #6
-  %tmp = trunc i64 %cav1.coerce to i32
-  %cav1.sroa.0.1.insert = shl i32 %tmp, 22
-  %tmp1 = ashr i32 %cav1.sroa.0.1.insert, 26
-  ret i32 %tmp1
-}
-
-define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind optsize ssp {
-; CHECK-LABEL: fct1:
-; CHECK: ubfx
-; CHECK-NOT: and
-; CHECK: ret
-
-  %tmp = bitcast %struct.Z* %x to i64*
-  %tmp1 = load i64* %tmp, align 4
-  %b = getelementptr inbounds %struct.A* %y, i64 0, i32 0
-  %bf.clear = lshr i64 %tmp1, 3
-  %bf.clear.lobit = and i64 %bf.clear, 1
-  store i64 %bf.clear.lobit, i64* %b, align 8
-  ret void
-}
-
-define i64 @fct2(i64 %cav1.coerce) nounwind {
-; CHECK-LABEL: fct2:
-; CHECK: sbfx  x0, x0, #0, #36
-  %tmp = shl i64 %cav1.coerce, 28
-  %bf.val.sext = ashr exact i64 %tmp, 28
-  ret i64 %bf.val.sext
-}
-
-define i64 @fct3(i64 %cav1.coerce) nounwind {
-; CHECK-LABEL: fct3:
-; CHECK: sbfx  x0, x0, #4, #38
-  %cav1.sroa.0.1.insert = shl i64 %cav1.coerce, 22
-  %tmp1 = ashr i64 %cav1.sroa.0.1.insert, 26
-  ret i64 %tmp1
-}
-
-define void @fct4(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct4:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfxil [[REG1]], x1, #16, #24
-; CHECK-NEXT: str [[REG1]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, -16777216
-  %shr = lshr i64 %x, 16
-  %and1 = and i64 %shr, 16777215
-  %or = or i64 %and, %and1
-  store i64 %or, i64* %y, align 8
-  ret void
-}
-
-define void @fct5(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct5:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
-; CHECK-NEXT: str [[REG1]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, -8
-  %shr = lshr i32 %x, 16
-  %and1 = and i32 %shr, 7
-  %or = or i32 %and, %and1
-  store i32 %or, i32* %y, align 8
-  ret void
-}
-
-; Check if we can still catch bfm instruction when we drop some low bits
-define void @fct6(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct6:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
-; lsr is an alias of ubfm
-; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #2
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, -8
-  %shr = lshr i32 %x, 16
-  %and1 = and i32 %shr, 7
-  %or = or i32 %and, %and1
-  %shr1 = lshr i32 %or, 2
-  store i32 %shr1, i32* %y, align 8
-  ret void
-}
-
-
-; Check if we can still catch bfm instruction when we drop some high bits
-define void @fct7(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct7:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
-; lsl is an alias of ubfm
-; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, -8
-  %shr = lshr i32 %x, 16
-  %and1 = and i32 %shr, 7
-  %or = or i32 %and, %and1
-  %shl = shl i32 %or, 2
-  store i32 %shl, i32* %y, align 8
-  ret void
-}
-
-
-; Check if we can still catch bfm instruction when we drop some low bits
-; (i64 version)
-define void @fct8(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct8:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
-; lsr is an alias of ubfm
-; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #2
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, -8
-  %shr = lshr i64 %x, 16
-  %and1 = and i64 %shr, 7
-  %or = or i64 %and, %and1
-  %shr1 = lshr i64 %or, 2
-  store i64 %shr1, i64* %y, align 8
-  ret void
-}
-
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; (i64 version)
-define void @fct9(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct9:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
-; lsr is an alias of ubfm
-; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, -8
-  %shr = lshr i64 %x, 16
-  %and1 = and i64 %shr, 7
-  %or = or i64 %and, %and1
-  %shl = shl i64 %or, 2
-  store i64 %shl, i64* %y, align 8
-  ret void
-}
-
-; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
-; (i32 version)
-define void @fct10(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct10:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfxil [[REG1]], w1, #0, #3
-; lsl is an alias of ubfm
-; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, -8
-  %and1 = and i32 %x, 7
-  %or = or i32 %and, %and1
-  %shl = shl i32 %or, 2
-  store i32 %shl, i32* %y, align 8
-  ret void
-}
-
-; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
-; (i64 version)
-define void @fct11(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct11:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfxil [[REG1]], x1, #0, #3
-; lsl is an alias of ubfm
-; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, -8
-  %and1 = and i64 %x, 7
-  %or = or i64 %and, %and1
-  %shl = shl i64 %or, 2
-  store i64 %shl, i64* %y, align 8
-  ret void
-}
-
-define zeroext i1 @fct12bis(i32 %tmp2) unnamed_addr nounwind ssp align 2 {
-; CHECK-LABEL: fct12bis:
-; CHECK-NOT: and
-; CHECK: ubfx w0, w0, #11, #1
-  %and.i.i = and i32 %tmp2, 2048
-  %tobool.i.i = icmp ne i32 %and.i.i, 0
-  ret i1 %tobool.i.i
-}
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; and some low bits
-define void @fct12(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct12:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
-; lsr is an alias of ubfm
-; CHECK-NEXT: ubfx [[REG2:w[0-9]+]], [[REG1]], #2, #28
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, -8
-  %shr = lshr i32 %x, 16
-  %and1 = and i32 %shr, 7
-  %or = or i32 %and, %and1
-  %shl = shl i32 %or, 2
-  %shr2 = lshr i32 %shl, 4
-  store i32 %shr2, i32* %y, align 8
-  ret void
-}
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; and some low bits
-; (i64 version)
-define void @fct13(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct13:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
-; lsr is an alias of ubfm
-; CHECK-NEXT: ubfx [[REG2:x[0-9]+]], [[REG1]], #2, #60
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, -8
-  %shr = lshr i64 %x, 16
-  %and1 = and i64 %shr, 7
-  %or = or i64 %and, %and1
-  %shl = shl i64 %or, 2
-  %shr2 = lshr i64 %shl, 4
-  store i64 %shr2, i64* %y, align 8
-  ret void
-}
-
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; and some low bits
-define void @fct14(i32* nocapture %y, i32 %x, i32 %x1) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct14:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfxil [[REG1]], w1, #16, #8
-; lsr is an alias of ubfm
-; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #4
-; CHECK-NEXT: bfxil [[REG2]], w2, #5, #3
-; lsl is an alias of ubfm
-; CHECK-NEXT: lsl [[REG3:w[0-9]+]], [[REG2]], #2
-; CHECK-NEXT: str [[REG3]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, -256
-  %shr = lshr i32 %x, 16
-  %and1 = and i32 %shr, 255
-  %or = or i32 %and, %and1
-  %shl = lshr i32 %or, 4
-  %and2 = and i32 %shl, -8
-  %shr1 = lshr i32 %x1, 5
-  %and3 = and i32 %shr1, 7
-  %or1 = or i32 %and2, %and3
-  %shl1 = shl i32 %or1, 2
-  store i32 %shl1, i32* %y, align 8
-  ret void
-}
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; and some low bits
-; (i64 version)
-define void @fct15(i64* nocapture %y, i64 %x, i64 %x1) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct15:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfxil [[REG1]], x1, #16, #8
-; lsr is an alias of ubfm
-; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #4
-; CHECK-NEXT: bfxil [[REG2]], x2, #5, #3
-; lsl is an alias of ubfm
-; CHECK-NEXT: lsl [[REG3:x[0-9]+]], [[REG2]], #2
-; CHECK-NEXT: str [[REG3]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, -256
-  %shr = lshr i64 %x, 16
-  %and1 = and i64 %shr, 255
-  %or = or i64 %and, %and1
-  %shl = lshr i64 %or, 4
-  %and2 = and i64 %shl, -8
-  %shr1 = lshr i64 %x1, 5
-  %and3 = and i64 %shr1, 7
-  %or1 = or i64 %and2, %and3
-  %shl1 = shl i64 %or1, 2
-  store i64 %shl1, i64* %y, align 8
-  ret void
-}
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; and some low bits and a masking operation has to be kept
-define void @fct16(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct16:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; Create the constant
-; CHECK: movz [[REGCST:w[0-9]+]], #0x1a, lsl #16
-; CHECK: movk [[REGCST]], #0x8160
-; Do the masking
-; CHECK: and [[REG2:w[0-9]+]], [[REG1]], [[REGCST]]
-; CHECK-NEXT: bfxil [[REG2]], w1, #16, #3
-; lsr is an alias of ubfm
-; CHECK-NEXT: ubfx [[REG3:w[0-9]+]], [[REG2]], #2, #28
-; CHECK-NEXT: str [[REG3]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, 1737056
-  %shr = lshr i32 %x, 16
-  %and1 = and i32 %shr, 7
-  %or = or i32 %and, %and1
-  %shl = shl i32 %or, 2
-  %shr2 = lshr i32 %shl, 4
-  store i32 %shr2, i32* %y, align 8
-  ret void
-}
-
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; and some low bits and a masking operation has to be kept
-; (i64 version)
-define void @fct17(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct17:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; Create the constant
-; CHECK: movz w[[REGCST:[0-9]+]], #0x1a, lsl #16
-; CHECK: movk w[[REGCST]], #0x8160
-; Do the masking
-; CHECK: and [[REG2:x[0-9]+]], [[REG1]], x[[REGCST]]
-; CHECK-NEXT: bfxil [[REG2]], x1, #16, #3
-; lsr is an alias of ubfm
-; CHECK-NEXT: ubfx [[REG3:x[0-9]+]], [[REG2]], #2, #60
-; CHECK-NEXT: str [[REG3]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, 1737056
-  %shr = lshr i64 %x, 16
-  %and1 = and i64 %shr, 7
-  %or = or i64 %and, %and1
-  %shl = shl i64 %or, 2
-  %shr2 = lshr i64 %shl, 4
-  store i64 %shr2, i64* %y, align 8
-  ret void
-}
-
-define i64 @fct18(i32 %xor72) nounwind ssp {
-; CHECK-LABEL: fct18:
-; CHECK: ubfx x0, x0, #9, #8
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %result = and i64 %conv82, 255
-  ret i64 %result
-}
-
-; Using the access to the global array to keep the instruction and control flow.
-@first_ones = external global [65536 x i8]
-
-; Function Attrs: nounwind readonly ssp
-define i32 @fct19(i64 %arg1) nounwind readonly ssp  {
-; CHECK-LABEL: fct19:
-entry:
-  %x.sroa.1.0.extract.shift = lshr i64 %arg1, 16
-  %x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16
-  %x.sroa.3.0.extract.shift = lshr i64 %arg1, 32
-  %x.sroa.5.0.extract.shift = lshr i64 %arg1, 48
-  %tobool = icmp eq i64 %x.sroa.5.0.extract.shift, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %arrayidx3 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %x.sroa.5.0.extract.shift
-  %0 = load i8* %arrayidx3, align 1
-  %conv = zext i8 %0 to i32
-  br label %return
-
-; OPT-LABEL: if.end
-if.end:                                           ; preds = %entry
-; OPT: lshr
-; CHECK: ubfx	[[REG1:x[0-9]+]], [[REG2:x[0-9]+]], #32, #16
-  %x.sroa.3.0.extract.trunc = trunc i64 %x.sroa.3.0.extract.shift to i16
-  %tobool6 = icmp eq i16 %x.sroa.3.0.extract.trunc, 0
-; CHECK: cbz
-  br i1 %tobool6, label %if.end13, label %if.then7
-
-; OPT-LABEL: if.then7
-if.then7:                                         ; preds = %if.end
-; OPT: lshr
-; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. 
-; So neither of them should be in the assemble code. 
-; CHECK-NOT: and
-; CHECK-NOT: ubfm
-  %idxprom10 = and i64 %x.sroa.3.0.extract.shift, 65535
-  %arrayidx11 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom10
-  %1 = load i8* %arrayidx11, align 1
-  %conv12 = zext i8 %1 to i32
-  %add = add nsw i32 %conv12, 16
-  br label %return
-
-; OPT-LABEL: if.end13
-if.end13:                                         ; preds = %if.end
-; OPT: lshr
-; OPT: trunc
-; CHECK: ubfx	[[REG3:x[0-9]+]], [[REG4:x[0-9]+]], #16, #16
-  %tobool16 = icmp eq i16 %x.sroa.1.0.extract.trunc, 0
-; CHECK: cbz
-  br i1 %tobool16, label %return, label %if.then17
-
-; OPT-LABEL: if.then17
-if.then17:                                        ; preds = %if.end13
-; OPT: lshr
-; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. 
-; So neither of them should be in the assemble code. 
-; CHECK-NOT: and
-; CHECK-NOT: ubfm
-  %idxprom20 = and i64 %x.sroa.1.0.extract.shift, 65535
-  %arrayidx21 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom20
-  %2 = load i8* %arrayidx21, align 1
-  %conv22 = zext i8 %2 to i32
-  %add23 = add nsw i32 %conv22, 32
-  br label %return
-
-return:                                           ; preds = %if.end13, %if.then17, %if.then7, %if.then
-; CHECK: ret
-  %retval.0 = phi i32 [ %conv, %if.then ], [ %add, %if.then7 ], [ %add23, %if.then17 ], [ 64, %if.end13 ]
-  ret i32 %retval.0
-}
-
-; Make sure we do not assert if the immediate in and is bigger than i64.
-; PR19503.
-; OPT-LABEL: @fct20
-; OPT: lshr
-; OPT-NOT: lshr
-; OPT: ret
-; CHECK-LABEL: fct20:
-; CHECK: ret
-define i80 @fct20(i128 %a, i128 %b) {
-entry:
-  %shr = lshr i128 %a, 18
-  %conv = trunc i128 %shr to i80
-  %tobool = icmp eq i128 %b, 0
-  br i1 %tobool, label %then, label %end
-then:                     
-  %and = and i128 %shr, 483673642326615442599424
-  %conv2 = trunc i128 %and to i80
-  br label %end
-end:
-  %conv3 = phi i80 [%conv, %entry], [%conv2, %then] 
-  ret i80 %conv3
-}
-
-; Check if we can still catch UBFX when "AND" is used by SHL.
-; CHECK-LABEL: fct21:
-; CHECK: ubfx
-@arr = external global [8 x [64 x i64]]
-define i64 @fct21(i64 %x) {
-entry:
-  %shr = lshr i64 %x, 4
-  %and = and i64 %shr, 15
-  %arrayidx = getelementptr inbounds [8 x [64 x i64]]* @arr, i64 0, i64 0, i64 %and
-  %0 = load i64* %arrayidx, align 8
-  ret i64 %0
-}
-
-define i16 @test_ignored_rightbits(i32 %dst, i32 %in) {
-; CHECK-LABEL: test_ignored_rightbits:
-
-  %positioned_field = shl i32 %in, 3
-  %positioned_masked_field = and i32 %positioned_field, 120
-  %masked_dst = and i32 %dst, 7
-  %insertion = or i32 %masked_dst, %positioned_masked_field
-; CHECK: {{bfm|bfi|bfxil}}
-
-  %shl16 = shl i32 %insertion, 8
-  %or18 = or i32 %shl16, %insertion
-  %conv19 = trunc i32 %or18 to i16
-; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #8, #7
-
-  ret i16 %conv19
-}
diff --git a/test/CodeGen/ARM64/blockaddress.ll b/test/CodeGen/ARM64/blockaddress.ll
deleted file mode 100644
index ac4f19e65df..00000000000
--- a/test/CodeGen/ARM64/blockaddress.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
-; RUN: llc < %s -mtriple=arm64-linux-gnu -code-model=large| FileCheck %s --check-prefix=CHECK-LARGE
-
-; rdar://9188695
-
-define i64 @t() nounwind ssp {
-entry:
-; CHECK-LABEL: t:
-; CHECK: adrp [[REG:x[0-9]+]], Ltmp1@PAGE
-; CHECK: add {{x[0-9]+}}, [[REG]], Ltmp1@PAGEOFF
-
-; CHECK-LINUX-LABEL: t:
-; CHECK-LINUX: adrp [[REG:x[0-9]+]], .Ltmp1
-; CHECK-LINUX: add {{x[0-9]+}}, [[REG]], :lo12:.Ltmp1
-
-; CHECK-LARGE-LABEL: t:
-; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g3:[[DEST_LBL:.Ltmp[0-9]+]]
-; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]]
-; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g1_nc:[[DEST_LBL]]
-; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g0_nc:[[DEST_LBL]]
-
-  %recover = alloca i64, align 8
-  store volatile i64 ptrtoint (i8* blockaddress(@t, %mylabel) to i64), i64* %recover, align 8
-  br label %mylabel
-
-mylabel:
-  %tmp = load volatile i64* %recover, align 8
-  ret i64 %tmp
-}
diff --git a/test/CodeGen/ARM64/build-vector.ll b/test/CodeGen/ARM64/build-vector.ll
deleted file mode 100644
index 143d6894383..00000000000
--- a/test/CodeGen/ARM64/build-vector.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-; Check that building up a vector w/ only one non-zero lane initializes
-; intelligently.
-define void @one_lane(i32* nocapture %out_int, i32 %skip0) nounwind {
-; CHECK-LABEL: one_lane:
-; CHECK: dup.16b v[[REG:[0-9]+]], wzr
-; CHECK-NEXT: ins.b v[[REG]][0], w1
-; v and q are aliases, and str is preferred against st.16b when possible
-; rdar://11246289
-; CHECK: str q[[REG]], [x0]
-; CHECK: ret
-  %conv = trunc i32 %skip0 to i8
-  %vset_lane = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %conv, i32 0
-  %tmp = bitcast i32* %out_int to <4 x i32>*
-  %tmp1 = bitcast <16 x i8> %vset_lane to <4 x i32>
-  store <4 x i32> %tmp1, <4 x i32>* %tmp, align 16
-  ret void
-}
-
-; Check that building a vector from floats doesn't insert an unnecessary
-; copy for lane zero.
-define <4 x float>  @foo(float %a, float %b, float %c, float %d) nounwind {
-; CHECK-LABEL: foo:
-; CHECK-NOT: ins.s v0[0], v0[0]
-; CHECK: ins.s v0[1], v1[0]
-; CHECK: ins.s v0[2], v2[0]
-; CHECK: ins.s v0[3], v3[0]
-; CHECK: ret
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float %b, i32 1
-  %3 = insertelement <4 x float> %2, float %c, i32 2
-  %4 = insertelement <4 x float> %3, float %d, i32 3
-  ret <4 x float> %4
-}
diff --git a/test/CodeGen/ARM64/call-tailcalls.ll b/test/CodeGen/ARM64/call-tailcalls.ll
deleted file mode 100644
index 487c1d9bec3..00000000000
--- a/test/CodeGen/ARM64/call-tailcalls.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-
-@t = weak global i32 ()* null
-@x = external global i32, align 4
-
-define void @t2() {
-; CHECK-LABEL: t2:
-; CHECK: adrp	x[[GOTADDR:[0-9]+]], _t@GOTPAGE
-; CHECK: ldr	x[[ADDR:[0-9]+]], [x[[GOTADDR]], _t@GOTPAGEOFF]
-; CHECK: ldr	x[[DEST:[0-9]+]], [x[[ADDR]]]
-; CHECK: br	x[[DEST]]
-  %tmp = load i32 ()** @t
-  %tmp.upgrd.2 = tail call i32 %tmp()
-  ret void
-}
-
-define void @t3() {
-; CHECK-LABEL: t3:
-; CHECK: b	_t2
-  tail call void @t2()
-  ret void
-}
-
-define double @t4(double %a) nounwind readonly ssp {
-; CHECK-LABEL: t4:
-; CHECK: b	_sin
-  %tmp = tail call double @sin(double %a) nounwind readonly
-  ret double %tmp
-}
-
-define float @t5(float %a) nounwind readonly ssp {
-; CHECK-LABEL: t5:
-; CHECK: b	_sinf
-  %tmp = tail call float @sinf(float %a) nounwind readonly
-  ret float %tmp
-}
-
-define void @t7() nounwind {
-; CHECK-LABEL: t7:
-; CHECK: b	_foo
-; CHECK: b	_bar
-
-  br i1 undef, label %bb, label %bb1.lr.ph
-
-bb1.lr.ph:                                        ; preds = %entry
-  tail call void @bar() nounwind
-  ret void
-
-bb:                                               ; preds = %entry
-  tail call void @foo() nounwind
-  ret void
-}
-
-define i32 @t8(i32 %x) nounwind ssp {
-; CHECK-LABEL: t8:
-; CHECK: b	_a
-; CHECK: b	_b
-; CHECK: b	_c
-  %and = and i32 %x, 1
-  %tobool = icmp eq i32 %and, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %call = tail call i32 @a(i32 %x) nounwind
-  br label %return
-
-if.end:                                           ; preds = %entry
-  %and1 = and i32 %x, 2
-  %tobool2 = icmp eq i32 %and1, 0
-  br i1 %tobool2, label %if.end5, label %if.then3
-
-if.then3:                                         ; preds = %if.end
-  %call4 = tail call i32 @b(i32 %x) nounwind
-  br label %return
-
-if.end5:                                          ; preds = %if.end
-  %call6 = tail call i32 @c(i32 %x) nounwind
-  br label %return
-
-return:                                           ; preds = %if.end5, %if.then3, %if.then
-  %retval.0 = phi i32 [ %call, %if.then ], [ %call4, %if.then3 ], [ %call6, %if.end5 ]
-  ret i32 %retval.0
-}
-
-declare float @sinf(float) nounwind readonly
-declare double @sin(double) nounwind readonly
-declare void @bar() nounwind
-declare void @foo() nounwind
-declare i32 @a(i32)
-declare i32 @b(i32)
-declare i32 @c(i32)
diff --git a/test/CodeGen/ARM64/cast-opt.ll b/test/CodeGen/ARM64/cast-opt.ll
deleted file mode 100644
index 65a871d4368..00000000000
--- a/test/CodeGen/ARM64/cast-opt.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc -O3 -march=arm64 -mtriple arm64-apple-ios5.0.0 < %s | FileCheck %s
-; <rdar://problem/15992732>
-; Zero truncation is not necessary when the values are extended properly
-; already.
-
-@block = common global i8* null, align 8
-
-define zeroext i8 @foo(i32 %i1, i32 %i2) {
-; CHECK-LABEL: foo:
-; CHECK: cset
-; CHECK-NOT: and
-entry:
-  %idxprom = sext i32 %i1 to i64
-  %0 = load i8** @block, align 8
-  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
-  %1 = load i8* %arrayidx, align 1
-  %idxprom1 = sext i32 %i2 to i64
-  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
-  %2 = load i8* %arrayidx2, align 1
-  %cmp = icmp eq i8 %1, %2
-  br i1 %cmp, label %return, label %if.then
-
-if.then:                                          ; preds = %entry
-  %cmp7 = icmp ugt i8 %1, %2
-  %conv9 = zext i1 %cmp7 to i8
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i8 [ %conv9, %if.then ], [ 1, %entry ]
-  ret i8 %retval.0
-}
diff --git a/test/CodeGen/ARM64/ccmp-heuristics.ll b/test/CodeGen/ARM64/ccmp-heuristics.ll
deleted file mode 100644
index 5575997e53e..00000000000
--- a/test/CodeGen/ARM64/ccmp-heuristics.ll
+++ /dev/null
@@ -1,190 +0,0 @@
-; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp | FileCheck %s
-target triple = "arm64-apple-ios7.0.0"
-
-@channelColumns = external global i64
-@channelTracks = external global i64
-@mazeRoute = external hidden unnamed_addr global i8*, align 8
-@TOP = external global i64*
-@BOT = external global i64*
-@netsAssign = external global i64*
-
-; Function from yacr2/maze.c
-; The branch at the end of %if.then is driven by %cmp5 and %cmp6.
-; Isel converts the and i1 into two branches, and arm64-ccmp should not convert
-; it back again. %cmp6 has much higher latency than %cmp5.
-; CHECK: Maze1
-; CHECK: %if.then
-; CHECK: cmp x{{[0-9]+}}, #2
-; CHECK-NEXT b.cc
-; CHECK: %if.then
-; CHECK: cmp x{{[0-9]+}}, #2
-; CHECK-NEXT b.cc
-define i32 @Maze1() nounwind ssp {
-entry:
-  %0 = load i64* @channelColumns, align 8, !tbaa !0
-  %cmp90 = icmp eq i64 %0, 0
-  br i1 %cmp90, label %for.end, label %for.body
-
-for.body:                                         ; preds = %for.inc, %entry
-  %1 = phi i64 [ %0, %entry ], [ %37, %for.inc ]
-  %i.092 = phi i64 [ 1, %entry ], [ %inc53, %for.inc ]
-  %numLeft.091 = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
-  %2 = load i8** @mazeRoute, align 8, !tbaa !3
-  %arrayidx = getelementptr inbounds i8* %2, i64 %i.092
-  %3 = load i8* %arrayidx, align 1, !tbaa !1
-  %tobool = icmp eq i8 %3, 0
-  br i1 %tobool, label %for.inc, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %4 = load i64** @TOP, align 8, !tbaa !3
-  %arrayidx1 = getelementptr inbounds i64* %4, i64 %i.092
-  %5 = load i64* %arrayidx1, align 8, !tbaa !0
-  %6 = load i64** @netsAssign, align 8, !tbaa !3
-  %arrayidx2 = getelementptr inbounds i64* %6, i64 %5
-  %7 = load i64* %arrayidx2, align 8, !tbaa !0
-  %8 = load i64** @BOT, align 8, !tbaa !3
-  %arrayidx3 = getelementptr inbounds i64* %8, i64 %i.092
-  %9 = load i64* %arrayidx3, align 8, !tbaa !0
-  %arrayidx4 = getelementptr inbounds i64* %6, i64 %9
-  %10 = load i64* %arrayidx4, align 8, !tbaa !0
-  %cmp5 = icmp ugt i64 %i.092, 1
-  %cmp6 = icmp ugt i64 %10, 1
-  %or.cond = and i1 %cmp5, %cmp6
-  br i1 %or.cond, label %land.lhs.true7, label %if.else
-
-land.lhs.true7:                                   ; preds = %if.then
-  %11 = load i64* @channelTracks, align 8, !tbaa !0
-  %add = add i64 %11, 1
-  %call = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add, i64 %10, i64 0, i64 %7, i32 -1, i32 -1)
-  %tobool8 = icmp eq i32 %call, 0
-  br i1 %tobool8, label %land.lhs.true7.if.else_crit_edge, label %if.then9
-
-land.lhs.true7.if.else_crit_edge:                 ; preds = %land.lhs.true7
-  %.pre = load i64* @channelColumns, align 8, !tbaa !0
-  br label %if.else
-
-if.then9:                                         ; preds = %land.lhs.true7
-  %12 = load i8** @mazeRoute, align 8, !tbaa !3
-  %arrayidx10 = getelementptr inbounds i8* %12, i64 %i.092
-  store i8 0, i8* %arrayidx10, align 1, !tbaa !1
-  %13 = load i64** @TOP, align 8, !tbaa !3
-  %arrayidx11 = getelementptr inbounds i64* %13, i64 %i.092
-  %14 = load i64* %arrayidx11, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %14)
-  %15 = load i64** @BOT, align 8, !tbaa !3
-  %arrayidx12 = getelementptr inbounds i64* %15, i64 %i.092
-  %16 = load i64* %arrayidx12, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %16)
-  br label %for.inc
-
-if.else:                                          ; preds = %land.lhs.true7.if.else_crit_edge, %if.then
-  %17 = phi i64 [ %.pre, %land.lhs.true7.if.else_crit_edge ], [ %1, %if.then ]
-  %cmp13 = icmp ult i64 %i.092, %17
-  %or.cond89 = and i1 %cmp13, %cmp6
-  br i1 %or.cond89, label %land.lhs.true16, label %if.else24
-
-land.lhs.true16:                                  ; preds = %if.else
-  %18 = load i64* @channelTracks, align 8, !tbaa !0
-  %add17 = add i64 %18, 1
-  %call18 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add17, i64 %10, i64 0, i64 %7, i32 1, i32 -1)
-  %tobool19 = icmp eq i32 %call18, 0
-  br i1 %tobool19, label %if.else24, label %if.then20
-
-if.then20:                                        ; preds = %land.lhs.true16
-  %19 = load i8** @mazeRoute, align 8, !tbaa !3
-  %arrayidx21 = getelementptr inbounds i8* %19, i64 %i.092
-  store i8 0, i8* %arrayidx21, align 1, !tbaa !1
-  %20 = load i64** @TOP, align 8, !tbaa !3
-  %arrayidx22 = getelementptr inbounds i64* %20, i64 %i.092
-  %21 = load i64* %arrayidx22, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %21)
-  %22 = load i64** @BOT, align 8, !tbaa !3
-  %arrayidx23 = getelementptr inbounds i64* %22, i64 %i.092
-  %23 = load i64* %arrayidx23, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %23)
-  br label %for.inc
-
-if.else24:                                        ; preds = %land.lhs.true16, %if.else
-  br i1 %cmp5, label %land.lhs.true26, label %if.else36
-
-land.lhs.true26:                                  ; preds = %if.else24
-  %24 = load i64* @channelTracks, align 8, !tbaa !0
-  %cmp27 = icmp ult i64 %7, %24
-  br i1 %cmp27, label %land.lhs.true28, label %if.else36
-
-land.lhs.true28:                                  ; preds = %land.lhs.true26
-  %add29 = add i64 %24, 1
-  %call30 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add29, i64 %10, i32 -1, i32 1)
-  %tobool31 = icmp eq i32 %call30, 0
-  br i1 %tobool31, label %if.else36, label %if.then32
-
-if.then32:                                        ; preds = %land.lhs.true28
-  %25 = load i8** @mazeRoute, align 8, !tbaa !3
-  %arrayidx33 = getelementptr inbounds i8* %25, i64 %i.092
-  store i8 0, i8* %arrayidx33, align 1, !tbaa !1
-  %26 = load i64** @TOP, align 8, !tbaa !3
-  %arrayidx34 = getelementptr inbounds i64* %26, i64 %i.092
-  %27 = load i64* %arrayidx34, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %27)
-  %28 = load i64** @BOT, align 8, !tbaa !3
-  %arrayidx35 = getelementptr inbounds i64* %28, i64 %i.092
-  %29 = load i64* %arrayidx35, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %29)
-  br label %for.inc
-
-if.else36:                                        ; preds = %land.lhs.true28, %land.lhs.true26, %if.else24
-  %30 = load i64* @channelColumns, align 8, !tbaa !0
-  %cmp37 = icmp ult i64 %i.092, %30
-  br i1 %cmp37, label %land.lhs.true38, label %if.else48
-
-land.lhs.true38:                                  ; preds = %if.else36
-  %31 = load i64* @channelTracks, align 8, !tbaa !0
-  %cmp39 = icmp ult i64 %7, %31
-  br i1 %cmp39, label %land.lhs.true40, label %if.else48
-
-land.lhs.true40:                                  ; preds = %land.lhs.true38
-  %add41 = add i64 %31, 1
-  %call42 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add41, i64 %10, i32 1, i32 1)
-  %tobool43 = icmp eq i32 %call42, 0
-  br i1 %tobool43, label %if.else48, label %if.then44
-
-if.then44:                                        ; preds = %land.lhs.true40
-  %32 = load i8** @mazeRoute, align 8, !tbaa !3
-  %arrayidx45 = getelementptr inbounds i8* %32, i64 %i.092
-  store i8 0, i8* %arrayidx45, align 1, !tbaa !1
-  %33 = load i64** @TOP, align 8, !tbaa !3
-  %arrayidx46 = getelementptr inbounds i64* %33, i64 %i.092
-  %34 = load i64* %arrayidx46, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %34)
-  %35 = load i64** @BOT, align 8, !tbaa !3
-  %arrayidx47 = getelementptr inbounds i64* %35, i64 %i.092
-  %36 = load i64* %arrayidx47, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %36)
-  br label %for.inc
-
-if.else48:                                        ; preds = %land.lhs.true40, %land.lhs.true38, %if.else36
-  %inc = add nsw i32 %numLeft.091, 1
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.else48, %if.then44, %if.then32, %if.then20, %if.then9, %for.body
-  %numLeft.1 = phi i32 [ %numLeft.091, %if.then9 ], [ %numLeft.091, %if.then20 ], [ %numLeft.091, %if.then32 ], [ %numLeft.091, %if.then44 ], [ %inc, %if.else48 ], [ %numLeft.091, %for.body ]
-  %inc53 = add i64 %i.092, 1
-  %37 = load i64* @channelColumns, align 8, !tbaa !0
-  %cmp = icmp ugt i64 %inc53, %37
-  br i1 %cmp, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.inc, %entry
-  %numLeft.0.lcssa = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
-  ret i32 %numLeft.0.lcssa
-}
-
-; Materializable
-declare hidden fastcc i32 @Maze1Mech(i64, i64, i64, i64, i64, i32, i32) nounwind ssp
-
-; Materializable
-declare hidden fastcc void @CleanNet(i64) nounwind ssp
-
-!0 = metadata !{metadata !"long", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/ARM64/ccmp.ll b/test/CodeGen/ARM64/ccmp.ll
deleted file mode 100644
index 79e6f94e3f6..00000000000
--- a/test/CodeGen/ARM64/ccmp.ll
+++ /dev/null
@@ -1,289 +0,0 @@
-; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp -arm64-stress-ccmp | FileCheck %s
-target triple = "arm64-apple-ios"
-
-; CHECK: single_same
-; CHECK: cmp w0, #5
-; CHECK-NEXT: ccmp w1, #17, #4, ne
-; CHECK-NEXT: b.ne
-; CHECK: %if.then
-; CHECK: bl _foo
-; CHECK: %if.end
-define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 5
-  %cmp1 = icmp eq i32 %b, 17
-  %or.cond = or i1 %cmp, %cmp1
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Different condition codes for the two compares.
-; CHECK: single_different
-; CHECK: cmp w0, #6
-; CHECK-NEXT: ccmp w1, #17, #0, ge
-; CHECK-NEXT: b.eq
-; CHECK: %if.then
-; CHECK: bl _foo
-; CHECK: %if.end
-define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp sle i32 %a, 5
-  %cmp1 = icmp ne i32 %b, 17
-  %or.cond = or i1 %cmp, %cmp1
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Second block clobbers the flags, can't convert (easily).
-; CHECK: single_flagclobber
-; CHECK: cmp
-; CHECK: b.eq
-; CHECK: cmp
-; CHECK: b.gt
-define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 5
-  br i1 %cmp, label %if.then, label %lor.lhs.false
-
-lor.lhs.false:                                    ; preds = %entry
-  %cmp1 = icmp slt i32 %b, 7
-  %mul = shl nsw i32 %b, 1
-  %add = add nsw i32 %b, 1
-  %cond = select i1 %cmp1, i32 %mul, i32 %add
-  %cmp2 = icmp slt i32 %cond, 17
-  br i1 %cmp2, label %if.then, label %if.end
-
-if.then:                                          ; preds = %lor.lhs.false, %entry
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %lor.lhs.false
-  ret i32 7
-}
-
-; Second block clobbers the flags and ends with a tbz terminator.
-; CHECK: single_flagclobber_tbz
-; CHECK: cmp
-; CHECK: b.eq
-; CHECK: cmp
-; CHECK: tbz
-define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 5
-  br i1 %cmp, label %if.then, label %lor.lhs.false
-
-lor.lhs.false:                                    ; preds = %entry
-  %cmp1 = icmp slt i32 %b, 7
-  %mul = shl nsw i32 %b, 1
-  %add = add nsw i32 %b, 1
-  %cond = select i1 %cmp1, i32 %mul, i32 %add
-  %and = and i32 %cond, 8
-  %cmp2 = icmp ne i32 %and, 0
-  br i1 %cmp2, label %if.then, label %if.end
-
-if.then:                                          ; preds = %lor.lhs.false, %entry
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %lor.lhs.false
-  ret i32 7
-}
-
-; Speculatively execute division by zero.
-; The sdiv/udiv instructions do not trap when the divisor is zero, so they are
-; safe to speculate.
-; CHECK: speculate_division
-; CHECK-NOT: cmp
-; CHECK: sdiv
-; CHECK: cmp
-; CHECK-NEXT: ccmp
-define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:
-  %div = sdiv i32 %b, %a
-  %cmp1 = icmp slt i32 %div, 17
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Floating point compare.
-; CHECK: single_fcmp
-; CHECK: cmp
-; CHECK-NOT: b.
-; CHECK: fccmp {{.*}}, #8, ge
-; CHECK: b.lt
-define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:
-  %conv = sitofp i32 %a to float
-  %div = fdiv float %b, %conv
-  %cmp1 = fcmp oge float %div, 1.700000e+01
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Chain multiple compares.
-; CHECK: multi_different
-; CHECK: cmp
-; CHECK: ccmp
-; CHECK: ccmp
-; CHECK: b.
-define void @multi_different(i32 %a, i32 %b, i32 %c) nounwind ssp {
-entry:
-  %cmp = icmp sgt i32 %a, %b
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:
-  %div = sdiv i32 %b, %a
-  %cmp1 = icmp eq i32 %div, 5
-  %cmp4 = icmp sgt i32 %div, %c
-  %or.cond = and i1 %cmp1, %cmp4
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret void
-}
-
-; Convert a cbz in the head block.
-; CHECK: cbz_head
-; CHECK: cmp w0, #0
-; CHECK: ccmp
-define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 0
-  %cmp1 = icmp ne i32 %b, 17
-  %or.cond = or i1 %cmp, %cmp1
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Check that the immediate operand is in range. The ccmp instruction encodes a
-; smaller range of immediates than subs/adds.
-; The ccmp immediates must be in the range 0-31.
-; CHECK: immediate_range
-; CHECK-NOT: ccmp
-define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 5
-  %cmp1 = icmp eq i32 %b, 32
-  %or.cond = or i1 %cmp, %cmp1
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Convert a cbz in the second block.
-; CHECK: cbz_second
-; CHECK: cmp w0, #0
-; CHECK: ccmp w1, #0, #0, ne
-; CHECK: b.eq
-define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 0
-  %cmp1 = icmp ne i32 %b, 0
-  %or.cond = or i1 %cmp, %cmp1
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Convert a cbnz in the second block.
-; CHECK: cbnz_second
-; CHECK: cmp w0, #0
-; CHECK: ccmp w1, #0, #4, ne
-; CHECK: b.ne
-define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 0
-  %cmp1 = icmp eq i32 %b, 0
-  %or.cond = or i1 %cmp, %cmp1
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-declare i32 @foo()
-
-%str1 = type { %str2 }
-%str2 = type { [24 x i8], i8*, i32, %str1*, i32, [4 x i8], %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, i8*, i8, i8*, %str1*, i8* }
-
-; Test case distilled from 126.gcc.
-; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor.
-; CHECK: build_modify_expr
-define void @build_modify_expr() nounwind ssp {
-entry:
-  switch i32 undef, label %sw.bb.i.i [
-    i32 69, label %if.end85
-    i32 70, label %if.end85
-    i32 71, label %if.end85
-    i32 72, label %if.end85
-    i32 73, label %if.end85
-    i32 105, label %if.end85
-    i32 106, label %if.end85
-  ]
-
-if.end85:
-  ret void
-
-sw.bb.i.i:
-  %ref.tr.i.i = phi %str1* [ %0, %sw.bb.i.i ], [ undef, %entry ]
-  %operands.i.i = getelementptr inbounds %str1* %ref.tr.i.i, i64 0, i32 0, i32 2
-  %arrayidx.i.i = bitcast i32* %operands.i.i to %str1**
-  %0 = load %str1** %arrayidx.i.i, align 8
-  %code1.i.i.phi.trans.insert = getelementptr inbounds %str1* %0, i64 0, i32 0, i32 0, i64 16
-  br label %sw.bb.i.i
-}
diff --git a/test/CodeGen/ARM64/clrsb.ll b/test/CodeGen/ARM64/clrsb.ll
deleted file mode 100644
index 042e52e5e78..00000000000
--- a/test/CodeGen/ARM64/clrsb.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: llc < %s -march=arm64 |  FileCheck %s
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios7.0.0"
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.ctlz.i32(i32, i1) #0
-declare i64 @llvm.ctlz.i64(i64, i1) #1
-
-; Function Attrs: nounwind ssp
-define i32 @clrsb32(i32 %x) #2 {
-entry:
-  %shr = ashr i32 %x, 31
-  %xor = xor i32 %shr, %x
-  %mul = shl i32 %xor, 1
-  %add = or i32 %mul, 1
-  %0 = tail call i32 @llvm.ctlz.i32(i32 %add, i1 false)
-
-  ret i32 %0
-; CHECK-LABEL: clrsb32
-; CHECK:   cls [[TEMP:w[0-9]+]], [[TEMP]]
-}
-
-; Function Attrs: nounwind ssp
-define i64 @clrsb64(i64 %x) #3 {
-entry:
-  %shr = ashr i64 %x, 63
-  %xor = xor i64 %shr, %x
-  %mul = shl nsw i64 %xor, 1
-  %add = or i64 %mul, 1
-  %0 = tail call i64 @llvm.ctlz.i64(i64 %add, i1 false)
-
-  ret i64 %0
-; CHECK-LABEL: clrsb64
-; CHECK:   cls [[TEMP:x[0-9]+]], [[TEMP]]
-}
diff --git a/test/CodeGen/ARM64/coalesce-ext.ll b/test/CodeGen/ARM64/coalesce-ext.ll
deleted file mode 100644
index 9420bf3bb59..00000000000
--- a/test/CodeGen/ARM64/coalesce-ext.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc -march=arm64 -mtriple=arm64-apple-darwin < %s | FileCheck %s
-; Check that the peephole optimizer knows about sext and zext instructions.
-; CHECK: test1sext
-define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
-  %C = add i64 %A, %B
-  ; CHECK: add x[[SUM:[0-9]+]], x0, x1
-  %D = trunc i64 %C to i32
-  %E = shl i64 %C, 32
-  %F = ashr i64 %E, 32
-  ; CHECK: sxtw x[[EXT:[0-9]+]], w[[SUM]]
-  store volatile i64 %F, i64 *%P2
-  ; CHECK: str x[[EXT]]
-  store volatile i32 %D, i32* %P
-  ; Reuse low bits of extended register, don't extend live range of SUM.
-  ; CHECK: str w[[SUM]]
-  ret i32 %D
-}
diff --git a/test/CodeGen/ARM64/code-model-large-abs.ll b/test/CodeGen/ARM64/code-model-large-abs.ll
deleted file mode 100644
index 264da2da25b..00000000000
--- a/test/CodeGen/ARM64/code-model-large-abs.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large < %s | FileCheck %s
-
-@var8 = global i8 0
-@var16 = global i16 0
-@var32 = global i32 0
-@var64 = global i64 0
-
-define i8* @global_addr() {
-; CHECK-LABEL: global_addr:
-  ret i8* @var8
-  ; The movz/movk calculation should end up returned directly in x0.
-; CHECK: movz x0, #:abs_g3:var8
-; CHECK: movk x0, #:abs_g2_nc:var8
-; CHECK: movk x0, #:abs_g1_nc:var8
-; CHECK: movk x0, #:abs_g0_nc:var8
-; CHECK-NEXT: ret
-}
-
-define i8 @global_i8() {
-; CHECK-LABEL: global_i8:
-  %val = load i8* @var8
-  ret i8 %val
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var8
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var8
-; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var8
-; CHECK: ldrb w0, [x[[ADDR_REG]]]
-}
-
-define i16 @global_i16() {
-; CHECK-LABEL: global_i16:
-  %val = load i16* @var16
-  ret i16 %val
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var16
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var16
-; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var16
-; CHECK: ldrh w0, [x[[ADDR_REG]]]
-}
-
-define i32 @global_i32() {
-; CHECK-LABEL: global_i32:
-  %val = load i32* @var32
-  ret i32 %val
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var32
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var32
-; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var32
-; CHECK: ldr w0, [x[[ADDR_REG]]]
-}
-
-define i64 @global_i64() {
-; CHECK-LABEL: global_i64:
-  %val = load i64* @var64
-  ret i64 %val
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var64
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var64
-; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var64
-; CHECK: ldr x0, [x[[ADDR_REG]]]
-}
-
-define <2 x i64> @constpool() {
-; CHECK-LABEL: constpool:
-  ret <2 x i64> <i64 123456789, i64 987654321100>
-
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:[[CPADDR:.LCPI[0-9]+_[0-9]+]]
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:[[CPADDR]]
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:[[CPADDR]]
-; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:[[CPADDR]]
-; CHECK: ldr q0, [x[[ADDR_REG]]]
-}
diff --git a/test/CodeGen/ARM64/collect-loh-garbage-crash.ll b/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
deleted file mode 100644
index 98cb625d2d5..00000000000
--- a/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -O3 -arm64-collect-loh -arm64-collect-loh-bb-only=true -arm64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
-; Check that the LOH analysis does not crash when the analysed chained
-; contains instructions that are filtered out.
-;
-; Before the fix for <rdar://problem/16041712>, these cases were removed
-; from the main container. Now, the deterministic container does not allow
-; to remove arbitrary values, so we have to live with garbage values.
-; <rdar://problem/16041712>
-
-%"class.H4ISP::H4ISPDevice" = type { i32 (%"class.H4ISP::H4ISPDevice"*, i32, i8*, i8*)*, i8*, i32*, %"class.H4ISP::H4ISPCameraManager"* }
-
-%"class.H4ISP::H4ISPCameraManager" = type opaque
-
-declare i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"*)
-
-@pH4ISPDevice = hidden global %"class.H4ISP::H4ISPDevice"* null, align 8
-
-; CHECK-LABEL: _foo:
-; CHECK: ret
-; CHECK-NOT: .loh AdrpLdrGotLdr
-define void @foo() {
-entry:
-  br label %if.then83
-if.then83:                                        ; preds = %if.end81
-  %tmp = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
-  %call84 = call i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"* %tmp) #19
-  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"()
-  %tmp2 = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
-  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x28}"()
-  %pCameraManager.i268 = getelementptr inbounds %"class.H4ISP::H4ISPDevice"* %tmp2, i64 0, i32 3
-  %tmp3 = load %"class.H4ISP::H4ISPCameraManager"** %pCameraManager.i268, align 8
-  %tobool.i269 = icmp eq %"class.H4ISP::H4ISPCameraManager"* %tmp3, null
-  br i1 %tobool.i269, label %if.then83, label %end
-end:
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/collect-loh-str.ll b/test/CodeGen/ARM64/collect-loh-str.ll
deleted file mode 100644
index fc63f8bcc2e..00000000000
--- a/test/CodeGen/ARM64/collect-loh-str.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
-; Test case for <rdar://problem/15942912>.
-; AdrpAddStr cannot be used when the store uses same
-; register as address and value. Indeed, the related
-; if applied, may completely remove the definition or
-; at least provide a wrong one (with the offset folded
-; into the definition).
-
-%struct.anon = type { i32*, i32** }
-
-@pptp_wan_head = internal global %struct.anon zeroinitializer, align 8
-
-; CHECK-LABEL: _pptp_wan_init
-; CHECK: ret
-; CHECK-NOT: AdrpAddStr
-define i32 @pptp_wan_init() {
-entry:
-  store i32* null, i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), align 8
-  store i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), i32*** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 1), align 8
-  ret i32 0
-}
-
-
diff --git a/test/CodeGen/ARM64/collect-loh.ll b/test/CodeGen/ARM64/collect-loh.ll
deleted file mode 100644
index e92690b04f8..00000000000
--- a/test/CodeGen/ARM64/collect-loh.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF
-
-; CHECK-ELF-NOT: .loh
-; CHECK-ELF-NOT: AdrpAdrp
-; CHECK-ELF-NOT: AdrpAdd
-; CHECK-ELF-NOT: AdrpLdrGot
-
-@a = internal unnamed_addr global i32 0, align 4
-@b = external global i32
-
-; Function Attrs: noinline nounwind ssp
-define void @foo(i32 %t) {
-entry:
-  %tmp = load i32* @a, align 4
-  %add = add nsw i32 %tmp, %t
-  store i32 %add, i32* @a, align 4
-  ret void
-}
-
-; Function Attrs: nounwind ssp
-; Testcase for <rdar://problem/15438605>, AdrpAdrp reuse is valid only when the first adrp
-; dominates the second.
-; The first adrp comes from the loading of 'a' and the second the loading of 'b'.
-; 'a' is loaded in if.then, 'b' in if.end4, if.then does not dominates if.end4.
-; CHECK-LABEL: _test
-; CHECK: ret
-; CHECK-NOT: .loh AdrpAdrp
-define i32 @test(i32 %t) {
-entry:
-  %cmp = icmp sgt i32 %t, 5
-  br i1 %cmp, label %if.then, label %if.end4
-
-if.then:                                          ; preds = %entry
-  %tmp = load i32* @a, align 4
-  %add = add nsw i32 %tmp, %t
-  %cmp1 = icmp sgt i32 %add, 12
-  br i1 %cmp1, label %if.then2, label %if.end4
-
-if.then2:                                         ; preds = %if.then
-  tail call void @foo(i32 %add)
-  %tmp1 = load i32* @a, align 4
-  br label %if.end4
-
-if.end4:                                          ; preds = %if.then2, %if.then, %entry
-  %t.addr.0 = phi i32 [ %tmp1, %if.then2 ], [ %t, %if.then ], [ %t, %entry ]
-  %tmp2 = load i32* @b, align 4
-  %add5 = add nsw i32 %tmp2, %t.addr.0
-  tail call void @foo(i32 %add5)
-  %tmp3 = load i32* @b, align 4
-  %add6 = add nsw i32 %tmp3, %t.addr.0
-  ret i32 %add6
-}
diff --git a/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S b/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
deleted file mode 100644
index 250732d6e84..00000000000
--- a/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o /dev/null %s
-
-        .text
-        .globl _foo
-        .cfi_startproc
-_foo:
-        stp x29, x30, [sp, #-16]!
- .cfi_adjust_cfa_offset 16
-
-        ldp x29, x30, [sp], #16
- .cfi_adjust_cfa_offset -16
-        .cfi_restore x29
-        .cfi_restore x30
-
-        ret
-
-        .cfi_endproc
diff --git a/test/CodeGen/ARM64/complex-copy-noneon.ll b/test/CodeGen/ARM64/complex-copy-noneon.ll
deleted file mode 100644
index f65b1161282..00000000000
--- a/test/CodeGen/ARM64/complex-copy-noneon.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s
-
-; The DAG combiner decided to use a vector load/store for this struct copy
-; previously. This probably shouldn't happen without NEON, but the most
-; important thing is that it compiles.
-
-define void @store_combine() nounwind {
-  %src = alloca { double, double }, align 8
-  %dst = alloca { double, double }, align 8
-
-  %src.realp = getelementptr inbounds { double, double }* %src, i32 0, i32 0
-  %src.real = load double* %src.realp
-  %src.imagp = getelementptr inbounds { double, double }* %src, i32 0, i32 1
-  %src.imag = load double* %src.imagp
-
-  %dst.realp = getelementptr inbounds { double, double }* %dst, i32 0, i32 0
-  %dst.imagp = getelementptr inbounds { double, double }* %dst, i32 0, i32 1
-  store double %src.real, double* %dst.realp
-  store double %src.imag, double* %dst.imagp
-  ret void
-}
diff --git a/test/CodeGen/ARM64/complex-ret.ll b/test/CodeGen/ARM64/complex-ret.ll
deleted file mode 100644
index 93d50a59861..00000000000
--- a/test/CodeGen/ARM64/complex-ret.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc -march=arm64 -o - %s | FileCheck %s
-
-define { i192, i192, i21, i192 } @foo(i192) {
-; CHECK-LABEL: foo:
-; CHECK: stp xzr, xzr, [x8]
-  ret { i192, i192, i21, i192 } {i192 0, i192 1, i21 2, i192 3}
-}
diff --git a/test/CodeGen/ARM64/const-addr.ll b/test/CodeGen/ARM64/const-addr.ll
deleted file mode 100644
index c55a9226cc7..00000000000
--- a/test/CodeGen/ARM64/const-addr.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -mtriple=arm64-darwin-unknown < %s | FileCheck %s
-
-%T = type { i32, i32, i32, i32 }
-
-; Test if the constant base address gets only materialized once.
-define i32 @test1() nounwind {
-; CHECK-LABEL:  test1
-; CHECK:        movz  w8, #0x40f, lsl #16
-; CHECK-NEXT:   movk  w8, #0xc000
-; CHECK-NEXT:   ldp w9, w10, [x8, #4]
-; CHECK:        ldr w8, [x8, #12]
-  %at = inttoptr i64 68141056 to %T*
-  %o1 = getelementptr %T* %at, i32 0, i32 1
-  %t1 = load i32* %o1
-  %o2 = getelementptr %T* %at, i32 0, i32 2
-  %t2 = load i32* %o2
-  %a1 = add i32 %t1, %t2
-  %o3 = getelementptr %T* %at, i32 0, i32 3
-  %t3 = load i32* %o3
-  %a2 = add i32 %a1, %t3
-  ret i32 %a2
-}
-
diff --git a/test/CodeGen/ARM64/convert-v2f64-v2i32.ll b/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
deleted file mode 100644
index 1a07c986557..00000000000
--- a/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-; CHECK: fptosi_1
-; CHECK: fcvtzs.2d
-; CHECK: xtn.2s
-; CHECK: ret
-define void @fptosi_1() nounwind noinline ssp {
-entry:
-  %0 = fptosi <2 x double> undef to <2 x i32>
-  store <2 x i32> %0, <2 x i32>* undef, align 8
-  ret void
-}
-
-; CHECK: fptoui_1
-; CHECK: fcvtzu.2d
-; CHECK: xtn.2s
-; CHECK: ret
-define void @fptoui_1() nounwind noinline ssp {
-entry:
-  %0 = fptoui <2 x double> undef to <2 x i32>
-  store <2 x i32> %0, <2 x i32>* undef, align 8
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/convert-v2i32-v2f64.ll b/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
deleted file mode 100644
index 63129a4b830..00000000000
--- a/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: f1:
-; CHECK: sshll.2d v0, v0, #0
-; CHECK-NEXT: scvtf.2d v0, v0
-; CHECK-NEXT: ret
-  %conv = sitofp <2 x i32> %v to <2 x double>
-  ret <2 x double> %conv
-}
-define <2 x double> @f2(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: f2:
-; CHECK: ushll.2d v0, v0, #0
-; CHECK-NEXT: ucvtf.2d v0, v0
-; CHECK-NEXT: ret
-  %conv = uitofp <2 x i32> %v to <2 x double>
-  ret <2 x double> %conv
-}
-
-; CHECK: autogen_SD19655
-; CHECK: scvtf
-; CHECK: ret
-define void @autogen_SD19655() {
-  %T = load <2 x i64>* undef
-  %F = sitofp <2 x i64> undef to <2 x float>
-  store <2 x float> %F, <2 x float>* undef
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/copy-tuple.ll b/test/CodeGen/ARM64/copy-tuple.ll
deleted file mode 100644
index f9981931246..00000000000
--- a/test/CodeGen/ARM64/copy-tuple.ll
+++ /dev/null
@@ -1,146 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
-
-; The main purpose of this test is to find out whether copyPhysReg can deal with
-; the memmove-like situation arising in tuples, where an early copy can clobber
-; the value needed by a later one if the tuples overlap.
-
-; We use dummy inline asm to force LLVM to generate a COPY between the registers
-; we want by clobbering all the others.
-
-define void @test_D1D2_from_D0D1(i8* %addr) #0 {
-; CHECK-LABEL: test_D1D2_from_D0D1:
-; CHECK: mov.8b v2, v1
-; CHECK: mov.8b v1, v0
-entry:
-  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
-  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
-  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
-  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-  ret void
-}
-
-define void @test_D0D1_from_D1D2(i8* %addr) #0 {
-; CHECK-LABEL: test_D0D1_from_D1D2:
-; CHECK: mov.8b v0, v1
-; CHECK: mov.8b v1, v2
-entry:
-  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
-  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
-  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
-  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-  ret void
-}
-
-define void @test_D0D1_from_D31D0(i8* %addr) #0 {
-; CHECK-LABEL: test_D0D1_from_D31D0:
-; CHECK: mov.8b v1, v0
-; CHECK: mov.8b v0, v31
-entry:
-  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
-  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
-  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
-  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-  ret void
-}
-
-define void @test_D31D0_from_D0D1(i8* %addr) #0 {
-; CHECK-LABEL: test_D31D0_from_D0D1:
-; CHECK: mov.8b v31, v0
-; CHECK: mov.8b v0, v1
-entry:
-  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
-  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
-  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
-  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-  ret void
-}
-
-define void @test_D2D3D4_from_D0D1D2(i8* %addr) #0 {
-; CHECK-LABEL: test_D2D3D4_from_D0D1D2:
-; CHECK: mov.8b v4, v2
-; CHECK: mov.8b v3, v1
-; CHECK: mov.8b v2, v0
-entry:
-  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
-  %vec0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 0
-  %vec1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 1
-  %vec2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 2
-
-  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v0},~{v1},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
-  ret void
-}
-
-define void @test_Q0Q1Q2_from_Q1Q2Q3(i8* %addr) #0 {
-; CHECK-LABEL: test_Q0Q1Q2_from_Q1Q2Q3:
-; CHECK: mov.16b v0, v1
-; CHECK: mov.16b v1, v2
-; CHECK: mov.16b v2, v3
-entry:
-  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
-  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
-  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
-  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
-  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
-  tail call void asm sideeffect "", "~{v0},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
-  ret void
-}
-
-define void @test_Q1Q2Q3Q4_from_Q30Q31Q0Q1(i8* %addr) #0 {
-; CHECK-LABEL: test_Q1Q2Q3Q4_from_Q30Q31Q0Q1:
-; CHECK: mov.16b v4, v1
-; CHECK: mov.16b v3, v0
-; CHECK: mov.16b v2, v31
-; CHECK: mov.16b v1, v30
-  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
-  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
-  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
-  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
-  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
-  %vec3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 3
-
-  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}"()
-  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v0},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
-
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
diff --git a/test/CodeGen/ARM64/crc32.ll b/test/CodeGen/ARM64/crc32.ll
deleted file mode 100644
index 5d759dcce71..00000000000
--- a/test/CodeGen/ARM64/crc32.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -march=arm64 -mattr=+crc -o - %s | FileCheck %s
-
-define i32 @test_crc32b(i32 %cur, i8 %next) {
-; CHECK-LABEL: test_crc32b:
-; CHECK: crc32b w0, w0, w1
-  %bits = zext i8 %next to i32
-  %val = call i32 @llvm.arm64.crc32b(i32 %cur, i32 %bits)
-  ret i32 %val
-}
-
-define i32 @test_crc32h(i32 %cur, i16 %next) {
-; CHECK-LABEL: test_crc32h:
-; CHECK: crc32h w0, w0, w1
-  %bits = zext i16 %next to i32
-  %val = call i32 @llvm.arm64.crc32h(i32 %cur, i32 %bits)
-  ret i32 %val
-}
-
-define i32 @test_crc32w(i32 %cur, i32 %next) {
-; CHECK-LABEL: test_crc32w:
-; CHECK: crc32w w0, w0, w1
-  %val = call i32 @llvm.arm64.crc32w(i32 %cur, i32 %next)
-  ret i32 %val
-}
-
-define i32 @test_crc32x(i32 %cur, i64 %next) {
-; CHECK-LABEL: test_crc32x:
-; CHECK: crc32x w0, w0, x1
-  %val = call i32 @llvm.arm64.crc32x(i32 %cur, i64 %next)
-  ret i32 %val
-}
-
-define i32 @test_crc32cb(i32 %cur, i8 %next) {
-; CHECK-LABEL: test_crc32cb:
-; CHECK: crc32cb w0, w0, w1
-  %bits = zext i8 %next to i32
-  %val = call i32 @llvm.arm64.crc32cb(i32 %cur, i32 %bits)
-  ret i32 %val
-}
-
-define i32 @test_crc32ch(i32 %cur, i16 %next) {
-; CHECK-LABEL: test_crc32ch:
-; CHECK: crc32ch w0, w0, w1
-  %bits = zext i16 %next to i32
-  %val = call i32 @llvm.arm64.crc32ch(i32 %cur, i32 %bits)
-  ret i32 %val
-}
-
-define i32 @test_crc32cw(i32 %cur, i32 %next) {
-; CHECK-LABEL: test_crc32cw:
-; CHECK: crc32cw w0, w0, w1
-  %val = call i32 @llvm.arm64.crc32cw(i32 %cur, i32 %next)
-  ret i32 %val
-}
-
-define i32 @test_crc32cx(i32 %cur, i64 %next) {
-; CHECK-LABEL: test_crc32cx:
-; CHECK: crc32cx w0, w0, x1
-  %val = call i32 @llvm.arm64.crc32cx(i32 %cur, i64 %next)
-  ret i32 %val
-}
-
-declare i32 @llvm.arm64.crc32b(i32, i32)
-declare i32 @llvm.arm64.crc32h(i32, i32)
-declare i32 @llvm.arm64.crc32w(i32, i32)
-declare i32 @llvm.arm64.crc32x(i32, i64)
-
-declare i32 @llvm.arm64.crc32cb(i32, i32)
-declare i32 @llvm.arm64.crc32ch(i32, i32)
-declare i32 @llvm.arm64.crc32cw(i32, i32)
-declare i32 @llvm.arm64.crc32cx(i32, i64)
diff --git a/test/CodeGen/ARM64/crypto.ll b/test/CodeGen/ARM64/crypto.ll
deleted file mode 100644
index 0020865bcd5..00000000000
--- a/test/CodeGen/ARM64/crypto.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; RUN: llc -march=arm64 -mattr=crypto -arm64-neon-syntax=apple -o - %s | FileCheck %s
-
-declare <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
-declare <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
-declare <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
-declare <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
-
-define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: test_aese:
-; CHECK: aese.16b v0, v1
-  %res = call <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: test_aesd:
-; CHECK: aesd.16b v0, v1
-  %res = call <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_aesmc(<16 x i8> %data) {
-; CHECK-LABEL: test_aesmc:
-; CHECK: aesmc.16b v0, v0
- %res = call <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_aesimc(<16 x i8> %data) {
-; CHECK-LABEL: test_aesimc:
-; CHECK: aesimc.16b v0, v0
- %res = call <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
-  ret <16 x i8> %res
-}
-
-declare <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
-declare <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
-declare <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
-
-define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha1c:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1c.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-; <rdar://problem/14742333> Incomplete removal of unnecessary FMOV instructions in intrinsic SHA1
-define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha1c_in_a_row:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1
-; CHECK-NOT: fmov
-; CHECK: sha1c.4s q0, s[[SHA1RES]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  %extract = extractelement <4 x i32> %res, i32 0
-  %res2 = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
-  ret <4 x i32> %res2
-}
-
-define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha1p:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1p.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha1m:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1m.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-define i32 @test_sha1h(i32 %hash_e) {
-; CHECK-LABEL: test_sha1h:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
-; CHECK: fmov w0, [[RES]]
-  %res = call i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
-  ret i32 %res
-}
-
-define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) {
-; CHECK-LABEL: test_sha1su0:
-; CHECK: sha1su0.4s v0, v1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) {
-; CHECK-LABEL: test_sha1su1:
-; CHECK: sha1su1.4s v0, v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
-  ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
-declare <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
-
-define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha256h:
-; CHECK: sha256h.4s q0, q1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha256h2:
-; CHECK: sha256h2.4s q0, q1, v2
-
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) {
-; CHECK-LABEL: test_sha256su0:
-; CHECK: sha256su0.4s v0, v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
-; CHECK-LABEL: test_sha256su1:
-; CHECK: sha256su1.4s v0, v1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
-  ret <4 x i32> %res
-}
diff --git a/test/CodeGen/ARM64/cse.ll b/test/CodeGen/ARM64/cse.ll
deleted file mode 100644
index bb14c895504..00000000000
--- a/test/CodeGen/ARM64/cse.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; RUN: llc -O3 < %s | FileCheck %s
-target triple = "arm64-apple-ios"
-
-; rdar://12462006
-; CSE between "icmp reg reg" and "sub reg reg".
-; Both can be in the same basic block or in different basic blocks.
-define i8* @t1(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: subs
-; CHECK-NOT: cmp
-; CHECK-NOT: sub
-; CHECK: b.ge
-; CHECK: sub
-; CHECK: sub
-; CHECK-NOT: sub
-; CHECK: ret
- %0 = load i32* %offset, align 4
- %cmp = icmp slt i32 %0, %size
- %s = sub nsw i32 %0, %size
- br i1 %cmp, label %return, label %if.end
-
-if.end:
- %sub = sub nsw i32 %0, %size
- %s2 = sub nsw i32 %s, %size
- %s3 = sub nsw i32 %sub, %s2
- store i32 %s3, i32* %offset, align 4
- %add.ptr = getelementptr inbounds i8* %base, i32 %sub
- br label %return
-
-return:
- %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
- ret i8* %retval.0
-}
-
-; CSE between "icmp reg imm" and "sub reg imm".
-define i8* @t2(i8* %base, i32* nocapture %offset) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: subs
-; CHECK-NOT: cmp
-; CHECK-NOT: sub
-; CHECK: b.lt
-; CHECK-NOT: sub
-; CHECK: ret
- %0 = load i32* %offset, align 4
- %cmp = icmp slt i32 %0, 1
- br i1 %cmp, label %return, label %if.end
-
-if.end:
- %sub = sub nsw i32 %0, 1
- store i32 %sub, i32* %offset, align 4
- %add.ptr = getelementptr inbounds i8* %base, i32 %sub
- br label %return
-
-return:
- %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
- ret i8* %retval.0
-}
diff --git a/test/CodeGen/ARM64/csel.ll b/test/CodeGen/ARM64/csel.ll
deleted file mode 100644
index 98eba30f119..00000000000
--- a/test/CodeGen/ARM64/csel.ll
+++ /dev/null
@@ -1,230 +0,0 @@
-; RUN: llc -O3 < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
-target triple = "arm64-unknown-unknown"
-
-; CHECK-LABEL: foo1
-; CHECK: cinc w{{[0-9]+}}, w{{[0-9]+}}, ne
-define i32 @foo1(i32 %b, i32 %c) nounwind readnone ssp {
-entry:
-  %not.tobool = icmp ne i32 %c, 0
-  %add = zext i1 %not.tobool to i32
-  %b.add = add i32 %c, %b
-  %add1 = add i32 %b.add, %add
-  ret i32 %add1
-}
-
-; CHECK-LABEL: foo2
-; CHECK: cneg w{{[0-9]+}}, w{{[0-9]+}}, ne
-define i32 @foo2(i32 %b, i32 %c) nounwind readnone ssp {
-entry:
-  %mul = sub i32 0, %b
-  %tobool = icmp eq i32 %c, 0
-  %b.mul = select i1 %tobool, i32 %b, i32 %mul
-  %add = add nsw i32 %b.mul, %c
-  ret i32 %add
-}
-
-; CHECK-LABEL: foo3
-; CHECK: cinv w{{[0-9]+}}, w{{[0-9]+}}, ne
-define i32 @foo3(i32 %b, i32 %c) nounwind readnone ssp {
-entry:
-  %not.tobool = icmp ne i32 %c, 0
-  %xor = sext i1 %not.tobool to i32
-  %b.xor = xor i32 %xor, %b
-  %add = add nsw i32 %b.xor, %c
-  ret i32 %add
-}
-
-; rdar://11632325
-define i32@foo4(i32 %a) nounwind ssp {
-; CHECK-LABEL: foo4
-; CHECK: cneg
-; CHECK-NEXT: ret
-  %cmp = icmp sgt i32 %a, -1
-  %neg = sub nsw i32 0, %a
-  %cond = select i1 %cmp, i32 %a, i32 %neg
-  ret i32 %cond
-}
-
-define i32@foo5(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK-LABEL: foo5
-; CHECK: subs
-; CHECK-NEXT: cneg
-; CHECK-NEXT: ret
-  %sub = sub nsw i32 %a, %b
-  %cmp = icmp sgt i32 %sub, -1
-  %sub3 = sub nsw i32 0, %sub
-  %cond = select i1 %cmp, i32 %sub, i32 %sub3
-  ret i32 %cond
-}
-
-; make sure we can handle branch instruction in optimizeCompare.
-define i32@foo6(i32 %a, i32 %b) nounwind ssp {
-; CHECK-LABEL: foo6
-; CHECK: b
-  %sub = sub nsw i32 %a, %b
-  %cmp = icmp sgt i32 %sub, 0
-  br i1 %cmp, label %l.if, label %l.else
-
-l.if:
-  ret i32 1
-
-l.else:
-  ret i32 %sub
-}
-
-; If CPSR is used multiple times and V flag is used, we don't remove cmp.
-define i32 @foo7(i32 %a, i32 %b) nounwind {
-entry:
-; CHECK-LABEL: foo7:
-; CHECK: sub
-; CHECK-next: adds
-; CHECK-next: csneg
-; CHECK-next: b
-  %sub = sub nsw i32 %a, %b
-  %cmp = icmp sgt i32 %sub, -1
-  %sub3 = sub nsw i32 0, %sub
-  %cond = select i1 %cmp, i32 %sub, i32 %sub3
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  %cmp2 = icmp slt i32 %sub, -1
-  %sel = select i1 %cmp2, i32 %cond, i32 %a
-  ret i32 %sel
-
-if.else:
-  ret i32 %cond
-}
-
-define i32 @foo8(i32 %v, i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: foo8:
-; CHECK: cmp w0, #0
-; CHECK: csinv w0, w1, w2, ne
-  %tobool = icmp eq i32 %v, 0
-  %neg = xor i32 -1, %b
-  %cond = select i1 %tobool, i32 %neg, i32 %a
-  ret i32 %cond
-}
-
-define i32 @foo9(i32 %v) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo9:
-; CHECK: cmp w0, #0
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
-; CHECK: cinv w0, w[[REG]], eq
-  %tobool = icmp ne i32 %v, 0
-  %cond = select i1 %tobool, i32 4, i32 -5
-  ret i32 %cond
-}
-
-define i64 @foo10(i64 %v) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo10:
-; CHECK: cmp x0, #0
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
-; CHECK: cinv x0, x[[REG]], eq
-  %tobool = icmp ne i64 %v, 0
-  %cond = select i1 %tobool, i64 4, i64 -5
-  ret i64 %cond
-}
-
-define i32 @foo11(i32 %v) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo11:
-; CHECK: cmp w0, #0
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
-; CHECK: cneg w0, w[[REG]], eq
-  %tobool = icmp ne i32 %v, 0
-  %cond = select i1 %tobool, i32 4, i32 -4
-  ret i32 %cond
-}
-
-define i64 @foo12(i64 %v) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo12:
-; CHECK: cmp x0, #0
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
-; CHECK: cneg x0, x[[REG]], eq
-  %tobool = icmp ne i64 %v, 0
-  %cond = select i1 %tobool, i64 4, i64 -4
-  ret i64 %cond
-}
-
-define i32 @foo13(i32 %v, i32 %a, i32 %b) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo13:
-; CHECK: cmp w0, #0
-; CHECK: csneg w0, w1, w2, ne
-  %tobool = icmp eq i32 %v, 0
-  %sub = sub i32 0, %b
-  %cond = select i1 %tobool, i32 %sub, i32 %a
-  ret i32 %cond
-}
-
-define i64 @foo14(i64 %v, i64 %a, i64 %b) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo14:
-; CHECK: cmp x0, #0
-; CHECK: csneg x0, x1, x2, ne
-  %tobool = icmp eq i64 %v, 0
-  %sub = sub i64 0, %b
-  %cond = select i1 %tobool, i64 %sub, i64 %a
-  ret i64 %cond
-}
-
-define i32 @foo15(i32 %a, i32 %b) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo15:
-; CHECK: cmp w0, w1
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
-; CHECK: cinc w0, w[[REG]], gt
-  %cmp = icmp sgt i32 %a, %b
-  %. = select i1 %cmp, i32 2, i32 1
-  ret i32 %.
-}
-
-define i32 @foo16(i32 %a, i32 %b) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo16:
-; CHECK: cmp w0, w1
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
-; CHECK: cinc w0, w[[REG]], le
-  %cmp = icmp sgt i32 %a, %b
-  %. = select i1 %cmp, i32 1, i32 2
-  ret i32 %.
-}
-
-define i64 @foo17(i64 %a, i64 %b) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo17:
-; CHECK: cmp x0, x1
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
-; CHECK: cinc x0, x[[REG]], gt
-  %cmp = icmp sgt i64 %a, %b
-  %. = select i1 %cmp, i64 2, i64 1
-  ret i64 %.
-}
-
-define i64 @foo18(i64 %a, i64 %b) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo18:
-; CHECK: cmp x0, x1
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
-; CHECK: cinc x0, x[[REG]], le
-  %cmp = icmp sgt i64 %a, %b
-  %. = select i1 %cmp, i64 1, i64 2
-  ret i64 %.
-}
-
-define i64 @foo19(i64 %a, i64 %b, i64 %c) {
-entry:
-; CHECK-LABEL: foo19:
-; CHECK: cinc x0, x2
-; CHECK-NOT: add
-  %cmp = icmp ult i64 %a, %b
-  %inc = zext i1 %cmp to i64
-  %inc.c = add i64 %inc, %c
-  ret i64 %inc.c
-}
diff --git a/test/CodeGen/ARM64/cvt.ll b/test/CodeGen/ARM64/cvt.ll
deleted file mode 100644
index b55a42fdf85..00000000000
--- a/test/CodeGen/ARM64/cvt.ll
+++ /dev/null
@@ -1,401 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-;
-; Floating-point scalar convert to signed integer (to nearest with ties to away)
-;
-define i32 @fcvtas_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtas_1w1s:
-;CHECK: fcvtas w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtas_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtas_1x1s:
-;CHECK: fcvtas x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtas_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtas_1w1d:
-;CHECK: fcvtas w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtas_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtas_1x1d:
-;CHECK: fcvtas x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtas.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtas.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtas.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtas.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer
-;
-define i32 @fcvtau_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtau_1w1s:
-;CHECK: fcvtau w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtau_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtau_1x1s:
-;CHECK: fcvtau x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtau_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtau_1w1d:
-;CHECK: fcvtau w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtau_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtau_1x1d:
-;CHECK: fcvtau x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtau.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtau.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtau.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtau.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to signed integer (toward -Inf)
-;
-define i32 @fcvtms_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtms_1w1s:
-;CHECK: fcvtms w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtms_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtms_1x1s:
-;CHECK: fcvtms x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtms_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtms_1w1d:
-;CHECK: fcvtms w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtms_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtms_1x1d:
-;CHECK: fcvtms x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtms.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtms.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtms.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtms.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer (toward -Inf)
-;
-define i32 @fcvtmu_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtmu_1w1s:
-;CHECK: fcvtmu w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtmu_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtmu_1x1s:
-;CHECK: fcvtmu x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtmu_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtmu_1w1d:
-;CHECK: fcvtmu w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtmu_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtmu_1x1d:
-;CHECK: fcvtmu x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtmu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtmu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtmu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtmu.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to signed integer (to nearest with ties to even)
-;
-define i32 @fcvtns_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtns_1w1s:
-;CHECK: fcvtns w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtns_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtns_1x1s:
-;CHECK: fcvtns x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtns_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtns_1w1d:
-;CHECK: fcvtns w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtns_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtns_1x1d:
-;CHECK: fcvtns x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtns.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtns.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtns.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtns.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer (to nearest with ties to even)
-;
-define i32 @fcvtnu_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtnu_1w1s:
-;CHECK: fcvtnu w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtnu_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtnu_1x1s:
-;CHECK: fcvtnu x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtnu_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtnu_1w1d:
-;CHECK: fcvtnu w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtnu_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtnu_1x1d:
-;CHECK: fcvtnu x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtnu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtnu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtnu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtnu.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to signed integer (toward +Inf)
-;
-define i32 @fcvtps_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtps_1w1s:
-;CHECK: fcvtps w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtps_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtps_1x1s:
-;CHECK: fcvtps x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtps_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtps_1w1d:
-;CHECK: fcvtps w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtps_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtps_1x1d:
-;CHECK: fcvtps x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtps.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtps.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtps.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtps.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer (toward +Inf)
-;
-define i32 @fcvtpu_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtpu_1w1s:
-;CHECK: fcvtpu w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtpu_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtpu_1x1s:
-;CHECK: fcvtpu x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtpu_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtpu_1w1d:
-;CHECK: fcvtpu w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtpu_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtpu_1x1d:
-;CHECK: fcvtpu x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtpu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtpu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtpu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtpu.i64.f64(double) nounwind readnone
-
-;
-;  Floating-point scalar convert to signed integer (toward zero)
-;
-define i32 @fcvtzs_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtzs_1w1s:
-;CHECK: fcvtzs w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtzs_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtzs_1x1s:
-;CHECK: fcvtzs x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtzs_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtzs_1w1d:
-;CHECK: fcvtzs w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtzs_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtzs_1x1d:
-;CHECK: fcvtzs x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtzs.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzs.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtzs.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzs.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer (toward zero)
-;
-define i32 @fcvtzu_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtzu_1w1s:
-;CHECK: fcvtzu w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtzu_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtzu_1x1s:
-;CHECK: fcvtzu x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtzu_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtzu_1w1d:
-;CHECK: fcvtzu w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtzu_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtzu_1x1d:
-;CHECK: fcvtzu x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtzu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtzu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzu.i64.f64(double) nounwind readnone
diff --git a/test/CodeGen/ARM64/dagcombiner-convergence.ll b/test/CodeGen/ARM64/dagcombiner-convergence.ll
deleted file mode 100644
index a45e31320de..00000000000
--- a/test/CodeGen/ARM64/dagcombiner-convergence.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -o /dev/null
-; rdar://10795250
-; DAGCombiner should converge.
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
-target triple = "arm64-apple-macosx10.8.0"
-
-define i64 @foo(i128 %Params.coerce, i128 %SelLocs.coerce) {
-entry:
-  %tmp = lshr i128 %Params.coerce, 61
-  %.tr38.i = trunc i128 %tmp to i64
-  %mul.i = and i64 %.tr38.i, 4294967288
-  %tmp1 = lshr i128 %SelLocs.coerce, 62
-  %.tr.i = trunc i128 %tmp1 to i64
-  %mul7.i = and i64 %.tr.i, 4294967292
-  %add.i = add i64 %mul7.i, %mul.i
-  %conv.i.i = and i64 %add.i, 4294967292
-  ret i64 %conv.i.i
-}
diff --git a/test/CodeGen/ARM64/dagcombiner-dead-indexed-load.ll b/test/CodeGen/ARM64/dagcombiner-dead-indexed-load.ll
deleted file mode 100644
index 2cf01357324..00000000000
--- a/test/CodeGen/ARM64/dagcombiner-dead-indexed-load.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -mcpu=cyclone < %s | FileCheck %s
-
-target datalayout = "e-i64:64-n32:64-S128"
-target triple = "arm64-apple-ios"
-
-%"struct.SU" = type { i32, %"struct.SU"*, i32*, i32, i32, %"struct.BO", i32, [5 x i8] }
-%"struct.BO" = type { %"struct.RE" }
-
-%"struct.RE" = type { i32, i32, i32, i32 }
-
-; This is a read-modify-write of some bifields combined into an i48.  It gets
-; legalized into i32 and i16 accesses.  Only a single store of zero to the low
-; i32 part should be live.
-
-; CHECK-LABEL: test:
-; CHECK-NOT: ldr
-; CHECK: str wzr
-; CHECK-NOT: str
-define void @test(%"struct.SU"* nocapture %su) {
-entry:
-  %r1 = getelementptr inbounds %"struct.SU"* %su, i64 1, i32 5
-  %r2 = bitcast %"struct.BO"* %r1 to i48*
-  %r3 = load i48* %r2, align 8
-  %r4 = and i48 %r3, -4294967296
-  %r5 = or i48 0, %r4
-  store i48 %r5, i48* %r2, align 8
-
-  ret void
-}
diff --git a/test/CodeGen/ARM64/dagcombiner-indexed-load.ll b/test/CodeGen/ARM64/dagcombiner-indexed-load.ll
deleted file mode 100644
index 2e4b658f1c9..00000000000
--- a/test/CodeGen/ARM64/dagcombiner-indexed-load.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc -O3 < %s | FileCheck %s
-; RUN: llc -O3 -addr-sink-using-gep=1 < %s | FileCheck %s
-; Test case for a DAG combiner bug where we combined an indexed load
-; with an extension (sext, zext, or any) into a regular extended load,
-; i.e., dropping the indexed value.
-; <rdar://problem/16389332>
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios"
-
-%class.A = type { i64, i64 }
-%class.C = type { i64 }
-
-; CHECK-LABEL: XX:
-; CHECK: ldr
-define void @XX(%class.A* %K) {
-entry:
-  br i1 undef, label %if.then, label %lor.rhs.i
-
-lor.rhs.i:                                        ; preds = %entry
-  %tmp = load i32* undef, align 4
-  %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
-  %tmp1 = load i64* %y.i.i.i, align 8
-  %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
-  %div11.i = sdiv i32 %U.sroa.3.8.extract.trunc.i, 17
-  %add12.i = add nsw i32 0, %div11.i
-  %U.sroa.3.12.extract.shift.i = lshr i64 %tmp1, 32
-  %U.sroa.3.12.extract.trunc.i = trunc i64 %U.sroa.3.12.extract.shift.i to i32
-  %div15.i = sdiv i32 %U.sroa.3.12.extract.trunc.i, 13
-  %add16.i = add nsw i32 %add12.i, %div15.i
-  %rem.i.i = srem i32 %add16.i, %tmp
-  %idxprom = sext i32 %rem.i.i to i64
-  %arrayidx = getelementptr inbounds %class.C** undef, i64 %idxprom
-  %tobool533 = icmp eq %class.C* undef, null
-  br i1 %tobool533, label %while.end, label %while.body
-
-if.then:                                          ; preds = %entry
-  unreachable
-
-while.body:                                       ; preds = %lor.rhs.i
-  unreachable
-
-while.end:                                        ; preds = %lor.rhs.i
-  %tmp3 = load %class.C** %arrayidx, align 8
-  unreachable
-}
diff --git a/test/CodeGen/ARM64/dagcombiner-load-slicing.ll b/test/CodeGen/ARM64/dagcombiner-load-slicing.ll
deleted file mode 100644
index 0679014e59a..00000000000
--- a/test/CodeGen/ARM64/dagcombiner-load-slicing.ll
+++ /dev/null
@@ -1,102 +0,0 @@
-; RUN: llc -mtriple arm64-apple-ios -O3 -o - < %s | FileCheck %s
-; <rdar://problem/14477220>
-
-%class.Complex = type { float, float }
-%class.Complex_int = type { i32, i32 }
-%class.Complex_long = type { i64, i64 }
-
-; CHECK-LABEL: @test
-; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
-; CHECK: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], {{\[}}[[BASE]]]
-; CHECK: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], {{\[}}[[BASE]], #64]
-; CHECK: fadd {{s[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
-; CHECK: fadd {{s[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
-; CHECK: ret
-define void @test(%class.Complex* nocapture %out, i64 %out_start) {
-entry:
-  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
-  %0 = bitcast %class.Complex* %arrayidx to i64*
-  %1 = load i64* %0, align 4
-  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
-  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
-  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
-  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
-  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
-  %add = add i64 %out_start, 8
-  %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
-  %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
-  %4 = load float* %i.i, align 4
-  %add.i = fadd float %4, %2
-  %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
-  %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
-  %5 = load float* %r.i, align 4
-  %add5.i = fadd float %5, %3
-  %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
-  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
-  store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
-  ret void
-}
-
-; CHECK-LABEL: @test_int
-; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
-; CHECK: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], {{\[}}[[BASE]]]
-; CHECK: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], {{\[}}[[BASE]], #64]
-; CHECK: add {{w[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
-; CHECK: add {{w[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
-; CHECK: ret
-define void @test_int(%class.Complex_int* nocapture %out, i64 %out_start) {
-entry:
-  %arrayidx = getelementptr inbounds %class.Complex_int* %out, i64 %out_start
-  %0 = bitcast %class.Complex_int* %arrayidx to i64*
-  %1 = load i64* %0, align 4
-  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
-  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to i32
-  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
-  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
-  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to i32
-  %add = add i64 %out_start, 8
-  %arrayidx2 = getelementptr inbounds %class.Complex_int* %out, i64 %add
-  %i.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 0
-  %4 = load i32* %i.i, align 4
-  %add.i = add i32 %4, %2
-  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i32> undef, i32 %add.i, i32 0
-  %r.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 1
-  %5 = load i32* %r.i, align 4
-  %add5.i = add i32 %5, %3
-  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i32> %retval.sroa.0.0.vec.insert.i, i32 %add5.i, i32 1
-  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_int* %arrayidx to <2 x i32>*
-  store <2 x i32> %retval.sroa.0.4.vec.insert.i, <2 x i32>* %ref.tmp.sroa.0.0.cast, align 4
-  ret void
-}
-
-; CHECK-LABEL: @test_long
-; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #4
-; CHECK: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], {{\[}}[[BASE]]]
-; CHECK: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], {{\[}}[[BASE]], #128]
-; CHECK: add {{x[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
-; CHECK: add {{x[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
-; CHECK: ret
-define void @test_long(%class.Complex_long* nocapture %out, i64 %out_start) {
-entry:
-  %arrayidx = getelementptr inbounds %class.Complex_long* %out, i64 %out_start
-  %0 = bitcast %class.Complex_long* %arrayidx to i128*
-  %1 = load i128* %0, align 4
-  %t0.sroa.0.0.extract.trunc = trunc i128 %1 to i64
-  %2 = bitcast i64 %t0.sroa.0.0.extract.trunc to i64
-  %t0.sroa.2.0.extract.shift = lshr i128 %1, 64
-  %t0.sroa.2.0.extract.trunc = trunc i128 %t0.sroa.2.0.extract.shift to i64
-  %3 = bitcast i64 %t0.sroa.2.0.extract.trunc to i64
-  %add = add i64 %out_start, 8
-  %arrayidx2 = getelementptr inbounds %class.Complex_long* %out, i64 %add
-  %i.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 0
-  %4 = load i64* %i.i, align 4
-  %add.i = add i64 %4, %2
-  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i64> undef, i64 %add.i, i32 0
-  %r.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 1
-  %5 = load i64* %r.i, align 4
-  %add5.i = add i64 %5, %3
-  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i64> %retval.sroa.0.0.vec.insert.i, i64 %add5.i, i32 1
-  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_long* %arrayidx to <2 x i64>*
-  store <2 x i64> %retval.sroa.0.4.vec.insert.i, <2 x i64>* %ref.tmp.sroa.0.0.cast, align 4
-  ret void
-}
diff --git a/test/CodeGen/ARM64/dead-def-frame-index.ll b/test/CodeGen/ARM64/dead-def-frame-index.ll
deleted file mode 100644
index 9bb4b712076..00000000000
--- a/test/CodeGen/ARM64/dead-def-frame-index.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -march=arm64 < %s | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios7.0.0"
-
-; Function Attrs: nounwind ssp uwtable
-define i32 @test1() #0 {
-  %tmp1 = alloca i8
-  %tmp2 = alloca i32, i32 4096
-  %tmp3 = icmp eq i8* %tmp1, null
-  %tmp4 = zext i1 %tmp3 to i32
-
-  ret i32 %tmp4
-
-  ; CHECK-LABEL: test1
-  ; CHECK:   adds [[TEMP:[a-z0-9]+]], sp, #4, lsl #12
-  ; CHECK:   adds [[TEMP]], [[TEMP]], #15
-}
diff --git a/test/CodeGen/ARM64/dead-register-def-bug.ll b/test/CodeGen/ARM64/dead-register-def-bug.ll
deleted file mode 100644
index 1bbcf50ba73..00000000000
--- a/test/CodeGen/ARM64/dead-register-def-bug.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc -mtriple="arm64-apple-ios" < %s | FileCheck %s
-;
-; Check that the dead register definition pass is considering implicit defs.
-; When rematerializing through truncates, the coalescer may produce instructions
-; with dead defs, but live implicit-defs of subregs:
-; E.g. %X1<def, dead> = MOVi64imm 2, %W1<imp-def>; %X1:GPR64, %W1:GPR32
-; These instructions are live, and their definitions should not be rewritten.
-;
-; <rdar://problem/16492408>
-
-define void @testcase() {
-; CHECK: testcase:
-; CHECK-NOT: orr xzr, xzr, #0x2
-
-bb1:
-  %tmp1 = tail call float @ceilf(float 2.000000e+00)
-  %tmp2 = fptoui float %tmp1 to i64
-  br i1 undef, label %bb2, label %bb3
-
-bb2:
-  tail call void @foo()
-  br label %bb3
-
-bb3:
-  %tmp3 = trunc i64 %tmp2 to i32
-  tail call void @bar(i32 %tmp3)
-  ret void
-}
-
-declare void @foo()
-declare void @bar(i32)
-declare float @ceilf(float) nounwind readnone
diff --git a/test/CodeGen/ARM64/dup.ll b/test/CodeGen/ARM64/dup.ll
deleted file mode 100644
index 97774a7d18f..00000000000
--- a/test/CodeGen/ARM64/dup.ll
+++ /dev/null
@@ -1,323 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
-
-define <8 x i8> @v_dup8(i8 %A) nounwind {
-;CHECK-LABEL: v_dup8:
-;CHECK: dup.8b
-	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
-	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
-	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
-	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
-	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
-	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
-	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
-	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
-	ret <8 x i8> %tmp8
-}
-
-define <4 x i16> @v_dup16(i16 %A) nounwind {
-;CHECK-LABEL: v_dup16:
-;CHECK: dup.4h
-	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
-	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
-	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
-	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
-	ret <4 x i16> %tmp4
-}
-
-define <2 x i32> @v_dup32(i32 %A) nounwind {
-;CHECK-LABEL: v_dup32:
-;CHECK: dup.2s
-	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
-	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
-	ret <2 x i32> %tmp2
-}
-
-define <2 x float> @v_dupfloat(float %A) nounwind {
-;CHECK-LABEL: v_dupfloat:
-;CHECK: dup.2s
-	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
-	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
-	ret <2 x float> %tmp2
-}
-
-define <16 x i8> @v_dupQ8(i8 %A) nounwind {
-;CHECK-LABEL: v_dupQ8:
-;CHECK: dup.16b
-	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
-	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
-	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
-	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
-	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
-	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
-	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
-	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
-	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
-	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
-	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
-	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
-	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
-	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
-	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
-	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
-	ret <16 x i8> %tmp16
-}
-
-define <8 x i16> @v_dupQ16(i16 %A) nounwind {
-;CHECK-LABEL: v_dupQ16:
-;CHECK: dup.8h
-	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
-	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
-	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
-	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
-	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
-	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
-	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
-	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
-	ret <8 x i16> %tmp8
-}
-
-define <4 x i32> @v_dupQ32(i32 %A) nounwind {
-;CHECK-LABEL: v_dupQ32:
-;CHECK: dup.4s
-	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
-	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
-	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
-	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
-	ret <4 x i32> %tmp4
-}
-
-define <4 x float> @v_dupQfloat(float %A) nounwind {
-;CHECK-LABEL: v_dupQfloat:
-;CHECK: dup.4s
-	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
-	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
-	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
-	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
-	ret <4 x float> %tmp4
-}
-
-; Check to make sure it works with shuffles, too.
-
-define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
-;CHECK-LABEL: v_shuffledup8:
-;CHECK: dup.8b
-	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
-	ret <8 x i8> %tmp2
-}
-
-define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
-;CHECK-LABEL: v_shuffledup16:
-;CHECK: dup.4h
-	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
-	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
-	ret <4 x i16> %tmp2
-}
-
-define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
-;CHECK-LABEL: v_shuffledup32:
-;CHECK: dup.2s
-	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
-	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
-	ret <2 x i32> %tmp2
-}
-
-define <2 x float> @v_shuffledupfloat(float %A) nounwind {
-;CHECK-LABEL: v_shuffledupfloat:
-;CHECK: dup.2s
-	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
-	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
-	ret <2 x float> %tmp2
-}
-
-define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
-;CHECK-LABEL: v_shuffledupQ8:
-;CHECK: dup.16b
-	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
-	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
-	ret <16 x i8> %tmp2
-}
-
-define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
-;CHECK-LABEL: v_shuffledupQ16:
-;CHECK: dup.8h
-	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
-	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
-	ret <8 x i16> %tmp2
-}
-
-define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
-;CHECK-LABEL: v_shuffledupQ32:
-;CHECK: dup.4s
-	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
-	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
-	ret <4 x i32> %tmp2
-}
-
-define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
-;CHECK-LABEL: v_shuffledupQfloat:
-;CHECK: dup.4s
-	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
-	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
-	ret <4 x float> %tmp2
-}
-
-define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vduplane8:
-;CHECK: dup.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
-	ret <8 x i8> %tmp2
-}
-
-define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: vduplane16:
-;CHECK: dup.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
-	ret <4 x i16> %tmp2
-}
-
-define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: vduplane32:
-;CHECK: dup.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
-	ret <2 x i32> %tmp2
-}
-
-define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
-;CHECK-LABEL: vduplanefloat:
-;CHECK: dup.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
-	ret <2 x float> %tmp2
-}
-
-define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vduplaneQ8:
-;CHECK: dup.16b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
-	ret <16 x i8> %tmp2
-}
-
-define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: vduplaneQ16:
-;CHECK: dup.8h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
-	ret <8 x i16> %tmp2
-}
-
-define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: vduplaneQ32:
-;CHECK: dup.4s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
-	ret <4 x i32> %tmp2
-}
-
-define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
-;CHECK-LABEL: vduplaneQfloat:
-;CHECK: dup.4s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
-	ret <4 x float> %tmp2
-}
-
-define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
-;CHECK-LABEL: foo:
-;CHECK: dup.2d
-entry:
-  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i64> %0
-}
-
-define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
-;CHECK-LABEL: bar:
-;CHECK: dup.2d
-entry:
-  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  ret <2 x i64> %0
-}
-
-define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
-;CHECK-LABEL: baz:
-;CHECK: dup.2d
-entry:
-  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x double> %0
-}
-
-define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
-;CHECK-LABEL: qux:
-;CHECK: dup.2d
-entry:
-  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
-  ret <2 x double> %0
-}
-
-define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone  {
-; CHECK-LABEL: f:
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: ins.s v0[1], w1
-; CHECK-NEXT: ret
-  %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
-  %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
-  ret <2 x i32> %vecinit1
-}
-
-define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
-; CHECK-LABEL: g:
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: ins.s v0[1], w1
-; CHECK-NEXT: ins.s v0[2], w1
-; CHECK-NEXT: ins.s v0[3], w0
-; CHECK-NEXT: ret
-  %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
-  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
-  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
-  ret <4 x i32> %vecinit3
-}
-
-define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone  {
-; CHECK-LABEL: h:
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ins.d v0[1], x1
-; CHECK-NEXT: ret
-  %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
-  %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
-  ret <2 x i64> %vecinit1
-}
-
-; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
-; the single value needed was of the same type as the vector. This is false if
-; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
-; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
-; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
-;
-; *However*, it is a dup vD.4h, vN.h[2*idx].
-define <4 x i16> @test_build_illegal(<4 x i32> %in) {
-; CHECK-LABEL: test_build_illegal:
-; CHECK: dup.4h v0, v0[6]
-  %val = extractelement <4 x i32> %in, i32 3
-  %smallval = trunc i32 %val to i16
-  %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
-
-  ret <4 x i16> %vec
-}
-
-; We used to inherit an already extract_subvectored v4i16 from
-; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
-; the formation of an indexed-by-7 MLS.
-define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
-; CHECK-LABEL: test_high_splat:
-; CHECK: mls.4h v0, v1, v2[7]
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
diff --git a/test/CodeGen/ARM64/early-ifcvt.ll b/test/CodeGen/ARM64/early-ifcvt.ll
deleted file mode 100644
index 17d783a488f..00000000000
--- a/test/CodeGen/ARM64/early-ifcvt.ll
+++ /dev/null
@@ -1,423 +0,0 @@
-; RUN: llc < %s -stress-early-ifcvt | FileCheck %s
-target triple = "arm64-apple-macosx"
-
-; CHECK: mm2
-define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp {
-entry:
-  br label %do.body
-
-; CHECK: do.body
-; Loop body has no branches before the backedge.
-; CHECK-NOT: LBB
-do.body:
-  %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ]
-  %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ]
-  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ]
-  %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ]
-  %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1
-  %0 = load i32* %p.addr.0, align 4
-  %cmp = icmp sgt i32 %0, %max.0
-  br i1 %cmp, label %do.cond, label %if.else
-
-if.else:
-  %cmp1 = icmp slt i32 %0, %min.0
-  %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0
-  br label %do.cond
-
-do.cond:
-  %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ]
-  %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ]
-; CHECK: cbnz
-  %dec = add i32 %n.addr.0, -1
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %do.end, label %do.body
-
-do.end:
-  %sub = sub nsw i32 %max.1, %min.1
-  ret i32 %sub
-}
-
-; CHECK-LABEL: fold_inc_true_32:
-; CHECK: {{subs.*wzr,|cmp}} w2, #1
-; CHECK-NEXT: csinc w0, w1, w0, eq
-; CHECK-NEXT: ret
-define i32 @fold_inc_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 1
-  %inc = add nsw i32 %x, 1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %y, %eq_bb ], [ %inc, %entry ]
-  ret i32 %cond
-}
-
-; CHECK-LABEL: fold_inc_true_64:
-; CHECK: {{subs.*xzr,|cmp}} x2, #1
-; CHECK-NEXT: csinc x0, x1, x0, eq
-; CHECK-NEXT: ret
-define i64 @fold_inc_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 1
-  %inc = add nsw i64 %x, 1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %y, %eq_bb ], [ %inc, %entry ]
-  ret i64 %cond
-}
-
-; CHECK-LABEL: fold_inc_false_32:
-; CHECK: {{subs.*wzr,|cmp}} w2, #1
-; CHECK-NEXT: csinc w0, w1, w0, ne
-; CHECK-NEXT: ret
-define i32 @fold_inc_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 1
-  %inc = add nsw i32 %x, 1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %inc, %eq_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK-LABEL: fold_inc_false_64:
-; CHECK: {{subs.*xzr,|cmp}} x2, #1
-; CHECK-NEXT: csinc x0, x1, x0, ne
-; CHECK-NEXT: ret
-define i64 @fold_inc_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 1
-  %inc = add nsw i64 %x, 1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %inc, %eq_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; CHECK-LABEL: fold_inv_true_32:
-; CHECK: {{subs.*wzr,|cmp}} w2, #1
-; CHECK-NEXT: csinv w0, w1, w0, eq
-; CHECK-NEXT: ret
-define i32 @fold_inv_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 1
-  %inv = xor i32 %x, -1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %y, %eq_bb ], [ %inv, %entry ]
-  ret i32 %cond
-}
-
-; CHECK-LABEL: fold_inv_true_64:
-; CHECK: {{subs.*xzr,|cmp}} x2, #1
-; CHECK-NEXT: csinv x0, x1, x0, eq
-; CHECK-NEXT: ret
-define i64 @fold_inv_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 1
-  %inv = xor i64 %x, -1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %y, %eq_bb ], [ %inv, %entry ]
-  ret i64 %cond
-}
-
-; CHECK-LABEL: fold_inv_false_32:
-; CHECK: {{subs.*wzr,|cmp}} w2, #1
-; CHECK-NEXT: csinv w0, w1, w0, ne
-; CHECK-NEXT: ret
-define i32 @fold_inv_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 1
-  %inv = xor i32 %x, -1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %inv, %eq_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK-LABEL: fold_inv_false_64:
-; CHECK: {{subs.*xzr,|cmp}} x2, #1
-; CHECK-NEXT: csinv x0, x1, x0, ne
-; CHECK-NEXT: ret
-define i64 @fold_inv_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 1
-  %inv = xor i64 %x, -1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %inv, %eq_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; CHECK-LABEL: fold_neg_true_32:
-; CHECK: {{subs.*wzr,|cmp}} w2, #1
-; CHECK-NEXT: csneg w0, w1, w0, eq
-; CHECK-NEXT: ret
-define i32 @fold_neg_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 1
-  %neg = sub nsw i32 0, %x
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %y, %eq_bb ], [ %neg, %entry ]
-  ret i32 %cond
-}
-
-; CHECK-LABEL: fold_neg_true_64:
-; CHECK: {{subs.*xzr,|cmp}} x2, #1
-; CHECK-NEXT: csneg x0, x1, x0, eq
-; CHECK-NEXT: ret
-define i64 @fold_neg_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 1
-  %neg = sub nsw i64 0, %x
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %y, %eq_bb ], [ %neg, %entry ]
-  ret i64 %cond
-}
-
-; CHECK-LABEL: fold_neg_false_32:
-; CHECK: {{subs.*wzr,|cmp}} w2, #1
-; CHECK-NEXT: csneg w0, w1, w0, ne
-; CHECK-NEXT: ret
-define i32 @fold_neg_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 1
-  %neg = sub nsw i32 0, %x
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %neg, %eq_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK-LABEL: fold_neg_false_64:
-; CHECK: {{subs.*xzr,|cmp}} x2, #1
-; CHECK-NEXT: csneg x0, x1, x0, ne
-; CHECK-NEXT: ret
-define i64 @fold_neg_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 1
-  %neg = sub nsw i64 0, %x
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %neg, %eq_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; CHECK: cbnz_32
-; CHECK: {{subs.*wzr,|cmp}} w2, #0
-; CHECK-NEXT: csel w0, w1, w0, ne
-; CHECK-NEXT: ret
-define i32 @cbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 0
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK: cbnz_64
-; CHECK: {{subs.*xzr,|cmp}} x2, #0
-; CHECK-NEXT: csel x0, x1, x0, ne
-; CHECK-NEXT: ret
-define i64 @cbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 0
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; CHECK: cbz_32
-; CHECK: {{subs.*wzr,|cmp}} w2, #0
-; CHECK-NEXT: csel w0, w1, w0, eq
-; CHECK-NEXT: ret
-define i32 @cbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp ne i32 %c, 0
-  br i1 %tobool, label %ne_bb, label %done
-
-ne_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK: cbz_64
-; CHECK: {{subs.*xzr,|cmp}} x2, #0
-; CHECK-NEXT: csel x0, x1, x0, eq
-; CHECK-NEXT: ret
-define i64 @cbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp ne i64 %c, 0
-  br i1 %tobool, label %ne_bb, label %done
-
-ne_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; CHECK: tbnz_32
-; CHECK: {{ands.*xzr,|tst}} w2, #0x80
-; CHECK-NEXT: csel w0, w1, w0, ne
-; CHECK-NEXT: ret
-define i32 @tbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %mask = and i32 %c, 128
-  %tobool = icmp eq i32 %mask, 0
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK: tbnz_64
-; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
-; CHECK-NEXT: csel x0, x1, x0, ne
-; CHECK-NEXT: ret
-define i64 @tbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %mask = and i64 %c, 9223372036854775808
-  %tobool = icmp eq i64 %mask, 0
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; CHECK: tbz_32
-; CHECK: {{ands.*xzr,|tst}} w2, #0x80
-; CHECK-NEXT: csel w0, w1, w0, eq
-; CHECK-NEXT: ret
-define i32 @tbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %mask = and i32 %c, 128
-  %tobool = icmp ne i32 %mask, 0
-  br i1 %tobool, label %ne_bb, label %done
-
-ne_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK: tbz_64
-; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
-; CHECK-NEXT: csel x0, x1, x0, eq
-; CHECK-NEXT: ret
-define i64 @tbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %mask = and i64 %c, 9223372036854775808
-  %tobool = icmp ne i64 %mask, 0
-  br i1 %tobool, label %ne_bb, label %done
-
-ne_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; This function from 175.vpr folds an ADDWri into a CSINC.
-; Remember to clear the kill flag on the ADDWri.
-define i32 @get_ytrack_to_xtracks() nounwind ssp {
-entry:
-  br label %for.body
-
-for.body:
-  %x0 = load i32* undef, align 4
-  br i1 undef, label %if.then.i146, label %is_sbox.exit155
-
-if.then.i146:
-  %add8.i143 = add nsw i32 0, %x0
-  %rem.i144 = srem i32 %add8.i143, %x0
-  %add9.i145 = add i32 %rem.i144, 1
-  br label %is_sbox.exit155
-
-is_sbox.exit155:                                  ; preds = %if.then.i146, %for.body
-  %seg_offset.0.i151 = phi i32 [ %add9.i145, %if.then.i146 ], [ undef, %for.body ]
-  %idxprom15.i152 = sext i32 %seg_offset.0.i151 to i64
-  %arrayidx18.i154 = getelementptr inbounds i32* null, i64 %idxprom15.i152
-  %x1 = load i32* %arrayidx18.i154, align 4
-  br i1 undef, label %for.body51, label %for.body
-
-for.body51:                                       ; preds = %is_sbox.exit155
-  call fastcc void @get_switch_type(i32 %x1, i32 undef, i16 signext undef, i16 signext undef, i16* undef)
-  unreachable
-}
-declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, i16* nocapture) nounwind ssp
diff --git a/test/CodeGen/ARM64/elf-calls.ll b/test/CodeGen/ARM64/elf-calls.ll
deleted file mode 100644
index 8c4020327b9..00000000000
--- a/test/CodeGen/ARM64/elf-calls.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
-
-declare void @callee()
-
-define void @caller() {
-  call void @callee()
-  ret void
-; CHECK-LABEL: caller:
-; CHECK:     bl callee
-; CHECK-OBJ: R_AARCH64_CALL26 callee
-}
-
-define void @tail_caller() {
-  tail call void @callee()
-  ret void
-; CHECK-LABEL: tail_caller:
-; CHECK:     b callee
-; CHECK-OBJ: R_AARCH64_JUMP26 callee
-}
diff --git a/test/CodeGen/ARM64/elf-constpool.ll b/test/CodeGen/ARM64/elf-constpool.ll
deleted file mode 100644
index 95d334376b7..00000000000
--- a/test/CodeGen/ARM64/elf-constpool.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -O0 -o - %s | FileCheck %s
-
-; O0 checked for fastisel purposes. It has a separate path which
-; creates a constpool entry for floating values.
-
-define double @needs_const() {
-  ret double 3.14159
-; CHECK: .LCPI0_0:
-
-; CHECK: adrp {{x[0-9]+}}, .LCPI0_0
-; CHECK: ldr d0, [{{x[0-9]+}}, :lo12:.LCPI0_0]
-}
diff --git a/test/CodeGen/ARM64/elf-globals.ll b/test/CodeGen/ARM64/elf-globals.ll
deleted file mode 100644
index 4ed44e7c17a..00000000000
--- a/test/CodeGen/ARM64/elf-globals.ll
+++ /dev/null
@@ -1,115 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s -mcpu=cyclone | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
-; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-PIC
-; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC
-
-@var8 = external global i8, align 1
-@var16 = external global i16, align 2
-@var32 = external global i32, align 4
-@var64 = external global i64, align 8
-
-define i8 @test_i8(i8 %new) {
-  %val = load i8* @var8, align 1
-  store i8 %new, i8* @var8
-  ret i8 %val
-; CHECK-LABEL: test_i8:
-; CHECK: adrp x[[HIREG:[0-9]+]], var8
-; CHECK: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
-; CHECK: strb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
-
-; CHECK-PIC-LABEL: test_i8:
-; CHECK-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
-; CHECK-PIC: ldr x[[VAR_ADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
-; CHECK-PIC: ldrb {{w[0-9]+}}, [x[[VAR_ADDR]]]
-
-; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var8
-; CHECK-FAST: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
-
-; CHECK-FAST-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
-; CHECK-FAST-PIC: ldr x[[VARADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
-; CHECK-FAST-PIC: ldr {{w[0-9]+}}, [x[[VARADDR]]]
-}
-
-define i16 @test_i16(i16 %new) {
-  %val = load i16* @var16, align 2
-  store i16 %new, i16* @var16
-  ret i16 %val
-; CHECK-LABEL: test_i16:
-; CHECK: adrp x[[HIREG:[0-9]+]], var16
-; CHECK: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
-; CHECK: strh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
-
-; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var16
-; CHECK-FAST: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
-}
-
-define i32 @test_i32(i32 %new) {
-  %val = load i32* @var32, align 4
-  store i32 %new, i32* @var32
-  ret i32 %val
-; CHECK-LABEL: test_i32:
-; CHECK: adrp x[[HIREG:[0-9]+]], var32
-; CHECK: ldr {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
-; CHECK: str {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
-
-; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var32
-; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var32
-}
-
-define i64 @test_i64(i64 %new) {
-  %val = load i64* @var64, align 8
-  store i64 %new, i64* @var64
-  ret i64 %val
-; CHECK-LABEL: test_i64:
-; CHECK: adrp x[[HIREG:[0-9]+]], var64
-; CHECK: ldr {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
-; CHECK: str {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
-
-; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var64
-; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var64
-}
-
-define i64* @test_addr() {
-  ret i64* @var64
-; CHECK-LABEL: test_addr:
-; CHECK: adrp [[HIREG:x[0-9]+]], var64
-; CHECK: add x0, [[HIREG]], :lo12:var64
-
-; CHECK-FAST: adrp [[HIREG:x[0-9]+]], var64
-; CHECK-FAST: add x0, [[HIREG]], :lo12:var64
-}
-
-@hiddenvar = hidden global i32 0, align 4
-@protectedvar = protected global i32 0, align 4
-
-define i32 @test_vis() {
-  %lhs = load i32* @hiddenvar, align 4
-  %rhs = load i32* @protectedvar, align 4
-  %ret = add i32 %lhs, %rhs
-  ret i32 %ret
-; CHECK-PIC: adrp {{x[0-9]+}}, hiddenvar
-; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:hiddenvar]
-; CHECK-PIC: adrp {{x[0-9]+}}, protectedvar
-; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:protectedvar]
-}
-
-@var_default = external global [2 x i32]
-
-define i32 @test_default_align() {
-  %addr = getelementptr [2 x i32]* @var_default, i32 0, i32 0
-  %val = load i32* %addr
-  ret i32 %val
-; CHECK-LABEL: test_default_align:
-; CHECK: adrp x[[HIREG:[0-9]+]], var_default
-; CHECK: ldr w0, [x[[HIREG]], :lo12:var_default]
-}
-
-define i64 @test_default_unaligned() {
-  %addr = bitcast [2 x i32]* @var_default to i64*
-  %val = load i64* %addr
-  ret i64 %val
-; CHECK-LABEL: test_default_unaligned:
-; CHECK: adrp [[HIREG:x[0-9]+]], var_default
-; CHECK: add x[[ADDR:[0-9]+]], [[HIREG]], :lo12:var_default
-; CHECK: ldr x0, [x[[ADDR]]]
-}
diff --git a/test/CodeGen/ARM64/ext.ll b/test/CodeGen/ARM64/ext.ll
deleted file mode 100644
index d368eef172e..00000000000
--- a/test/CodeGen/ARM64/ext.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextd:
-;CHECK: {{ext.8b.*#3}}
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-	ret <8 x i8> %tmp3
-}
-
-define <8 x i8> @test_vextRd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextRd:
-;CHECK: {{ext.8b.*#5}}
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @test_vextq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextq:
-;CHECK: {{ext.16b.*3}}
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
-	ret <16 x i8> %tmp3
-}
-
-define <16 x i8> @test_vextRq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextRq:
-;CHECK: {{ext.16b.*7}}
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @test_vextd16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: test_vextd16:
-;CHECK: {{ext.8b.*#6}}
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-	ret <4 x i16> %tmp3
-}
-
-define <4 x i32> @test_vextq32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: test_vextq32:
-;CHECK: {{ext.16b.*12}}
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-	ret <4 x i32> %tmp3
-}
-
-; Undef shuffle indices should not prevent matching to VEXT:
-
-define <8 x i8> @test_vextd_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextd_undef:
-;CHECK: {{ext.8b.*}}
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10>
-	ret <8 x i8> %tmp3
-}
-
-define <8 x i8> @test_vextd_undef2(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextd_undef2:
-;CHECK: {{ext.8b.*#6}}
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5>
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextRq_undef:
-;CHECK: {{ext.16b.*#7}}
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 undef, i32 undef, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 undef, i32 6>
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @test_vextRq_undef2(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: test_vextRq_undef2:
-;CHECK: {{ext.16b.*#10}}
-  %tmp1 = load <8 x i16>* %A
-  %vext = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 4>
-  ret <8 x i16> %vext;
-}
-
-; Tests for ReconstructShuffle function. Indices have to be carefully
-; chosen to reach lowering phase as a BUILD_VECTOR.
-
-; One vector needs vext, the other can be handled by extract_subvector
-; Also checks interleaving of sources is handled correctly.
-; Essence: a vext is used on %A and something saner than stack load/store for final result.
-define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: test_interleaved:
-;CHECK: ext.8b
-;CHECK: zip1.4h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 3, i32 8, i32 5, i32 9>
-        ret <4 x i16> %tmp3
-}
-
-; An undef in the shuffle list should still be optimizable
-define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: test_undef:
-;CHECK: zip1.4h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 undef, i32 8, i32 5, i32 9>
-        ret <4 x i16> %tmp3
-}
diff --git a/test/CodeGen/ARM64/extend-int-to-fp.ll b/test/CodeGen/ARM64/extend-int-to-fp.ll
deleted file mode 100644
index 599a697a310..00000000000
--- a/test/CodeGen/ARM64/extend-int-to-fp.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <4 x float> @foo(<4 x i16> %a) nounwind {
-; CHECK-LABEL: foo:
-; CHECK: ushll.4s	v0, v0, #0
-; CHECK-NEXT: ucvtf.4s	v0, v0
-; CHECK-NEXT: ret
-  %vcvt.i = uitofp <4 x i16> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @bar(<4 x i16> %a) nounwind {
-; CHECK-LABEL: bar:
-; CHECK: sshll.4s	v0, v0, #0
-; CHECK-NEXT: scvtf.4s	v0, v0
-; CHECK-NEXT: ret
-  %vcvt.i = sitofp <4 x i16> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
diff --git a/test/CodeGen/ARM64/extend.ll b/test/CodeGen/ARM64/extend.ll
deleted file mode 100644
index afcaca2c492..00000000000
--- a/test/CodeGen/ARM64/extend.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
-@array = external global [0 x i32]
-
-define i64 @foo(i32 %i) {
-; CHECK: foo
-; CHECK:  adrp  x[[REG:[0-9]+]], _array@GOTPAGE
-; CHECK:  ldr x[[REG1:[0-9]+]], [x[[REG]], _array@GOTPAGEOFF]
-; CHECK:  ldrsw x0, [x[[REG1]], w0, sxtw #2]
-; CHECK:  ret
-  %idxprom = sext i32 %i to i64
-  %arrayidx = getelementptr inbounds [0 x i32]* @array, i64 0, i64 %idxprom
-  %tmp1 = load i32* %arrayidx, align 4
-  %conv = sext i32 %tmp1 to i64
-  ret i64 %conv
-}
diff --git a/test/CodeGen/ARM64/extern-weak.ll b/test/CodeGen/ARM64/extern-weak.ll
deleted file mode 100644
index a239403befa..00000000000
--- a/test/CodeGen/ARM64/extern-weak.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -o - < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
-
-declare extern_weak i32 @var()
-
-define i32()* @foo() {
-; The usual ADRP/ADD pair can't be used for a weak reference because it must
-; evaluate to 0 if the symbol is undefined. We use a litpool entry.
-  ret i32()* @var
-
-; CHECK: adrp x[[VAR:[0-9]+]], :got:var
-; CHECK: ldr x0, [x[[VAR]], :got_lo12:var]
-
-  ; In the large model, the usual relocations are absolute and can
-  ; materialise 0.
-; CHECK-LARGE: movz x0, #:abs_g3:var
-; CHECK-LARGE: movk x0, #:abs_g2_nc:var
-; CHECK-LARGE: movk x0, #:abs_g1_nc:var
-; CHECK-LARGE: movk x0, #:abs_g0_nc:var
-}
-
-
-@arr_var = extern_weak global [10 x i32]
-
-define i32* @bar() {
-  %addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5
-; CHECK: adrp x[[ARR_VAR_HI:[0-9]+]], :got:arr_var
-; CHECK: ldr [[ARR_VAR:x[0-9]+]], [x[[ARR_VAR_HI]], :got_lo12:arr_var]
-; CHECK: add x0, [[ARR_VAR]], #20
-  ret i32* %addr
-
-  ; In the large model, the usual relocations are absolute and can
-  ; materialise 0.
-; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g3:arr_var
-; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g2_nc:arr_var
-; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g1_nc:arr_var
-; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g0_nc:arr_var
-}
-
-@defined_weak_var = internal unnamed_addr global i32 0
-
-define i32* @wibble() {
-  ret i32* @defined_weak_var
-; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
-; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
-
-; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
-; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
-; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var
-; CHECK-LARGE: movk x0, #:abs_g0_nc:defined_weak_var
-}
diff --git a/test/CodeGen/ARM64/extload-knownzero.ll b/test/CodeGen/ARM64/extload-knownzero.ll
deleted file mode 100644
index 14e5fd310d7..00000000000
--- a/test/CodeGen/ARM64/extload-knownzero.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-; rdar://12771555
-
-define void @foo(i16* %ptr, i32 %a) nounwind {
-entry:
-; CHECK-LABEL: foo:
-  %tmp1 = icmp ult i32 %a, 100
-  br i1 %tmp1, label %bb1, label %bb2
-bb1:
-; CHECK: %bb1
-; CHECK: ldrh [[REG:w[0-9]+]]
-  %tmp2 = load i16* %ptr, align 2
-  br label %bb2
-bb2:
-; CHECK: %bb2
-; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
-; CHECK: cmp [[REG]], #23
-  %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ]
-  %cmp = icmp ult i16 %tmp3, 24
-  br i1 %cmp, label %bb3, label %exit
-bb3:
-  call void @bar() nounwind
-  br label %exit
-exit:
-  ret void
-}
-
-declare void @bar ()
diff --git a/test/CodeGen/ARM64/extract.ll b/test/CodeGen/ARM64/extract.ll
deleted file mode 100644
index 02e42189bb4..00000000000
--- a/test/CodeGen/ARM64/extract.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc -arm64-extr-generation=true -verify-machineinstrs < %s \
-; RUN: -march=arm64 | FileCheck %s
-
-define i64 @ror_i64(i64 %in) {
-; CHECK-LABEL: ror_i64:
-    %left = shl i64 %in, 19
-    %right = lshr i64 %in, 45
-    %val5 = or i64 %left, %right
-; CHECK: ror {{x[0-9]+}}, x0, #45
-    ret i64 %val5
-}
-
-define i32 @ror_i32(i32 %in) {
-; CHECK-LABEL: ror_i32:
-    %left = shl i32 %in, 9
-    %right = lshr i32 %in, 23
-    %val5 = or i32 %left, %right
-; CHECK: ror {{w[0-9]+}}, w0, #23
-    ret i32 %val5
-}
-
-define i32 @extr_i32(i32 %lhs, i32 %rhs) {
-; CHECK-LABEL: extr_i32:
-  %left = shl i32 %lhs, 6
-  %right = lshr i32 %rhs, 26
-  %val = or i32 %left, %right
-  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
-  ; something other than w0 and w1.
-; CHECK: extr {{w[0-9]+}}, w0, w1, #26
-
-  ret i32 %val
-}
-
-define i64 @extr_i64(i64 %lhs, i64 %rhs) {
-; CHECK-LABEL: extr_i64:
-  %right = lshr i64 %rhs, 40
-  %left = shl i64 %lhs, 24
-  %val = or i64 %right, %left
-  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
-  ; something other than w0 and w1.
-; CHECK: extr {{x[0-9]+}}, x0, x1, #40
-
-  ret i64 %val
-}
-
-; Regression test: a bad experimental pattern crept into git which optimised
-; this pattern to a single EXTR.
-define i32 @extr_regress(i32 %a, i32 %b) {
-; CHECK-LABEL: extr_regress:
-
-    %sh1 = shl i32 %a, 14
-    %sh2 = lshr i32 %b, 14
-    %val = or i32 %sh2, %sh1
-; CHECK-NOT: extr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, #{{[0-9]+}}
-
-    ret i32 %val
-; CHECK: ret
-}
diff --git a/test/CodeGen/ARM64/extract_subvector.ll b/test/CodeGen/ARM64/extract_subvector.ll
deleted file mode 100644
index 20c05fb2326..00000000000
--- a/test/CodeGen/ARM64/extract_subvector.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn.
-
-define <8 x i8> @v8i8(<16 x i8> %a) nounwind {
-; CHECK: v8i8
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: ret
-  %ret = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32>  <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i8> %ret
-}
-
-define <4 x i16> @v4i16(<8 x i16> %a) nounwind {
-; CHECK-LABEL: v4i16:
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: ret
-  %ret = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32>  <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %ret
-}
-
-define <2 x i32> @v2i32(<4 x i32> %a) nounwind {
-; CHECK-LABEL: v2i32:
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: ret
-  %ret = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32>  <i32 2, i32 3>
-  ret <2 x i32> %ret
-}
-
-define <1 x i64> @v1i64(<2 x i64> %a) nounwind {
-; CHECK-LABEL: v1i64:
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: ret
-  %ret = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32>  <i32 1>
-  ret <1 x i64> %ret
-}
-
-define <2 x float> @v2f32(<4 x float> %a) nounwind {
-; CHECK-LABEL: v2f32:
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: ret
-  %ret = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32>  <i32 2, i32 3>
-  ret <2 x float> %ret
-}
-
-define <1 x double> @v1f64(<2 x double> %a) nounwind {
-; CHECK-LABEL: v1f64:
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: ret
-  %ret = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32>  <i32 1>
-  ret <1 x double> %ret
-}
diff --git a/test/CodeGen/ARM64/fast-isel-addr-offset.ll b/test/CodeGen/ARM64/fast-isel-addr-offset.ll
deleted file mode 100644
index ebd847e0f72..00000000000
--- a/test/CodeGen/ARM64/fast-isel-addr-offset.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-@sortlist = common global [5001 x i32] zeroinitializer, align 16
-@sortlist2 = common global [5001 x i64] zeroinitializer, align 16
-
-; Load an address with an offset larget then LDR imm can handle
-define i32 @foo() nounwind {
-entry:
-; CHECK: @foo
-; CHECK: adrp x[[REG:[0-9]+]], _sortlist@GOTPAGE
-; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist@GOTPAGEOFF]
-; CHECK: movz x[[REG2:[0-9]+]], #0x4e20
-; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
-; CHECK: ldr w0, [x[[REG3]]]
-; CHECK: ret
-  %0 = load i32* getelementptr inbounds ([5001 x i32]* @sortlist, i32 0, i64 5000), align 4
-  ret i32 %0
-}
-
-define i64 @foo2() nounwind {
-entry:
-; CHECK: @foo2
-; CHECK: adrp x[[REG:[0-9]+]], _sortlist2@GOTPAGE
-; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist2@GOTPAGEOFF]
-; CHECK: movz x[[REG2:[0-9]+]], #0x9c40
-; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
-; CHECK: ldr x0, [x[[REG3]]]
-; CHECK: ret
-  %0 = load i64* getelementptr inbounds ([5001 x i64]* @sortlist2, i32 0, i64 5000), align 4
-  ret i64 %0
-}
-
-; Load an address with a ridiculously large offset.
-; rdar://12505553
-@pd2 = common global i8* null, align 8
-
-define signext i8 @foo3() nounwind ssp {
-entry:
-; CHECK: @foo3
-; CHECK: movz x[[REG:[0-9]+]], #0xb3a, lsl #32
-; CHECK: movk x[[REG]], #0x73ce, lsl #16
-; CHECK: movk x[[REG]], #0x2ff2
-  %0 = load i8** @pd2, align 8
-  %arrayidx = getelementptr inbounds i8* %0, i64 12345678901234
-  %1 = load i8* %arrayidx, align 1
-  ret i8 %1
-}
diff --git a/test/CodeGen/ARM64/fast-isel-alloca.ll b/test/CodeGen/ARM64/fast-isel-alloca.ll
deleted file mode 100644
index 1706e9eba2b..00000000000
--- a/test/CodeGen/ARM64/fast-isel-alloca.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; This test should cause the TargetMaterializeAlloca to be invoked
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-%struct.S1Ty = type { i64 }
-%struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty }
-
-define void @takeS1(%struct.S1Ty* %V) nounwind {
-entry:
-  %V.addr = alloca %struct.S1Ty*, align 8
-  store %struct.S1Ty* %V, %struct.S1Ty** %V.addr, align 8
-  ret void
-}
-
-define void @main() nounwind {
-entry:
-; CHECK: main
-; CHECK: mov x29, sp
-; CHECK: mov x[[REG:[0-9]+]], sp
-; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8
-; CHECK-NEXT: add x0, x[[REG]], x[[REG1]]
-  %E = alloca %struct.S2Ty, align 4
-  %B = getelementptr inbounds %struct.S2Ty* %E, i32 0, i32 1
-  call void @takeS1(%struct.S1Ty* %B)
-  ret void
-}
diff --git a/test/CodeGen/ARM64/fast-isel-br.ll b/test/CodeGen/ARM64/fast-isel-br.ll
deleted file mode 100644
index 37a8295c893..00000000000
--- a/test/CodeGen/ARM64/fast-isel-br.ll
+++ /dev/null
@@ -1,155 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
-
-define void @branch1() nounwind uwtable ssp {
-  %x = alloca i32, align 4
-  store i32 0, i32* %x, align 4
-  %1 = load i32* %x, align 4
-  %2 = icmp ne i32 %1, 0
-  br i1 %2, label %3, label %4
-
-; <label>:3                                       ; preds = %0
-  br label %4
-
-; <label>:4                                       ; preds = %3, %0
-  ret void
-}
-
-define void @branch2() nounwind uwtable ssp {
-  %1 = alloca i32, align 4
-  %x = alloca i32, align 4
-  %y = alloca i32, align 4
-  %z = alloca i32, align 4
-  store i32 0, i32* %1
-  store i32 1, i32* %y, align 4
-  store i32 1, i32* %x, align 4
-  store i32 0, i32* %z, align 4
-  %2 = load i32* %x, align 4
-  %3 = icmp ne i32 %2, 0
-  br i1 %3, label %4, label %5
-
-; <label>:4                                       ; preds = %0
-  store i32 0, i32* %1
-  br label %14
-
-; <label>:5                                       ; preds = %0
-  %6 = load i32* %y, align 4
-  %7 = icmp ne i32 %6, 0
-  br i1 %7, label %8, label %13
-
-; <label>:8                                       ; preds = %5
-  %9 = load i32* %z, align 4
-  %10 = icmp ne i32 %9, 0
-  br i1 %10, label %11, label %12
-
-; <label>:11                                      ; preds = %8
-  store i32 1, i32* %1
-  br label %14
-
-; <label>:12                                      ; preds = %8
-  store i32 0, i32* %1
-  br label %14
-
-; <label>:13                                      ; preds = %5
-  br label %14
-
-; <label>:14                                      ; preds = %4, %11, %12, %13
-  %15 = load i32* %1
-  ret void
-}
-
-define void @true_() nounwind uwtable ssp {
-; CHECK: @true_
-; CHECK: b LBB2_1
-  br i1 true, label %1, label %2
-
-; <label>:1
-; CHECK: LBB2_1
-  br label %2
-
-; <label>:2
-  ret void
-}
-
-define void @false_() nounwind uwtable ssp {
-; CHECK: @false_
-; CHECK: b LBB3_2
-  br i1 false, label %1, label %2
-
-; <label>:1
-  br label %2
-
-; <label>:2
-; CHECK: LBB3_2
-  ret void
-}
-
-define zeroext i8 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) {
-entry:
-  %a.addr = alloca i8, align 1
-  %b.addr = alloca i16, align 2
-  %c.addr = alloca i32, align 4
-  %d.addr = alloca i64, align 8
-  store i8 %a, i8* %a.addr, align 1
-  store i16 %b, i16* %b.addr, align 2
-  store i32 %c, i32* %c.addr, align 4
-  store i64 %d, i64* %d.addr, align 8
-  %0 = load i16* %b.addr, align 2
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: b.eq LBB4_2
-  %conv = trunc i16 %0 to i1
-  br i1 %conv, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  call void @foo1()
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %entry
-  %1 = load i32* %c.addr, align 4
-; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
-; CHECK: subs w{{[0-9]+}}, w[[REG]], #0
-; CHECK: b.eq LBB4_4
-  %conv1 = trunc i32 %1 to i1
-  br i1 %conv1, label %if.then3, label %if.end4
-
-if.then3:                                         ; preds = %if.end
-  call void @foo1()
-  br label %if.end4
-
-if.end4:                                          ; preds = %if.then3, %if.end
-  %2 = load i64* %d.addr, align 8
-; CHECK: subs w{{[0-9]+}}, w{{[0-9]+}}, #0
-; CHECK: b.eq LBB4_6
-  %conv5 = trunc i64 %2 to i1
-  br i1 %conv5, label %if.then7, label %if.end8
-
-if.then7:                                         ; preds = %if.end4
-  call void @foo1()
-  br label %if.end8
-
-if.end8:                                          ; preds = %if.then7, %if.end4
-  %3 = load i8* %a.addr, align 1
-  ret i8 %3
-}
-
-declare void @foo1()
-
-; rdar://15174028
-define i32 @trunc64(i64 %foo) nounwind {
-; CHECK: trunc64
-; CHECK: orr  [[REG:x[0-9]+]], xzr, #0x1
-; CHECK: and  [[REG2:x[0-9]+]], x0, [[REG]]
-; CHECK: mov  x[[REG3:[0-9]+]], [[REG2]]
-; CHECK: and  [[REG4:w[0-9]+]], w[[REG3]], #0x1
-; CHECK: subs {{w[0-9]+}}, [[REG4]], #0
-; CHECK: b.eq LBB5_2
-  %a = and i64 %foo, 1
-  %b = trunc i64 %a to i1
-  br i1 %b, label %if.then, label %if.else
-
-if.then:
-  ret i32 1
-
-if.else:
-  ret i32 0
-}
diff --git a/test/CodeGen/ARM64/fast-isel-call.ll b/test/CodeGen/ARM64/fast-isel-call.ll
deleted file mode 100644
index 8d756ae5461..00000000000
--- a/test/CodeGen/ARM64/fast-isel-call.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64_be-linux-gnu | FileCheck %s --check-prefix=CHECK-BE
-
-define void @call0() nounwind {
-entry:
-  ret void
-}
-
-define void @foo0() nounwind {
-entry:
-; CHECK: foo0
-; CHECK: bl _call0
-  call void @call0()
-  ret void
-}
-
-define i32 @call1(i32 %a) nounwind {
-entry:
-  %a.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  %tmp = load i32* %a.addr, align 4
-  ret i32 %tmp
-}
-
-define i32 @foo1(i32 %a) nounwind {
-entry:
-; CHECK: foo1
-; CHECK: stur w0, [x29, #-4]
-; CHECK-NEXT: ldur w0, [x29, #-4]
-; CHECK-NEXT: bl _call1
-  %a.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  %tmp = load i32* %a.addr, align 4
-  %call = call i32 @call1(i32 %tmp)
-  ret i32 %call
-}
-
-define i32 @sext_(i8 %a, i16 %b) nounwind {
-entry:
-; CHECK: @sext_
-; CHECK: sxtb w0, w0
-; CHECK: sxth w1, w1
-; CHECK: bl _foo_sext_
-  call void @foo_sext_(i8 signext %a, i16 signext %b)
-  ret i32 0
-}
-
-declare void @foo_sext_(i8 %a, i16 %b)
-
-define i32 @zext_(i8 %a, i16 %b) nounwind {
-entry:
-; CHECK: @zext_
-; CHECK: uxtb w0, w0
-; CHECK: uxth w1, w1
-  call void @foo_zext_(i8 zeroext %a, i16 zeroext %b)
-  ret i32 0
-}
-
-declare void @foo_zext_(i8 %a, i16 %b)
-
-define i32 @t1(i32 %argc, i8** nocapture %argv) {
-entry:
-; CHECK: @t1
-; The last parameter will be passed on stack via i8.
-; CHECK: strb w{{[0-9]+}}, [sp]
-; CHECK-NEXT: bl _bar
-  %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70, i8 zeroext 28, i8 zeroext 39, i8 zeroext -41)
-  ret i32 0
-}
-
-declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
-
-; Test materialization of integers.  Target-independent selector handles this.
-define i32 @t2() {
-entry:
-; CHECK: @t2
-; CHECK: movz x0, #0
-; CHECK: orr w1, wzr, #0xfffffff8
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x3ff
-; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x2
-; CHECK: movz w[[REG3:[0-9]+]], #0
-; CHECK: orr w[[REG4:[0-9]+]], wzr, #0x1
-; CHECK: uxth w2, w[[REG]]
-; CHECK: sxtb w3, w[[REG2]]
-; CHECK: and w4, w[[REG3]], #0x1
-; CHECK: and w5, w[[REG4]], #0x1
-; CHECK: bl	_func2
-  %call = call i32 @func2(i64 zeroext 0, i32 signext -8, i16 zeroext 1023, i8 signext -254, i1 zeroext 0, i1 zeroext 1)
-  ret i32 0
-}
-
-declare i32 @func2(i64 zeroext, i32 signext, i16 zeroext, i8 signext, i1 zeroext, i1 zeroext)
-
-declare void @callee_b0f(i8 %bp10, i8 %bp11, i8 %bp12, i8 %bp13, i8 %bp14, i8 %bp15, i8 %bp17, i8 %bp18, i8 %bp19)
-define void @caller_b1f() {
-entry:
-  ; CHECK-BE: strb w{{.*}}, [sp, #7]
-  call void @callee_b0f(i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 42)
-  ret void
-}
diff --git a/test/CodeGen/ARM64/fast-isel-conversion.ll b/test/CodeGen/ARM64/fast-isel-conversion.ll
deleted file mode 100644
index c5417de0ae9..00000000000
--- a/test/CodeGen/ARM64/fast-isel-conversion.ll
+++ /dev/null
@@ -1,442 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
-
-;; Test various conversions.
-define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
-entry:
-; CHECK: trunc_
-; CHECK: sub sp, sp, #16
-; CHECK: strb w0, [sp, #15]
-; CHECK: strh w1, [sp, #12]
-; CHECK: str w2, [sp, #8]
-; CHECK: str x3, [sp]
-; CHECK: ldr x3, [sp]
-; CHECK: mov x0, x3
-; CHECK: str w0, [sp, #8]
-; CHECK: ldr w0, [sp, #8]
-; CHECK: strh w0, [sp, #12]
-; CHECK: ldrh w0, [sp, #12]
-; CHECK: strb w0, [sp, #15]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: uxtb w0, w0
-; CHECK: add sp, sp, #16
-; CHECK: ret
-  %a.addr = alloca i8, align 1
-  %b.addr = alloca i16, align 2
-  %c.addr = alloca i32, align 4
-  %d.addr = alloca i64, align 8
-  store i8 %a, i8* %a.addr, align 1
-  store i16 %b, i16* %b.addr, align 2
-  store i32 %c, i32* %c.addr, align 4
-  store i64 %d, i64* %d.addr, align 8
-  %tmp = load i64* %d.addr, align 8
-  %conv = trunc i64 %tmp to i32
-  store i32 %conv, i32* %c.addr, align 4
-  %tmp1 = load i32* %c.addr, align 4
-  %conv2 = trunc i32 %tmp1 to i16
-  store i16 %conv2, i16* %b.addr, align 2
-  %tmp3 = load i16* %b.addr, align 2
-  %conv4 = trunc i16 %tmp3 to i8
-  store i8 %conv4, i8* %a.addr, align 1
-  %tmp5 = load i8* %a.addr, align 1
-  %conv6 = zext i8 %tmp5 to i32
-  ret i32 %conv6
-}
-
-define i64 @zext_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
-entry:
-; CHECK: zext_
-; CHECK: sub sp, sp, #16
-; CHECK: strb w0, [sp, #15]
-; CHECK: strh w1, [sp, #12]
-; CHECK: str w2, [sp, #8]
-; CHECK: str x3, [sp]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: uxtb w0, w0
-; CHECK: strh w0, [sp, #12]
-; CHECK: ldrh w0, [sp, #12]
-; CHECK: uxth w0, w0
-; CHECK: str w0, [sp, #8]
-; CHECK: ldr w0, [sp, #8]
-; CHECK: mov x3, x0
-; CHECK: ubfx x3, x3, #0, #32
-; CHECK: str x3, [sp]
-; CHECK: ldr x0, [sp]
-; CHECK: ret
-  %a.addr = alloca i8, align 1
-  %b.addr = alloca i16, align 2
-  %c.addr = alloca i32, align 4
-  %d.addr = alloca i64, align 8
-  store i8 %a, i8* %a.addr, align 1
-  store i16 %b, i16* %b.addr, align 2
-  store i32 %c, i32* %c.addr, align 4
-  store i64 %d, i64* %d.addr, align 8
-  %tmp = load i8* %a.addr, align 1
-  %conv = zext i8 %tmp to i16
-  store i16 %conv, i16* %b.addr, align 2
-  %tmp1 = load i16* %b.addr, align 2
-  %conv2 = zext i16 %tmp1 to i32
-  store i32 %conv2, i32* %c.addr, align 4
-  %tmp3 = load i32* %c.addr, align 4
-  %conv4 = zext i32 %tmp3 to i64
-  store i64 %conv4, i64* %d.addr, align 8
-  %tmp5 = load i64* %d.addr, align 8
-  ret i64 %tmp5
-}
-
-define i32 @zext_i1_i32(i1 zeroext %a) nounwind ssp {
-entry:
-; CHECK: @zext_i1_i32
-; CHECK: and w0, w0, #0x1
-  %conv = zext i1 %a to i32
-  ret i32 %conv;
-}
-
-define i64 @zext_i1_i64(i1 zeroext %a) nounwind ssp {
-entry:
-; CHECK: @zext_i1_i64
-; CHECK: and w0, w0, #0x1
-  %conv = zext i1 %a to i64
-  ret i64 %conv;
-}
-
-define i64 @sext_(i8 signext %a, i16 signext %b, i32 %c, i64 %d) nounwind ssp {
-entry:
-; CHECK: sext_
-; CHECK: sub sp, sp, #16
-; CHECK: strb w0, [sp, #15]
-; CHECK: strh w1, [sp, #12]
-; CHECK: str w2, [sp, #8]
-; CHECK: str x3, [sp]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: sxtb w0, w0
-; CHECK: strh w0, [sp, #12]
-; CHECK: ldrh w0, [sp, #12]
-; CHECK: sxth w0, w0
-; CHECK: str w0, [sp, #8]
-; CHECK: ldr w0, [sp, #8]
-; CHECK: mov x3, x0
-; CHECK: sxtw x3, w3
-; CHECK: str x3, [sp]
-; CHECK: ldr x0, [sp]
-; CHECK: ret
-  %a.addr = alloca i8, align 1
-  %b.addr = alloca i16, align 2
-  %c.addr = alloca i32, align 4
-  %d.addr = alloca i64, align 8
-  store i8 %a, i8* %a.addr, align 1
-  store i16 %b, i16* %b.addr, align 2
-  store i32 %c, i32* %c.addr, align 4
-  store i64 %d, i64* %d.addr, align 8
-  %tmp = load i8* %a.addr, align 1
-  %conv = sext i8 %tmp to i16
-  store i16 %conv, i16* %b.addr, align 2
-  %tmp1 = load i16* %b.addr, align 2
-  %conv2 = sext i16 %tmp1 to i32
-  store i32 %conv2, i32* %c.addr, align 4
-  %tmp3 = load i32* %c.addr, align 4
-  %conv4 = sext i32 %tmp3 to i64
-  store i64 %conv4, i64* %d.addr, align 8
-  %tmp5 = load i64* %d.addr, align 8
-  ret i64 %tmp5
-}
-
-; Test sext i8 to i64
-
-define zeroext i64 @sext_i8_i64(i8 zeroext %in) {
-; CHECK-LABEL: sext_i8_i64:
-; CHECK: mov x[[TMP:[0-9]+]], x0
-; CHECK: sxtb x0, w[[TMP]]
-  %big = sext i8 %in to i64
-  ret i64 %big
-}
-
-define zeroext i64 @sext_i16_i64(i16 zeroext %in) {
-; CHECK-LABEL: sext_i16_i64:
-; CHECK: mov x[[TMP:[0-9]+]], x0
-; CHECK: sxth x0, w[[TMP]]
-  %big = sext i16 %in to i64
-  ret i64 %big
-}
-
-; Test sext i1 to i32
-define i32 @sext_i1_i32(i1 signext %a) nounwind ssp {
-entry:
-; CHECK: sext_i1_i32
-; CHECK: sbfx w0, w0, #0, #1
-  %conv = sext i1 %a to i32
-  ret i32 %conv
-}
-
-; Test sext i1 to i16
-define signext i16 @sext_i1_i16(i1 %a) nounwind ssp {
-entry:
-; CHECK: sext_i1_i16
-; CHECK: sbfx w0, w0, #0, #1
-  %conv = sext i1 %a to i16
-  ret i16 %conv
-}
-
-; Test sext i1 to i8
-define signext i8 @sext_i1_i8(i1 %a) nounwind ssp {
-entry:
-; CHECK: sext_i1_i8
-; CHECK: sbfx w0, w0, #0, #1
-  %conv = sext i1 %a to i8
-  ret i8 %conv
-}
-
-; Test fpext
-define double @fpext_(float %a) nounwind ssp {
-entry:
-; CHECK: fpext_
-; CHECK: fcvt d0, s0
-  %conv = fpext float %a to double
-  ret double %conv
-}
-
-; Test fptrunc
-define float @fptrunc_(double %a) nounwind ssp {
-entry:
-; CHECK: fptrunc_
-; CHECK: fcvt s0, d0
-  %conv = fptrunc double %a to float
-  ret float %conv
-}
-
-; Test fptosi
-define i32 @fptosi_ws(float %a) nounwind ssp {
-entry:
-; CHECK: fptosi_ws
-; CHECK: fcvtzs w0, s0
-  %conv = fptosi float %a to i32
-  ret i32 %conv
-}
-
-; Test fptosi
-define i32 @fptosi_wd(double %a) nounwind ssp {
-entry:
-; CHECK: fptosi_wd
-; CHECK: fcvtzs w0, d0
-  %conv = fptosi double %a to i32
-  ret i32 %conv
-}
-
-; Test fptoui
-define i32 @fptoui_ws(float %a) nounwind ssp {
-entry:
-; CHECK: fptoui_ws
-; CHECK: fcvtzu w0, s0
-  %conv = fptoui float %a to i32
-  ret i32 %conv
-}
-
-; Test fptoui
-define i32 @fptoui_wd(double %a) nounwind ssp {
-entry:
-; CHECK: fptoui_wd
-; CHECK: fcvtzu w0, d0
-  %conv = fptoui double %a to i32
-  ret i32 %conv
-}
-
-; Test sitofp
-define float @sitofp_sw_i1(i1 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_sw_i1
-; CHECK: sbfx w0, w0, #0, #1
-; CHECK: scvtf s0, w0
-  %conv = sitofp i1 %a to float
-  ret float %conv
-}
-
-; Test sitofp
-define float @sitofp_sw_i8(i8 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_sw_i8
-; CHECK: sxtb w0, w0
-; CHECK: scvtf s0, w0
-  %conv = sitofp i8 %a to float
-  ret float %conv
-}
-
-; Test sitofp
-define float @sitofp_sw_i16(i16 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_sw_i16
-; CHECK: sxth w0, w0
-; CHECK: scvtf s0, w0
-  %conv = sitofp i16 %a to float
-  ret float %conv
-}
-
-; Test sitofp
-define float @sitofp_sw(i32 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_sw
-; CHECK: scvtf s0, w0
-  %conv = sitofp i32 %a to float
-  ret float %conv
-}
-
-; Test sitofp
-define float @sitofp_sx(i64 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_sx
-; CHECK: scvtf s0, x0
-  %conv = sitofp i64 %a to float
-  ret float %conv
-}
-
-; Test sitofp
-define double @sitofp_dw(i32 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_dw
-; CHECK: scvtf d0, w0
-  %conv = sitofp i32 %a to double
-  ret double %conv
-}
-
-; Test sitofp
-define double @sitofp_dx(i64 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_dx
-; CHECK: scvtf d0, x0
-  %conv = sitofp i64 %a to double
-  ret double %conv
-}
-
-; Test uitofp
-define float @uitofp_sw_i1(i1 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_sw_i1
-; CHECK: and w0, w0, #0x1
-; CHECK: ucvtf s0, w0
-  %conv = uitofp i1 %a to float
-  ret float %conv
-}
-
-; Test uitofp
-define float @uitofp_sw_i8(i8 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_sw_i8
-; CHECK: uxtb w0, w0
-; CHECK: ucvtf s0, w0
-  %conv = uitofp i8 %a to float
-  ret float %conv
-}
-
-; Test uitofp
-define float @uitofp_sw_i16(i16 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_sw_i16
-; CHECK: uxth w0, w0
-; CHECK: ucvtf s0, w0
-  %conv = uitofp i16 %a to float
-  ret float %conv
-}
-
-; Test uitofp
-define float @uitofp_sw(i32 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_sw
-; CHECK: ucvtf s0, w0
-  %conv = uitofp i32 %a to float
-  ret float %conv
-}
-
-; Test uitofp
-define float @uitofp_sx(i64 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_sx
-; CHECK: ucvtf s0, x0
-  %conv = uitofp i64 %a to float
-  ret float %conv
-}
-
-; Test uitofp
-define double @uitofp_dw(i32 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_dw
-; CHECK: ucvtf d0, w0
-  %conv = uitofp i32 %a to double
-  ret double %conv
-}
-
-; Test uitofp
-define double @uitofp_dx(i64 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_dx
-; CHECK: ucvtf d0, x0
-  %conv = uitofp i64 %a to double
-  ret double %conv
-}
-
-define i32 @i64_trunc_i32(i64 %a) nounwind ssp {
-entry:
-; CHECK: i64_trunc_i32
-; CHECK: mov x1, x0
-  %conv = trunc i64 %a to i32
-  ret i32 %conv
-}
-
-define zeroext i16 @i64_trunc_i16(i64 %a) nounwind ssp {
-entry:
-; CHECK: i64_trunc_i16
-; CHECK: mov x[[REG:[0-9]+]], x0
-; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xffff
-; CHECK: uxth w0, [[REG2]]
-  %conv = trunc i64 %a to i16
-  ret i16 %conv
-}
-
-define zeroext i8 @i64_trunc_i8(i64 %a) nounwind ssp {
-entry:
-; CHECK: i64_trunc_i8
-; CHECK: mov x[[REG:[0-9]+]], x0
-; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xff
-; CHECK: uxtb w0, [[REG2]]
-  %conv = trunc i64 %a to i8
-  ret i8 %conv
-}
-
-define zeroext i1 @i64_trunc_i1(i64 %a) nounwind ssp {
-entry:
-; CHECK: i64_trunc_i1
-; CHECK: mov x[[REG:[0-9]+]], x0
-; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0x1
-; CHECK: and w0, [[REG2]], #0x1
-  %conv = trunc i64 %a to i1
-  ret i1 %conv
-}
-
-; rdar://15101939
-define void @stack_trunc() nounwind {
-; CHECK: stack_trunc
-; CHECK: sub  sp, sp, #16
-; CHECK: ldr  [[REG:x[0-9]+]], [sp]
-; CHECK: mov  x[[REG2:[0-9]+]], [[REG]]
-; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0xff
-; CHECK: strb [[REG3]], [sp, #15]
-; CHECK: add  sp, sp, #16
-  %a = alloca i8, align 1
-  %b = alloca i64, align 8
-  %c = load i64* %b, align 8
-  %d = trunc i64 %c to i8
-  store i8 %d, i8* %a, align 1
-  ret void
-}
-
-define zeroext i64 @zext_i8_i64(i8 zeroext %in) {
-; CHECK-LABEL: zext_i8_i64:
-; CHECK: mov x[[TMP:[0-9]+]], x0
-; CHECK: ubfx x0, x[[TMP]], #0, #8
-  %big = zext i8 %in to i64
-  ret i64 %big
-}
-define zeroext i64 @zext_i16_i64(i16 zeroext %in) {
-; CHECK-LABEL: zext_i16_i64:
-; CHECK: mov x[[TMP:[0-9]+]], x0
-; CHECK: ubfx x0, x[[TMP]], #0, #16
-  %big = zext i16 %in to i64
-  ret i64 %big
-}
diff --git a/test/CodeGen/ARM64/fast-isel-fcmp.ll b/test/CodeGen/ARM64/fast-isel-fcmp.ll
deleted file mode 100644
index f0305962076..00000000000
--- a/test/CodeGen/ARM64/fast-isel-fcmp.ll
+++ /dev/null
@@ -1,146 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin | FileCheck %s
-
-define zeroext i1 @fcmp_float1(float %a) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_float1
-; CHECK: fcmp s0, #0.0
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une float %a, 0.000000e+00
-  ret i1 %cmp
-}
-
-define zeroext i1 @fcmp_float2(float %a, float %b) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_float2
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une float %a, %b
-  ret i1 %cmp
-}
-
-define zeroext i1 @fcmp_double1(double %a) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_double1
-; CHECK: fcmp d0, #0.0
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une double %a, 0.000000e+00
-  ret i1 %cmp
-}
-
-define zeroext i1 @fcmp_double2(double %a, double %b) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_double2
-; CHECK: fcmp d0, d1
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une double %a, %b
-  ret i1 %cmp
-}
-
-; Check each fcmp condition
-define float @fcmp_oeq(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_oeq
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, eq
-  %cmp = fcmp oeq float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ogt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ogt
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, gt
-  %cmp = fcmp ogt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_oge(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_oge
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, ge
-  %cmp = fcmp oge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_olt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_olt
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, mi
-  %cmp = fcmp olt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ole(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ole
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, ls
-  %cmp = fcmp ole float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ord(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ord
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, vc
-  %cmp = fcmp ord float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_uno(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_uno
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, vs
-  %cmp = fcmp uno float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ugt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ugt
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, hi
-  %cmp = fcmp ugt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_uge(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_uge
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, pl
-  %cmp = fcmp uge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ult(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ult
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, lt
-  %cmp = fcmp ult float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ule(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ule
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, le
-  %cmp = fcmp ule float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_une(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_une
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, ne
-  %cmp = fcmp une float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
diff --git a/test/CodeGen/ARM64/fast-isel-gv.ll b/test/CodeGen/ARM64/fast-isel-gv.ll
deleted file mode 100644
index dc4d8953c27..00000000000
--- a/test/CodeGen/ARM64/fast-isel-gv.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-; Test load/store of global value from global offset table.
-@seed = common global i64 0, align 8
-
-define void @Initrand() nounwind {
-entry:
-; CHECK: @Initrand
-; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
-; CHECK: str x{{[0-9]+}}, [x[[REG2]]]
-  store i64 74755, i64* @seed, align 8
-  ret void
-}
-
-define i32 @Rand() nounwind {
-entry:
-; CHECK: @Rand
-; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
-; CHECK: movz x[[REG3:[0-9]+]], #0x51d
-; CHECK: ldr x[[REG4:[0-9]+]], [x[[REG2]]]
-; CHECK: mul x[[REG5:[0-9]+]], x[[REG4]], x[[REG3]]
-; CHECK: movz x[[REG6:[0-9]+]], #0x3619
-; CHECK: add x[[REG7:[0-9]+]], x[[REG5]], x[[REG6]]
-; CHECK: orr x[[REG8:[0-9]+]], xzr, #0xffff
-; CHECK: and x[[REG9:[0-9]+]], x[[REG7]], x[[REG8]]
-; CHECK: str x[[REG9]], [x[[REG]]]
-; CHECK: ldr x{{[0-9]+}}, [x[[REG]]]
-  %0 = load i64* @seed, align 8
-  %mul = mul nsw i64 %0, 1309
-  %add = add nsw i64 %mul, 13849
-  %and = and i64 %add, 65535
-  store i64 %and, i64* @seed, align 8
-  %1 = load i64* @seed, align 8
-  %conv = trunc i64 %1 to i32
-  ret i32 %conv
-}
diff --git a/test/CodeGen/ARM64/fast-isel-icmp.ll b/test/CodeGen/ARM64/fast-isel-icmp.ll
deleted file mode 100644
index 971be5c4346..00000000000
--- a/test/CodeGen/ARM64/fast-isel-icmp.ll
+++ /dev/null
@@ -1,214 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
-entry:
-; CHECK: icmp_eq_imm
-; CHECK: cmp  w0, #31
-; CHECK: cset w0, eq
-  %cmp = icmp eq i32 %a, 31
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
-entry:
-; CHECK: icmp_eq_neg_imm
-; CHECK: cmn  w0, #7
-; CHECK: cset w0, eq
-  %cmp = icmp eq i32 %a, -7
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_eq(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_eq
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, eq
-  %cmp = icmp eq i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_ne
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, ne
-  %cmp = icmp ne i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_ugt
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, hi
-  %cmp = icmp ugt i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_uge
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, hs
-  %cmp = icmp uge i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_ult
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, lo
-  %cmp = icmp ult i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_ule
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, ls
-  %cmp = icmp ule i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_sgt
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, gt
-  %cmp = icmp sgt i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_sge
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, ge
-  %cmp = icmp sge i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_slt
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, lt
-  %cmp = icmp slt i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_sle
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, le
-  %cmp = icmp sle i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
-entry:
-; CHECK: icmp_i64
-; CHECK: cmp  x0, x1
-; CHECK: cset w{{[0-9]+}}, le
-  %cmp = icmp sle i64 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
-entry:
-; CHECK: icmp_eq_i16
-; CHECK: sxth w0, w0
-; CHECK: sxth w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, eq
-  %cmp = icmp eq i16 %a, %b
-  ret i1 %cmp
-}
-
-define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
-entry:
-; CHECK: icmp_eq_i8
-; CHECK: sxtb w0, w0
-; CHECK: sxtb w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, eq
-  %cmp = icmp eq i8 %a, %b
-  ret i1 %cmp
-}
-
-define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
-entry:
-; CHECK: icmp_i16_unsigned
-; CHECK: uxth w0, w0
-; CHECK: uxth w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, lo
-  %cmp = icmp ult i16 %a, %b
-  %conv2 = zext i1 %cmp to i32
-  ret i32 %conv2
-}
-
-define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
-entry:
-; CHECK: @icmp_i8_signed
-; CHECK: sxtb w0, w0
-; CHECK: sxtb w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, gt
-  %cmp = icmp sgt i8 %a, %b
-  %conv2 = zext i1 %cmp to i32
-  ret i32 %conv2
-}
-
-
-define i32 @icmp_i16_signed_const(i16 %a) nounwind {
-entry:
-; CHECK: icmp_i16_signed_const
-; CHECK: sxth w0, w0
-; CHECK: cmn  w0, #233
-; CHECK: cset w0, lt
-; CHECK: and w0, w0, #0x1
-  %cmp = icmp slt i16 %a, -233
-  %conv2 = zext i1 %cmp to i32
-  ret i32 %conv2
-}
-
-define i32 @icmp_i8_signed_const(i8 %a) nounwind {
-entry:
-; CHECK: icmp_i8_signed_const
-; CHECK: sxtb w0, w0
-; CHECK: cmp  w0, #124
-; CHECK: cset w0, gt
-; CHECK: and w0, w0, #0x1
-  %cmp = icmp sgt i8 %a, 124
-  %conv2 = zext i1 %cmp to i32
-  ret i32 %conv2
-}
-
-define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
-entry:
-; CHECK: icmp_i1_unsigned_const
-; CHECK: and w0, w0, #0x1
-; CHECK: cmp  w0, #0
-; CHECK: cset w0, lo
-; CHECK: and w0, w0, #0x1
-  %cmp = icmp ult i1 %a, 0
-  %conv2 = zext i1 %cmp to i32
-  ret i32 %conv2
-}
diff --git a/test/CodeGen/ARM64/fast-isel-indirectbr.ll b/test/CodeGen/ARM64/fast-isel-indirectbr.ll
deleted file mode 100644
index 70335ace50c..00000000000
--- a/test/CodeGen/ARM64/fast-isel-indirectbr.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-@fn.table = internal global [2 x i8*] [i8* blockaddress(@fn, %ZERO), i8* blockaddress(@fn, %ONE)], align 8
-
-define i32 @fn(i32 %target) nounwind {
-entry:
-; CHECK: @fn
-  %retval = alloca i32, align 4
-  %target.addr = alloca i32, align 4
-  store i32 %target, i32* %target.addr, align 4
-  %0 = load i32* %target.addr, align 4
-  %idxprom = zext i32 %0 to i64
-  %arrayidx = getelementptr inbounds [2 x i8*]* @fn.table, i32 0, i64 %idxprom
-  %1 = load i8** %arrayidx, align 8
-  br label %indirectgoto
-
-ZERO:                                             ; preds = %indirectgoto
-; CHECK: LBB0_1
-  store i32 0, i32* %retval
-  br label %return
-
-ONE:                                              ; preds = %indirectgoto
-; CHECK: LBB0_2
-  store i32 1, i32* %retval
-  br label %return
-
-return:                                           ; preds = %ONE, %ZERO
-  %2 = load i32* %retval
-  ret i32 %2
-
-indirectgoto:                                     ; preds = %entry
-; CHECK: ldr x0, [sp]
-; CHECK: br x0
-  %indirect.goto.dest = phi i8* [ %1, %entry ]
-  indirectbr i8* %indirect.goto.dest, [label %ZERO, label %ONE]
-}
diff --git a/test/CodeGen/ARM64/fast-isel-intrinsic.ll b/test/CodeGen/ARM64/fast-isel-intrinsic.ll
deleted file mode 100644
index a3d5f6c3c5a..00000000000
--- a/test/CodeGen/ARM64/fast-isel-intrinsic.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios | FileCheck %s --check-prefix=ARM64
-
-@message = global [80 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 16
-@temp = common global [80 x i8] zeroinitializer, align 16
-
-define void @t1() {
-; ARM64-LABEL: t1
-; ARM64: adrp x8, _message@PAGE
-; ARM64: add x0, x8, _message@PAGEOFF
-; ARM64: movz w9, #0
-; ARM64: movz x2, #0x50
-; ARM64: uxtb w1, w9
-; ARM64: bl _memset
-  call void @llvm.memset.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i8 0, i64 80, i32 16, i1 false)
-  ret void
-}
-
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
-
-define void @t2() {
-; ARM64-LABEL: t2
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x8, _message@PAGE
-; ARM64: add x1, x8, _message@PAGEOFF
-; ARM64: movz x2, #0x50
-; ARM64: bl _memcpy
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 80, i32 16, i1 false)
-  ret void
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
-
-define void @t3() {
-; ARM64-LABEL: t3
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x8, _message@PAGE
-; ARM64: add x1, x8, _message@PAGEOFF
-; ARM64: movz x2, #0x14
-; ARM64: bl _memmove
-  call void @llvm.memmove.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 20, i32 16, i1 false)
-  ret void
-}
-
-declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
-
-define void @t4() {
-; ARM64-LABEL: t4
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldr x10, [x9]
-; ARM64: str x10, [x8]
-; ARM64: ldr x10, [x9, #8]
-; ARM64: str x10, [x8, #8]
-; ARM64: ldrb w11, [x9, #16]
-; ARM64: strb w11, [x8, #16]
-; ARM64: ret
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 16, i1 false)
-  ret void
-}
-
-define void @t5() {
-; ARM64-LABEL: t5
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldr x10, [x9]
-; ARM64: str x10, [x8]
-; ARM64: ldr x10, [x9, #8]
-; ARM64: str x10, [x8, #8]
-; ARM64: ldrb w11, [x9, #16]
-; ARM64: strb w11, [x8, #16]
-; ARM64: ret
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 8, i1 false)
-  ret void
-}
-
-define void @t6() {
-; ARM64-LABEL: t6
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldr w10, [x9]
-; ARM64: str w10, [x8]
-; ARM64: ldr w10, [x9, #4]
-; ARM64: str w10, [x8, #4]
-; ARM64: ldrb w10, [x9, #8]
-; ARM64: strb w10, [x8, #8]
-; ARM64: ret
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 9, i32 4, i1 false)
-  ret void
-}
-
-define void @t7() {
-; ARM64-LABEL: t7
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldrh w10, [x9]
-; ARM64: strh w10, [x8]
-; ARM64: ldrh w10, [x9, #2]
-; ARM64: strh w10, [x8, #2]
-; ARM64: ldrh w10, [x9, #4]
-; ARM64: strh w10, [x8, #4]
-; ARM64: ldrb w10, [x9, #6]
-; ARM64: strb w10, [x8, #6]
-; ARM64: ret
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 7, i32 2, i1 false)
-  ret void
-}
-
-define void @t8() {
-; ARM64-LABEL: t8
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldrb w10, [x9]
-; ARM64: strb w10, [x8]
-; ARM64: ldrb w10, [x9, #1]
-; ARM64: strb w10, [x8, #1]
-; ARM64: ldrb w10, [x9, #2]
-; ARM64: strb w10, [x8, #2]
-; ARM64: ldrb w10, [x9, #3]
-; ARM64: strb w10, [x8, #3]
-; ARM64: ret
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 4, i32 1, i1 false)
-  ret void
-}
diff --git a/test/CodeGen/ARM64/fast-isel-materialize.ll b/test/CodeGen/ARM64/fast-isel-materialize.ll
deleted file mode 100644
index ffac131f0ca..00000000000
--- a/test/CodeGen/ARM64/fast-isel-materialize.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-; Materialize using fmov
-define void @float_(float* %value) {
-; CHECK: @float_
-; CHECK: fmov s0, #1.25000000
-  store float 1.250000e+00, float* %value, align 4
-  ret void
-}
-
-define void @double_(double* %value) {
-; CHECK: @double_
-; CHECK: fmov d0, #1.25000000
-  store double 1.250000e+00, double* %value, align 8
-  ret void
-}
-
-; Materialize from constant pool
-define float @float_cp() {
-; CHECK: @float_cp
-  ret float 0x400921FB60000000
-}
-
-define double @double_cp() {
-; CHECK: @double_cp
-  ret double 0x400921FB54442D18
-}
diff --git a/test/CodeGen/ARM64/fast-isel-noconvert.ll b/test/CodeGen/ARM64/fast-isel-noconvert.ll
deleted file mode 100644
index 483d1799f9c..00000000000
--- a/test/CodeGen/ARM64/fast-isel-noconvert.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -O0 %s -o - | FileCheck %s
-
-; Fast-isel can't do vector conversions yet, but it was emitting some highly
-; suspect UCVTFUWDri MachineInstrs.
-define <4 x float> @test_uitofp(<4 x i32> %in) {
-; CHECK-LABEL: test_uitofp:
-; CHECK: ucvtf.4s v0, v0
-
-  %res = uitofp <4 x i32> %in to <4 x float>
-  ret <4 x float> %res
-}
-
-define <2 x double> @test_sitofp(<2 x i32> %in) {
-; CHECK-LABEL: test_sitofp:
-; CHECK: sshll.2d [[EXT:v[0-9]+]], v0, #0
-; CHECK: scvtf.2d v0, [[EXT]]
-
-  %res = sitofp <2 x i32> %in to <2 x double>
-  ret <2 x double> %res
-}
-
-define <2 x i32> @test_fptoui(<2 x float> %in) {
-; CHECK-LABEL: test_fptoui:
-; CHECK: fcvtzu.2s v0, v0
-
-  %res = fptoui <2 x float> %in to <2 x i32>
-  ret <2 x i32> %res
-}
-
-define <2 x i64> @test_fptosi(<2 x double> %in) {
-; CHECK-LABEL: test_fptosi:
-; CHECK: fcvtzs.2d v0, v0
-
-  %res = fptosi <2 x double> %in to <2 x i64>
-  ret <2 x i64> %res
-}
-
-define fp128 @uitofp_i32_fp128(i32 %a) {
-entry:
-; CHECK-LABEL: uitofp_i32_fp128
-; CHECK: bl ___floatunsitf
-  %conv = uitofp i32 %a to fp128
-  ret fp128 %conv
-}
-
-define fp128 @uitofp_i64_fp128(i64 %a) {
-entry:
-; CHECK-LABEL: uitofp_i64_fp128
-; CHECK: bl ___floatunditf
-  %conv = uitofp i64 %a to fp128
-  ret fp128 %conv
-}
-
-define i32 @uitofp_fp128_i32(fp128 %a) {
-entry:
-; CHECK-LABEL: uitofp_fp128_i32
-; CHECK: ___fixunstfsi
-  %conv = fptoui fp128 %a to i32
-  ret i32 %conv
-}
-
-define i64 @uitofp_fp128_i64(fp128 %a) {
-entry:
-; CHECK-LABEL: uitofp_fp128_i64
-; CHECK: ___fixunstfdi
-  %conv = fptoui fp128 %a to i64
-  ret i64 %conv
-}
diff --git a/test/CodeGen/ARM64/fast-isel-rem.ll b/test/CodeGen/ARM64/fast-isel-rem.ll
deleted file mode 100644
index d5bdbaae9e7..00000000000
--- a/test/CodeGen/ARM64/fast-isel-rem.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-; RUN: llc %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -print-machineinstrs=expand-isel-pseudos -o /dev/null 2> %t
-; RUN: FileCheck %s < %t --check-prefix=CHECK-SSA
-; REQUIRES: asserts
-
-; CHECK-SSA-LABEL: Machine code for function t1
-
-; CHECK-SSA: [[QUOTREG:%vreg[0-9]+]]<def> = SDIVWr
-; CHECK-SSA-NOT: [[QUOTREG]]<def> =
-; CHECK-SSA: {{%vreg[0-9]+}}<def> = MSUBWrrr [[QUOTREG]]
-
-; CHECK-SSA-LABEL: Machine code for function t2
-
-define i32 @t1(i32 %a, i32 %b) {
-; CHECK: @t1
-; CHECK: sdiv [[TMP:w[0-9]+]], w0, w1
-; CHECK: msub w0, [[TMP]], w1, w0
-  %1 = srem i32 %a, %b
-  ret i32 %1
-}
-
-define i64 @t2(i64 %a, i64 %b) {
-; CHECK: @t2
-; CHECK: sdiv [[TMP:x[0-9]+]], x0, x1
-; CHECK: msub x0, [[TMP]], x1, x0
-  %1 = srem i64 %a, %b
-  ret i64 %1
-}
-
-define i32 @t3(i32 %a, i32 %b) {
-; CHECK: @t3
-; CHECK: udiv [[TMP:w[0-9]+]], w0, w1
-; CHECK: msub w0, [[TMP]], w1, w0
-  %1 = urem i32 %a, %b
-  ret i32 %1
-}
-
-define i64 @t4(i64 %a, i64 %b) {
-; CHECK: @t4
-; CHECK: udiv [[TMP:x[0-9]+]], x0, x1
-; CHECK: msub x0, [[TMP]], x1, x0
-  %1 = urem i64 %a, %b
-  ret i64 %1
-}
diff --git a/test/CodeGen/ARM64/fast-isel-ret.ll b/test/CodeGen/ARM64/fast-isel-ret.ll
deleted file mode 100644
index d91fd285d55..00000000000
--- a/test/CodeGen/ARM64/fast-isel-ret.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-;; Test returns.
-define void @t0() nounwind ssp {
-entry:
-; CHECK: t0
-; CHECK: ret
-  ret void
-}
-
-define i32 @t1(i32 %a) nounwind ssp {
-entry:
-; CHECK: t1
-; CHECK: str w0, [sp, #12]
-; CHECK-NEXT: ldr w0, [sp, #12]
-; CHECK: ret
-  %a.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  %tmp = load i32* %a.addr, align 4
-  ret i32 %tmp
-}
-
-define i64 @t2(i64 %a) nounwind ssp {
-entry:
-; CHECK: t2
-; CHECK: str x0, [sp, #8]
-; CHECK-NEXT: ldr x0, [sp, #8]
-; CHECK: ret
-  %a.addr = alloca i64, align 8
-  store i64 %a, i64* %a.addr, align 8
-  %tmp = load i64* %a.addr, align 8
-  ret i64 %tmp
-}
-
-define signext i16 @ret_i16(i16 signext %a) nounwind {
-entry:
-; CHECK: @ret_i16
-; CHECK: sxth	w0, w0
-  %a.addr = alloca i16, align 1
-  store i16 %a, i16* %a.addr, align 1
-  %0 = load i16* %a.addr, align 1
-  ret i16 %0
-}
-
-define signext i8 @ret_i8(i8 signext %a) nounwind {
-entry:
-; CHECK: @ret_i8
-; CHECK: sxtb	w0, w0
-  %a.addr = alloca i8, align 1
-  store i8 %a, i8* %a.addr, align 1
-  %0 = load i8* %a.addr, align 1
-  ret i8 %0
-}
-
-define signext i1 @ret_i1(i1 signext %a) nounwind {
-entry:
-; CHECK: @ret_i1
-; CHECK: and w0, w0, #0x1
-  %a.addr = alloca i1, align 1
-  store i1 %a, i1* %a.addr, align 1
-  %0 = load i1* %a.addr, align 1
-  ret i1 %0
-}
diff --git a/test/CodeGen/ARM64/fast-isel-select.ll b/test/CodeGen/ARM64/fast-isel-select.ll
deleted file mode 100644
index 1cc207f5915..00000000000
--- a/test/CodeGen/ARM64/fast-isel-select.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i32 @t1(i32 %c) nounwind readnone {
-entry:
-; CHECK: @t1
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
-  %0 = icmp sgt i32 %c, 1
-  %1 = select i1 %0, i32 123, i32 357
-  ret i32 %1
-}
-
-define i64 @t2(i32 %c) nounwind readnone {
-entry:
-; CHECK: @t2
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
-  %0 = icmp sgt i32 %c, 1
-  %1 = select i1 %0, i64 123, i64 357
-  ret i64 %1
-}
-
-define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
-entry:
-; CHECK: @t3
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
-  %0 = select i1 %c, i32 %a, i32 %b
-  ret i32 %0
-}
-
-define i64 @t4(i1 %c, i64 %a, i64 %b) nounwind readnone {
-entry:
-; CHECK: @t4
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
-  %0 = select i1 %c, i64 %a, i64 %b
-  ret i64 %0
-}
-
-define float @t5(i1 %c, float %a, float %b) nounwind readnone {
-entry:
-; CHECK: @t5
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: fcsel s0, s0, s1, ne
-  %0 = select i1 %c, float %a, float %b
-  ret float %0
-}
-
-define double @t6(i1 %c, double %a, double %b) nounwind readnone {
-entry:
-; CHECK: @t6
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: fcsel d0, d0, d1, ne
-  %0 = select i1 %c, double %a, double %b
-  ret double %0
-}
diff --git a/test/CodeGen/ARM64/fast-isel.ll b/test/CodeGen/ARM64/fast-isel.ll
deleted file mode 100644
index 0194b3a6c2d..00000000000
--- a/test/CodeGen/ARM64/fast-isel.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define void @t0(i32 %a) nounwind {
-entry:
-; CHECK: t0
-; CHECK: str {{w[0-9]+}}, [sp, #12]
-; CHECK-NEXT: ldr [[REGISTER:w[0-9]+]], [sp, #12]
-; CHECK-NEXT: str [[REGISTER]], [sp, #12]
-; CHECK: ret
-  %a.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr
-  %tmp = load i32* %a.addr
-  store i32 %tmp, i32* %a.addr
-  ret void
-}
-
-define void @t1(i64 %a) nounwind {
-; CHECK: t1
-; CHECK: str {{x[0-9]+}}, [sp, #8]
-; CHECK-NEXT: ldr [[REGISTER:x[0-9]+]], [sp, #8]
-; CHECK-NEXT: str [[REGISTER]], [sp, #8]
-; CHECK: ret
-  %a.addr = alloca i64, align 4
-  store i64 %a, i64* %a.addr
-  %tmp = load i64* %a.addr
-  store i64 %tmp, i64* %a.addr
-  ret void
-}
-
-define zeroext i1 @i1(i1 %a) nounwind {
-entry:
-; CHECK: @i1
-; CHECK: and w0, w0, #0x1
-; CHECK: strb w0, [sp, #15]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: and w0, w0, #0x1
-; CHECK: and w0, w0, #0x1
-; CHECK: add sp, sp, #16
-; CHECK: ret
-  %a.addr = alloca i1, align 1
-  store i1 %a, i1* %a.addr, align 1
-  %0 = load i1* %a.addr, align 1
-  ret i1 %0
-}
-
-define i32 @t2(i32 *%ptr) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: ldur w0, [x0, #-4]
-; CHECK: ret
-  %0 = getelementptr i32 *%ptr, i32 -1
-  %1 = load i32* %0, align 4
-  ret i32 %1
-}
-
-define i32 @t3(i32 *%ptr) nounwind {
-entry:
-; CHECK-LABEL: t3:
-; CHECK: ldur w0, [x0, #-256]
-; CHECK: ret
-  %0 = getelementptr i32 *%ptr, i32 -64
-  %1 = load i32* %0, align 4
-  ret i32 %1
-}
-
-define void @t4(i32 *%ptr) nounwind {
-entry:
-; CHECK-LABEL: t4:
-; CHECK: movz w8, #0
-; CHECK: stur w8, [x0, #-4]
-; CHECK: ret
-  %0 = getelementptr i32 *%ptr, i32 -1
-  store i32 0, i32* %0, align 4
-  ret void
-}
-
-define void @t5(i32 *%ptr) nounwind {
-entry:
-; CHECK-LABEL: t5:
-; CHECK: movz w8, #0
-; CHECK: stur w8, [x0, #-256]
-; CHECK: ret
-  %0 = getelementptr i32 *%ptr, i32 -64
-  store i32 0, i32* %0, align 4
-  ret void
-}
-
-define void @t6() nounwind {
-; CHECK: t6
-; CHECK: brk #0x1
-  tail call void @llvm.trap()
-  ret void
-}
-
-declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM64/fastcc-tailcall.ll b/test/CodeGen/ARM64/fastcc-tailcall.ll
deleted file mode 100644
index 8a744c513d7..00000000000
--- a/test/CodeGen/ARM64/fastcc-tailcall.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define void @caller(i32* nocapture %p, i32 %a, i32 %b) nounwind optsize ssp {
-; CHECK-NOT: stp
-; CHECK: b       {{_callee|callee}}
-; CHECK-NOT: ldp
-; CHECK: ret
-  %1 = icmp eq i32 %b, 0
-  br i1 %1, label %3, label %2
-
-  tail call fastcc void @callee(i32* %p, i32 %a) optsize
-  br label %3
-
-  ret void
-}
-
-define internal fastcc void @callee(i32* nocapture %p, i32 %a) nounwind optsize noinline ssp {
-  store volatile i32 %a, i32* %p, align 4, !tbaa !0
-  ret void
-}
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll b/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
deleted file mode 100644
index af9fe056173..00000000000
--- a/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; fastisel should not fold add with non-pointer bitwidth
-; sext(a) + sext(b) != sext(a + b)
-; RUN: llc -mtriple=arm64-apple-darwin %s -O0 -o - | FileCheck %s
-
-define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
-entry:
-  %ptr.addr = alloca i8*, align 8
-  %add = add i8 64, 64 ; 0x40 + 0x40
-  %0 = load i8** %ptr.addr, align 8
-
-  ; CHECK-LABEL: _gep_promotion:
-  ; CHECK: ldrb {{[a-z][0-9]+}}, {{\[[a-z][0-9]+\]}}
-  %arrayidx = getelementptr inbounds i8* %0, i8 %add
-
-  %1 = load i8* %arrayidx, align 1
-  ret i8 %1
-}
-
diff --git a/test/CodeGen/ARM64/fcmp-opt.ll b/test/CodeGen/ARM64/fcmp-opt.ll
deleted file mode 100644
index e79eb9c6fb1..00000000000
--- a/test/CodeGen/ARM64/fcmp-opt.ll
+++ /dev/null
@@ -1,204 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -arm64-neon-syntax=apple | FileCheck %s
-; rdar://10263824
-
-define i1 @fcmp_float1(float %a) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_float1
-; CHECK: fcmp s0, #0.0
-; CHECK: cset w0, ne
-  %cmp = fcmp une float %a, 0.000000e+00
-  ret i1 %cmp
-}
-
-define i1 @fcmp_float2(float %a, float %b) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_float2
-; CHECK: fcmp s0, s1
-; CHECK: cset w0, ne
-  %cmp = fcmp une float %a, %b
-  ret i1 %cmp
-}
-
-define i1 @fcmp_double1(double %a) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_double1
-; CHECK: fcmp d0, #0.0
-; CHECK: cset w0, ne
-  %cmp = fcmp une double %a, 0.000000e+00
-  ret i1 %cmp
-}
-
-define i1 @fcmp_double2(double %a, double %b) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_double2
-; CHECK: fcmp d0, d1
-; CHECK: cset w0, ne
-  %cmp = fcmp une double %a, %b
-  ret i1 %cmp
-}
-
-; Check each fcmp condition
-define float @fcmp_oeq(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_oeq
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], eq
-
-  %cmp = fcmp oeq float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ogt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ogt
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], gt
-
-  %cmp = fcmp ogt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_oge(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_oge
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ge
-
-  %cmp = fcmp oge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_olt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_olt
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], mi
-
-  %cmp = fcmp olt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ole(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ole
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ls
-
-  %cmp = fcmp ole float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ord(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ord
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], vc
-  %cmp = fcmp ord float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_uno(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_uno
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], vs
-  %cmp = fcmp uno float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ugt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ugt
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], hi
-  %cmp = fcmp ugt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_uge(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_uge
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], pl
-  %cmp = fcmp uge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ult(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ult
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], lt
-  %cmp = fcmp ult float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ule(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ule
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], le
-  %cmp = fcmp ule float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_une(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_une
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ne
-  %cmp = fcmp une float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-; Possible opportunity for improvement.  See comment in
-; ARM64TargetLowering::LowerSETCC()
-define float @fcmp_one(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_one
-;	fcmp	s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel [[TMP:s[0-9]+]], s[[ONE]], s[[ZERO]], mi
-; CHECK: fcsel s0, s[[ONE]], [[TMP]], gt
-  %cmp = fcmp one float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-; Possible opportunity for improvement.  See comment in
-; ARM64TargetLowering::LowerSETCC()
-define float @fcmp_ueq(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ueq
-; CHECK: fcmp s0, s1
-; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
-; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
-; CHECK: fcsel [[TMP:s[0-9]+]], s[[ONE]], s[[ZERO]], eq
-; CHECK: fcsel s0, s[[ONE]], [[TMP]], vs
-  %cmp = fcmp ueq float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
diff --git a/test/CodeGen/ARM64/fcopysign.ll b/test/CodeGen/ARM64/fcopysign.ll
deleted file mode 100644
index 66241df9444..00000000000
--- a/test/CodeGen/ARM64/fcopysign.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-
-; rdar://9332258
-
-define float @test1(float %x, float %y) nounwind {
-entry:
-; CHECK-LABEL: test1:
-; CHECK: movi.4s	v2, #0x80, lsl #24
-; CHECK: bit.16b	v0, v1, v2
-  %0 = tail call float @copysignf(float %x, float %y) nounwind readnone
-  ret float %0
-}
-
-define double @test2(double %x, double %y) nounwind {
-entry:
-; CHECK-LABEL: test2:
-; CHECK: movi.2d	v2, #0
-; CHECK: fneg.2d	v2, v2
-; CHECK: bit.16b	v0, v1, v2
-  %0 = tail call double @copysign(double %x, double %y) nounwind readnone
-  ret double %0
-}
-
-; rdar://9545768
-define double @test3(double %a, float %b, float %c) nounwind {
-; CHECK-LABEL: test3:
-; CHECK: fcvt d1, s1
-; CHECK: fneg.2d v2, v{{[0-9]+}}
-; CHECK: bit.16b v0, v1, v2
-  %tmp1 = fadd float %b, %c
-  %tmp2 = fpext float %tmp1 to double
-  %tmp = tail call double @copysign( double %a, double %tmp2 ) nounwind readnone
-  ret double %tmp
-}
-
-define float @test4() nounwind {
-entry:
-; CHECK-LABEL: test4:
-; CHECK: fcvt s0, d0
-; CHECK: movi.4s v[[CONST:[0-9]+]], #0x80, lsl #24
-; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
-  %0 = tail call double (...)* @bar() nounwind
-  %1 = fptrunc double %0 to float
-  %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
-  %3 = fadd float %1, %2
-  ret float %3
-}
-
-declare double @bar(...)
-declare double @copysign(double, double) nounwind readnone
-declare float @copysignf(float, float) nounwind readnone
diff --git a/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll b/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
deleted file mode 100644
index 77981f292bd..00000000000
--- a/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-; DAGCombine to transform a conversion of an extract_vector_elt to an
-; extract_vector_elt of a conversion, which saves a round trip of copies
-; of the value to a GPR and back to and FPR.
-; rdar://11855286
-define double @foo0(<2 x i64> %a) nounwind {
-; CHECK:  scvtf.2d  [[REG:v[0-9]+]], v0, #9
-; CHECK-NEXT:  ins.d v0[0], [[REG]][1]
-  %vecext = extractelement <2 x i64> %a, i32 1
-  %fcvt_n = tail call double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
-  ret double %fcvt_n
-}
-
-declare double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/fmadd.ll b/test/CodeGen/ARM64/fmadd.ll
deleted file mode 100644
index c791900cc2f..00000000000
--- a/test/CodeGen/ARM64/fmadd.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc -march=arm64 < %s | FileCheck %s
-
-define float @fma32(float %a, float %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fma32:
-; CHECK: fmadd s0, s0, s1, s2
-  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
-  ret float %0
-}
-
-define float @fnma32(float %a, float %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fnma32:
-; CHECK: fnmadd s0, s0, s1, s2
-  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
-  %mul = fmul float %0, -1.000000e+00
-  ret float %mul
-}
-
-define float @fms32(float %a, float %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fms32:
-; CHECK: fmsub s0, s0, s1, s2
-  %mul = fmul float %b, -1.000000e+00
-  %0 = tail call float @llvm.fma.f32(float %a, float %mul, float %c)
-  ret float %0
-}
-
-define float @fms32_com(float %a, float %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fms32_com:
-; CHECK: fmsub s0, s1, s0, s2
-  %mul = fmul float %b, -1.000000e+00
-  %0 = tail call float @llvm.fma.f32(float %mul, float %a, float %c)
-  ret float %0
-}
-
-define float @fnms32(float %a, float %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fnms32:
-; CHECK: fnmsub s0, s0, s1, s2
-  %mul = fmul float %c, -1.000000e+00
-  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %mul)
-  ret float %0
-}
-
-define double @fma64(double %a, double %b, double %c) nounwind readnone ssp {
-; CHECK-LABEL: fma64:
-; CHECK: fmadd d0, d0, d1, d2
-entry:
-  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
-  ret double %0
-}
-
-define double @fnma64(double %a, double %b, double %c) nounwind readnone ssp {
-; CHECK-LABEL: fnma64:
-; CHECK: fnmadd d0, d0, d1, d2
-entry:
-  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
-  %mul = fmul double %0, -1.000000e+00
-  ret double %mul
-}
-
-define double @fms64(double %a, double %b, double %c) nounwind readnone ssp {
-; CHECK-LABEL: fms64:
-; CHECK: fmsub d0, d0, d1, d2
-entry:
-  %mul = fmul double %b, -1.000000e+00
-  %0 = tail call double @llvm.fma.f64(double %a, double %mul, double %c)
-  ret double %0
-}
-
-define double @fms64_com(double %a, double %b, double %c) nounwind readnone ssp {
-; CHECK-LABEL: fms64_com:
-; CHECK: fmsub d0, d1, d0, d2
-entry:
-  %mul = fmul double %b, -1.000000e+00
-  %0 = tail call double @llvm.fma.f64(double %mul, double %a, double %c)
-  ret double %0
-}
-
-define double @fnms64(double %a, double %b, double %c) nounwind readnone ssp {
-; CHECK-LABEL: fnms64:
-; CHECK: fnmsub d0, d0, d1, d2
-entry:
-  %mul = fmul double %c, -1.000000e+00
-  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %mul)
-  ret double %0
-}
-
-declare float @llvm.fma.f32(float, float, float) nounwind readnone
-declare double @llvm.fma.f64(double, double, double) nounwind readnone
diff --git a/test/CodeGen/ARM64/fmax.ll b/test/CodeGen/ARM64/fmax.ll
deleted file mode 100644
index 94b745437bd..00000000000
--- a/test/CodeGen/ARM64/fmax.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s
-
-define double @test_direct(float %in) #1 {
-; CHECK-LABEL: test_direct:
-  %cmp = fcmp olt float %in, 0.000000e+00
-  %longer = fpext float %in to double
-  %val = select i1 %cmp, double 0.000000e+00, double %longer
-  ret double %val
-
-; CHECK: fmax
-}
-
-define double @test_cross(float %in) #1 {
-; CHECK-LABEL: test_cross:
-  %cmp = fcmp olt float %in, 0.000000e+00
-  %longer = fpext float %in to double
-  %val = select i1 %cmp, double %longer, double 0.000000e+00
-  ret double %val
-
-; CHECK: fmin
-}
-
-; This isn't a min or a max, but passes the first condition for swapping the
-; results. Make sure they're put back before we resort to the normal fcsel.
-define float @test_cross_fail(float %lhs, float %rhs) {
-; CHECK-LABEL: test_cross_fail:
-  %tst = fcmp une float %lhs, %rhs
-  %res = select i1 %tst, float %rhs, float %lhs
-  ret float %res
-
-  ; The register allocator would have to decide to be deliberately obtuse before
-  ; other register were used.
-; CHECK: fcsel s0, s1, s0, ne
-}
\ No newline at end of file
diff --git a/test/CodeGen/ARM64/fminv.ll b/test/CodeGen/ARM64/fminv.ll
deleted file mode 100644
index ca706d897ca..00000000000
--- a/test/CodeGen/ARM64/fminv.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-
-define float @test_fminv_v2f32(<2 x float> %in) {
-; CHECK: test_fminv_v2f32:
-; CHECK: fminp s0, v0.2s
-  %min = call float @llvm.arm64.neon.fminv.f32.v2f32(<2 x float> %in)
-  ret float %min
-}
-
-define float @test_fminv_v4f32(<4 x float> %in) {
-; CHECK: test_fminv_v4f32:
-; CHECK: fminv s0, v0.4s
-  %min = call float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float> %in)
-  ret float %min
-}
-
-define double @test_fminv_v2f64(<2 x double> %in) {
-; CHECK: test_fminv_v2f64:
-; CHECK: fminp d0, v0.2d
-  %min = call double @llvm.arm64.neon.fminv.f64.v2f64(<2 x double> %in)
-  ret double %min
-}
-
-declare float @llvm.arm64.neon.fminv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fminv.f64.v2f64(<2 x double>)
-
-define float @test_fmaxv_v2f32(<2 x float> %in) {
-; CHECK: test_fmaxv_v2f32:
-; CHECK: fmaxp s0, v0.2s
-  %max = call float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float> %in)
-  ret float %max
-}
-
-define float @test_fmaxv_v4f32(<4 x float> %in) {
-; CHECK: test_fmaxv_v4f32:
-; CHECK: fmaxv s0, v0.4s
-  %max = call float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float> %in)
-  ret float %max
-}
-
-define double @test_fmaxv_v2f64(<2 x double> %in) {
-; CHECK: test_fmaxv_v2f64:
-; CHECK: fmaxp d0, v0.2d
-  %max = call double @llvm.arm64.neon.fmaxv.f64.v2f64(<2 x double> %in)
-  ret double %max
-}
-
-declare float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fmaxv.f64.v2f64(<2 x double>)
-
-define float @test_fminnmv_v2f32(<2 x float> %in) {
-; CHECK: test_fminnmv_v2f32:
-; CHECK: fminnmp s0, v0.2s
-  %minnm = call float @llvm.arm64.neon.fminnmv.f32.v2f32(<2 x float> %in)
-  ret float %minnm
-}
-
-define float @test_fminnmv_v4f32(<4 x float> %in) {
-; CHECK: test_fminnmv_v4f32:
-; CHECK: fminnmv s0, v0.4s
-  %minnm = call float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float> %in)
-  ret float %minnm
-}
-
-define double @test_fminnmv_v2f64(<2 x double> %in) {
-; CHECK: test_fminnmv_v2f64:
-; CHECK: fminnmp d0, v0.2d
-  %minnm = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
-  ret double %minnm
-}
-
-declare float @llvm.arm64.neon.fminnmv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
-
-define float @test_fmaxnmv_v2f32(<2 x float> %in) {
-; CHECK: test_fmaxnmv_v2f32:
-; CHECK: fmaxnmp s0, v0.2s
-  %maxnm = call float @llvm.arm64.neon.fmaxnmv.f32.v2f32(<2 x float> %in)
-  ret float %maxnm
-}
-
-define float @test_fmaxnmv_v4f32(<4 x float> %in) {
-; CHECK: test_fmaxnmv_v4f32:
-; CHECK: fmaxnmv s0, v0.4s
-  %maxnm = call float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float> %in)
-  ret float %maxnm
-}
-
-define double @test_fmaxnmv_v2f64(<2 x double> %in) {
-; CHECK: test_fmaxnmv_v2f64:
-; CHECK: fmaxnmp d0, v0.2d
-  %maxnm = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
-  ret double %maxnm
-}
-
-declare float @llvm.arm64.neon.fmaxnmv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/fmuladd.ll b/test/CodeGen/ARM64/fmuladd.ll
deleted file mode 100644
index 174d8307671..00000000000
--- a/test/CodeGen/ARM64/fmuladd.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define float @test_f32(float* %A, float* %B, float* %C) nounwind {
-;CHECK-LABEL: test_f32:
-;CHECK: fmadd
-;CHECK-NOT: fmadd
-  %tmp1 = load float* %A
-  %tmp2 = load float* %B
-  %tmp3 = load float* %C
-  %tmp4 = call float @llvm.fmuladd.f32(float %tmp1, float %tmp2, float %tmp3)
-  ret float %tmp4
-}
-
-define <2 x float> @test_v2f32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
-;CHECK-LABEL: test_v2f32:
-;CHECK: fmla.2s
-;CHECK-NOT: fmla.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = load <2 x float>* %C
-  %tmp4 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
-  ret <2 x float> %tmp4
-}
-
-define <4 x float> @test_v4f32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
-;CHECK-LABEL: test_v4f32:
-;CHECK: fmla.4s
-;CHECK-NOT: fmla.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = load <4 x float>* %C
-  %tmp4 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
-  ret <4 x float> %tmp4
-}
-
-define <8 x float> @test_v8f32(<8 x float>* %A, <8 x float>* %B, <8 x float>* %C) nounwind {
-;CHECK-LABEL: test_v8f32:
-;CHECK: fmla.4s
-;CHECK: fmla.4s
-;CHECK-NOT: fmla.4s
-  %tmp1 = load <8 x float>* %A
-  %tmp2 = load <8 x float>* %B
-  %tmp3 = load <8 x float>* %C
-  %tmp4 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %tmp1, <8 x float> %tmp2, <8 x float> %tmp3)
-  ret <8 x float> %tmp4
-}
-
-define double @test_f64(double* %A, double* %B, double* %C) nounwind {
-;CHECK-LABEL: test_f64:
-;CHECK: fmadd
-;CHECK-NOT: fmadd
-  %tmp1 = load double* %A
-  %tmp2 = load double* %B
-  %tmp3 = load double* %C
-  %tmp4 = call double @llvm.fmuladd.f64(double %tmp1, double %tmp2, double %tmp3)
-  ret double %tmp4
-}
-
-define <2 x double> @test_v2f64(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
-;CHECK-LABEL: test_v2f64:
-;CHECK: fmla.2d
-;CHECK-NOT: fmla.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = load <2 x double>* %C
-  %tmp4 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
-  ret <2 x double> %tmp4
-}
-
-define <4 x double> @test_v4f64(<4 x double>* %A, <4 x double>* %B, <4 x double>* %C) nounwind {
-;CHECK-LABEL: test_v4f64:
-;CHECK: fmla.2d
-;CHECK: fmla.2d
-;CHECK-NOT: fmla.2d
-  %tmp1 = load <4 x double>* %A
-  %tmp2 = load <4 x double>* %B
-  %tmp3 = load <4 x double>* %C
-  %tmp4 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %tmp1, <4 x double> %tmp2, <4 x double> %tmp3)
-  ret <4 x double> %tmp4
-}
-
-declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
-declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
-declare double @llvm.fmuladd.f64(double, double, double) nounwind readnone
-declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
diff --git a/test/CodeGen/ARM64/fold-address.ll b/test/CodeGen/ARM64/fold-address.ll
deleted file mode 100644
index 96cc3e90f63..00000000000
--- a/test/CodeGen/ARM64/fold-address.ll
+++ /dev/null
@@ -1,79 +0,0 @@
-; RUN: llc < %s -O2 -mtriple=arm64-apple-darwin | FileCheck %s
-
-%0 = type opaque
-%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
-%struct.CGPoint = type { double, double }
-%struct.CGSize = type { double, double }
-
-@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
-
-define hidden %struct.CGRect @nofold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
-entry:
-; CHECK-LABEL: nofold:
-; CHECK: add x[[REG:[0-9]+]], x0, x{{[0-9]+}}
-; CHECK: ldp d0, d1, [x[[REG]]]
-; CHECK: ldp d2, d3, [x[[REG]], #16]
-; CHECK: ret
-  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
-  %0 = bitcast %0* %self to i8*
-  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
-  %add.ptr10.0 = bitcast i8* %add.ptr to double*
-  %tmp11 = load double* %add.ptr10.0, align 8
-  %add.ptr.sum = add i64 %ivar, 8
-  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
-  %1 = bitcast i8* %add.ptr10.1 to double*
-  %tmp12 = load double* %1, align 8
-  %add.ptr.sum17 = add i64 %ivar, 16
-  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
-  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
-  %tmp = load double* %add.ptr4.1.0, align 8
-  %add.ptr4.1.sum = add i64 %ivar, 24
-  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
-  %2 = bitcast i8* %add.ptr4.1.1 to double*
-  %tmp5 = load double* %2, align 8
-  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
-  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
-  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
-  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
-  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
-  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
-  ret %struct.CGRect %insert3
-}
-
-define hidden %struct.CGRect @fold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
-entry:
-; CHECK-LABEL: fold:
-; CHECK: ldr d0, [x0, x{{[0-9]+}}]
-; CHECK-NOT: add x0, x0, x1
-; CHECK: ret
-  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
-  %0 = bitcast %0* %self to i8*
-  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
-  %add.ptr10.0 = bitcast i8* %add.ptr to double*
-  %tmp11 = load double* %add.ptr10.0, align 8
-  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %ivar
-  %1 = bitcast i8* %add.ptr10.1 to double*
-  %tmp12 = load double* %1, align 8
-  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %ivar
-  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
-  %tmp = load double* %add.ptr4.1.0, align 8
-  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %ivar
-  %2 = bitcast i8* %add.ptr4.1.1 to double*
-  %tmp5 = load double* %2, align 8
-  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
-  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
-  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
-  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
-  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
-  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
-  ret %struct.CGRect %insert3
-}
-
-
-!llvm.module.flags = !{!0, !1, !2, !3}
-
-!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!4 = metadata !{}
diff --git a/test/CodeGen/ARM64/fold-lsl.ll b/test/CodeGen/ARM64/fold-lsl.ll
deleted file mode 100644
index 2e5762dd262..00000000000
--- a/test/CodeGen/ARM64/fold-lsl.ll
+++ /dev/null
@@ -1,79 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-;
-; <rdar://problem/14486451>
-
-%struct.a = type [256 x i16]
-%struct.b = type [256 x i32]
-%struct.c = type [256 x i64]
-
-define i16 @load_halfword(%struct.a* %ctx, i32 %xor72) nounwind {
-; CHECK-LABEL: load_halfword:
-; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
-; CHECK: ldrh w0, [x0, [[REG]], lsl #1]
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %idxprom83 = and i64 %conv82, 255
-  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
-  %result = load i16* %arrayidx86, align 2
-  ret i16 %result
-}
-
-define i32 @load_word(%struct.b* %ctx, i32 %xor72) nounwind {
-; CHECK-LABEL: load_word:
-; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
-; CHECK: ldr w0, [x0, [[REG]], lsl #2]
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %idxprom83 = and i64 %conv82, 255
-  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
-  %result = load i32* %arrayidx86, align 4
-  ret i32 %result
-}
-
-define i64 @load_doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
-; CHECK-LABEL: load_doubleword:
-; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
-; CHECK: ldr x0, [x0, [[REG]], lsl #3]
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %idxprom83 = and i64 %conv82, 255
-  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
-  %result = load i64* %arrayidx86, align 8
-  ret i64 %result
-}
-
-define void @store_halfword(%struct.a* %ctx, i32 %xor72, i16 %val) nounwind {
-; CHECK-LABEL: store_halfword:
-; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
-; CHECK: strh w2, [x0, [[REG]], lsl #1]
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %idxprom83 = and i64 %conv82, 255
-  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
-  store i16 %val, i16* %arrayidx86, align 8
-  ret void
-}
-
-define void @store_word(%struct.b* %ctx, i32 %xor72, i32 %val) nounwind {
-; CHECK-LABEL: store_word:
-; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
-; CHECK: str w2, [x0, [[REG]], lsl #2]
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %idxprom83 = and i64 %conv82, 255
-  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
-  store i32 %val, i32* %arrayidx86, align 8
-  ret void
-}
-
-define void @store_doubleword(%struct.c* %ctx, i32 %xor72, i64 %val) nounwind {
-; CHECK-LABEL: store_doubleword:
-; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
-; CHECK: str x2, [x0, [[REG]], lsl #3]
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %idxprom83 = and i64 %conv82, 255
-  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
-  store i64 %val, i64* %arrayidx86, align 8
-  ret void
-}
diff --git a/test/CodeGen/ARM64/fp-contract-zero.ll b/test/CodeGen/ARM64/fp-contract-zero.ll
deleted file mode 100644
index f982cbb7f5e..00000000000
--- a/test/CodeGen/ARM64/fp-contract-zero.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -mtriple=arm64 -fp-contract=fast -o - %s | FileCheck %s
-
-
-; Make sure we don't try to fold an fneg into +0.0, creating an illegal constant
-; -0.0. It's also good, though not essential, that we don't resort to a litpool.
-define double @test_fms_fold(double %a, double %b) {
-; CHECK-LABEL: test_fms_fold:
-; CHECK: fmov {{d[0-9]+}}, xzr
-; CHECK: ret
-  %mul = fmul double %a, 0.000000e+00
-  %mul1 = fmul double %b, 0.000000e+00
-  %sub = fsub double %mul, %mul1
-  ret double %sub
-}
\ No newline at end of file
diff --git a/test/CodeGen/ARM64/fp-imm.ll b/test/CodeGen/ARM64/fp-imm.ll
deleted file mode 100644
index 6e271e03d28..00000000000
--- a/test/CodeGen/ARM64/fp-imm.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-
-; CHECK: literal8
-; CHECK: .quad  4614256656552045848
-define double @foo() {
-; CHECK: _foo:
-; CHECK: adrp x[[REG:[0-9]+]], lCPI0_0@PAGE
-; CHECK: ldr  d0, [x[[REG]], lCPI0_0@PAGEOFF]
-; CHECK-NEXT: ret
-  ret double 0x400921FB54442D18
-}
-
-; CHECK: literal4
-; CHECK: .long 1078530011
-define float @bar() {
-; CHECK: _bar:
-; CHECK:  adrp  x[[REG:[0-9]+]], lCPI1_0@PAGE
-; CHECK:  ldr s0, [x[[REG]], lCPI1_0@PAGEOFF]
-; CHECK-NEXT:  ret
-  ret float 0x400921FB60000000
-}
-
-; CHECK: literal16
-; CHECK: .quad 0
-; CHECK: .quad 0
-define fp128 @baz() {
-; CHECK: _baz:
-; CHECK:  adrp x[[REG:[0-9]+]], lCPI2_0@PAGE
-; CHECK:  ldr  q0, [x[[REG]], lCPI2_0@PAGEOFF]
-; CHECK-NEXT:  ret
-  ret fp128 0xL00000000000000000000000000000000
-}
diff --git a/test/CodeGen/ARM64/fp.ll b/test/CodeGen/ARM64/fp.ll
deleted file mode 100644
index 08b1b6754c2..00000000000
--- a/test/CodeGen/ARM64/fp.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define float @t1(i1 %a, float %b, float %c) nounwind {
-; CHECK: t1
-; CHECK: fcsel	s0, s0, s1, ne
-  %sel = select i1 %a, float %b, float %c
-  ret float %sel
-}
diff --git a/test/CodeGen/ARM64/fp128-folding.ll b/test/CodeGen/ARM64/fp128-folding.ll
deleted file mode 100644
index 6a7d203f5b1..00000000000
--- a/test/CodeGen/ARM64/fp128-folding.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
-declare void @bar(i8*, i8*, i32*)
-
-; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
-; which is not supported.
-
-define fp128 @test_folding() {
-; CHECK-LABEL: test_folding:
-  %l = alloca i32
-  store i32 42, i32* %l
-  %val = load i32* %l
-  %fpval = sitofp i32 %val to fp128
-  ; If the value is loaded from a constant pool into an fp128, it's been folded
-  ; successfully.
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}},
-  ret fp128 %fpval
-}
diff --git a/test/CodeGen/ARM64/fp128.ll b/test/CodeGen/ARM64/fp128.ll
deleted file mode 100644
index 57bbb93e12b..00000000000
--- a/test/CodeGen/ARM64/fp128.ll
+++ /dev/null
@@ -1,273 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone < %s | FileCheck %s
-
-@lhs = global fp128 zeroinitializer, align 16
-@rhs = global fp128 zeroinitializer, align 16
-
-define fp128 @test_add() {
-; CHECK-LABEL: test_add:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-  %val = fadd fp128 %lhs, %rhs
-; CHECK: bl __addtf3
-  ret fp128 %val
-}
-
-define fp128 @test_sub() {
-; CHECK-LABEL: test_sub:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-  %val = fsub fp128 %lhs, %rhs
-; CHECK: bl __subtf3
-  ret fp128 %val
-}
-
-define fp128 @test_mul() {
-; CHECK-LABEL: test_mul:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-  %val = fmul fp128 %lhs, %rhs
-; CHECK: bl __multf3
-  ret fp128 %val
-}
-
-define fp128 @test_div() {
-; CHECK-LABEL: test_div:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-  %val = fdiv fp128 %lhs, %rhs
-; CHECK: bl __divtf3
-  ret fp128 %val
-}
-
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @test_fptosi() {
-; CHECK-LABEL: test_fptosi:
-  %val = load fp128* @lhs, align 16
-
-  %val32 = fptosi fp128 %val to i32
-  store i32 %val32, i32* @var32
-; CHECK: bl __fixtfsi
-
-  %val64 = fptosi fp128 %val to i64
-  store i64 %val64, i64* @var64
-; CHECK: bl __fixtfdi
-
-  ret void
-}
-
-define void @test_fptoui() {
-; CHECK-LABEL: test_fptoui:
-  %val = load fp128* @lhs, align 16
-
-  %val32 = fptoui fp128 %val to i32
-  store i32 %val32, i32* @var32
-; CHECK: bl __fixunstfsi
-
-  %val64 = fptoui fp128 %val to i64
-  store i64 %val64, i64* @var64
-; CHECK: bl __fixunstfdi
-
-  ret void
-}
-
-define void @test_sitofp() {
-; CHECK-LABEL: test_sitofp:
-
-  %src32 = load i32* @var32
-  %val32 = sitofp i32 %src32 to fp128
-  store volatile fp128 %val32, fp128* @lhs
-; CHECK: bl __floatsitf
-
-  %src64 = load i64* @var64
-  %val64 = sitofp i64 %src64 to fp128
-  store volatile fp128 %val64, fp128* @lhs
-; CHECK: bl __floatditf
-
-  ret void
-}
-
-define void @test_uitofp() {
-; CHECK-LABEL: test_uitofp:
-
-  %src32 = load i32* @var32
-  %val32 = uitofp i32 %src32 to fp128
-  store volatile fp128 %val32, fp128* @lhs
-; CHECK: bl __floatunsitf
-
-  %src64 = load i64* @var64
-  %val64 = uitofp i64 %src64 to fp128
-  store volatile fp128 %val64, fp128* @lhs
-; CHECK: bl __floatunditf
-
-  ret void
-}
-
-define i1 @test_setcc1() {
-; CHECK-LABEL: test_setcc1:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-; Technically, everything after the call to __letf2 is redundant, but we'll let
-; LLVM have its fun for now.
-  %val = fcmp ole fp128 %lhs, %rhs
-; CHECK: bl __letf2
-; CHECK: cmp w0, #0
-; CHECK: cset w0, le
-
-  ret i1 %val
-; CHECK: ret
-}
-
-define i1 @test_setcc2() {
-; CHECK-LABEL: test_setcc2:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-  %val = fcmp ugt fp128 %lhs, %rhs
-; CHECK: bl      __gttf2
-; CHECK: cmp     w0, #0
-; CHECK: cset   [[GT:w[0-9]+]], gt
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp     w0, #0
-; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
-; CHECK: orr     w0, [[UNORDERED]], [[GT]]
-
-  ret i1 %val
-; CHECK: ret
-}
-
-define i32 @test_br_cc() {
-; CHECK-LABEL: test_br_cc:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
-  %cond = fcmp olt fp128 %lhs, %rhs
-; CHECK: bl      __getf2
-; CHECK: cmp     w0, #0
-; CHECK: cset   [[OGE:w[0-9]+]], ge
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp     w0, #0
-; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
-
-; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
-; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
-  br i1 %cond, label %iftrue, label %iffalse
-
-iftrue:
-  ret i32 42
-; CHECK-NEXT: BB#
-; CHECK-NEXT: movz w0, #0x2a
-; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
-
-iffalse:
-  ret i32 29
-; CHECK: [[RET29]]:
-; CHECK-NEXT: movz w0, #0x1d
-; CHECK-NEXT: [[REALRET]]:
-; CHECK: ret
-}
-
-define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
-; CHECK-LABEL: test_select:
-
-  %val = select i1 %cond, fp128 %lhs, fp128 %rhs
-  store fp128 %val, fp128* @lhs, align 16
-; CHECK: tst w0, #0x1
-; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: BB#
-; CHECK-NEXT: mov v[[VAL:[0-9]+]].16b, v0.16b
-; CHECK-NEXT: [[IFFALSE]]:
-; CHECK: str q[[VAL]], [{{x[0-9]+}}, :lo12:lhs]
-  ret void
-; CHECK: ret
-}
-
-@varfloat = global float 0.0, align 4
-@vardouble = global double 0.0, align 8
-
-define void @test_round() {
-; CHECK-LABEL: test_round:
-
-  %val = load fp128* @lhs, align 16
-
-  %float = fptrunc fp128 %val to float
-  store float %float, float* @varfloat, align 4
-; CHECK: bl __trunctfsf2
-; CHECK: str s0, [{{x[0-9]+}}, :lo12:varfloat]
-
-  %double = fptrunc fp128 %val to double
-  store double %double, double* @vardouble, align 8
-; CHECK: bl __trunctfdf2
-; CHECK: str d0, [{{x[0-9]+}}, :lo12:vardouble]
-
-  ret void
-}
-
-define void @test_extend() {
-; CHECK-LABEL: test_extend:
-
-  %val = load fp128* @lhs, align 16
-
-  %float = load float* @varfloat
-  %fromfloat = fpext float %float to fp128
-  store volatile fp128 %fromfloat, fp128* @lhs, align 16
-; CHECK: bl __extendsftf2
-; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
-
-  %double = load double* @vardouble
-  %fromdouble = fpext double %double to fp128
-  store volatile fp128 %fromdouble, fp128* @lhs, align 16
-; CHECK: bl __extenddftf2
-; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
-
-  ret void
-; CHECK: ret
-}
-
-define fp128 @test_neg(fp128 %in) {
-; CHECK: [[MINUS0:.LCPI[0-9]+_0]]:
-; Make sure the weird hex constant below *is* -0.0
-; CHECK-NEXT: fp128 -0
-
-; CHECK-LABEL: test_neg:
-
-  ; Could in principle be optimized to fneg which we can't select, this makes
-  ; sure that doesn't happen.
-  %ret = fsub fp128 0xL00000000000000008000000000000000, %in
-; CHECK: mov v1.16b, v0.16b
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:[[MINUS0]]]
-; CHECK: bl __subtf3
-
-  ret fp128 %ret
-; CHECK: ret
-}
diff --git a/test/CodeGen/ARM64/frame-index.ll b/test/CodeGen/ARM64/frame-index.ll
deleted file mode 100644
index 4a91ff31d8c..00000000000
--- a/test/CodeGen/ARM64/frame-index.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
-; rdar://11935841
-
-define void @t1() nounwind ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK-NOT: add x{{[0-9]+}}, sp
-; CHECK: stp x28, x27, [sp, #-16]!
-  %v = alloca [288 x i32], align 4
-  unreachable
-}
diff --git a/test/CodeGen/ARM64/frameaddr.ll b/test/CodeGen/ARM64/frameaddr.ll
deleted file mode 100644
index 469078c8814..00000000000
--- a/test/CodeGen/ARM64/frameaddr.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i8* @t() nounwind {
-entry:
-; CHECK-LABEL: t:
-; CHECK: stp x29, x30, [sp, #-16]!
-; CHECK: mov x29, sp
-; CHECK: mov x0, x29
-; CHECK: ldp x29, x30, [sp], #16
-; CHECK: ret
-	%0 = call i8* @llvm.frameaddress(i32 0)
-        ret i8* %0
-}
-
-declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/global-address.ll b/test/CodeGen/ARM64/global-address.ll
deleted file mode 100644
index 005f414f875..00000000000
--- a/test/CodeGen/ARM64/global-address.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-; rdar://9618644
-
-@G = external global i32
-
-define i32 @test(i32 %off) nounwind {
-; CHECK-LABEL: test:
-; CHECK: adrp x[[REG:[0-9]+]], _G@GOTPAGE
-; CHECK: ldr  x[[REG2:[0-9]+]], [x[[REG]], _G@GOTPAGEOFF]
-; CHECK: add w0, w[[REG2]], w0
-  %tmp = ptrtoint i32* @G to i32
-  %tmp1 = add i32 %tmp, %off
-  ret i32 %tmp1
-}
diff --git a/test/CodeGen/ARM64/hello.ll b/test/CodeGen/ARM64/hello.ll
deleted file mode 100644
index a6346fb467f..00000000000
--- a/test/CodeGen/ARM64/hello.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
-
-; CHECK-LABEL: main:
-; CHECK:	stp	x29, x30, [sp, #-16]!
-; CHECK-NEXT:	mov	x29, sp
-; CHECK-NEXT:	sub	sp, sp, #16
-; CHECK-NEXT:	stur	wzr, [x29, #-4]
-; CHECK:	adrp	x0, L_.str@PAGE
-; CHECK:	add	x0, x0, L_.str@PAGEOFF
-; CHECK-NEXT:	bl	_puts
-; CHECK-NEXT:	mov	sp, x29
-; CHECK-NEXT:	ldp	x29, x30, [sp], #16
-; CHECK-NEXT:	ret
-
-; CHECK-LINUX-LABEL: main:
-; CHECK-LINUX:	stp	x29, x30, [sp, #-16]!
-; CHECK-LINUX-NEXT:	mov	x29, sp
-; CHECK-LINUX-NEXT:	sub	sp, sp, #16
-; CHECK-LINUX-NEXT:	stur	wzr, [x29, #-4]
-; CHECK-LINUX:	adrp	x0, .L.str
-; CHECK-LINUX:	add	x0, x0, :lo12:.L.str
-; CHECK-LINUX-NEXT:	bl	puts
-; CHECK-LINUX-NEXT:	mov	sp, x29
-; CHECK-LINUX-NEXT:	ldp	x29, x30, [sp], #16
-; CHECK-LINUX-NEXT:	ret
-
-@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
-
-define i32 @main() nounwind ssp {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %call = call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0))
-  ret i32 %call
-}
-
-declare i32 @puts(i8*)
diff --git a/test/CodeGen/ARM64/i16-subreg-extract.ll b/test/CodeGen/ARM64/i16-subreg-extract.ll
deleted file mode 100644
index fc2e8b58ac8..00000000000
--- a/test/CodeGen/ARM64/i16-subreg-extract.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define i32 @foo(<4 x i16>* %__a) nounwind {
-; CHECK-LABEL: foo:
-; CHECK: umov.h w{{[0-9]+}}, v{{[0-9]+}}[0]
-  %tmp18 = load <4 x i16>* %__a, align 8
-  %vget_lane = extractelement <4 x i16> %tmp18, i32 0
-  %conv = zext i16 %vget_lane to i32
-  %mul = mul nsw i32 3, %conv
-  ret i32 %mul
-}
-
diff --git a/test/CodeGen/ARM64/icmp-opt.ll b/test/CodeGen/ARM64/icmp-opt.ll
deleted file mode 100644
index 7b12ed74861..00000000000
--- a/test/CodeGen/ARM64/icmp-opt.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-; Optimize (x > -1) to (x >= 0) etc.
-; Optimize (cmp (add / sub), 0): eliminate the subs used to update flag
-;   for comparison only
-; rdar://10233472
-
-define i32 @t1(i64 %a) nounwind ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK-NOT: movn
-; CHECK: cmp  x0, #0
-; CHECK: cset w0, ge
-  %cmp = icmp sgt i64 %a, -1
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
diff --git a/test/CodeGen/ARM64/illegal-float-ops.ll b/test/CodeGen/ARM64/illegal-float-ops.ll
deleted file mode 100644
index 9a35fe54d32..00000000000
--- a/test/CodeGen/ARM64/illegal-float-ops.ll
+++ /dev/null
@@ -1,295 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-
-@varfloat = global float 0.0
-@vardouble = global double 0.0
-@varfp128 = global fp128 zeroinitializer
-
-declare float @llvm.cos.f32(float)
-declare double @llvm.cos.f64(double)
-declare fp128 @llvm.cos.f128(fp128)
-
-define void @test_cos(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_cos:
-
-   %cosfloat = call float @llvm.cos.f32(float %float)
-   store float %cosfloat, float* @varfloat
-; CHECK: bl cosf
-
-   %cosdouble = call double @llvm.cos.f64(double %double)
-   store double %cosdouble, double* @vardouble
-; CHECK: bl cos
-
-   %cosfp128 = call fp128 @llvm.cos.f128(fp128 %fp128)
-   store fp128 %cosfp128, fp128* @varfp128
-; CHECK: bl cosl
-
-  ret void
-}
-
-declare float @llvm.exp.f32(float)
-declare double @llvm.exp.f64(double)
-declare fp128 @llvm.exp.f128(fp128)
-
-define void @test_exp(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_exp:
-
-   %expfloat = call float @llvm.exp.f32(float %float)
-   store float %expfloat, float* @varfloat
-; CHECK: bl expf
-
-   %expdouble = call double @llvm.exp.f64(double %double)
-   store double %expdouble, double* @vardouble
-; CHECK: bl exp
-
-   %expfp128 = call fp128 @llvm.exp.f128(fp128 %fp128)
-   store fp128 %expfp128, fp128* @varfp128
-; CHECK: bl expl
-
-  ret void
-}
-
-declare float @llvm.exp2.f32(float)
-declare double @llvm.exp2.f64(double)
-declare fp128 @llvm.exp2.f128(fp128)
-
-define void @test_exp2(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_exp2:
-
-   %exp2float = call float @llvm.exp2.f32(float %float)
-   store float %exp2float, float* @varfloat
-; CHECK: bl exp2f
-
-   %exp2double = call double @llvm.exp2.f64(double %double)
-   store double %exp2double, double* @vardouble
-; CHECK: bl exp2
-
-   %exp2fp128 = call fp128 @llvm.exp2.f128(fp128 %fp128)
-   store fp128 %exp2fp128, fp128* @varfp128
-; CHECK: bl exp2l
-  ret void
-
-}
-
-declare float @llvm.log.f32(float)
-declare double @llvm.log.f64(double)
-declare fp128 @llvm.log.f128(fp128)
-
-define void @test_log(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_log:
-
-   %logfloat = call float @llvm.log.f32(float %float)
-   store float %logfloat, float* @varfloat
-; CHECK: bl logf
-
-   %logdouble = call double @llvm.log.f64(double %double)
-   store double %logdouble, double* @vardouble
-; CHECK: bl log
-
-   %logfp128 = call fp128 @llvm.log.f128(fp128 %fp128)
-   store fp128 %logfp128, fp128* @varfp128
-; CHECK: bl logl
-
-  ret void
-}
-
-declare float @llvm.log2.f32(float)
-declare double @llvm.log2.f64(double)
-declare fp128 @llvm.log2.f128(fp128)
-
-define void @test_log2(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_log2:
-
-   %log2float = call float @llvm.log2.f32(float %float)
-   store float %log2float, float* @varfloat
-; CHECK: bl log2f
-
-   %log2double = call double @llvm.log2.f64(double %double)
-   store double %log2double, double* @vardouble
-; CHECK: bl log2
-
-   %log2fp128 = call fp128 @llvm.log2.f128(fp128 %fp128)
-   store fp128 %log2fp128, fp128* @varfp128
-; CHECK: bl log2l
-  ret void
-
-}
-
-declare float @llvm.log10.f32(float)
-declare double @llvm.log10.f64(double)
-declare fp128 @llvm.log10.f128(fp128)
-
-define void @test_log10(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_log10:
-
-   %log10float = call float @llvm.log10.f32(float %float)
-   store float %log10float, float* @varfloat
-; CHECK: bl log10f
-
-   %log10double = call double @llvm.log10.f64(double %double)
-   store double %log10double, double* @vardouble
-; CHECK: bl log10
-
-   %log10fp128 = call fp128 @llvm.log10.f128(fp128 %fp128)
-   store fp128 %log10fp128, fp128* @varfp128
-; CHECK: bl log10l
-
-  ret void
-}
-
-declare float @llvm.sin.f32(float)
-declare double @llvm.sin.f64(double)
-declare fp128 @llvm.sin.f128(fp128)
-
-define void @test_sin(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_sin:
-
-   %sinfloat = call float @llvm.sin.f32(float %float)
-   store float %sinfloat, float* @varfloat
-; CHECK: bl sinf
-
-   %sindouble = call double @llvm.sin.f64(double %double)
-   store double %sindouble, double* @vardouble
-; CHECK: bl sin
-
-   %sinfp128 = call fp128 @llvm.sin.f128(fp128 %fp128)
-   store fp128 %sinfp128, fp128* @varfp128
-; CHECK: bl sinl
-  ret void
-
-}
-
-declare float @llvm.pow.f32(float, float)
-declare double @llvm.pow.f64(double, double)
-declare fp128 @llvm.pow.f128(fp128, fp128)
-
-define void @test_pow(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_pow:
-
-   %powfloat = call float @llvm.pow.f32(float %float, float %float)
-   store float %powfloat, float* @varfloat
-; CHECK: bl powf
-
-   %powdouble = call double @llvm.pow.f64(double %double, double %double)
-   store double %powdouble, double* @vardouble
-; CHECK: bl pow
-
-   %powfp128 = call fp128 @llvm.pow.f128(fp128 %fp128, fp128 %fp128)
-   store fp128 %powfp128, fp128* @varfp128
-; CHECK: bl powl
-
-  ret void
-}
-
-declare float @llvm.powi.f32(float, i32)
-declare double @llvm.powi.f64(double, i32)
-declare fp128 @llvm.powi.f128(fp128, i32)
-
-define void @test_powi(float %float, double %double, i32 %exponent, fp128 %fp128) {
-; CHECK-LABEL: test_powi:
-
-   %powifloat = call float @llvm.powi.f32(float %float, i32 %exponent)
-   store float %powifloat, float* @varfloat
-; CHECK: bl __powisf2
-
-   %powidouble = call double @llvm.powi.f64(double %double, i32 %exponent)
-   store double %powidouble, double* @vardouble
-; CHECK: bl __powidf2
-
-   %powifp128 = call fp128 @llvm.powi.f128(fp128 %fp128, i32 %exponent)
-   store fp128 %powifp128, fp128* @varfp128
-; CHECK: bl __powitf2
-  ret void
-
-}
-
-define void @test_frem(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_frem:
-
-  %fremfloat = frem float %float, %float
-  store float %fremfloat, float* @varfloat
-; CHECK: bl fmodf
-
-  %fremdouble = frem double %double, %double
-  store double %fremdouble, double* @vardouble
-; CHECK: bl fmod
-
-  %fremfp128 = frem fp128 %fp128, %fp128
-  store fp128 %fremfp128, fp128* @varfp128
-; CHECK: bl fmodl
-
-  ret void
-}
-
-declare fp128 @llvm.fma.f128(fp128, fp128, fp128)
-
-define void @test_fma(fp128 %fp128) {
-; CHECK-LABEL: test_fma:
-
-  %fmafp128 = call fp128 @llvm.fma.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
-  store fp128 %fmafp128, fp128* @varfp128
-; CHECK: bl fmal
-
-  ret void
-}
-
-declare fp128 @llvm.fmuladd.f128(fp128, fp128, fp128)
-
-define void @test_fmuladd(fp128 %fp128) {
-; CHECK-LABEL: test_fmuladd:
-
-  %fmuladdfp128 = call fp128 @llvm.fmuladd.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
-  store fp128 %fmuladdfp128, fp128* @varfp128
-; CHECK-NOT: bl fmal
-; CHECK: bl __multf3
-; CHECK: bl __addtf3
-
-  ret void
-}
-
-define i32 @test_fptosi32(fp128 %a) {
-; CHECK-LABEL: test_fptosi32:
-; CHECK: bl __fixtfsi
-  %conv.i = fptosi fp128 %a to i32
-  %b = add nsw i32 %conv.i, 48
-  ret i32 %b
-}
-
-define i64 @test_fptosi64(fp128 %a) {
-; CHECK-LABEL: test_fptosi64:
-; CHECK: bl __fixtfdi
-  %conv.i = fptosi fp128 %a to i64
-  %b = add nsw i64 %conv.i, 48
-  ret i64 %b
-}
-
-define i128 @test_fptosi128(fp128 %a) {
-; CHECK-LABEL: test_fptosi128:
-; CHECK: bl __fixtfti
-  %conv.i = fptosi fp128 %a to i128
-  %b = add nsw i128 %conv.i, 48
-  ret i128 %b
-}
-
-define i32 @test_fptoui32(fp128 %a) {
-; CHECK-LABEL: test_fptoui32:
-; CHECK: bl __fixunstfsi
-  %conv.i = fptoui fp128 %a to i32
-  %b = add nsw i32 %conv.i, 48
-  ret i32 %b
-}
-
-define i64 @test_fptoui64(fp128 %a) {
-; CHECK-LABEL: test_fptoui64:
-; CHECK: bl __fixunstfdi
-  %conv.i = fptoui fp128 %a to i64
-  %b = add nsw i64 %conv.i, 48
-  ret i64 %b
-}
-
-define i128 @test_fptoui128(fp128 %a) {
-; CHECK-LABEL: test_fptoui128:
-; CHECK: bl __fixunstfti
-  %conv.i = fptoui fp128 %a to i128
-  %b = add nsw i128 %conv.i, 48
-  ret i128 %b
-}
diff --git a/test/CodeGen/ARM64/indexed-memory.ll b/test/CodeGen/ARM64/indexed-memory.ll
deleted file mode 100644
index e390ed7ece5..00000000000
--- a/test/CodeGen/ARM64/indexed-memory.ll
+++ /dev/null
@@ -1,351 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
-
-define void @store64(i64** nocapture %out, i64 %index, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: store64:
-; CHECK: str x{{[0-9+]}}, [x{{[0-9+]}}], #8
-; CHECK: ret
-  %tmp = load i64** %out, align 8
-  %incdec.ptr = getelementptr inbounds i64* %tmp, i64 1
-  store i64 %spacing, i64* %tmp, align 4
-  store i64* %incdec.ptr, i64** %out, align 8
-  ret void
-}
-
-define void @store32(i32** nocapture %out, i32 %index, i32 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: store32:
-; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
-; CHECK: ret
-  %tmp = load i32** %out, align 8
-  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
-  store i32 %spacing, i32* %tmp, align 4
-  store i32* %incdec.ptr, i32** %out, align 8
-  ret void
-}
-
-define void @store16(i16** nocapture %out, i16 %index, i16 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: store16:
-; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
-; CHECK: ret
-  %tmp = load i16** %out, align 8
-  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
-  store i16 %spacing, i16* %tmp, align 4
-  store i16* %incdec.ptr, i16** %out, align 8
-  ret void
-}
-
-define void @store8(i8** nocapture %out, i8 %index, i8 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: store8:
-; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
-; CHECK: ret
-  %tmp = load i8** %out, align 8
-  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
-  store i8 %spacing, i8* %tmp, align 4
-  store i8* %incdec.ptr, i8** %out, align 8
-  ret void
-}
-
-define void @truncst64to32(i32** nocapture %out, i32 %index, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: truncst64to32:
-; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
-; CHECK: ret
-  %tmp = load i32** %out, align 8
-  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
-  %trunc = trunc i64 %spacing to i32
-  store i32 %trunc, i32* %tmp, align 4
-  store i32* %incdec.ptr, i32** %out, align 8
-  ret void
-}
-
-define void @truncst64to16(i16** nocapture %out, i16 %index, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: truncst64to16:
-; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
-; CHECK: ret
-  %tmp = load i16** %out, align 8
-  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
-  %trunc = trunc i64 %spacing to i16
-  store i16 %trunc, i16* %tmp, align 4
-  store i16* %incdec.ptr, i16** %out, align 8
-  ret void
-}
-
-define void @truncst64to8(i8** nocapture %out, i8 %index, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: truncst64to8:
-; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
-; CHECK: ret
-  %tmp = load i8** %out, align 8
-  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
-  %trunc = trunc i64 %spacing to i8
-  store i8 %trunc, i8* %tmp, align 4
-  store i8* %incdec.ptr, i8** %out, align 8
-  ret void
-}
-
-
-define void @storef32(float** nocapture %out, float %index, float %spacing) nounwind noinline ssp {
-; CHECK-LABEL: storef32:
-; CHECK: str s{{[0-9+]}}, [x{{[0-9+]}}], #4
-; CHECK: ret
-  %tmp = load float** %out, align 8
-  %incdec.ptr = getelementptr inbounds float* %tmp, i64 1
-  store float %spacing, float* %tmp, align 4
-  store float* %incdec.ptr, float** %out, align 8
-  ret void
-}
-
-define void @storef64(double** nocapture %out, double %index, double %spacing) nounwind noinline ssp {
-; CHECK-LABEL: storef64:
-; CHECK: str d{{[0-9+]}}, [x{{[0-9+]}}], #8
-; CHECK: ret
-  %tmp = load double** %out, align 8
-  %incdec.ptr = getelementptr inbounds double* %tmp, i64 1
-  store double %spacing, double* %tmp, align 4
-  store double* %incdec.ptr, double** %out, align 8
-  ret void
-}
-
-define double * @pref64(double** nocapture %out, double %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pref64:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: str     d0, [x0, #32]!
-; CHECK-NEXT: ret
-  %tmp = load double** %out, align 8
-  %ptr = getelementptr inbounds double* %tmp, i64 4
-  store double %spacing, double* %ptr, align 4
-  ret double *%ptr
-}
-
-define float * @pref32(float** nocapture %out, float %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pref32:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: str     s0, [x0, #12]!
-; CHECK-NEXT: ret
-  %tmp = load float** %out, align 8
-  %ptr = getelementptr inbounds float* %tmp, i64 3
-  store float %spacing, float* %ptr, align 4
-  ret float *%ptr
-}
-
-define i64 * @pre64(i64** nocapture %out, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pre64:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: str     x1, [x0, #16]!
-; CHECK-NEXT: ret
-  %tmp = load i64** %out, align 8
-  %ptr = getelementptr inbounds i64* %tmp, i64 2
-  store i64 %spacing, i64* %ptr, align 4
-  ret i64 *%ptr
-}
-
-define i32 * @pre32(i32** nocapture %out, i32 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pre32:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: str     w1, [x0, #8]!
-; CHECK-NEXT: ret
-  %tmp = load i32** %out, align 8
-  %ptr = getelementptr inbounds i32* %tmp, i64 2
-  store i32 %spacing, i32* %ptr, align 4
-  ret i32 *%ptr
-}
-
-define i16 * @pre16(i16** nocapture %out, i16 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pre16:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: strh    w1, [x0, #4]!
-; CHECK-NEXT: ret
-  %tmp = load i16** %out, align 8
-  %ptr = getelementptr inbounds i16* %tmp, i64 2
-  store i16 %spacing, i16* %ptr, align 4
-  ret i16 *%ptr
-}
-
-define i8 * @pre8(i8** nocapture %out, i8 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pre8:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: strb    w1, [x0, #2]!
-; CHECK-NEXT: ret
-  %tmp = load i8** %out, align 8
-  %ptr = getelementptr inbounds i8* %tmp, i64 2
-  store i8 %spacing, i8* %ptr, align 4
-  ret i8 *%ptr
-}
-
-define i32 * @pretrunc64to32(i32** nocapture %out, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pretrunc64to32:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: str     w1, [x0, #8]!
-; CHECK-NEXT: ret
-  %tmp = load i32** %out, align 8
-  %ptr = getelementptr inbounds i32* %tmp, i64 2
-  %trunc = trunc i64 %spacing to i32
-  store i32 %trunc, i32* %ptr, align 4
-  ret i32 *%ptr
-}
-
-define i16 * @pretrunc64to16(i16** nocapture %out, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pretrunc64to16:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: strh    w1, [x0, #4]!
-; CHECK-NEXT: ret
-  %tmp = load i16** %out, align 8
-  %ptr = getelementptr inbounds i16* %tmp, i64 2
-  %trunc = trunc i64 %spacing to i16
-  store i16 %trunc, i16* %ptr, align 4
-  ret i16 *%ptr
-}
-
-define i8 * @pretrunc64to8(i8** nocapture %out, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pretrunc64to8:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: strb    w1, [x0, #2]!
-; CHECK-NEXT: ret
-  %tmp = load i8** %out, align 8
-  %ptr = getelementptr inbounds i8* %tmp, i64 2
-  %trunc = trunc i64 %spacing to i8
-  store i8 %trunc, i8* %ptr, align 4
-  ret i8 *%ptr
-}
-
-;-----
-; Pre-indexed loads
-;-----
-define double* @preidxf64(double* %src, double* %out) {
-; CHECK-LABEL: preidxf64:
-; CHECK: ldr     d0, [x0, #8]!
-; CHECK: str     d0, [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds double* %src, i64 1
-  %tmp = load double* %ptr, align 4
-  store double %tmp, double* %out, align 4
-  ret double* %ptr
-}
-
-define float* @preidxf32(float* %src, float* %out) {
-; CHECK-LABEL: preidxf32:
-; CHECK: ldr     s0, [x0, #4]!
-; CHECK: str     s0, [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds float* %src, i64 1
-  %tmp = load float* %ptr, align 4
-  store float %tmp, float* %out, align 4
-  ret float* %ptr
-}
-
-define i64* @preidx64(i64* %src, i64* %out) {
-; CHECK-LABEL: preidx64:
-; CHECK: ldr     x[[REG:[0-9]+]], [x0, #8]!
-; CHECK: str     x[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i64* %src, i64 1
-  %tmp = load i64* %ptr, align 4
-  store i64 %tmp, i64* %out, align 4
-  ret i64* %ptr
-}
-
-define i32* @preidx32(i32* %src, i32* %out) {
-; CHECK: ldr     w[[REG:[0-9]+]], [x0, #4]!
-; CHECK: str     w[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i32* %src, i64 1
-  %tmp = load i32* %ptr, align 4
-  store i32 %tmp, i32* %out, align 4
-  ret i32* %ptr
-}
-
-define i16* @preidx16zext32(i16* %src, i32* %out) {
-; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
-; CHECK: str     w[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i16* %src, i64 1
-  %tmp = load i16* %ptr, align 4
-  %ext = zext i16 %tmp to i32
-  store i32 %ext, i32* %out, align 4
-  ret i16* %ptr
-}
-
-define i16* @preidx16zext64(i16* %src, i64* %out) {
-; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
-; CHECK: str     x[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i16* %src, i64 1
-  %tmp = load i16* %ptr, align 4
-  %ext = zext i16 %tmp to i64
-  store i64 %ext, i64* %out, align 4
-  ret i16* %ptr
-}
-
-define i8* @preidx8zext32(i8* %src, i32* %out) {
-; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
-; CHECK: str     w[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i8* %src, i64 1
-  %tmp = load i8* %ptr, align 4
-  %ext = zext i8 %tmp to i32
-  store i32 %ext, i32* %out, align 4
-  ret i8* %ptr
-}
-
-define i8* @preidx8zext64(i8* %src, i64* %out) {
-; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
-; CHECK: str     x[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i8* %src, i64 1
-  %tmp = load i8* %ptr, align 4
-  %ext = zext i8 %tmp to i64
-  store i64 %ext, i64* %out, align 4
-  ret i8* %ptr
-}
-
-define i32* @preidx32sext64(i32* %src, i64* %out) {
-; CHECK: ldrsw   x[[REG:[0-9]+]], [x0, #4]!
-; CHECK: str     x[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i32* %src, i64 1
-  %tmp = load i32* %ptr, align 4
-  %ext = sext i32 %tmp to i64
-  store i64 %ext, i64* %out, align 8
-  ret i32* %ptr
-}
-
-define i16* @preidx16sext32(i16* %src, i32* %out) {
-; CHECK: ldrsh   w[[REG:[0-9]+]], [x0, #2]!
-; CHECK: str     w[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i16* %src, i64 1
-  %tmp = load i16* %ptr, align 4
-  %ext = sext i16 %tmp to i32
-  store i32 %ext, i32* %out, align 4
-  ret i16* %ptr
-}
-
-define i16* @preidx16sext64(i16* %src, i64* %out) {
-; CHECK: ldrsh   x[[REG:[0-9]+]], [x0, #2]!
-; CHECK: str     x[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i16* %src, i64 1
-  %tmp = load i16* %ptr, align 4
-  %ext = sext i16 %tmp to i64
-  store i64 %ext, i64* %out, align 4
-  ret i16* %ptr
-}
-
-define i8* @preidx8sext32(i8* %src, i32* %out) {
-; CHECK: ldrsb   w[[REG:[0-9]+]], [x0, #1]!
-; CHECK: str     w[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i8* %src, i64 1
-  %tmp = load i8* %ptr, align 4
-  %ext = sext i8 %tmp to i32
-  store i32 %ext, i32* %out, align 4
-  ret i8* %ptr
-}
-
-define i8* @preidx8sext64(i8* %src, i64* %out) {
-; CHECK: ldrsb   x[[REG:[0-9]+]], [x0, #1]!
-; CHECK: str     x[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i8* %src, i64 1
-  %tmp = load i8* %ptr, align 4
-  %ext = sext i8 %tmp to i64
-  store i64 %ext, i64* %out, align 4
-  ret i8* %ptr
-}
diff --git a/test/CodeGen/ARM64/indexed-vector-ldst-2.ll b/test/CodeGen/ARM64/indexed-vector-ldst-2.ll
deleted file mode 100644
index c118f109289..00000000000
--- a/test/CodeGen/ARM64/indexed-vector-ldst-2.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc < %s
-
-; This used to assert with "Overran sorted position" in AssignTopologicalOrder
-; due to a cycle created in performPostLD1Combine.
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios7.0.0"
-
-; Function Attrs: nounwind ssp
-define void @f(double* %P1) #0 {
-entry:
-  %arrayidx4 = getelementptr inbounds double* %P1, i64 1
-  %0 = load double* %arrayidx4, align 8, !tbaa !1
-  %1 = load double* %P1, align 8, !tbaa !1
-  %2 = insertelement <2 x double> undef, double %0, i32 0
-  %3 = insertelement <2 x double> %2, double %1, i32 1
-  %4 = fsub <2 x double> zeroinitializer, %3
-  %5 = fmul <2 x double> undef, %4
-  %6 = extractelement <2 x double> %5, i32 0
-  %cmp168 = fcmp olt double %6, undef
-  br i1 %cmp168, label %if.then172, label %return
-
-if.then172:                                       ; preds = %cond.end90
-  %7 = tail call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 false)
-  br label %return
-
-return:                                           ; preds = %if.then172, %cond.end90, %entry
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) #1
-
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
-!1 = metadata !{metadata !2, metadata !2, i64 0}
-!2 = metadata !{metadata !"double", metadata !3, i64 0}
-!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM64/indexed-vector-ldst.ll b/test/CodeGen/ARM64/indexed-vector-ldst.ll
deleted file mode 100644
index 3d8ff53137b..00000000000
--- a/test/CodeGen/ARM64/indexed-vector-ldst.ll
+++ /dev/null
@@ -1,6174 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
-
-@ptr = global i8* null
-
-define <8 x i8> @test_v8i8_pre_load(<8 x i8>* %addr) {
-; CHECK-LABEL: test_v8i8_pre_load:
-; CHECK: ldr d0, [x0, #40]!
-  %newaddr = getelementptr <8 x i8>* %addr, i32 5
-  %val = load <8 x i8>* %newaddr, align 8
-  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
-  ret <8 x i8> %val
-}
-
-define <8 x i8> @test_v8i8_post_load(<8 x i8>* %addr) {
-; CHECK-LABEL: test_v8i8_post_load:
-; CHECK: ldr d0, [x0], #40
-  %newaddr = getelementptr <8 x i8>* %addr, i32 5
-  %val = load <8 x i8>* %addr, align 8
-  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
-  ret <8 x i8> %val
-}
-
-define void @test_v8i8_pre_store(<8 x i8> %in, <8 x i8>* %addr) {
-; CHECK-LABEL: test_v8i8_pre_store:
-; CHECK: str d0, [x0, #40]!
-  %newaddr = getelementptr <8 x i8>* %addr, i32 5
-  store <8 x i8> %in, <8 x i8>* %newaddr, align 8
-  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
-  ret void
-}
-
-define void @test_v8i8_post_store(<8 x i8> %in, <8 x i8>* %addr) {
-; CHECK-LABEL: test_v8i8_post_store:
-; CHECK: str d0, [x0], #40
-  %newaddr = getelementptr <8 x i8>* %addr, i32 5
-  store <8 x i8> %in, <8 x i8>* %addr, align 8
-  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
-  ret void
-}
-
-define <4 x i16> @test_v4i16_pre_load(<4 x i16>* %addr) {
-; CHECK-LABEL: test_v4i16_pre_load:
-; CHECK: ldr d0, [x0, #40]!
-  %newaddr = getelementptr <4 x i16>* %addr, i32 5
-  %val = load <4 x i16>* %newaddr, align 8
-  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
-  ret <4 x i16> %val
-}
-
-define <4 x i16> @test_v4i16_post_load(<4 x i16>* %addr) {
-; CHECK-LABEL: test_v4i16_post_load:
-; CHECK: ldr d0, [x0], #40
-  %newaddr = getelementptr <4 x i16>* %addr, i32 5
-  %val = load <4 x i16>* %addr, align 8
-  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
-  ret <4 x i16> %val
-}
-
-define void @test_v4i16_pre_store(<4 x i16> %in, <4 x i16>* %addr) {
-; CHECK-LABEL: test_v4i16_pre_store:
-; CHECK: str d0, [x0, #40]!
-  %newaddr = getelementptr <4 x i16>* %addr, i32 5
-  store <4 x i16> %in, <4 x i16>* %newaddr, align 8
-  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
-  ret void
-}
-
-define void @test_v4i16_post_store(<4 x i16> %in, <4 x i16>* %addr) {
-; CHECK-LABEL: test_v4i16_post_store:
-; CHECK: str d0, [x0], #40
-  %newaddr = getelementptr <4 x i16>* %addr, i32 5
-  store <4 x i16> %in, <4 x i16>* %addr, align 8
-  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
-  ret void
-}
-
-define <2 x i32> @test_v2i32_pre_load(<2 x i32>* %addr) {
-; CHECK-LABEL: test_v2i32_pre_load:
-; CHECK: ldr d0, [x0, #40]!
-  %newaddr = getelementptr <2 x i32>* %addr, i32 5
-  %val = load <2 x i32>* %newaddr, align 8
-  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
-  ret <2 x i32> %val
-}
-
-define <2 x i32> @test_v2i32_post_load(<2 x i32>* %addr) {
-; CHECK-LABEL: test_v2i32_post_load:
-; CHECK: ldr d0, [x0], #40
-  %newaddr = getelementptr <2 x i32>* %addr, i32 5
-  %val = load <2 x i32>* %addr, align 8
-  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
-  ret <2 x i32> %val
-}
-
-define void @test_v2i32_pre_store(<2 x i32> %in, <2 x i32>* %addr) {
-; CHECK-LABEL: test_v2i32_pre_store:
-; CHECK: str d0, [x0, #40]!
-  %newaddr = getelementptr <2 x i32>* %addr, i32 5
-  store <2 x i32> %in, <2 x i32>* %newaddr, align 8
-  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
-  ret void
-}
-
-define void @test_v2i32_post_store(<2 x i32> %in, <2 x i32>* %addr) {
-; CHECK-LABEL: test_v2i32_post_store:
-; CHECK: str d0, [x0], #40
-  %newaddr = getelementptr <2 x i32>* %addr, i32 5
-  store <2 x i32> %in, <2 x i32>* %addr, align 8
-  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
-  ret void
-}
-
-define <2 x float> @test_v2f32_pre_load(<2 x float>* %addr) {
-; CHECK-LABEL: test_v2f32_pre_load:
-; CHECK: ldr d0, [x0, #40]!
-  %newaddr = getelementptr <2 x float>* %addr, i32 5
-  %val = load <2 x float>* %newaddr, align 8
-  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
-  ret <2 x float> %val
-}
-
-define <2 x float> @test_v2f32_post_load(<2 x float>* %addr) {
-; CHECK-LABEL: test_v2f32_post_load:
-; CHECK: ldr d0, [x0], #40
-  %newaddr = getelementptr <2 x float>* %addr, i32 5
-  %val = load <2 x float>* %addr, align 8
-  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
-  ret <2 x float> %val
-}
-
-define void @test_v2f32_pre_store(<2 x float> %in, <2 x float>* %addr) {
-; CHECK-LABEL: test_v2f32_pre_store:
-; CHECK: str d0, [x0, #40]!
-  %newaddr = getelementptr <2 x float>* %addr, i32 5
-  store <2 x float> %in, <2 x float>* %newaddr, align 8
-  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
-  ret void
-}
-
-define void @test_v2f32_post_store(<2 x float> %in, <2 x float>* %addr) {
-; CHECK-LABEL: test_v2f32_post_store:
-; CHECK: str d0, [x0], #40
-  %newaddr = getelementptr <2 x float>* %addr, i32 5
-  store <2 x float> %in, <2 x float>* %addr, align 8
-  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
-  ret void
-}
-
-define <1 x i64> @test_v1i64_pre_load(<1 x i64>* %addr) {
-; CHECK-LABEL: test_v1i64_pre_load:
-; CHECK: ldr d0, [x0, #40]!
-  %newaddr = getelementptr <1 x i64>* %addr, i32 5
-  %val = load <1 x i64>* %newaddr, align 8
-  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
-  ret <1 x i64> %val
-}
-
-define <1 x i64> @test_v1i64_post_load(<1 x i64>* %addr) {
-; CHECK-LABEL: test_v1i64_post_load:
-; CHECK: ldr d0, [x0], #40
-  %newaddr = getelementptr <1 x i64>* %addr, i32 5
-  %val = load <1 x i64>* %addr, align 8
-  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
-  ret <1 x i64> %val
-}
-
-define void @test_v1i64_pre_store(<1 x i64> %in, <1 x i64>* %addr) {
-; CHECK-LABEL: test_v1i64_pre_store:
-; CHECK: str d0, [x0, #40]!
-  %newaddr = getelementptr <1 x i64>* %addr, i32 5
-  store <1 x i64> %in, <1 x i64>* %newaddr, align 8
-  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
-  ret void
-}
-
-define void @test_v1i64_post_store(<1 x i64> %in, <1 x i64>* %addr) {
-; CHECK-LABEL: test_v1i64_post_store:
-; CHECK: str d0, [x0], #40
-  %newaddr = getelementptr <1 x i64>* %addr, i32 5
-  store <1 x i64> %in, <1 x i64>* %addr, align 8
-  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
-  ret void
-}
-
-define <16 x i8> @test_v16i8_pre_load(<16 x i8>* %addr) {
-; CHECK-LABEL: test_v16i8_pre_load:
-; CHECK: ldr q0, [x0, #80]!
-  %newaddr = getelementptr <16 x i8>* %addr, i32 5
-  %val = load <16 x i8>* %newaddr, align 8
-  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
-  ret <16 x i8> %val
-}
-
-define <16 x i8> @test_v16i8_post_load(<16 x i8>* %addr) {
-; CHECK-LABEL: test_v16i8_post_load:
-; CHECK: ldr q0, [x0], #80
-  %newaddr = getelementptr <16 x i8>* %addr, i32 5
-  %val = load <16 x i8>* %addr, align 8
-  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
-  ret <16 x i8> %val
-}
-
-define void @test_v16i8_pre_store(<16 x i8> %in, <16 x i8>* %addr) {
-; CHECK-LABEL: test_v16i8_pre_store:
-; CHECK: str q0, [x0, #80]!
-  %newaddr = getelementptr <16 x i8>* %addr, i32 5
-  store <16 x i8> %in, <16 x i8>* %newaddr, align 8
-  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
-  ret void
-}
-
-define void @test_v16i8_post_store(<16 x i8> %in, <16 x i8>* %addr) {
-; CHECK-LABEL: test_v16i8_post_store:
-; CHECK: str q0, [x0], #80
-  %newaddr = getelementptr <16 x i8>* %addr, i32 5
-  store <16 x i8> %in, <16 x i8>* %addr, align 8
-  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
-  ret void
-}
-
-define <8 x i16> @test_v8i16_pre_load(<8 x i16>* %addr) {
-; CHECK-LABEL: test_v8i16_pre_load:
-; CHECK: ldr q0, [x0, #80]!
-  %newaddr = getelementptr <8 x i16>* %addr, i32 5
-  %val = load <8 x i16>* %newaddr, align 8
-  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
-  ret <8 x i16> %val
-}
-
-define <8 x i16> @test_v8i16_post_load(<8 x i16>* %addr) {
-; CHECK-LABEL: test_v8i16_post_load:
-; CHECK: ldr q0, [x0], #80
-  %newaddr = getelementptr <8 x i16>* %addr, i32 5
-  %val = load <8 x i16>* %addr, align 8
-  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
-  ret <8 x i16> %val
-}
-
-define void @test_v8i16_pre_store(<8 x i16> %in, <8 x i16>* %addr) {
-; CHECK-LABEL: test_v8i16_pre_store:
-; CHECK: str q0, [x0, #80]!
-  %newaddr = getelementptr <8 x i16>* %addr, i32 5
-  store <8 x i16> %in, <8 x i16>* %newaddr, align 8
-  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
-  ret void
-}
-
-define void @test_v8i16_post_store(<8 x i16> %in, <8 x i16>* %addr) {
-; CHECK-LABEL: test_v8i16_post_store:
-; CHECK: str q0, [x0], #80
-  %newaddr = getelementptr <8 x i16>* %addr, i32 5
-  store <8 x i16> %in, <8 x i16>* %addr, align 8
-  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
-  ret void
-}
-
-define <4 x i32> @test_v4i32_pre_load(<4 x i32>* %addr) {
-; CHECK-LABEL: test_v4i32_pre_load:
-; CHECK: ldr q0, [x0, #80]!
-  %newaddr = getelementptr <4 x i32>* %addr, i32 5
-  %val = load <4 x i32>* %newaddr, align 8
-  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
-  ret <4 x i32> %val
-}
-
-define <4 x i32> @test_v4i32_post_load(<4 x i32>* %addr) {
-; CHECK-LABEL: test_v4i32_post_load:
-; CHECK: ldr q0, [x0], #80
-  %newaddr = getelementptr <4 x i32>* %addr, i32 5
-  %val = load <4 x i32>* %addr, align 8
-  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
-  ret <4 x i32> %val
-}
-
-define void @test_v4i32_pre_store(<4 x i32> %in, <4 x i32>* %addr) {
-; CHECK-LABEL: test_v4i32_pre_store:
-; CHECK: str q0, [x0, #80]!
-  %newaddr = getelementptr <4 x i32>* %addr, i32 5
-  store <4 x i32> %in, <4 x i32>* %newaddr, align 8
-  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
-  ret void
-}
-
-define void @test_v4i32_post_store(<4 x i32> %in, <4 x i32>* %addr) {
-; CHECK-LABEL: test_v4i32_post_store:
-; CHECK: str q0, [x0], #80
-  %newaddr = getelementptr <4 x i32>* %addr, i32 5
-  store <4 x i32> %in, <4 x i32>* %addr, align 8
-  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
-  ret void
-}
-
-
-define <4 x float> @test_v4f32_pre_load(<4 x float>* %addr) {
-; CHECK-LABEL: test_v4f32_pre_load:
-; CHECK: ldr q0, [x0, #80]!
-  %newaddr = getelementptr <4 x float>* %addr, i32 5
-  %val = load <4 x float>* %newaddr, align 8
-  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
-  ret <4 x float> %val
-}
-
-define <4 x float> @test_v4f32_post_load(<4 x float>* %addr) {
-; CHECK-LABEL: test_v4f32_post_load:
-; CHECK: ldr q0, [x0], #80
-  %newaddr = getelementptr <4 x float>* %addr, i32 5
-  %val = load <4 x float>* %addr, align 8
-  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
-  ret <4 x float> %val
-}
-
-define void @test_v4f32_pre_store(<4 x float> %in, <4 x float>* %addr) {
-; CHECK-LABEL: test_v4f32_pre_store:
-; CHECK: str q0, [x0, #80]!
-  %newaddr = getelementptr <4 x float>* %addr, i32 5
-  store <4 x float> %in, <4 x float>* %newaddr, align 8
-  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
-  ret void
-}
-
-define void @test_v4f32_post_store(<4 x float> %in, <4 x float>* %addr) {
-; CHECK-LABEL: test_v4f32_post_store:
-; CHECK: str q0, [x0], #80
-  %newaddr = getelementptr <4 x float>* %addr, i32 5
-  store <4 x float> %in, <4 x float>* %addr, align 8
-  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
-  ret void
-}
-
-
-define <2 x i64> @test_v2i64_pre_load(<2 x i64>* %addr) {
-; CHECK-LABEL: test_v2i64_pre_load:
-; CHECK: ldr q0, [x0, #80]!
-  %newaddr = getelementptr <2 x i64>* %addr, i32 5
-  %val = load <2 x i64>* %newaddr, align 8
-  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
-  ret <2 x i64> %val
-}
-
-define <2 x i64> @test_v2i64_post_load(<2 x i64>* %addr) {
-; CHECK-LABEL: test_v2i64_post_load:
-; CHECK: ldr q0, [x0], #80
-  %newaddr = getelementptr <2 x i64>* %addr, i32 5
-  %val = load <2 x i64>* %addr, align 8
-  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
-  ret <2 x i64> %val
-}
-
-define void @test_v2i64_pre_store(<2 x i64> %in, <2 x i64>* %addr) {
-; CHECK-LABEL: test_v2i64_pre_store:
-; CHECK: str q0, [x0, #80]!
-  %newaddr = getelementptr <2 x i64>* %addr, i32 5
-  store <2 x i64> %in, <2 x i64>* %newaddr, align 8
-  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
-  ret void
-}
-
-define void @test_v2i64_post_store(<2 x i64> %in, <2 x i64>* %addr) {
-; CHECK-LABEL: test_v2i64_post_store:
-; CHECK: str q0, [x0], #80
-  %newaddr = getelementptr <2 x i64>* %addr, i32 5
-  store <2 x i64> %in, <2 x i64>* %addr, align 8
-  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
-  ret void
-}
-
-
-define <2 x double> @test_v2f64_pre_load(<2 x double>* %addr) {
-; CHECK-LABEL: test_v2f64_pre_load:
-; CHECK: ldr q0, [x0, #80]!
-  %newaddr = getelementptr <2 x double>* %addr, i32 5
-  %val = load <2 x double>* %newaddr, align 8
-  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
-  ret <2 x double> %val
-}
-
-define <2 x double> @test_v2f64_post_load(<2 x double>* %addr) {
-; CHECK-LABEL: test_v2f64_post_load:
-; CHECK: ldr q0, [x0], #80
-  %newaddr = getelementptr <2 x double>* %addr, i32 5
-  %val = load <2 x double>* %addr, align 8
-  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
-  ret <2 x double> %val
-}
-
-define void @test_v2f64_pre_store(<2 x double> %in, <2 x double>* %addr) {
-; CHECK-LABEL: test_v2f64_pre_store:
-; CHECK: str q0, [x0, #80]!
-  %newaddr = getelementptr <2 x double>* %addr, i32 5
-  store <2 x double> %in, <2 x double>* %newaddr, align 8
-  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
-  ret void
-}
-
-define void @test_v2f64_post_store(<2 x double> %in, <2 x double>* %addr) {
-; CHECK-LABEL: test_v2f64_post_store:
-; CHECK: str q0, [x0], #80
-  %newaddr = getelementptr <2 x double>* %addr, i32 5
-  store <2 x double> %in, <2 x double>* %addr, align 8
-  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
-  ret void
-}
-
-define i8* @test_v16i8_post_imm_st1_lane(<16 x i8> %in, i8* %addr) {
-; CHECK-LABEL: test_v16i8_post_imm_st1_lane:
-; CHECK: st1.b { v0 }[3], [x0], #1
-  %elt = extractelement <16 x i8> %in, i32 3
-  store i8 %elt, i8* %addr
-
-  %newaddr = getelementptr i8* %addr, i32 1
-  ret i8* %newaddr
-}
-
-define i8* @test_v16i8_post_reg_st1_lane(<16 x i8> %in, i8* %addr) {
-; CHECK-LABEL: test_v16i8_post_reg_st1_lane:
-; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x2
-; CHECK: st1.b { v0 }[3], [x0], x[[OFFSET]]
-  %elt = extractelement <16 x i8> %in, i32 3
-  store i8 %elt, i8* %addr
-
-  %newaddr = getelementptr i8* %addr, i32 2
-  ret i8* %newaddr
-}
-
-
-define i16* @test_v8i16_post_imm_st1_lane(<8 x i16> %in, i16* %addr) {
-; CHECK-LABEL: test_v8i16_post_imm_st1_lane:
-; CHECK: st1.h { v0 }[3], [x0], #2
-  %elt = extractelement <8 x i16> %in, i32 3
-  store i16 %elt, i16* %addr
-
-  %newaddr = getelementptr i16* %addr, i32 1
-  ret i16* %newaddr
-}
-
-define i16* @test_v8i16_post_reg_st1_lane(<8 x i16> %in, i16* %addr) {
-; CHECK-LABEL: test_v8i16_post_reg_st1_lane:
-; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x4
-; CHECK: st1.h { v0 }[3], [x0], x[[OFFSET]]
-  %elt = extractelement <8 x i16> %in, i32 3
-  store i16 %elt, i16* %addr
-
-  %newaddr = getelementptr i16* %addr, i32 2
-  ret i16* %newaddr
-}
-
-define i32* @test_v4i32_post_imm_st1_lane(<4 x i32> %in, i32* %addr) {
-; CHECK-LABEL: test_v4i32_post_imm_st1_lane:
-; CHECK: st1.s { v0 }[3], [x0], #4
-  %elt = extractelement <4 x i32> %in, i32 3
-  store i32 %elt, i32* %addr
-
-  %newaddr = getelementptr i32* %addr, i32 1
-  ret i32* %newaddr
-}
-
-define i32* @test_v4i32_post_reg_st1_lane(<4 x i32> %in, i32* %addr) {
-; CHECK-LABEL: test_v4i32_post_reg_st1_lane:
-; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
-; CHECK: st1.s { v0 }[3], [x0], x[[OFFSET]]
-  %elt = extractelement <4 x i32> %in, i32 3
-  store i32 %elt, i32* %addr
-
-  %newaddr = getelementptr i32* %addr, i32 2
-  ret i32* %newaddr
-}
-
-define float* @test_v4f32_post_imm_st1_lane(<4 x float> %in, float* %addr) {
-; CHECK-LABEL: test_v4f32_post_imm_st1_lane:
-; CHECK: st1.s { v0 }[3], [x0], #4
-  %elt = extractelement <4 x float> %in, i32 3
-  store float %elt, float* %addr
-
-  %newaddr = getelementptr float* %addr, i32 1
-  ret float* %newaddr
-}
-
-define float* @test_v4f32_post_reg_st1_lane(<4 x float> %in, float* %addr) {
-; CHECK-LABEL: test_v4f32_post_reg_st1_lane:
-; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
-; CHECK: st1.s { v0 }[3], [x0], x[[OFFSET]]
-  %elt = extractelement <4 x float> %in, i32 3
-  store float %elt, float* %addr
-
-  %newaddr = getelementptr float* %addr, i32 2
-  ret float* %newaddr
-}
-
-define i64* @test_v2i64_post_imm_st1_lane(<2 x i64> %in, i64* %addr) {
-; CHECK-LABEL: test_v2i64_post_imm_st1_lane:
-; CHECK: st1.d { v0 }[1], [x0], #8
-  %elt = extractelement <2 x i64> %in, i64 1
-  store i64 %elt, i64* %addr
-
-  %newaddr = getelementptr i64* %addr, i64 1
-  ret i64* %newaddr
-}
-
-define i64* @test_v2i64_post_reg_st1_lane(<2 x i64> %in, i64* %addr) {
-; CHECK-LABEL: test_v2i64_post_reg_st1_lane:
-; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x10
-; CHECK: st1.d { v0 }[1], [x0], x[[OFFSET]]
-  %elt = extractelement <2 x i64> %in, i64 1
-  store i64 %elt, i64* %addr
-
-  %newaddr = getelementptr i64* %addr, i64 2
-  ret i64* %newaddr
-}
-
-define double* @test_v2f64_post_imm_st1_lane(<2 x double> %in, double* %addr) {
-; CHECK-LABEL: test_v2f64_post_imm_st1_lane:
-; CHECK: st1.d { v0 }[1], [x0], #8
-  %elt = extractelement <2 x double> %in, i32 1
-  store double %elt, double* %addr
-
-  %newaddr = getelementptr double* %addr, i32 1
-  ret double* %newaddr
-}
-
-define double* @test_v2f64_post_reg_st1_lane(<2 x double> %in, double* %addr) {
-; CHECK-LABEL: test_v2f64_post_reg_st1_lane:
-; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x10
-; CHECK: st1.d { v0 }[1], [x0], x[[OFFSET]]
-  %elt = extractelement <2 x double> %in, i32 1
-  store double %elt, double* %addr
-
-  %newaddr = getelementptr double* %addr, i32 2
-  ret double* %newaddr
-}
-
-define i8* @test_v8i8_post_imm_st1_lane(<8 x i8> %in, i8* %addr) {
-; CHECK-LABEL: test_v8i8_post_imm_st1_lane:
-; CHECK: st1.b { v0 }[3], [x0], #1
-  %elt = extractelement <8 x i8> %in, i32 3
-  store i8 %elt, i8* %addr
-
-  %newaddr = getelementptr i8* %addr, i32 1
-  ret i8* %newaddr
-}
-
-define i8* @test_v8i8_post_reg_st1_lane(<8 x i8> %in, i8* %addr) {
-; CHECK-LABEL: test_v8i8_post_reg_st1_lane:
-; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x2
-; CHECK: st1.b { v0 }[3], [x0], x[[OFFSET]]
-  %elt = extractelement <8 x i8> %in, i32 3
-  store i8 %elt, i8* %addr
-
-  %newaddr = getelementptr i8* %addr, i32 2
-  ret i8* %newaddr
-}
-
-define i16* @test_v4i16_post_imm_st1_lane(<4 x i16> %in, i16* %addr) {
-; CHECK-LABEL: test_v4i16_post_imm_st1_lane:
-; CHECK: st1.h { v0 }[3], [x0], #2
-  %elt = extractelement <4 x i16> %in, i32 3
-  store i16 %elt, i16* %addr
-
-  %newaddr = getelementptr i16* %addr, i32 1
-  ret i16* %newaddr
-}
-
-define i16* @test_v4i16_post_reg_st1_lane(<4 x i16> %in, i16* %addr) {
-; CHECK-LABEL: test_v4i16_post_reg_st1_lane:
-; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x4
-; CHECK: st1.h { v0 }[3], [x0], x[[OFFSET]]
-  %elt = extractelement <4 x i16> %in, i32 3
-  store i16 %elt, i16* %addr
-
-  %newaddr = getelementptr i16* %addr, i32 2
-  ret i16* %newaddr
-}
-
-define i32* @test_v2i32_post_imm_st1_lane(<2 x i32> %in, i32* %addr) {
-; CHECK-LABEL: test_v2i32_post_imm_st1_lane:
-; CHECK: st1.s { v0 }[1], [x0], #4
-  %elt = extractelement <2 x i32> %in, i32 1
-  store i32 %elt, i32* %addr
-
-  %newaddr = getelementptr i32* %addr, i32 1
-  ret i32* %newaddr
-}
-
-define i32* @test_v2i32_post_reg_st1_lane(<2 x i32> %in, i32* %addr) {
-; CHECK-LABEL: test_v2i32_post_reg_st1_lane:
-; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
-; CHECK: st1.s { v0 }[1], [x0], x[[OFFSET]]
-  %elt = extractelement <2 x i32> %in, i32 1
-  store i32 %elt, i32* %addr
-
-  %newaddr = getelementptr i32* %addr, i32 2
-  ret i32* %newaddr
-}
-
-define float* @test_v2f32_post_imm_st1_lane(<2 x float> %in, float* %addr) {
-; CHECK-LABEL: test_v2f32_post_imm_st1_lane:
-; CHECK: st1.s { v0 }[1], [x0], #4
-  %elt = extractelement <2 x float> %in, i32 1
-  store float %elt, float* %addr
-
-  %newaddr = getelementptr float* %addr, i32 1
-  ret float* %newaddr
-}
-
-define float* @test_v2f32_post_reg_st1_lane(<2 x float> %in, float* %addr) {
-; CHECK-LABEL: test_v2f32_post_reg_st1_lane:
-; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
-; CHECK: st1.s { v0 }[1], [x0], x[[OFFSET]]
-  %elt = extractelement <2 x float> %in, i32 1
-  store float %elt, float* %addr
-
-  %newaddr = getelementptr float* %addr, i32 2
-  ret float* %newaddr
-}
-
-define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
-;CHECK-LABEL: test_v16i8_post_imm_ld2:
-;CHECK: ld2.16b { v0, v1 }, [x0], #32
-  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 32
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8> } %ld2
-}
-
-define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v16i8_post_reg_ld2:
-;CHECK: ld2.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8> } %ld2
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8*)
-
-
-define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2(i8* %A, i8** %ptr) {
-;CHECK-LABEL: test_v8i8_post_imm_ld2:
-;CHECK: ld2.8b { v0, v1 }, [x0], #16
-  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 16
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8> } %ld2
-}
-
-define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v8i8_post_reg_ld2:
-;CHECK: ld2.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8> } %ld2
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8*)
-
-
-define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2(i16* %A, i16** %ptr) {
-;CHECK-LABEL: test_v8i16_post_imm_ld2:
-;CHECK: ld2.8h { v0, v1 }, [x0], #32
-  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 16
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16> } %ld2
-}
-
-define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v8i16_post_reg_ld2:
-;CHECK: ld2.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16> } %ld2
-}
-
-declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16*)
-
-
-define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2(i16* %A, i16** %ptr) {
-;CHECK-LABEL: test_v4i16_post_imm_ld2:
-;CHECK: ld2.4h { v0, v1 }, [x0], #16
-  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 8
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16> } %ld2
-}
-
-define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4i16_post_reg_ld2:
-;CHECK: ld2.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16> } %ld2
-}
-
-declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16*)
-
-
-define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2(i32* %A, i32** %ptr) {
-;CHECK-LABEL: test_v4i32_post_imm_ld2:
-;CHECK: ld2.4s { v0, v1 }, [x0], #32
-  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 8
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32> } %ld2
-}
-
-define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4i32_post_reg_ld2:
-;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32> } %ld2
-}
-
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32*)
-
-
-define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2(i32* %A, i32** %ptr) {
-;CHECK-LABEL: test_v2i32_post_imm_ld2:
-;CHECK: ld2.2s { v0, v1 }, [x0], #16
-  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 4
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32> } %ld2
-}
-
-define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2i32_post_reg_ld2:
-;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32> } %ld2
-}
-
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32*)
-
-
-define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2(i64* %A, i64** %ptr) {
-;CHECK-LABEL: test_v2i64_post_imm_ld2:
-;CHECK: ld2.2d { v0, v1 }, [x0], #32
-  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 4
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64> } %ld2
-}
-
-define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2i64_post_reg_ld2:
-;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64> } %ld2
-}
-
-declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64*)
-
-
-define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2(i64* %A, i64** %ptr) {
-;CHECK-LABEL: test_v1i64_post_imm_ld2:
-;CHECK: ld1.1d { v0, v1 }, [x0], #16
-  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 2
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64> } %ld2
-}
-
-define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v1i64_post_reg_ld2:
-;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64> } %ld2
-}
-
-declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64*)
-
-
-define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2(float* %A, float** %ptr) {
-;CHECK-LABEL: test_v4f32_post_imm_ld2:
-;CHECK: ld2.4s { v0, v1 }, [x0], #32
-  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 8
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float> } %ld2
-}
-
-define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4f32_post_reg_ld2:
-;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float> } %ld2
-}
-
-declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float*)
-
-
-define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2(float* %A, float** %ptr) {
-;CHECK-LABEL: test_v2f32_post_imm_ld2:
-;CHECK: ld2.2s { v0, v1 }, [x0], #16
-  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 4
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float> } %ld2
-}
-
-define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2f32_post_reg_ld2:
-;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float> } %ld2
-}
-
-declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float*)
-
-
-define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2(double* %A, double** %ptr) {
-;CHECK-LABEL: test_v2f64_post_imm_ld2:
-;CHECK: ld2.2d { v0, v1 }, [x0], #32
-  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 4
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double> } %ld2
-}
-
-define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2f64_post_reg_ld2:
-;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double> } %ld2
-}
-
-declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double*)
-
-
-define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2(double* %A, double** %ptr) {
-;CHECK-LABEL: test_v1f64_post_imm_ld2:
-;CHECK: ld1.1d { v0, v1 }, [x0], #16
-  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 2
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double> } %ld2
-}
-
-define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v1f64_post_reg_ld2:
-;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double> } %ld2
-}
-
-declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double*)
-
-
-define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3(i8* %A, i8** %ptr) {
-;CHECK-LABEL: test_v16i8_post_imm_ld3:
-;CHECK: ld3.16b { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 48
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
-}
-
-define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v16i8_post_reg_ld3:
-;CHECK: ld3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8*)
-
-
-define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3(i8* %A, i8** %ptr) {
-;CHECK-LABEL: test_v8i8_post_imm_ld3:
-;CHECK: ld3.8b { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 24
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
-}
-
-define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v8i8_post_reg_ld3:
-;CHECK: ld3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8*)
-
-
-define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3(i16* %A, i16** %ptr) {
-;CHECK-LABEL: test_v8i16_post_imm_ld3:
-;CHECK: ld3.8h { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 24
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
-}
-
-define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v8i16_post_reg_ld3:
-;CHECK: ld3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
-}
-
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16*)
-
-
-define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3(i16* %A, i16** %ptr) {
-;CHECK-LABEL: test_v4i16_post_imm_ld3:
-;CHECK: ld3.4h { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 12
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
-}
-
-define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4i16_post_reg_ld3:
-;CHECK: ld3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
-}
-
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16*)
-
-
-define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3(i32* %A, i32** %ptr) {
-;CHECK-LABEL: test_v4i32_post_imm_ld3:
-;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 12
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
-}
-
-define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4i32_post_reg_ld3:
-;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
-}
-
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32*)
-
-
-define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3(i32* %A, i32** %ptr) {
-;CHECK-LABEL: test_v2i32_post_imm_ld3:
-;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 6
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
-}
-
-define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2i32_post_reg_ld3:
-;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
-}
-
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32*)
-
-
-define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3(i64* %A, i64** %ptr) {
-;CHECK-LABEL: test_v2i64_post_imm_ld3:
-;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 6
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
-}
-
-define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2i64_post_reg_ld3:
-;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
-}
-
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64*)
-
-
-define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3(i64* %A, i64** %ptr) {
-;CHECK-LABEL: test_v1i64_post_imm_ld3:
-;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 3
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
-}
-
-define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v1i64_post_reg_ld3:
-;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
-}
-
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64*)
-
-
-define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3(float* %A, float** %ptr) {
-;CHECK-LABEL: test_v4f32_post_imm_ld3:
-;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 12
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
-}
-
-define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4f32_post_reg_ld3:
-;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
-}
-
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float*)
-
-
-define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3(float* %A, float** %ptr) {
-;CHECK-LABEL: test_v2f32_post_imm_ld3:
-;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 6
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
-}
-
-define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2f32_post_reg_ld3:
-;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
-}
-
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float*)
-
-
-define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3(double* %A, double** %ptr) {
-;CHECK-LABEL: test_v2f64_post_imm_ld3:
-;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 6
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
-}
-
-define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2f64_post_reg_ld3:
-;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
-}
-
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double*)
-
-
-define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3(double* %A, double** %ptr) {
-;CHECK-LABEL: test_v1f64_post_imm_ld3:
-;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 3
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
-}
-
-define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v1f64_post_reg_ld3:
-;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
-}
-
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double*)
-
-
-define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4(i8* %A, i8** %ptr) {
-;CHECK-LABEL: test_v16i8_post_imm_ld4:
-;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 64
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
-}
-
-define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v16i8_post_reg_ld4:
-;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8*)
-
-
-define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4(i8* %A, i8** %ptr) {
-;CHECK-LABEL: test_v8i8_post_imm_ld4:
-;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 32
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
-}
-
-define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v8i8_post_reg_ld4:
-;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8*)
-
-
-define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4(i16* %A, i16** %ptr) {
-;CHECK-LABEL: test_v8i16_post_imm_ld4:
-;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 32
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
-}
-
-define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v8i16_post_reg_ld4:
-;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
-}
-
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16*)
-
-
-define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4(i16* %A, i16** %ptr) {
-;CHECK-LABEL: test_v4i16_post_imm_ld4:
-;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 16
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
-}
-
-define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4i16_post_reg_ld4:
-;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
-}
-
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16*)
-
-
-define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4(i32* %A, i32** %ptr) {
-;CHECK-LABEL: test_v4i32_post_imm_ld4:
-;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 16
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
-}
-
-define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4i32_post_reg_ld4:
-;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
-}
-
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32*)
-
-
-define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4(i32* %A, i32** %ptr) {
-;CHECK-LABEL: test_v2i32_post_imm_ld4:
-;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 8
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
-}
-
-define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2i32_post_reg_ld4:
-;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
-}
-
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32*)
-
-
-define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4(i64* %A, i64** %ptr) {
-;CHECK-LABEL: test_v2i64_post_imm_ld4:
-;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 8
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
-}
-
-define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2i64_post_reg_ld4:
-;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
-}
-
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64*)
-
-
-define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4(i64* %A, i64** %ptr) {
-;CHECK-LABEL: test_v1i64_post_imm_ld4:
-;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 4
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
-}
-
-define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v1i64_post_reg_ld4:
-;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
-}
-
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64*)
-
-
-define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4(float* %A, float** %ptr) {
-;CHECK-LABEL: test_v4f32_post_imm_ld4:
-;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 16
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
-}
-
-define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4f32_post_reg_ld4:
-;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
-}
-
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float*)
-
-
-define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4(float* %A, float** %ptr) {
-;CHECK-LABEL: test_v2f32_post_imm_ld4:
-;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 8
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
-}
-
-define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2f32_post_reg_ld4:
-;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
-}
-
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float*)
-
-
-define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4(double* %A, double** %ptr) {
-;CHECK-LABEL: test_v2f64_post_imm_ld4:
-;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 8
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
-}
-
-define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2f64_post_reg_ld4:
-;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
-}
-
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double*)
-
-
-define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4(double* %A, double** %ptr) {
-;CHECK-LABEL: test_v1f64_post_imm_ld4:
-;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 4
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
-}
-
-define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v1f64_post_reg_ld4:
-;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
-}
-
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double*)
-
-define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
-;CHECK-LABEL: test_v16i8_post_imm_ld1x2:
-;CHECK: ld1.16b { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 32
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8> } %ld1x2
-}
-
-define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v16i8_post_reg_ld1x2:
-;CHECK: ld1.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8> } %ld1x2
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*)
-
-
-define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
-;CHECK-LABEL: test_v8i8_post_imm_ld1x2:
-;CHECK: ld1.8b { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 16
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8> } %ld1x2
-}
-
-define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v8i8_post_reg_ld1x2:
-;CHECK: ld1.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8> } %ld1x2
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*)
-
-
-define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
-;CHECK-LABEL: test_v8i16_post_imm_ld1x2:
-;CHECK: ld1.8h { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 16
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16> } %ld1x2
-}
-
-define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v8i16_post_reg_ld1x2:
-;CHECK: ld1.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16> } %ld1x2
-}
-
-declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*)
-
-
-define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
-;CHECK-LABEL: test_v4i16_post_imm_ld1x2:
-;CHECK: ld1.4h { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 8
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16> } %ld1x2
-}
-
-define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4i16_post_reg_ld1x2:
-;CHECK: ld1.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16> } %ld1x2
-}
-
-declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*)
-
-
-define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
-;CHECK-LABEL: test_v4i32_post_imm_ld1x2:
-;CHECK: ld1.4s { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 8
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32> } %ld1x2
-}
-
-define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4i32_post_reg_ld1x2:
-;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32> } %ld1x2
-}
-
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*)
-
-
-define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
-;CHECK-LABEL: test_v2i32_post_imm_ld1x2:
-;CHECK: ld1.2s { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 4
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32> } %ld1x2
-}
-
-define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2i32_post_reg_ld1x2:
-;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32> } %ld1x2
-}
-
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*)
-
-
-define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
-;CHECK-LABEL: test_v2i64_post_imm_ld1x2:
-;CHECK: ld1.2d { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 4
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64> } %ld1x2
-}
-
-define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2i64_post_reg_ld1x2:
-;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64> } %ld1x2
-}
-
-declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*)
-
-
-define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
-;CHECK-LABEL: test_v1i64_post_imm_ld1x2:
-;CHECK: ld1.1d { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 2
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64> } %ld1x2
-}
-
-define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v1i64_post_reg_ld1x2:
-;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64> } %ld1x2
-}
-
-declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*)
-
-
-define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x2(float* %A, float** %ptr) {
-;CHECK-LABEL: test_v4f32_post_imm_ld1x2:
-;CHECK: ld1.4s { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 8
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float> } %ld1x2
-}
-
-define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4f32_post_reg_ld1x2:
-;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float> } %ld1x2
-}
-
-declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*)
-
-
-define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x2(float* %A, float** %ptr) {
-;CHECK-LABEL: test_v2f32_post_imm_ld1x2:
-;CHECK: ld1.2s { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 4
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float> } %ld1x2
-}
-
-define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2f32_post_reg_ld1x2:
-;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float> } %ld1x2
-}
-
-declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*)
-
-
-define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x2(double* %A, double** %ptr) {
-;CHECK-LABEL: test_v2f64_post_imm_ld1x2:
-;CHECK: ld1.2d { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 4
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double> } %ld1x2
-}
-
-define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2f64_post_reg_ld1x2:
-;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double> } %ld1x2
-}
-
-declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*)
-
-
-define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x2(double* %A, double** %ptr) {
-;CHECK-LABEL: test_v1f64_post_imm_ld1x2:
-;CHECK: ld1.1d { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 2
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double> } %ld1x2
-}
-
-define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v1f64_post_reg_ld1x2:
-;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double> } %ld1x2
-}
-
-declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*)
-
-
-define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
-;CHECK-LABEL: test_v16i8_post_imm_ld1x3:
-;CHECK: ld1.16b { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 48
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
-}
-
-define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v16i8_post_reg_ld1x3:
-;CHECK: ld1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*)
-
-
-define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
-;CHECK-LABEL: test_v8i8_post_imm_ld1x3:
-;CHECK: ld1.8b { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 24
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
-}
-
-define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v8i8_post_reg_ld1x3:
-;CHECK: ld1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*)
-
-
-define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
-;CHECK-LABEL: test_v8i16_post_imm_ld1x3:
-;CHECK: ld1.8h { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 24
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
-}
-
-define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v8i16_post_reg_ld1x3:
-;CHECK: ld1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
-}
-
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*)
-
-
-define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
-;CHECK-LABEL: test_v4i16_post_imm_ld1x3:
-;CHECK: ld1.4h { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 12
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
-}
-
-define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4i16_post_reg_ld1x3:
-;CHECK: ld1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
-}
-
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*)
-
-
-define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
-;CHECK-LABEL: test_v4i32_post_imm_ld1x3:
-;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 12
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
-}
-
-define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4i32_post_reg_ld1x3:
-;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
-}
-
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*)
-
-
-define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
-;CHECK-LABEL: test_v2i32_post_imm_ld1x3:
-;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 6
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
-}
-
-define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2i32_post_reg_ld1x3:
-;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
-}
-
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*)
-
-
-define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
-;CHECK-LABEL: test_v2i64_post_imm_ld1x3:
-;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 6
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
-}
-
-define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2i64_post_reg_ld1x3:
-;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
-}
-
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*)
-
-
-define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
-;CHECK-LABEL: test_v1i64_post_imm_ld1x3:
-;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 3
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
-}
-
-define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v1i64_post_reg_ld1x3:
-;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
-}
-
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*)
-
-
-define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x3(float* %A, float** %ptr) {
-;CHECK-LABEL: test_v4f32_post_imm_ld1x3:
-;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 12
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
-}
-
-define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4f32_post_reg_ld1x3:
-;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
-}
-
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*)
-
-
-define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x3(float* %A, float** %ptr) {
-;CHECK-LABEL: test_v2f32_post_imm_ld1x3:
-;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 6
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
-}
-
-define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2f32_post_reg_ld1x3:
-;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
-}
-
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*)
-
-
-define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x3(double* %A, double** %ptr) {
-;CHECK-LABEL: test_v2f64_post_imm_ld1x3:
-;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 6
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
-}
-
-define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2f64_post_reg_ld1x3:
-;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
-}
-
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*)
-
-
-define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x3(double* %A, double** %ptr) {
-;CHECK-LABEL: test_v1f64_post_imm_ld1x3:
-;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 3
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
-}
-
-define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v1f64_post_reg_ld1x3:
-;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
-}
-
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*)
-
-
-define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
-;CHECK-LABEL: test_v16i8_post_imm_ld1x4:
-;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 64
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
-}
-
-define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v16i8_post_reg_ld1x4:
-;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*)
-
-
-define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
-;CHECK-LABEL: test_v8i8_post_imm_ld1x4:
-;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 32
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
-}
-
-define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v8i8_post_reg_ld1x4:
-;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*)
-
-
-define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
-;CHECK-LABEL: test_v8i16_post_imm_ld1x4:
-;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 32
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
-}
-
-define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v8i16_post_reg_ld1x4:
-;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
-}
-
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*)
-
-
-define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
-;CHECK-LABEL: test_v4i16_post_imm_ld1x4:
-;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 16
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
-}
-
-define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4i16_post_reg_ld1x4:
-;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
-}
-
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*)
-
-
-define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
-;CHECK-LABEL: test_v4i32_post_imm_ld1x4:
-;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 16
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
-}
-
-define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4i32_post_reg_ld1x4:
-;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
-}
-
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*)
-
-
-define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
-;CHECK-LABEL: test_v2i32_post_imm_ld1x4:
-;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 8
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
-}
-
-define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2i32_post_reg_ld1x4:
-;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
-}
-
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*)
-
-
-define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
-;CHECK-LABEL: test_v2i64_post_imm_ld1x4:
-;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 8
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
-}
-
-define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2i64_post_reg_ld1x4:
-;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
-}
-
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*)
-
-
-define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
-;CHECK-LABEL: test_v1i64_post_imm_ld1x4:
-;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 4
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
-}
-
-define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v1i64_post_reg_ld1x4:
-;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
-}
-
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*)
-
-
-define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x4(float* %A, float** %ptr) {
-;CHECK-LABEL: test_v4f32_post_imm_ld1x4:
-;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 16
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
-}
-
-define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v4f32_post_reg_ld1x4:
-;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
-}
-
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*)
-
-
-define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x4(float* %A, float** %ptr) {
-;CHECK-LABEL: test_v2f32_post_imm_ld1x4:
-;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 8
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
-}
-
-define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2f32_post_reg_ld1x4:
-;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
-}
-
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*)
-
-
-define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x4(double* %A, double** %ptr) {
-;CHECK-LABEL: test_v2f64_post_imm_ld1x4:
-;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 8
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
-}
-
-define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v2f64_post_reg_ld1x4:
-;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
-}
-
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*)
-
-
-define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x4(double* %A, double** %ptr) {
-;CHECK-LABEL: test_v1f64_post_imm_ld1x4:
-;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 4
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
-}
-
-define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
-;CHECK-LABEL: test_v1f64_post_reg_ld1x4:
-;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
-}
-
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*)
-
-
-define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_ld2r:
-;CHECK: ld2r.16b { v0, v1 }, [x0], #2
-  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 2
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8> } %ld2
-}
-
-define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_ld2r:
-;CHECK: ld2r.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8> } %ld2
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
-
-
-define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_ld2r:
-;CHECK: ld2r.8b { v0, v1 }, [x0], #2
-  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 2
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8> } %ld2
-}
-
-define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_ld2r:
-;CHECK: ld2r.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8> } %ld2
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
-
-
-define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_ld2r:
-;CHECK: ld2r.8h { v0, v1 }, [x0], #4
-  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 2
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16> } %ld2
-}
-
-define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_ld2r:
-;CHECK: ld2r.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16> } %ld2
-}
-
-declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
-
-
-define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_ld2r:
-;CHECK: ld2r.4h { v0, v1 }, [x0], #4
-  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 2
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16> } %ld2
-}
-
-define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_ld2r:
-;CHECK: ld2r.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16> } %ld2
-}
-
-declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
-
-
-define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_ld2r:
-;CHECK: ld2r.4s { v0, v1 }, [x0], #8
-  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 2
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32> } %ld2
-}
-
-define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_ld2r:
-;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32> } %ld2
-}
-
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
-
-define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_ld2r:
-;CHECK: ld2r.2s { v0, v1 }, [x0], #8
-  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 2
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32> } %ld2
-}
-
-define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_ld2r:
-;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32> } %ld2
-}
-
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
-
-
-define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_ld2r:
-;CHECK: ld2r.2d { v0, v1 }, [x0], #16
-  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 2
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64> } %ld2
-}
-
-define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_ld2r:
-;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64> } %ld2
-}
-
-declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
-
-define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_ld2r:
-;CHECK: ld2r.1d { v0, v1 }, [x0], #16
-  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 2
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64> } %ld2
-}
-
-define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_ld2r:
-;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64> } %ld2
-}
-
-declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
-
-
-define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_ld2r:
-;CHECK: ld2r.4s { v0, v1 }, [x0], #8
-  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 2
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float> } %ld2
-}
-
-define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_ld2r:
-;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float> } %ld2
-}
-
-declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float*) nounwind readonly
-
-define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_ld2r:
-;CHECK: ld2r.2s { v0, v1 }, [x0], #8
-  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 2
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float> } %ld2
-}
-
-define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_ld2r:
-;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float> } %ld2
-}
-
-declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float*) nounwind readonly
-
-
-define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_ld2r:
-;CHECK: ld2r.2d { v0, v1 }, [x0], #16
-  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 2
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double> } %ld2
-}
-
-define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_ld2r:
-;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double> } %ld2
-}
-
-declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double*) nounwind readonly
-
-define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_ld2r:
-;CHECK: ld2r.1d { v0, v1 }, [x0], #16
-  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 2
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double> } %ld2
-}
-
-define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_ld2r:
-;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double> } %ld2
-}
-
-declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double*) nounwind readonly
-
-
-define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_ld3r:
-;CHECK: ld3r.16b { v0, v1, v2 }, [x0], #3
-  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 3
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
-}
-
-define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_ld3r:
-;CHECK: ld3r.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
-
-
-define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_ld3r:
-;CHECK: ld3r.8b { v0, v1, v2 }, [x0], #3
-  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 3
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
-}
-
-define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_ld3r:
-;CHECK: ld3r.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
-
-
-define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_ld3r:
-;CHECK: ld3r.8h { v0, v1, v2 }, [x0], #6
-  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 3
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
-}
-
-define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_ld3r:
-;CHECK: ld3r.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
-}
-
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
-
-
-define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_ld3r:
-;CHECK: ld3r.4h { v0, v1, v2 }, [x0], #6
-  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 3
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
-}
-
-define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_ld3r:
-;CHECK: ld3r.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
-}
-
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
-
-
-define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_ld3r:
-;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
-  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 3
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
-}
-
-define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_ld3r:
-;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
-}
-
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
-
-define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_ld3r:
-;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
-  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 3
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
-}
-
-define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_ld3r:
-;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
-}
-
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
-
-
-define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_ld3r:
-;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
-  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 3
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
-}
-
-define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_ld3r:
-;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
-}
-
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
-
-define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_ld3r:
-;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
-  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 3
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
-}
-
-define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_ld3r:
-;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
-}
-
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
-
-
-define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_ld3r:
-;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
-  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 3
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
-}
-
-define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_ld3r:
-;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
-}
-
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float*) nounwind readonly
-
-define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_ld3r:
-;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
-  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 3
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
-}
-
-define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_ld3r:
-;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
-}
-
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float*) nounwind readonly
-
-
-define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_ld3r:
-;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
-  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 3
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
-}
-
-define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_ld3r:
-;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
-}
-
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double*) nounwind readonly
-
-define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_ld3r:
-;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
-  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 3
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
-}
-
-define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_ld3r:
-;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
-}
-
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double*) nounwind readonly
-
-
-define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_ld4r:
-;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], #4
-  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 4
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
-}
-
-define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_ld4r:
-;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
-
-
-define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_ld4r:
-;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], #4
-  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 4
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
-}
-
-define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_ld4r:
-;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
-
-
-define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_ld4r:
-;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], #8
-  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 4
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
-}
-
-define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_ld4r:
-;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
-}
-
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
-
-
-define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_ld4r:
-;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], #8
-  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i32 4
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
-}
-
-define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_ld4r:
-;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
-}
-
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
-
-
-define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_ld4r:
-;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
-  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 4
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
-}
-
-define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_ld4r:
-;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
-}
-
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
-
-define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_ld4r:
-;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
-  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i32 4
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
-}
-
-define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_ld4r:
-;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
-}
-
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
-
-
-define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_ld4r:
-;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 4
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
-}
-
-define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_ld4r:
-;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
-}
-
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
-
-define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_ld4r:
-;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i32 4
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
-}
-
-define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_ld4r:
-;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
-}
-
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
-
-
-define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_ld4r:
-;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
-  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 4
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
-}
-
-define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_ld4r:
-;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
-}
-
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float*) nounwind readonly
-
-define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_ld4r:
-;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
-  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i32 4
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
-}
-
-define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_ld4r:
-;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
-}
-
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float*) nounwind readonly
-
-
-define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_ld4r:
-;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 4
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
-}
-
-define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_ld4r:
-;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
-}
-
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double*) nounwind readonly
-
-define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_ld4r:
-;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i32 4
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
-}
-
-define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_ld4r:
-;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
-}
-
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double*) nounwind readonly
-
-
-define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_ld2lane:
-;CHECK: ld2.b { v0, v1 }[0], [x0], #2
-  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i32 2
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8> } %ld2
-}
-
-define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_ld2lane:
-;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8> } %ld2
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-
-
-define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_ld2lane:
-;CHECK: ld2.b { v0, v1 }[0], [x0], #2
-  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i32 2
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8> } %ld2
-}
-
-define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_ld2lane:
-;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8> } %ld2
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
-
-
-define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_ld2lane:
-;CHECK: ld2.h { v0, v1 }[0], [x0], #4
-  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i32 2
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16> } %ld2
-}
-
-define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_ld2lane:
-;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16> } %ld2
-}
-
-declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-
-
-define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_ld2lane:
-;CHECK: ld2.h { v0, v1 }[0], [x0], #4
-  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i32 2
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16> } %ld2
-}
-
-define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_ld2lane:
-;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16> } %ld2
-}
-
-declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
-
-
-define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_ld2lane:
-;CHECK: ld2.s { v0, v1 }[0], [x0], #8
-  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i32 2
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32> } %ld2
-}
-
-define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_ld2lane:
-;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32> } %ld2
-}
-
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-
-
-define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_ld2lane:
-;CHECK: ld2.s { v0, v1 }[0], [x0], #8
-  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i32 2
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32> } %ld2
-}
-
-define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_ld2lane:
-;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32> } %ld2
-}
-
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
-
-
-define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_ld2lane:
-;CHECK: ld2.d { v0, v1 }[0], [x0], #16
-  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i32 2
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64> } %ld2
-}
-
-define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_ld2lane:
-;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64> } %ld2
-}
-
-declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-
-
-define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_ld2lane:
-;CHECK: ld2.d { v0, v1 }[0], [x0], #16
-  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i32 2
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64> } %ld2
-}
-
-define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_ld2lane:
-;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64> } %ld2
-}
-
-declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
-
-
-define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_ld2lane:
-;CHECK: ld2.s { v0, v1 }[0], [x0], #8
-  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i32 2
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float> } %ld2
-}
-
-define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_ld2lane:
-;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float> } %ld2
-}
-
-declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) nounwind readonly
-
-
-define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_ld2lane:
-;CHECK: ld2.s { v0, v1 }[0], [x0], #8
-  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i32 2
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float> } %ld2
-}
-
-define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_ld2lane:
-;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float> } %ld2
-}
-
-declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) nounwind readonly
-
-
-define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_ld2lane:
-;CHECK: ld2.d { v0, v1 }[0], [x0], #16
-  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i32 2
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double> } %ld2
-}
-
-define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_ld2lane:
-;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double> } %ld2
-}
-
-declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) nounwind readonly
-
-
-define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_ld2lane:
-;CHECK: ld2.d { v0, v1 }[0], [x0], #16
-  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i32 2
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double> } %ld2
-}
-
-define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_ld2lane:
-;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double> } %ld2
-}
-
-declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) nounwind readonly
-
-
-define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_ld3lane:
-;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
-  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i32 3
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
-}
-
-define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_ld3lane:
-;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-
-
-define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_ld3lane:
-;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
-  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i32 3
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
-}
-
-define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_ld3lane:
-;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
-
-
-define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_ld3lane:
-;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
-  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i32 3
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
-}
-
-define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_ld3lane:
-;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
-}
-
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-
-
-define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_ld3lane:
-;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
-  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i32 3
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
-}
-
-define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_ld3lane:
-;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
-}
-
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
-
-
-define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_ld3lane:
-;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
-  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i32 3
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
-}
-
-define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_ld3lane:
-;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
-}
-
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-
-
-define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_ld3lane:
-;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
-  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i32 3
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
-}
-
-define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_ld3lane:
-;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
-}
-
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
-
-
-define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_ld3lane:
-;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
-  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i32 3
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
-}
-
-define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_ld3lane:
-;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
-}
-
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-
-
-define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_ld3lane:
-;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
-  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i32 3
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
-}
-
-define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_ld3lane:
-;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
-}
-
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
-
-
-define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_ld3lane:
-;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
-  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i32 3
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
-}
-
-define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_ld3lane:
-;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
-}
-
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
-
-
-define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_ld3lane:
-;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
-  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i32 3
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
-}
-
-define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_ld3lane:
-;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
-}
-
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
-
-
-define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_ld3lane:
-;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
-  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i32 3
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
-}
-
-define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_ld3lane:
-;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
-}
-
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
-
-
-define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_ld3lane:
-;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
-  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i32 3
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
-}
-
-define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_ld3lane:
-;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
-}
-
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
-
-
-define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_ld4lane:
-;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
-  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i32 4
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
-}
-
-define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_ld4lane:
-;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-
-
-define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_ld4lane:
-;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
-  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i32 4
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
-}
-
-define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_ld4lane:
-;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  store i8* %tmp, i8** %ptr
-  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
-
-
-define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_ld4lane:
-;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
-  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i32 4
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
-}
-
-define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_ld4lane:
-;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
-}
-
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-
-
-define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_ld4lane:
-;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
-  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i32 4
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
-}
-
-define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_ld4lane:
-;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  store i16* %tmp, i16** %ptr
-  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
-}
-
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
-
-
-define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_ld4lane:
-;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
-  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i32 4
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
-}
-
-define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_ld4lane:
-;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
-}
-
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-
-
-define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_ld4lane:
-;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
-  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i32 4
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
-}
-
-define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_ld4lane:
-;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  store i32* %tmp, i32** %ptr
-  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
-}
-
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
-
-
-define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_ld4lane:
-;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
-  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i32 4
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
-}
-
-define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_ld4lane:
-;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
-}
-
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-
-
-define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_ld4lane:
-;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
-  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i32 4
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
-}
-
-define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_ld4lane:
-;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  store i64* %tmp, i64** %ptr
-  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
-}
-
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
-
-
-define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_ld4lane:
-;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
-  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i32 4
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
-}
-
-define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_ld4lane:
-;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
-}
-
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
-
-
-define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_ld4lane:
-;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
-  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i32 4
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
-}
-
-define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_ld4lane:
-;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  store float* %tmp, float** %ptr
-  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
-}
-
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
-
-
-define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_ld4lane:
-;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
-  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i32 4
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
-}
-
-define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_ld4lane:
-;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
-}
-
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
-
-
-define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_ld4lane:
-;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
-  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i32 4
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
-}
-
-define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_ld4lane:
-;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  store double* %tmp, double** %ptr
-  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
-}
-
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
-
-
-define i8* @test_v16i8_post_imm_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_st2:
-;CHECK: st2.16b { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
-  %tmp = getelementptr i8* %A, i32 32
-  ret i8* %tmp
-}
-
-define i8* @test_v16i8_post_reg_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_st2:
-;CHECK: st2.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
-
-
-define i8* @test_v8i8_post_imm_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_st2:
-;CHECK: st2.8b { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
-  %tmp = getelementptr i8* %A, i32 16
-  ret i8* %tmp
-}
-
-define i8* @test_v8i8_post_reg_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_st2:
-;CHECK: st2.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
-
-
-define i16* @test_v8i16_post_imm_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_st2:
-;CHECK: st2.8h { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
-  %tmp = getelementptr i16* %A, i32 16
-  ret i16* %tmp
-}
-
-define i16* @test_v8i16_post_reg_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_st2:
-;CHECK: st2.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
-
-
-define i16* @test_v4i16_post_imm_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_st2:
-;CHECK: st2.4h { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
-  %tmp = getelementptr i16* %A, i32 8
-  ret i16* %tmp
-}
-
-define i16* @test_v4i16_post_reg_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_st2:
-;CHECK: st2.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
-
-
-define i32* @test_v4i32_post_imm_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_st2:
-;CHECK: st2.4s { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
-  %tmp = getelementptr i32* %A, i32 8
-  ret i32* %tmp
-}
-
-define i32* @test_v4i32_post_reg_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_st2:
-;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
-
-
-define i32* @test_v2i32_post_imm_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_st2:
-;CHECK: st2.2s { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
-  %tmp = getelementptr i32* %A, i32 4
-  ret i32* %tmp
-}
-
-define i32* @test_v2i32_post_reg_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_st2:
-;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
-
-
-define i64* @test_v2i64_post_imm_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_st2:
-;CHECK: st2.2d { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
-  %tmp = getelementptr i64* %A, i64 4
-  ret i64* %tmp
-}
-
-define i64* @test_v2i64_post_reg_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_st2:
-;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
-
-
-define i64* @test_v1i64_post_imm_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_st2:
-;CHECK: st1.1d { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
-  %tmp = getelementptr i64* %A, i64 2
-  ret i64* %tmp
-}
-
-define i64* @test_v1i64_post_reg_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_st2:
-;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
-
-
-define float* @test_v4f32_post_imm_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_st2:
-;CHECK: st2.4s { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
-  %tmp = getelementptr float* %A, i32 8
-  ret float* %tmp
-}
-
-define float* @test_v4f32_post_reg_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_st2:
-;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
-
-
-define float* @test_v2f32_post_imm_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_st2:
-;CHECK: st2.2s { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
-  %tmp = getelementptr float* %A, i32 4
-  ret float* %tmp
-}
-
-define float* @test_v2f32_post_reg_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_st2:
-;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
-
-
-define double* @test_v2f64_post_imm_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_st2:
-;CHECK: st2.2d { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
-  %tmp = getelementptr double* %A, i64 4
-  ret double* %tmp
-}
-
-define double* @test_v2f64_post_reg_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_st2:
-;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
-
-
-define double* @test_v1f64_post_imm_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_st2:
-;CHECK: st1.1d { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
-  %tmp = getelementptr double* %A, i64 2
-  ret double* %tmp
-}
-
-define double* @test_v1f64_post_reg_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_st2:
-;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
-
-
-define i8* @test_v16i8_post_imm_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_st3:
-;CHECK: st3.16b { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
-  %tmp = getelementptr i8* %A, i32 48
-  ret i8* %tmp
-}
-
-define i8* @test_v16i8_post_reg_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_st3:
-;CHECK: st3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
-
-
-define i8* @test_v8i8_post_imm_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_st3:
-;CHECK: st3.8b { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
-  %tmp = getelementptr i8* %A, i32 24
-  ret i8* %tmp
-}
-
-define i8* @test_v8i8_post_reg_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_st3:
-;CHECK: st3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
-
-
-define i16* @test_v8i16_post_imm_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_st3:
-;CHECK: st3.8h { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
-  %tmp = getelementptr i16* %A, i32 24
-  ret i16* %tmp
-}
-
-define i16* @test_v8i16_post_reg_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_st3:
-;CHECK: st3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
-
-
-define i16* @test_v4i16_post_imm_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_st3:
-;CHECK: st3.4h { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
-  %tmp = getelementptr i16* %A, i32 12
-  ret i16* %tmp
-}
-
-define i16* @test_v4i16_post_reg_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_st3:
-;CHECK: st3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
-
-
-define i32* @test_v4i32_post_imm_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_st3:
-;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
-  %tmp = getelementptr i32* %A, i32 12
-  ret i32* %tmp
-}
-
-define i32* @test_v4i32_post_reg_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_st3:
-;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
-
-
-define i32* @test_v2i32_post_imm_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_st3:
-;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
-  %tmp = getelementptr i32* %A, i32 6
-  ret i32* %tmp
-}
-
-define i32* @test_v2i32_post_reg_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_st3:
-;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
-
-
-define i64* @test_v2i64_post_imm_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_st3:
-;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
-  %tmp = getelementptr i64* %A, i64 6
-  ret i64* %tmp
-}
-
-define i64* @test_v2i64_post_reg_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_st3:
-;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
-
-
-define i64* @test_v1i64_post_imm_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_st3:
-;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
-  %tmp = getelementptr i64* %A, i64 3
-  ret i64* %tmp
-}
-
-define i64* @test_v1i64_post_reg_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_st3:
-;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
-
-
-define float* @test_v4f32_post_imm_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_st3:
-;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
-  %tmp = getelementptr float* %A, i32 12
-  ret float* %tmp
-}
-
-define float* @test_v4f32_post_reg_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_st3:
-;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
-
-
-define float* @test_v2f32_post_imm_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_st3:
-;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
-  %tmp = getelementptr float* %A, i32 6
-  ret float* %tmp
-}
-
-define float* @test_v2f32_post_reg_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_st3:
-;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
-
-
-define double* @test_v2f64_post_imm_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_st3:
-;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
-  %tmp = getelementptr double* %A, i64 6
-  ret double* %tmp
-}
-
-define double* @test_v2f64_post_reg_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_st3:
-;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
-
-
-define double* @test_v1f64_post_imm_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_st3:
-;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
-  %tmp = getelementptr double* %A, i64 3
-  ret double* %tmp
-}
-
-define double* @test_v1f64_post_reg_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_st3:
-;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
-
-
-define i8* @test_v16i8_post_imm_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_st4:
-;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
-  %tmp = getelementptr i8* %A, i32 64
-  ret i8* %tmp
-}
-
-define i8* @test_v16i8_post_reg_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_st4:
-;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
-
-
-define i8* @test_v8i8_post_imm_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_st4:
-;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
-  %tmp = getelementptr i8* %A, i32 32
-  ret i8* %tmp
-}
-
-define i8* @test_v8i8_post_reg_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_st4:
-;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
-
-
-define i16* @test_v8i16_post_imm_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_st4:
-;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
-  %tmp = getelementptr i16* %A, i32 32
-  ret i16* %tmp
-}
-
-define i16* @test_v8i16_post_reg_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_st4:
-;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
-
-
-define i16* @test_v4i16_post_imm_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_st4:
-;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
-  %tmp = getelementptr i16* %A, i32 16
-  ret i16* %tmp
-}
-
-define i16* @test_v4i16_post_reg_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_st4:
-;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
-
-
-define i32* @test_v4i32_post_imm_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_st4:
-;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
-  %tmp = getelementptr i32* %A, i32 16
-  ret i32* %tmp
-}
-
-define i32* @test_v4i32_post_reg_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_st4:
-;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
-
-
-define i32* @test_v2i32_post_imm_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_st4:
-;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
-  %tmp = getelementptr i32* %A, i32 8
-  ret i32* %tmp
-}
-
-define i32* @test_v2i32_post_reg_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_st4:
-;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
-
-
-define i64* @test_v2i64_post_imm_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_st4:
-;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
-  %tmp = getelementptr i64* %A, i64 8
-  ret i64* %tmp
-}
-
-define i64* @test_v2i64_post_reg_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_st4:
-;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
-
-
-define i64* @test_v1i64_post_imm_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_st4:
-;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
-  %tmp = getelementptr i64* %A, i64 4
-  ret i64* %tmp
-}
-
-define i64* @test_v1i64_post_reg_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_st4:
-;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
-
-
-define float* @test_v4f32_post_imm_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_st4:
-;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
-  %tmp = getelementptr float* %A, i32 16
-  ret float* %tmp
-}
-
-define float* @test_v4f32_post_reg_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_st4:
-;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
-
-
-define float* @test_v2f32_post_imm_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_st4:
-;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
-  %tmp = getelementptr float* %A, i32 8
-  ret float* %tmp
-}
-
-define float* @test_v2f32_post_reg_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_st4:
-;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
-
-
-define double* @test_v2f64_post_imm_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_st4:
-;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
-  %tmp = getelementptr double* %A, i64 8
-  ret double* %tmp
-}
-
-define double* @test_v2f64_post_reg_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_st4:
-;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
-
-
-define double* @test_v1f64_post_imm_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_st4:
-;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
-  %tmp = getelementptr double* %A, i64 4
-  ret double* %tmp
-}
-
-define double* @test_v1f64_post_reg_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_st4:
-;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
-
-
-define i8* @test_v16i8_post_imm_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_st1x2:
-;CHECK: st1.16b { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
-  %tmp = getelementptr i8* %A, i32 32
-  ret i8* %tmp
-}
-
-define i8* @test_v16i8_post_reg_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_st1x2:
-;CHECK: st1.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
-
-
-define i8* @test_v8i8_post_imm_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_st1x2:
-;CHECK: st1.8b { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
-  %tmp = getelementptr i8* %A, i32 16
-  ret i8* %tmp
-}
-
-define i8* @test_v8i8_post_reg_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_st1x2:
-;CHECK: st1.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
-
-
-define i16* @test_v8i16_post_imm_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_st1x2:
-;CHECK: st1.8h { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
-  %tmp = getelementptr i16* %A, i32 16
-  ret i16* %tmp
-}
-
-define i16* @test_v8i16_post_reg_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_st1x2:
-;CHECK: st1.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
-
-
-define i16* @test_v4i16_post_imm_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_st1x2:
-;CHECK: st1.4h { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
-  %tmp = getelementptr i16* %A, i32 8
-  ret i16* %tmp
-}
-
-define i16* @test_v4i16_post_reg_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_st1x2:
-;CHECK: st1.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
-
-
-define i32* @test_v4i32_post_imm_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_st1x2:
-;CHECK: st1.4s { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
-  %tmp = getelementptr i32* %A, i32 8
-  ret i32* %tmp
-}
-
-define i32* @test_v4i32_post_reg_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_st1x2:
-;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
-
-
-define i32* @test_v2i32_post_imm_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_st1x2:
-;CHECK: st1.2s { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
-  %tmp = getelementptr i32* %A, i32 4
-  ret i32* %tmp
-}
-
-define i32* @test_v2i32_post_reg_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_st1x2:
-;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
-
-
-define i64* @test_v2i64_post_imm_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_st1x2:
-;CHECK: st1.2d { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
-  %tmp = getelementptr i64* %A, i64 4
-  ret i64* %tmp
-}
-
-define i64* @test_v2i64_post_reg_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_st1x2:
-;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
-
-
-define i64* @test_v1i64_post_imm_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_st1x2:
-;CHECK: st1.1d { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
-  %tmp = getelementptr i64* %A, i64 2
-  ret i64* %tmp
-}
-
-define i64* @test_v1i64_post_reg_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_st1x2:
-;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
-
-
-define float* @test_v4f32_post_imm_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_st1x2:
-;CHECK: st1.4s { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
-  %tmp = getelementptr float* %A, i32 8
-  ret float* %tmp
-}
-
-define float* @test_v4f32_post_reg_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_st1x2:
-;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
-
-
-define float* @test_v2f32_post_imm_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_st1x2:
-;CHECK: st1.2s { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
-  %tmp = getelementptr float* %A, i32 4
-  ret float* %tmp
-}
-
-define float* @test_v2f32_post_reg_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_st1x2:
-;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
-
-
-define double* @test_v2f64_post_imm_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_st1x2:
-;CHECK: st1.2d { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
-  %tmp = getelementptr double* %A, i64 4
-  ret double* %tmp
-}
-
-define double* @test_v2f64_post_reg_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_st1x2:
-;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
-
-
-define double* @test_v1f64_post_imm_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_st1x2:
-;CHECK: st1.1d { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
-  %tmp = getelementptr double* %A, i64 2
-  ret double* %tmp
-}
-
-define double* @test_v1f64_post_reg_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_st1x2:
-;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
-
-
-define i8* @test_v16i8_post_imm_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_st1x3:
-;CHECK: st1.16b { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
-  %tmp = getelementptr i8* %A, i32 48
-  ret i8* %tmp
-}
-
-define i8* @test_v16i8_post_reg_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_st1x3:
-;CHECK: st1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
-
-
-define i8* @test_v8i8_post_imm_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_st1x3:
-;CHECK: st1.8b { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
-  %tmp = getelementptr i8* %A, i32 24
-  ret i8* %tmp
-}
-
-define i8* @test_v8i8_post_reg_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_st1x3:
-;CHECK: st1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
-
-
-define i16* @test_v8i16_post_imm_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_st1x3:
-;CHECK: st1.8h { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
-  %tmp = getelementptr i16* %A, i32 24
-  ret i16* %tmp
-}
-
-define i16* @test_v8i16_post_reg_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_st1x3:
-;CHECK: st1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
-
-
-define i16* @test_v4i16_post_imm_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_st1x3:
-;CHECK: st1.4h { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
-  %tmp = getelementptr i16* %A, i32 12
-  ret i16* %tmp
-}
-
-define i16* @test_v4i16_post_reg_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_st1x3:
-;CHECK: st1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
-
-
-define i32* @test_v4i32_post_imm_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_st1x3:
-;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
-  %tmp = getelementptr i32* %A, i32 12
-  ret i32* %tmp
-}
-
-define i32* @test_v4i32_post_reg_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_st1x3:
-;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
-
-
-define i32* @test_v2i32_post_imm_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_st1x3:
-;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
-  %tmp = getelementptr i32* %A, i32 6
-  ret i32* %tmp
-}
-
-define i32* @test_v2i32_post_reg_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_st1x3:
-;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
-
-
-define i64* @test_v2i64_post_imm_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_st1x3:
-;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
-  %tmp = getelementptr i64* %A, i64 6
-  ret i64* %tmp
-}
-
-define i64* @test_v2i64_post_reg_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_st1x3:
-;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
-
-
-define i64* @test_v1i64_post_imm_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_st1x3:
-;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
-  %tmp = getelementptr i64* %A, i64 3
-  ret i64* %tmp
-}
-
-define i64* @test_v1i64_post_reg_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_st1x3:
-;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
-
-
-define float* @test_v4f32_post_imm_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_st1x3:
-;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
-  %tmp = getelementptr float* %A, i32 12
-  ret float* %tmp
-}
-
-define float* @test_v4f32_post_reg_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_st1x3:
-;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
-
-
-define float* @test_v2f32_post_imm_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_st1x3:
-;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
-  %tmp = getelementptr float* %A, i32 6
-  ret float* %tmp
-}
-
-define float* @test_v2f32_post_reg_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_st1x3:
-;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
-
-
-define double* @test_v2f64_post_imm_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_st1x3:
-;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
-  %tmp = getelementptr double* %A, i64 6
-  ret double* %tmp
-}
-
-define double* @test_v2f64_post_reg_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_st1x3:
-;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
-
-
-define double* @test_v1f64_post_imm_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_st1x3:
-;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
-  %tmp = getelementptr double* %A, i64 3
-  ret double* %tmp
-}
-
-define double* @test_v1f64_post_reg_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_st1x3:
-;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
-
-
-define i8* @test_v16i8_post_imm_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_st1x4:
-;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
-  %tmp = getelementptr i8* %A, i32 64
-  ret i8* %tmp
-}
-
-define i8* @test_v16i8_post_reg_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_st1x4:
-;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
-
-
-define i8* @test_v8i8_post_imm_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_st1x4:
-;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
-  %tmp = getelementptr i8* %A, i32 32
-  ret i8* %tmp
-}
-
-define i8* @test_v8i8_post_reg_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_st1x4:
-;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
-
-
-define i16* @test_v8i16_post_imm_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_st1x4:
-;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
-  %tmp = getelementptr i16* %A, i32 32
-  ret i16* %tmp
-}
-
-define i16* @test_v8i16_post_reg_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_st1x4:
-;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
-
-
-define i16* @test_v4i16_post_imm_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_st1x4:
-;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
-  %tmp = getelementptr i16* %A, i32 16
-  ret i16* %tmp
-}
-
-define i16* @test_v4i16_post_reg_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_st1x4:
-;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
-
-
-define i32* @test_v4i32_post_imm_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_st1x4:
-;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
-  %tmp = getelementptr i32* %A, i32 16
-  ret i32* %tmp
-}
-
-define i32* @test_v4i32_post_reg_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_st1x4:
-;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
-
-
-define i32* @test_v2i32_post_imm_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_st1x4:
-;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
-  %tmp = getelementptr i32* %A, i32 8
-  ret i32* %tmp
-}
-
-define i32* @test_v2i32_post_reg_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_st1x4:
-;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
-
-
-define i64* @test_v2i64_post_imm_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_st1x4:
-;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
-  %tmp = getelementptr i64* %A, i64 8
-  ret i64* %tmp
-}
-
-define i64* @test_v2i64_post_reg_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_st1x4:
-;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
-
-
-define i64* @test_v1i64_post_imm_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_st1x4:
-;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
-  %tmp = getelementptr i64* %A, i64 4
-  ret i64* %tmp
-}
-
-define i64* @test_v1i64_post_reg_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_st1x4:
-;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
-
-
-define float* @test_v4f32_post_imm_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_st1x4:
-;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
-  %tmp = getelementptr float* %A, i32 16
-  ret float* %tmp
-}
-
-define float* @test_v4f32_post_reg_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_st1x4:
-;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
-
-
-define float* @test_v2f32_post_imm_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_st1x4:
-;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
-  %tmp = getelementptr float* %A, i32 8
-  ret float* %tmp
-}
-
-define float* @test_v2f32_post_reg_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_st1x4:
-;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
-
-
-define double* @test_v2f64_post_imm_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_st1x4:
-;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
-  %tmp = getelementptr double* %A, i64 8
-  ret double* %tmp
-}
-
-define double* @test_v2f64_post_reg_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_st1x4:
-;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
-
-
-define double* @test_v1f64_post_imm_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_st1x4:
-;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
-  %tmp = getelementptr double* %A, i64 4
-  ret double* %tmp
-}
-
-define double* @test_v1f64_post_reg_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_st1x4:
-;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
-
-
-define i8* @test_v16i8_post_imm_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) {
-  call void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
-  %tmp = getelementptr i8* %A, i32 2
-  ret i8* %tmp
-}
-
-define i8* @test_v16i8_post_reg_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) {
-  call void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i64, i8*) nounwind readnone
-
-
-define i8* @test_v16i8_post_imm_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_st2lane:
-;CHECK: st2.b { v0, v1 }[0], [x0], #2
-  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i32 2
-  ret i8* %tmp
-}
-
-define i8* @test_v16i8_post_reg_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_st2lane:
-;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*)
-
-
-define i8* @test_v8i8_post_imm_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_st2lane:
-;CHECK: st2.b { v0, v1 }[0], [x0], #2
-  call void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i32 2
-  ret i8* %tmp
-}
-
-define i8* @test_v8i8_post_reg_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_st2lane:
-;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*)
-
-
-define i16* @test_v8i16_post_imm_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_st2lane:
-;CHECK: st2.h { v0, v1 }[0], [x0], #4
-  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i32 2
-  ret i16* %tmp
-}
-
-define i16* @test_v8i16_post_reg_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_st2lane:
-;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*)
-
-
-define i16* @test_v4i16_post_imm_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_st2lane:
-;CHECK: st2.h { v0, v1 }[0], [x0], #4
-  call void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i32 2
-  ret i16* %tmp
-}
-
-define i16* @test_v4i16_post_reg_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_st2lane:
-;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*)
-
-
-define i32* @test_v4i32_post_imm_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_st2lane:
-;CHECK: st2.s { v0, v1 }[0], [x0], #8
-  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i32 2
-  ret i32* %tmp
-}
-
-define i32* @test_v4i32_post_reg_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_st2lane:
-;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
-
-
-define i32* @test_v2i32_post_imm_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_st2lane:
-;CHECK: st2.s { v0, v1 }[0], [x0], #8
-  call void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i32 2
-  ret i32* %tmp
-}
-
-define i32* @test_v2i32_post_reg_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_st2lane:
-;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*)
-
-
-define i64* @test_v2i64_post_imm_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_st2lane:
-;CHECK: st2.d { v0, v1 }[0], [x0], #16
-  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 2
-  ret i64* %tmp
-}
-
-define i64* @test_v2i64_post_reg_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_st2lane:
-;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*)
-
-
-define i64* @test_v1i64_post_imm_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_st2lane:
-;CHECK: st2.d { v0, v1 }[0], [x0], #16
-  call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 2
-  ret i64* %tmp
-}
-
-define i64* @test_v1i64_post_reg_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_st2lane:
-;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
-
-
-define float* @test_v4f32_post_imm_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_st2lane:
-;CHECK: st2.s { v0, v1 }[0], [x0], #8
-  call void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i32 2
-  ret float* %tmp
-}
-
-define float* @test_v4f32_post_reg_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_st2lane:
-;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
-
-
-define float* @test_v2f32_post_imm_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_st2lane:
-;CHECK: st2.s { v0, v1 }[0], [x0], #8
-  call void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i32 2
-  ret float* %tmp
-}
-
-define float* @test_v2f32_post_reg_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_st2lane:
-;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
-
-
-define double* @test_v2f64_post_imm_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_st2lane:
-;CHECK: st2.d { v0, v1 }[0], [x0], #16
-  call void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 2
-  ret double* %tmp
-}
-
-define double* @test_v2f64_post_reg_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_st2lane:
-;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*)
-
-
-define double* @test_v1f64_post_imm_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_st2lane:
-;CHECK: st2.d { v0, v1 }[0], [x0], #16
-  call void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 2
-  ret double* %tmp
-}
-
-define double* @test_v1f64_post_reg_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_st2lane:
-;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*)
-
-
-define i8* @test_v16i8_post_imm_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_st3lane:
-;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
-  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i32 3
-  ret i8* %tmp
-}
-
-define i8* @test_v16i8_post_reg_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_st3lane:
-;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
-
-
-define i8* @test_v8i8_post_imm_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_st3lane:
-;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
-  call void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i32 3
-  ret i8* %tmp
-}
-
-define i8* @test_v8i8_post_reg_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_st3lane:
-;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
-
-
-define i16* @test_v8i16_post_imm_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_st3lane:
-;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
-  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i32 3
-  ret i16* %tmp
-}
-
-define i16* @test_v8i16_post_reg_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_st3lane:
-;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
-
-
-define i16* @test_v4i16_post_imm_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_st3lane:
-;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
-  call void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i32 3
-  ret i16* %tmp
-}
-
-define i16* @test_v4i16_post_reg_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_st3lane:
-;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
-
-
-define i32* @test_v4i32_post_imm_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_st3lane:
-;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
-  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i32 3
-  ret i32* %tmp
-}
-
-define i32* @test_v4i32_post_reg_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_st3lane:
-;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
-
-
-define i32* @test_v2i32_post_imm_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_st3lane:
-;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
-  call void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i32 3
-  ret i32* %tmp
-}
-
-define i32* @test_v2i32_post_reg_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_st3lane:
-;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
-
-
-define i64* @test_v2i64_post_imm_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_st3lane:
-;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
-  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 3
-  ret i64* %tmp
-}
-
-define i64* @test_v2i64_post_reg_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_st3lane:
-;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
-
-
-define i64* @test_v1i64_post_imm_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_st3lane:
-;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
-  call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 3
-  ret i64* %tmp
-}
-
-define i64* @test_v1i64_post_reg_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_st3lane:
-;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
-
-
-define float* @test_v4f32_post_imm_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_st3lane:
-;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
-  call void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i32 3
-  ret float* %tmp
-}
-
-define float* @test_v4f32_post_reg_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_st3lane:
-;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
-
-
-define float* @test_v2f32_post_imm_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_st3lane:
-;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
-  call void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i32 3
-  ret float* %tmp
-}
-
-define float* @test_v2f32_post_reg_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_st3lane:
-;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
-
-
-define double* @test_v2f64_post_imm_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_st3lane:
-;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
-  call void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 3
-  ret double* %tmp
-}
-
-define double* @test_v2f64_post_reg_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_st3lane:
-;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*)
-
-
-define double* @test_v1f64_post_imm_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_st3lane:
-;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
-  call void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 3
-  ret double* %tmp
-}
-
-define double* @test_v1f64_post_reg_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_st3lane:
-;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*)
-
-
-define i8* @test_v16i8_post_imm_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
-;CHECK-LABEL: test_v16i8_post_imm_st4lane:
-;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
-  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i32 4
-  ret i8* %tmp
-}
-
-define i8* @test_v16i8_post_reg_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v16i8_post_reg_st4lane:
-;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
-
-
-define i8* @test_v8i8_post_imm_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
-;CHECK-LABEL: test_v8i8_post_imm_st4lane:
-;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
-  call void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i32 4
-  ret i8* %tmp
-}
-
-define i8* @test_v8i8_post_reg_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i8_post_reg_st4lane:
-;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
-  %tmp = getelementptr i8* %A, i64 %inc
-  ret i8* %tmp
-}
-
-declare void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
-
-
-define i16* @test_v8i16_post_imm_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
-;CHECK-LABEL: test_v8i16_post_imm_st4lane:
-;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
-  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i32 4
-  ret i16* %tmp
-}
-
-define i16* @test_v8i16_post_reg_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v8i16_post_reg_st4lane:
-;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
-
-
-define i16* @test_v4i16_post_imm_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
-;CHECK-LABEL: test_v4i16_post_imm_st4lane:
-;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
-  call void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i32 4
-  ret i16* %tmp
-}
-
-define i16* @test_v4i16_post_reg_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i16_post_reg_st4lane:
-;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
-  %tmp = getelementptr i16* %A, i64 %inc
-  ret i16* %tmp
-}
-
-declare void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
-
-
-define i32* @test_v4i32_post_imm_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
-;CHECK-LABEL: test_v4i32_post_imm_st4lane:
-;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
-  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i32 4
-  ret i32* %tmp
-}
-
-define i32* @test_v4i32_post_reg_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4i32_post_reg_st4lane:
-;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
-
-
-define i32* @test_v2i32_post_imm_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
-;CHECK-LABEL: test_v2i32_post_imm_st4lane:
-;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
-  call void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i32 4
-  ret i32* %tmp
-}
-
-define i32* @test_v2i32_post_reg_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i32_post_reg_st4lane:
-;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
-  %tmp = getelementptr i32* %A, i64 %inc
-  ret i32* %tmp
-}
-
-declare void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
-
-
-define i64* @test_v2i64_post_imm_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
-;CHECK-LABEL: test_v2i64_post_imm_st4lane:
-;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
-  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 4
-  ret i64* %tmp
-}
-
-define i64* @test_v2i64_post_reg_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2i64_post_reg_st4lane:
-;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
-
-
-define i64* @test_v1i64_post_imm_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
-;CHECK-LABEL: test_v1i64_post_imm_st4lane:
-;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
-  call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 4
-  ret i64* %tmp
-}
-
-define i64* @test_v1i64_post_reg_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1i64_post_reg_st4lane:
-;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
-  %tmp = getelementptr i64* %A, i64 %inc
-  ret i64* %tmp
-}
-
-declare void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
-
-
-define float* @test_v4f32_post_imm_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
-;CHECK-LABEL: test_v4f32_post_imm_st4lane:
-;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
-  call void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i32 4
-  ret float* %tmp
-}
-
-define float* @test_v4f32_post_reg_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v4f32_post_reg_st4lane:
-;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
-
-
-define float* @test_v2f32_post_imm_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
-;CHECK-LABEL: test_v2f32_post_imm_st4lane:
-;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
-  call void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i32 4
-  ret float* %tmp
-}
-
-define float* @test_v2f32_post_reg_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f32_post_reg_st4lane:
-;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
-  %tmp = getelementptr float* %A, i64 %inc
-  ret float* %tmp
-}
-
-declare void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
-
-
-define double* @test_v2f64_post_imm_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
-;CHECK-LABEL: test_v2f64_post_imm_st4lane:
-;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
-  call void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 4
-  ret double* %tmp
-}
-
-define double* @test_v2f64_post_reg_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v2f64_post_reg_st4lane:
-;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*)
-
-
-define double* @test_v1f64_post_imm_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
-;CHECK-LABEL: test_v1f64_post_imm_st4lane:
-;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
-  call void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 4
-  ret double* %tmp
-}
-
-define double* @test_v1f64_post_reg_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
-;CHECK-LABEL: test_v1f64_post_reg_st4lane:
-;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
-  %tmp = getelementptr double* %A, i64 %inc
-  ret double* %tmp
-}
-
-declare void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*)
-
-define <16 x i8> @test_v16i8_post_imm_ld1r(i8* %bar, i8** %ptr) {
-; CHECK-LABEL: test_v16i8_post_imm_ld1r:
-; CHECK: ld1r.16b { v0 }, [x0], #1
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
-  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
-  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
-  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
-  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
-  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
-  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
-  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
-  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
-  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
-  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
-  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
-  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
-  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
-  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
-  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
-  %tmp18 = getelementptr i8* %bar, i64 1
-  store i8* %tmp18, i8** %ptr
-  ret <16 x i8> %tmp17
-}
-
-define <16 x i8> @test_v16i8_post_reg_ld1r(i8* %bar, i8** %ptr, i64 %inc) {
-; CHECK-LABEL: test_v16i8_post_reg_ld1r:
-; CHECK: ld1r.16b { v0 }, [x0], x{{[0-9]+}}
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
-  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
-  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
-  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
-  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
-  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
-  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
-  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
-  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
-  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
-  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
-  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
-  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
-  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
-  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
-  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
-  %tmp18 = getelementptr i8* %bar, i64 %inc
-  store i8* %tmp18, i8** %ptr
-  ret <16 x i8> %tmp17
-}
-
-define <8 x i8> @test_v8i8_post_imm_ld1r(i8* %bar, i8** %ptr) {
-; CHECK-LABEL: test_v8i8_post_imm_ld1r:
-; CHECK: ld1r.8b { v0 }, [x0], #1
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
-  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
-  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
-  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
-  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
-  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
-  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
-  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
-  %tmp10 = getelementptr i8* %bar, i64 1
-  store i8* %tmp10, i8** %ptr
-  ret <8 x i8> %tmp9
-}
-
-define <8 x i8> @test_v8i8_post_reg_ld1r(i8* %bar, i8** %ptr, i64 %inc) {
-; CHECK-LABEL: test_v8i8_post_reg_ld1r:
-; CHECK: ld1r.8b { v0 }, [x0], x{{[0-9]+}}
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
-  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
-  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
-  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
-  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
-  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
-  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
-  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
-  %tmp10 = getelementptr i8* %bar, i64 %inc
-  store i8* %tmp10, i8** %ptr
-  ret <8 x i8> %tmp9
-}
-
-define <8 x i16> @test_v8i16_post_imm_ld1r(i16* %bar, i16** %ptr) {
-; CHECK-LABEL: test_v8i16_post_imm_ld1r:
-; CHECK: ld1r.8h { v0 }, [x0], #2
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
-  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
-  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
-  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
-  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
-  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
-  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
-  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
-  %tmp10 = getelementptr i16* %bar, i64 1
-  store i16* %tmp10, i16** %ptr
-  ret <8 x i16> %tmp9
-}
-
-define <8 x i16> @test_v8i16_post_reg_ld1r(i16* %bar, i16** %ptr, i64 %inc) {
-; CHECK-LABEL: test_v8i16_post_reg_ld1r:
-; CHECK: ld1r.8h { v0 }, [x0], x{{[0-9]+}}
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
-  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
-  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
-  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
-  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
-  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
-  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
-  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
-  %tmp10 = getelementptr i16* %bar, i64 %inc
-  store i16* %tmp10, i16** %ptr
-  ret <8 x i16> %tmp9
-}
-
-define <4 x i16> @test_v4i16_post_imm_ld1r(i16* %bar, i16** %ptr) {
-; CHECK-LABEL: test_v4i16_post_imm_ld1r:
-; CHECK: ld1r.4h { v0 }, [x0], #2
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
-  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
-  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
-  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
-  %tmp6 = getelementptr i16* %bar, i64 1
-  store i16* %tmp6, i16** %ptr
-  ret <4 x i16> %tmp5
-}
-
-define <4 x i16> @test_v4i16_post_reg_ld1r(i16* %bar, i16** %ptr, i64 %inc) {
-; CHECK-LABEL: test_v4i16_post_reg_ld1r:
-; CHECK: ld1r.4h { v0 }, [x0], x{{[0-9]+}}
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
-  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
-  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
-  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
-  %tmp6 = getelementptr i16* %bar, i64 %inc
-  store i16* %tmp6, i16** %ptr
-  ret <4 x i16> %tmp5
-}
-
-define <4 x i32> @test_v4i32_post_imm_ld1r(i32* %bar, i32** %ptr) {
-; CHECK-LABEL: test_v4i32_post_imm_ld1r:
-; CHECK: ld1r.4s { v0 }, [x0], #4
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
-  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
-  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
-  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
-  %tmp6 = getelementptr i32* %bar, i64 1
-  store i32* %tmp6, i32** %ptr
-  ret <4 x i32> %tmp5
-}
-
-define <4 x i32> @test_v4i32_post_reg_ld1r(i32* %bar, i32** %ptr, i64 %inc) {
-; CHECK-LABEL: test_v4i32_post_reg_ld1r:
-; CHECK: ld1r.4s { v0 }, [x0], x{{[0-9]+}}
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
-  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
-  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
-  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
-  %tmp6 = getelementptr i32* %bar, i64 %inc
-  store i32* %tmp6, i32** %ptr
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i32> @test_v2i32_post_imm_ld1r(i32* %bar, i32** %ptr) {
-; CHECK-LABEL: test_v2i32_post_imm_ld1r:
-; CHECK: ld1r.2s { v0 }, [x0], #4
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
-  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
-  %tmp4 = getelementptr i32* %bar, i64 1
-  store i32* %tmp4, i32** %ptr
-  ret <2 x i32> %tmp3
-}
-
-define <2 x i32> @test_v2i32_post_reg_ld1r(i32* %bar, i32** %ptr, i64 %inc) {
-; CHECK-LABEL: test_v2i32_post_reg_ld1r:
-; CHECK: ld1r.2s { v0 }, [x0], x{{[0-9]+}}
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
-  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
-  %tmp4 = getelementptr i32* %bar, i64 %inc
-  store i32* %tmp4, i32** %ptr
-  ret <2 x i32> %tmp3
-}
-
-define <2 x i64> @test_v2i64_post_imm_ld1r(i64* %bar, i64** %ptr) {
-; CHECK-LABEL: test_v2i64_post_imm_ld1r:
-; CHECK: ld1r.2d { v0 }, [x0], #8
-  %tmp1 = load i64* %bar
-  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
-  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
-  %tmp4 = getelementptr i64* %bar, i64 1
-  store i64* %tmp4, i64** %ptr
-  ret <2 x i64> %tmp3
-}
-
-define <2 x i64> @test_v2i64_post_reg_ld1r(i64* %bar, i64** %ptr, i64 %inc) {
-; CHECK-LABEL: test_v2i64_post_reg_ld1r:
-; CHECK: ld1r.2d { v0 }, [x0], x{{[0-9]+}}
-  %tmp1 = load i64* %bar
-  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
-  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
-  %tmp4 = getelementptr i64* %bar, i64 %inc
-  store i64* %tmp4, i64** %ptr
-  ret <2 x i64> %tmp3
-}
-
-define <4 x float> @test_v4f32_post_imm_ld1r(float* %bar, float** %ptr) {
-; CHECK-LABEL: test_v4f32_post_imm_ld1r:
-; CHECK: ld1r.4s { v0 }, [x0], #4
-  %tmp1 = load float* %bar
-  %tmp2 = insertelement <4 x float> <float undef, float undef, float undef, float undef>, float %tmp1, i32 0
-  %tmp3 = insertelement <4 x float> %tmp2, float %tmp1, i32 1
-  %tmp4 = insertelement <4 x float> %tmp3, float %tmp1, i32 2
-  %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 3
-  %tmp6 = getelementptr float* %bar, i64 1
-  store float* %tmp6, float** %ptr
-  ret <4 x float> %tmp5
-}
-
-define <4 x float> @test_v4f32_post_reg_ld1r(float* %bar, float** %ptr, i64 %inc) {
-; CHECK-LABEL: test_v4f32_post_reg_ld1r:
-; CHECK: ld1r.4s { v0 }, [x0], x{{[0-9]+}}
-  %tmp1 = load float* %bar
-  %tmp2 = insertelement <4 x float> <float undef, float undef, float undef, float undef>, float %tmp1, i32 0
-  %tmp3 = insertelement <4 x float> %tmp2, float %tmp1, i32 1
-  %tmp4 = insertelement <4 x float> %tmp3, float %tmp1, i32 2
-  %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 3
-  %tmp6 = getelementptr float* %bar, i64 %inc
-  store float* %tmp6, float** %ptr
-  ret <4 x float> %tmp5
-}
-
-define <2 x float> @test_v2f32_post_imm_ld1r(float* %bar, float** %ptr) {
-; CHECK-LABEL: test_v2f32_post_imm_ld1r:
-; CHECK: ld1r.2s { v0 }, [x0], #4
-  %tmp1 = load float* %bar
-  %tmp2 = insertelement <2 x float> <float undef, float undef>, float %tmp1, i32 0
-  %tmp3 = insertelement <2 x float> %tmp2, float %tmp1, i32 1
-  %tmp4 = getelementptr float* %bar, i64 1
-  store float* %tmp4, float** %ptr
-  ret <2 x float> %tmp3
-}
-
-define <2 x float> @test_v2f32_post_reg_ld1r(float* %bar, float** %ptr, i64 %inc) {
-; CHECK-LABEL: test_v2f32_post_reg_ld1r:
-; CHECK: ld1r.2s { v0 }, [x0], x{{[0-9]+}}
-  %tmp1 = load float* %bar
-  %tmp2 = insertelement <2 x float> <float undef, float undef>, float %tmp1, i32 0
-  %tmp3 = insertelement <2 x float> %tmp2, float %tmp1, i32 1
-  %tmp4 = getelementptr float* %bar, i64 %inc
-  store float* %tmp4, float** %ptr
-  ret <2 x float> %tmp3
-}
-
-define <2 x double> @test_v2f64_post_imm_ld1r(double* %bar, double** %ptr) {
-; CHECK-LABEL: test_v2f64_post_imm_ld1r:
-; CHECK: ld1r.2d { v0 }, [x0], #8
-  %tmp1 = load double* %bar
-  %tmp2 = insertelement <2 x double> <double undef, double undef>, double %tmp1, i32 0
-  %tmp3 = insertelement <2 x double> %tmp2, double %tmp1, i32 1
-  %tmp4 = getelementptr double* %bar, i64 1
-  store double* %tmp4, double** %ptr
-  ret <2 x double> %tmp3
-}
-
-define <2 x double> @test_v2f64_post_reg_ld1r(double* %bar, double** %ptr, i64 %inc) {
-; CHECK-LABEL: test_v2f64_post_reg_ld1r:
-; CHECK: ld1r.2d { v0 }, [x0], x{{[0-9]+}}
-  %tmp1 = load double* %bar
-  %tmp2 = insertelement <2 x double> <double undef, double undef>, double %tmp1, i32 0
-  %tmp3 = insertelement <2 x double> %tmp2, double %tmp1, i32 1
-  %tmp4 = getelementptr double* %bar, i64 %inc
-  store double* %tmp4, double** %ptr
-  ret <2 x double> %tmp3
-}
-
-define <16 x i8> @test_v16i8_post_imm_ld1lane(i8* %bar, i8** %ptr, <16 x i8> %A) {
-; CHECK-LABEL: test_v16i8_post_imm_ld1lane:
-; CHECK: ld1.b { v0 }[1], [x0], #1
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1
-  %tmp3 = getelementptr i8* %bar, i64 1
-  store i8* %tmp3, i8** %ptr
-  ret <16 x i8> %tmp2
-}
-
-define <16 x i8> @test_v16i8_post_reg_ld1lane(i8* %bar, i8** %ptr, i64 %inc, <16 x i8> %A) {
-; CHECK-LABEL: test_v16i8_post_reg_ld1lane:
-; CHECK: ld1.b { v0 }[1], [x0], x{{[0-9]+}}
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1
-  %tmp3 = getelementptr i8* %bar, i64 %inc
-  store i8* %tmp3, i8** %ptr
-  ret <16 x i8> %tmp2
-}
-
-define <8 x i8> @test_v8i8_post_imm_ld1lane(i8* %bar, i8** %ptr, <8 x i8> %A) {
-; CHECK-LABEL: test_v8i8_post_imm_ld1lane:
-; CHECK: ld1.b { v0 }[1], [x0], #1
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1
-  %tmp3 = getelementptr i8* %bar, i64 1
-  store i8* %tmp3, i8** %ptr
-  ret <8 x i8> %tmp2
-}
-
-define <8 x i8> @test_v8i8_post_reg_ld1lane(i8* %bar, i8** %ptr, i64 %inc, <8 x i8> %A) {
-; CHECK-LABEL: test_v8i8_post_reg_ld1lane:
-; CHECK: ld1.b { v0 }[1], [x0], x{{[0-9]+}}
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1
-  %tmp3 = getelementptr i8* %bar, i64 %inc
-  store i8* %tmp3, i8** %ptr
-  ret <8 x i8> %tmp2
-}
-
-define <8 x i16> @test_v8i16_post_imm_ld1lane(i16* %bar, i16** %ptr, <8 x i16> %A) {
-; CHECK-LABEL: test_v8i16_post_imm_ld1lane:
-; CHECK: ld1.h { v0 }[1], [x0], #2
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1
-  %tmp3 = getelementptr i16* %bar, i64 1
-  store i16* %tmp3, i16** %ptr
-  ret <8 x i16> %tmp2
-}
-
-define <8 x i16> @test_v8i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <8 x i16> %A) {
-; CHECK-LABEL: test_v8i16_post_reg_ld1lane:
-; CHECK: ld1.h { v0 }[1], [x0], x{{[0-9]+}}
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1
-  %tmp3 = getelementptr i16* %bar, i64 %inc
-  store i16* %tmp3, i16** %ptr
-  ret <8 x i16> %tmp2
-}
-
-define <4 x i16> @test_v4i16_post_imm_ld1lane(i16* %bar, i16** %ptr, <4 x i16> %A) {
-; CHECK-LABEL: test_v4i16_post_imm_ld1lane:
-; CHECK: ld1.h { v0 }[1], [x0], #2
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1
-  %tmp3 = getelementptr i16* %bar, i64 1
-  store i16* %tmp3, i16** %ptr
-  ret <4 x i16> %tmp2
-}
-
-define <4 x i16> @test_v4i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <4 x i16> %A) {
-; CHECK-LABEL: test_v4i16_post_reg_ld1lane:
-; CHECK: ld1.h { v0 }[1], [x0], x{{[0-9]+}}
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1
-  %tmp3 = getelementptr i16* %bar, i64 %inc
-  store i16* %tmp3, i16** %ptr
-  ret <4 x i16> %tmp2
-}
-
-define <4 x i32> @test_v4i32_post_imm_ld1lane(i32* %bar, i32** %ptr, <4 x i32> %A) {
-; CHECK-LABEL: test_v4i32_post_imm_ld1lane:
-; CHECK: ld1.s { v0 }[1], [x0], #4
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1
-  %tmp3 = getelementptr i32* %bar, i64 1
-  store i32* %tmp3, i32** %ptr
-  ret <4 x i32> %tmp2
-}
-
-define <4 x i32> @test_v4i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <4 x i32> %A) {
-; CHECK-LABEL: test_v4i32_post_reg_ld1lane:
-; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1
-  %tmp3 = getelementptr i32* %bar, i64 %inc
-  store i32* %tmp3, i32** %ptr
-  ret <4 x i32> %tmp2
-}
-
-define <2 x i32> @test_v2i32_post_imm_ld1lane(i32* %bar, i32** %ptr, <2 x i32> %A) {
-; CHECK-LABEL: test_v2i32_post_imm_ld1lane:
-; CHECK: ld1.s { v0 }[1], [x0], #4
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1
-  %tmp3 = getelementptr i32* %bar, i64 1
-  store i32* %tmp3, i32** %ptr
-  ret <2 x i32> %tmp2
-}
-
-define <2 x i32> @test_v2i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <2 x i32> %A) {
-; CHECK-LABEL: test_v2i32_post_reg_ld1lane:
-; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1
-  %tmp3 = getelementptr i32* %bar, i64 %inc
-  store i32* %tmp3, i32** %ptr
-  ret <2 x i32> %tmp2
-}
-
-define <2 x i64> @test_v2i64_post_imm_ld1lane(i64* %bar, i64** %ptr, <2 x i64> %A) {
-; CHECK-LABEL: test_v2i64_post_imm_ld1lane:
-; CHECK: ld1.d { v0 }[1], [x0], #8
-  %tmp1 = load i64* %bar
-  %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1
-  %tmp3 = getelementptr i64* %bar, i64 1
-  store i64* %tmp3, i64** %ptr
-  ret <2 x i64> %tmp2
-}
-
-define <2 x i64> @test_v2i64_post_reg_ld1lane(i64* %bar, i64** %ptr, i64 %inc, <2 x i64> %A) {
-; CHECK-LABEL: test_v2i64_post_reg_ld1lane:
-; CHECK: ld1.d { v0 }[1], [x0], x{{[0-9]+}}
-  %tmp1 = load i64* %bar
-  %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1
-  %tmp3 = getelementptr i64* %bar, i64 %inc
-  store i64* %tmp3, i64** %ptr
-  ret <2 x i64> %tmp2
-}
-
-define <4 x float> @test_v4f32_post_imm_ld1lane(float* %bar, float** %ptr, <4 x float> %A) {
-; CHECK-LABEL: test_v4f32_post_imm_ld1lane:
-; CHECK: ld1.s { v0 }[1], [x0], #4
-  %tmp1 = load float* %bar
-  %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1
-  %tmp3 = getelementptr float* %bar, i64 1
-  store float* %tmp3, float** %ptr
-  ret <4 x float> %tmp2
-}
-
-define <4 x float> @test_v4f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <4 x float> %A) {
-; CHECK-LABEL: test_v4f32_post_reg_ld1lane:
-; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
-  %tmp1 = load float* %bar
-  %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1
-  %tmp3 = getelementptr float* %bar, i64 %inc
-  store float* %tmp3, float** %ptr
-  ret <4 x float> %tmp2
-}
-
-define <2 x float> @test_v2f32_post_imm_ld1lane(float* %bar, float** %ptr, <2 x float> %A) {
-; CHECK-LABEL: test_v2f32_post_imm_ld1lane:
-; CHECK: ld1.s { v0 }[1], [x0], #4
-  %tmp1 = load float* %bar
-  %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1
-  %tmp3 = getelementptr float* %bar, i64 1
-  store float* %tmp3, float** %ptr
-  ret <2 x float> %tmp2
-}
-
-define <2 x float> @test_v2f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <2 x float> %A) {
-; CHECK-LABEL: test_v2f32_post_reg_ld1lane:
-; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
-  %tmp1 = load float* %bar
-  %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1
-  %tmp3 = getelementptr float* %bar, i64 %inc
-  store float* %tmp3, float** %ptr
-  ret <2 x float> %tmp2
-}
-
-define <2 x double> @test_v2f64_post_imm_ld1lane(double* %bar, double** %ptr, <2 x double> %A) {
-; CHECK-LABEL: test_v2f64_post_imm_ld1lane:
-; CHECK: ld1.d { v0 }[1], [x0], #8
-  %tmp1 = load double* %bar
-  %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1
-  %tmp3 = getelementptr double* %bar, i64 1
-  store double* %tmp3, double** %ptr
-  ret <2 x double> %tmp2
-}
-
-define <2 x double> @test_v2f64_post_reg_ld1lane(double* %bar, double** %ptr, i64 %inc, <2 x double> %A) {
-; CHECK-LABEL: test_v2f64_post_reg_ld1lane:
-; CHECK: ld1.d { v0 }[1], [x0], x{{[0-9]+}}
-  %tmp1 = load double* %bar
-  %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1
-  %tmp3 = getelementptr double* %bar, i64 %inc
-  store double* %tmp3, double** %ptr
-  ret <2 x double> %tmp2
-}
\ No newline at end of file
diff --git a/test/CodeGen/ARM64/inline-asm-error-I.ll b/test/CodeGen/ARM64/inline-asm-error-I.ll
deleted file mode 100644
index a7aaf9e55d1..00000000000
--- a/test/CodeGen/ARM64/inline-asm-error-I.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc -march=arm64 < %s  2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-; Check for at least one invalid constant.
-; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'I'
-
-define i32 @constraint_I(i32 %i, i32 %j) nounwind ssp {
-entry:
-  %0 = tail call i32 asm sideeffect "add $0, $1, $2", "=r,r,I"(i32 %i, i32 4097) nounwind
-  ret i32 %0
-}
diff --git a/test/CodeGen/ARM64/inline-asm-error-J.ll b/test/CodeGen/ARM64/inline-asm-error-J.ll
deleted file mode 100644
index 077e1b80d93..00000000000
--- a/test/CodeGen/ARM64/inline-asm-error-J.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc -march=arm64 < %s  2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-; Check for at least one invalid constant.
-; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'J'
-
-define i32 @constraint_J(i32 %i, i32 %j) nounwind ssp {
-entry:
-  %0 = tail call i32 asm sideeffect "sub $0, $1, $2", "=r,r,J"(i32 %i, i32 2) nounwind
-  ret i32 %0
-}
diff --git a/test/CodeGen/ARM64/inline-asm-error-K.ll b/test/CodeGen/ARM64/inline-asm-error-K.ll
deleted file mode 100644
index 2a7f9619de5..00000000000
--- a/test/CodeGen/ARM64/inline-asm-error-K.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc -march=arm64 < %s  2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-; Check for at least one invalid constant.
-; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'K'
-
-define i32 @constraint_K(i32 %i, i32 %j) nounwind {
-entry:
-  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,K"(i32 %i, i32 -1) nounwind
-  ret i32 %0
-}
diff --git a/test/CodeGen/ARM64/inline-asm-error-L.ll b/test/CodeGen/ARM64/inline-asm-error-L.ll
deleted file mode 100644
index 17019434195..00000000000
--- a/test/CodeGen/ARM64/inline-asm-error-L.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc -march=arm64 < %s  2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-; Check for at least one invalid constant.
-; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'L'
-
-define i32 @constraint_L(i32 %i, i32 %j) nounwind {
-entry:
-  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,L"(i32 %i, i64 -1) nounwind
-  ret i32 %0
-}
diff --git a/test/CodeGen/ARM64/inline-asm-error-M.ll b/test/CodeGen/ARM64/inline-asm-error-M.ll
deleted file mode 100644
index 952bf6042c2..00000000000
--- a/test/CodeGen/ARM64/inline-asm-error-M.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc -march=arm64 < %s  2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-; Check for at least one invalid constant.
-; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'M'
-
-define i32 @constraint_M(i32 %i, i32 %j) nounwind {
-entry:
-  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,M"(i32 305418240) nounwind
-  ret i32 %0
-}
diff --git a/test/CodeGen/ARM64/inline-asm-error-N.ll b/test/CodeGen/ARM64/inline-asm-error-N.ll
deleted file mode 100644
index b4a199f160a..00000000000
--- a/test/CodeGen/ARM64/inline-asm-error-N.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc -march=arm64 < %s  2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-; Check for at least one invalid constant.
-; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'N'
-
-define i32 @constraint_N(i32 %i, i32 %j) nounwind {
-entry:
-  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,N"(i64 1311761352401879040) nounwind
-  ret i32 %0
-}
diff --git a/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll b/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
deleted file mode 100644
index 6bfce8f8f6a..00000000000
--- a/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc < %s -march=arm64 2>&1 | FileCheck %s
-
-
-; The 'z' constraint allocates either xzr or wzr, but obviously an input of 1 is
-; incompatible.
-define void @test_bad_zero_reg() {
-  tail call void asm sideeffect "USE($0)", "z"(i32 1) nounwind
-; CHECK: error: invalid operand for inline asm constraint 'z'
-
-  ret void
-}
diff --git a/test/CodeGen/ARM64/inline-asm.ll b/test/CodeGen/ARM64/inline-asm.ll
deleted file mode 100644
index e64507870fb..00000000000
--- a/test/CodeGen/ARM64/inline-asm.ll
+++ /dev/null
@@ -1,230 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -no-integrated-as | FileCheck %s
-
-; rdar://9167275
-
-define i32 @t1() nounwind ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: mov {{w[0-9]+}}, 7
-  %0 = tail call i32 asm "mov ${0:w}, 7", "=r"() nounwind
-  ret i32 %0
-}
-
-define i64 @t2() nounwind ssp {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: mov {{x[0-9]+}}, 7
-  %0 = tail call i64 asm "mov $0, 7", "=r"() nounwind
-  ret i64 %0
-}
-
-define i64 @t3() nounwind ssp {
-entry:
-; CHECK-LABEL: t3:
-; CHECK: mov {{w[0-9]+}}, 7
-  %0 = tail call i64 asm "mov ${0:w}, 7", "=r"() nounwind
-  ret i64 %0
-}
-
-; rdar://9281206
-
-define void @t4(i64 %op) nounwind {
-entry:
-; CHECK-LABEL: t4:
-; CHECK: mov x0, {{x[0-9]+}}; svc #0
-  %0 = tail call i64 asm sideeffect "mov x0, $1; svc #0;", "=r,r,r,~{x0}"(i64 %op, i64 undef) nounwind
-  ret void
-}
-
-; rdar://9394290
-
-define float @t5(float %x) nounwind {
-entry:
-; CHECK-LABEL: t5:
-; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %0 = tail call float asm "fadd ${0:s}, ${0:s}, ${0:s}", "=w,0"(float %x) nounwind
-  ret float %0
-}
-
-; rdar://9553599
-
-define zeroext i8 @t6(i8* %src) nounwind {
-entry:
-; CHECK-LABEL: t6:
-; CHECK: ldtrb {{w[0-9]+}}, [{{x[0-9]+}}]
-  %0 = tail call i8 asm "ldtrb ${0:w}, [$1]", "=r,r"(i8* %src) nounwind
-  ret i8 %0
-}
-
-define void @t7(i8* %f, i32 %g) nounwind {
-entry:
-  %f.addr = alloca i8*, align 8
-  store i8* %f, i8** %f.addr, align 8
-  ; CHECK-LABEL: t7:
-  ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
-  call void asm "str ${1:w}, $0", "=*Q,r"(i8** %f.addr, i32 %g) nounwind
-  ret void
-}
-
-; rdar://10258229
-; ARM64TargetLowering::getRegForInlineAsmConstraint() should recognize 'v'
-; registers.
-define void @t8() nounwind ssp {
-entry:
-; CHECK-LABEL: t8:
-; CHECK: stp {{d[0-9]+}}, {{d[0-9]+}}, [sp, #-16]
-  tail call void asm sideeffect "nop", "~{v8}"() nounwind
-  ret void
-}
-
-define i32 @constraint_I(i32 %i, i32 %j) nounwind {
-entry:
-  ; CHECK-LABEL: constraint_I:
-  %0 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 16773120) nounwind
-  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #16773120
-  %1 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 4096) nounwind
-  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #4096
-  ret i32 %1
-}
-
-define i32 @constraint_J(i32 %i, i32 %j) nounwind {
-entry:
-  ; CHECK-LABEL: constraint_J:
-  %0 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -16773120) nounwind
-  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4278194176
-  %1 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -1) nounwind
-  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4294967295
-  ret i32 %1
-}
-
-define i32 @constraint_KL(i32 %i, i32 %j) nounwind {
-entry:
-  ; CHECK-LABEL: constraint_KL:
-  %0 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,K"(i32 %i, i32 255) nounwind
-  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #255
-  %1 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,L"(i32 %i, i64 16711680) nounwind
-  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #16711680
-  ret i32 %1
-}
-
-define i32 @constraint_MN(i32 %i, i32 %j) nounwind {
-entry:
-  ; CHECK-LABEL: constraint_MN:
-  %0 = tail call i32 asm sideeffect "movk ${0:w}, $1", "=r,M"(i32 65535) nounwind
-  ; CHECK: movk  {{w[0-9]+}}, #65535
-  %1 = tail call i32 asm sideeffect "movz ${0:w}, $1", "=r,N"(i64 0) nounwind
-  ; CHECK: movz  {{w[0-9]+}}, #0
-  ret i32 %1
-}
-
-define void @t9() nounwind {
-entry:
-  ; CHECK-LABEL: t9:
-  %data = alloca <2 x double>, align 16
-  %0 = load <2 x double>* %data, align 16
-  call void asm sideeffect "mov.2d v4, $0\0A", "w,~{v4}"(<2 x double> %0) nounwind
-  ; CHECK: mov.2d v4, {{v[0-9]+}}
-  ret void
-}
-
-define void @t10() nounwind {
-entry:
-  ; CHECK-LABEL: t10:
-  %data = alloca <2 x float>, align 8
-  %a = alloca [2 x float], align 4
-  %arraydecay = getelementptr inbounds [2 x float]* %a, i32 0, i32 0
-  %0 = load <2 x float>* %data, align 8
-  call void asm sideeffect "ldr ${1:q}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
-  ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
-  call void asm sideeffect "ldr ${1:d}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
-  ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
-  call void asm sideeffect "ldr ${1:s}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
-  ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}]
-  call void asm sideeffect "ldr ${1:h}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
-  ; CHECK: ldr {{h[0-9]+}}, [{{x[0-9]+}}]
-  call void asm sideeffect "ldr ${1:b}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
-  ; CHECK: ldr {{b[0-9]+}}, [{{x[0-9]+}}]
-  ret void
-}
-
-define void @t11() nounwind {
-entry:
-  ; CHECK-LABEL: t11:
-  %a = alloca i32, align 4
-  %0 = load i32* %a, align 4
-  call void asm sideeffect "mov ${1:x}, ${0:x}\0A", "r,i"(i32 %0, i32 0) nounwind
-  ; CHECK: mov xzr, {{x[0-9]+}}
-  %1 = load i32* %a, align 4
-  call void asm sideeffect "mov ${1:w}, ${0:w}\0A", "r,i"(i32 %1, i32 0) nounwind
-  ; CHECK: mov wzr, {{w[0-9]+}}
-  ret void
-}
-
-define void @t12() nounwind {
-entry:
-  ; CHECK-LABEL: t12:
-  %data = alloca <4 x float>, align 16
-  %0 = load <4 x float>* %data, align 16
-  call void asm sideeffect "mov.2d v4, $0\0A", "x,~{v4}"(<4 x float> %0) nounwind
-  ; CHECK mov.2d v4, {{v([0-9])|(1[0-5])}}
-  ret void
-}
-
-define void @t13() nounwind {
-entry:
-  ; CHECK-LABEL: t13:
-  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 1311673391471656960) nounwind
-  ; CHECK: mov x4, #1311673391471656960
-  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 -4662) nounwind
-  ; CHECK: mov x4, #-4662
-  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 4660) nounwind
-  ; CHECK: mov x4, #4660
-  call void asm sideeffect "mov x4, $0\0A", "N"(i64 -71777214294589696) nounwind
-  ; CHECK: mov x4, #-71777214294589696
-  ret void
-}
-
-define void @t14() nounwind {
-entry:
-  ; CHECK-LABEL: t14:
-  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 305397760) nounwind
-  ; CHECK: mov w4, #305397760
-  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 -4662) nounwind
-  ; CHECK: mov w4, #4294962634
-  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 4660) nounwind
-  ; CHECK: mov w4, #4660
-  call void asm sideeffect "mov w4, $0\0A", "M"(i32 -16711936) nounwind
-  ; CHECK: mov w4, #4278255360
-  ret void
-}
-
-define void @t15() nounwind {
-entry:
-  %0 = tail call double asm sideeffect "fmov $0, d8", "=r"() nounwind
-  ; CHECK: fmov {{x[0-9]+}}, d8
-  ret void
-}
-
-; rdar://problem/14285178
-
-define void @test_zero_reg(i32* %addr) {
-; CHECK-LABEL: test_zero_reg:
-
-  tail call void asm sideeffect "USE($0)", "z"(i32 0) nounwind
-; CHECK: USE(xzr)
-
-  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 0)
-; CHECK: USE(wzr)
-
-  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 1)
-; CHECK: orr [[VAL1:w[0-9]+]], wzr, #0x1
-; CHECK: USE([[VAL1]])
-
-  tail call void asm sideeffect "USE($0), USE($1)", "z,z"(i32 0, i32 0) nounwind
-; CHECK: USE(xzr), USE(xzr)
-
-  tail call void asm sideeffect "USE($0), USE(${1:w})", "z,z"(i32 0, i32 0) nounwind
-; CHECK: USE(xzr), USE(wzr)
-
-  ret void
-}
diff --git a/test/CodeGen/ARM64/join-reserved.ll b/test/CodeGen/ARM64/join-reserved.ll
deleted file mode 100644
index e99168b5eba..00000000000
--- a/test/CodeGen/ARM64/join-reserved.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs | FileCheck %s
-target triple = "arm64-apple-macosx10"
-
-; Make sure that a store to [sp] addresses off sp directly.
-; A move isn't necessary.
-; <rdar://problem/11492712>
-; CHECK-LABEL: g:
-; CHECK: str xzr, [sp]
-; CHECK: bl
-; CHECK: ret
-define void @g() nounwind ssp {
-entry:
-  tail call void (i32, ...)* @f(i32 0, i32 0) nounwind
-  ret void
-}
-
-declare void @f(i32, ...)
diff --git a/test/CodeGen/ARM64/jumptable.ll b/test/CodeGen/ARM64/jumptable.ll
deleted file mode 100644
index 4635cfe5858..00000000000
--- a/test/CodeGen/ARM64/jumptable.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-LINUX
-; <rdar://11417675>
-
-define void @sum(i32* %to) {
-entry:
-  switch i32 undef, label %exit [
-    i32 1, label %bb1
-    i32 2, label %bb2
-    i32 3, label %bb3
-    i32 4, label %bb4
-  ]
-bb1:
-  store i32 undef, i32* %to
-  br label %exit
-bb2:
-  store i32 undef, i32* %to
-  br label %exit
-bb3:
-  store i32 undef, i32* %to
-  br label %exit
-bb4:
-  store i32 undef, i32* %to
-  br label %exit
-exit:
-  ret void
-}
-
-; CHECK-LABEL: sum:
-; CHECK: adrp    {{x[0-9]+}}, LJTI0_0@PAGE
-; CHECK:  add    {{x[0-9]+}}, {{x[0-9]+}}, LJTI0_0@PAGEOFF
-
-; CHECK-LINUX-LABEL: sum:
-; CHECK-LINUX: adrp    {{x[0-9]+}}, .LJTI0_0
-; CHECK-LINUX:  add    {{x[0-9]+}}, {{x[0-9]+}}, :lo12:.LJTI0_0
diff --git a/test/CodeGen/ARM64/ld1.ll b/test/CodeGen/ARM64/ld1.ll
deleted file mode 100644
index 61836a10a80..00000000000
--- a/test/CodeGen/ARM64/ld1.ll
+++ /dev/null
@@ -1,1345 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
-
-%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
-%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
-%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>, <8 x i8>,  <8 x i8> }
-
-define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
-; CHECK-LABEL: ld2_8b
-; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
-; and from the argument of the function also defined by ABI (i.e., x0)
-; CHECK ld2.8b { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x8x2_t  %tmp2
-}
-
-define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
-; CHECK-LABEL: ld3_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.8b { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x8x3_t  %tmp2
-}
-
-define %struct.__neon_int8x8x4_t @ld4_8b(i8* %A) nounwind {
-; CHECK-LABEL: ld4_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x8x4_t  %tmp2
-}
-
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
-
-%struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
-%struct.__neon_int8x16x3_t = type { <16 x i8>,  <16 x i8>,  <16 x i8> }
-%struct.__neon_int8x16x4_t = type { <16 x i8>,  <16 x i8>, <16 x i8>,  <16 x i8> }
-
-define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
-; CHECK-LABEL: ld2_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2.16b { v0, v1 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
-  ret %struct.__neon_int8x16x2_t  %tmp2
-}
-
-define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
-; CHECK-LABEL: ld3_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.16b { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
-  ret %struct.__neon_int8x16x3_t  %tmp2
-}
-
-define %struct.__neon_int8x16x4_t @ld4_16b(i8* %A) nounwind {
-; CHECK-LABEL: ld4_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
-  ret %struct.__neon_int8x16x4_t  %tmp2
-}
-
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
-
-%struct.__neon_int16x4x2_t = type { <4 x i16>,  <4 x i16> }
-%struct.__neon_int16x4x3_t = type { <4 x i16>,  <4 x i16>,  <4 x i16> }
-%struct.__neon_int16x4x4_t = type { <4 x i16>,  <4 x i16>, <4 x i16>,  <4 x i16> }
-
-define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
-; CHECK-LABEL: ld2_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2.4h { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
-	ret %struct.__neon_int16x4x2_t  %tmp2
-}
-
-define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
-; CHECK-LABEL: ld3_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.4h { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
-	ret %struct.__neon_int16x4x3_t  %tmp2
-}
-
-define %struct.__neon_int16x4x4_t @ld4_4h(i16* %A) nounwind {
-; CHECK-LABEL: ld4_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
-	ret %struct.__neon_int16x4x4_t  %tmp2
-}
-
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
-
-%struct.__neon_int16x8x2_t = type { <8 x i16>,  <8 x i16> }
-%struct.__neon_int16x8x3_t = type { <8 x i16>,  <8 x i16>,  <8 x i16> }
-%struct.__neon_int16x8x4_t = type { <8 x i16>,  <8 x i16>, <8 x i16>,  <8 x i16> }
-
-define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
-; CHECK-LABEL: ld2_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2.8h { v0, v1 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
-  ret %struct.__neon_int16x8x2_t  %tmp2
-}
-
-define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
-; CHECK-LABEL: ld3_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.8h { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
-  ret %struct.__neon_int16x8x3_t %tmp2
-}
-
-define %struct.__neon_int16x8x4_t @ld4_8h(i16* %A) nounwind {
-; CHECK-LABEL: ld4_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
-  ret %struct.__neon_int16x8x4_t  %tmp2
-}
-
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
-
-%struct.__neon_int32x2x2_t = type { <2 x i32>,  <2 x i32> }
-%struct.__neon_int32x2x3_t = type { <2 x i32>,  <2 x i32>,  <2 x i32> }
-%struct.__neon_int32x2x4_t = type { <2 x i32>,  <2 x i32>, <2 x i32>,  <2 x i32> }
-
-define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
-; CHECK-LABEL: ld2_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2.2s { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x2x2_t  %tmp2
-}
-
-define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
-; CHECK-LABEL: ld3_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.2s { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x2x3_t  %tmp2
-}
-
-define %struct.__neon_int32x2x4_t @ld4_2s(i32* %A) nounwind {
-; CHECK-LABEL: ld4_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x2x4_t  %tmp2
-}
-
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
-
-%struct.__neon_int32x4x2_t = type { <4 x i32>,  <4 x i32> }
-%struct.__neon_int32x4x3_t = type { <4 x i32>,  <4 x i32>,  <4 x i32> }
-%struct.__neon_int32x4x4_t = type { <4 x i32>,  <4 x i32>, <4 x i32>,  <4 x i32> }
-
-define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
-; CHECK-LABEL: ld2_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2.4s { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x4x2_t  %tmp2
-}
-
-define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
-; CHECK-LABEL: ld3_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.4s { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x4x3_t  %tmp2
-}
-
-define %struct.__neon_int32x4x4_t @ld4_4s(i32* %A) nounwind {
-; CHECK-LABEL: ld4_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x4x4_t  %tmp2
-}
-
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
-
-%struct.__neon_int64x2x2_t = type { <2 x i64>,  <2 x i64> }
-%struct.__neon_int64x2x3_t = type { <2 x i64>,  <2 x i64>,  <2 x i64> }
-%struct.__neon_int64x2x4_t = type { <2 x i64>,  <2 x i64>, <2 x i64>,  <2 x i64> }
-
-define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
-; CHECK-LABEL: ld2_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2.2d { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x2x2_t  %tmp2
-}
-
-define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
-; CHECK-LABEL: ld3_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.2d { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x2x3_t  %tmp2
-}
-
-define %struct.__neon_int64x2x4_t @ld4_2d(i64* %A) nounwind {
-; CHECK-LABEL: ld4_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x2x4_t  %tmp2
-}
-
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
-
-%struct.__neon_int64x1x2_t = type { <1 x i64>,  <1 x i64> }
-%struct.__neon_int64x1x3_t = type { <1 x i64>,  <1 x i64>, <1 x i64> }
-%struct.__neon_int64x1x4_t = type { <1 x i64>,  <1 x i64>, <1 x i64>, <1 x i64> }
-
-
-define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
-; CHECK-LABEL: ld2_1di64
-; Make sure we are using the operands defined by the ABI
-; CHECK ld1.1d { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x1x2_t  %tmp2
-}
-
-define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
-; CHECK-LABEL: ld3_1di64
-; Make sure we are using the operands defined by the ABI
-; CHECK ld1.1d { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x1x3_t  %tmp2
-}
-
-define %struct.__neon_int64x1x4_t @ld4_1di64(i64* %A) nounwind {
-; CHECK-LABEL: ld4_1di64
-; Make sure we are using the operands defined by the ABI
-; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x1x4_t  %tmp2
-}
-
-
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
-
-%struct.__neon_float64x1x2_t = type { <1 x double>,  <1 x double> }
-%struct.__neon_float64x1x3_t = type { <1 x double>,  <1 x double>, <1 x double> }
-%struct.__neon_float64x1x4_t = type { <1 x double>,  <1 x double>, <1 x double>, <1 x double> }
-
-
-define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
-; CHECK-LABEL: ld2_1df64
-; Make sure we are using the operands defined by the ABI
-; CHECK ld1.1d { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
-	ret %struct.__neon_float64x1x2_t  %tmp2
-}
-
-define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
-; CHECK-LABEL: ld3_1df64
-; Make sure we are using the operands defined by the ABI
-; CHECK ld1.1d { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
-	ret %struct.__neon_float64x1x3_t  %tmp2
-}
-
-define %struct.__neon_float64x1x4_t @ld4_1df64(double* %A) nounwind {
-; CHECK-LABEL: ld4_1df64
-; Make sure we are using the operands defined by the ABI
-; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
-	ret %struct.__neon_float64x1x4_t  %tmp2
-}
-
-declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
-declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
-declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
-
-
-define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld2lane_16b
-; CHECK ld2.b { v0, v1 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
-	ret %struct.__neon_int8x16x2_t  %tmp2
-}
-
-define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i8* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld3lane_16b
-; CHECK ld3.b { v0, v1, v2 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
-	ret %struct.__neon_int8x16x3_t  %tmp2
-}
-
-define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i8* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld4lane_16b
-; CHECK ld4.b { v0, v1, v2, v3 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
-	ret %struct.__neon_int8x16x4_t  %tmp2
-}
-
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-
-define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, i16* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld2lane_8h
-; CHECK ld2.h { v0, v1 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
-	ret %struct.__neon_int16x8x2_t  %tmp2
-}
-
-define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i16* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld3lane_8h
-; CHECK ld3.h { v0, v1, v3 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
-	ret %struct.__neon_int16x8x3_t  %tmp2
-}
-
-define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i16* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld4lane_8h
-; CHECK ld4.h { v0, v1, v2, v3 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
-	ret %struct.__neon_int16x8x4_t  %tmp2
-}
-
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-
-define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, i32* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld2lane_4s
-; CHECK ld2.s { v0, v1 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
-	ret %struct.__neon_int32x4x2_t  %tmp2
-}
-
-define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i32* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld3lane_4s
-; CHECK ld3.s { v0, v1, v2 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
-	ret %struct.__neon_int32x4x3_t  %tmp2
-}
-
-define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i32* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld4lane_4s
-; CHECK ld4.s { v0, v1, v2, v3 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
-	ret %struct.__neon_int32x4x4_t  %tmp2
-}
-
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-
-define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, i64* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld2lane_2d
-; CHECK ld2.d { v0, v1 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
-	ret %struct.__neon_int64x2x2_t  %tmp2
-}
-
-define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld3lane_2d
-; CHECK ld3.d { v0, v1, v3 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
-	ret %struct.__neon_int64x2x3_t  %tmp2
-}
-
-define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld4lane_2d
-; CHECK ld4.d { v0, v1, v2, v3 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
-	ret %struct.__neon_int64x2x4_t  %tmp2
-}
-
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-
-define <8 x i8> @ld1r_8b(i8* %bar) {
-; CHECK: ld1r_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.8b { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
-  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
-  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
-  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
-  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
-  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
-  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
-  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
-  ret <8 x i8> %tmp9
-}
-
-define <16 x i8> @ld1r_16b(i8* %bar) {
-; CHECK: ld1r_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.16b { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
-  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
-  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
-  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
-  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
-  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
-  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
-  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
-  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
-  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
-  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
-  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
-  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
-  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
-  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
-  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
-  ret <16 x i8> %tmp17
-}
-
-define <4 x i16> @ld1r_4h(i16* %bar) {
-; CHECK: ld1r_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.4h { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
-  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
-  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
-  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
-  ret <4 x i16> %tmp5
-}
-
-define <8 x i16> @ld1r_8h(i16* %bar) {
-; CHECK: ld1r_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.8h { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
-  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
-  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
-  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
-  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
-  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
-  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
-  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
-  ret <8 x i16> %tmp9
-}
-
-define <2 x i32> @ld1r_2s(i32* %bar) {
-; CHECK: ld1r_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.2s { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
-  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
-  ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @ld1r_4s(i32* %bar) {
-; CHECK: ld1r_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.4s { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
-  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
-  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
-  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @ld1r_2d(i64* %bar) {
-; CHECK: ld1r_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.2d { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i64* %bar
-  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
-  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
-  ret <2 x i64> %tmp3
-}
-
-define %struct.__neon_int8x8x2_t @ld2r_8b(i8* %A) nounwind {
-; CHECK: ld2r_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.8b { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x8x2_t  %tmp2
-}
-
-define %struct.__neon_int8x8x3_t @ld3r_8b(i8* %A) nounwind {
-; CHECK: ld3r_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.8b { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x8x3_t  %tmp2
-}
-
-define %struct.__neon_int8x8x4_t @ld4r_8b(i8* %A) nounwind {
-; CHECK: ld4r_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.8b { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x8x4_t  %tmp2
-}
-
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
-
-define %struct.__neon_int8x16x2_t @ld2r_16b(i8* %A) nounwind {
-; CHECK: ld2r_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.16b { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x16x2_t  %tmp2
-}
-
-define %struct.__neon_int8x16x3_t @ld3r_16b(i8* %A) nounwind {
-; CHECK: ld3r_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.16b { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x16x3_t  %tmp2
-}
-
-define %struct.__neon_int8x16x4_t @ld4r_16b(i8* %A) nounwind {
-; CHECK: ld4r_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.16b { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x16x4_t  %tmp2
-}
-
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
-
-define %struct.__neon_int16x4x2_t @ld2r_4h(i16* %A) nounwind {
-; CHECK: ld2r_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.4h { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
-	ret %struct.__neon_int16x4x2_t  %tmp2
-}
-
-define %struct.__neon_int16x4x3_t @ld3r_4h(i16* %A) nounwind {
-; CHECK: ld3r_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.4h { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
-	ret %struct.__neon_int16x4x3_t  %tmp2
-}
-
-define %struct.__neon_int16x4x4_t @ld4r_4h(i16* %A) nounwind {
-; CHECK: ld4r_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.4h { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
-	ret %struct.__neon_int16x4x4_t  %tmp2
-}
-
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
-
-define %struct.__neon_int16x8x2_t @ld2r_8h(i16* %A) nounwind {
-; CHECK: ld2r_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.8h { v0, v1 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
-  ret %struct.__neon_int16x8x2_t  %tmp2
-}
-
-define %struct.__neon_int16x8x3_t @ld3r_8h(i16* %A) nounwind {
-; CHECK: ld3r_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.8h { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
-  ret %struct.__neon_int16x8x3_t  %tmp2
-}
-
-define %struct.__neon_int16x8x4_t @ld4r_8h(i16* %A) nounwind {
-; CHECK: ld4r_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.8h { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
-  ret %struct.__neon_int16x8x4_t  %tmp2
-}
-
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
-
-define %struct.__neon_int32x2x2_t @ld2r_2s(i32* %A) nounwind {
-; CHECK: ld2r_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.2s { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x2x2_t  %tmp2
-}
-
-define %struct.__neon_int32x2x3_t @ld3r_2s(i32* %A) nounwind {
-; CHECK: ld3r_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.2s { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x2x3_t  %tmp2
-}
-
-define %struct.__neon_int32x2x4_t @ld4r_2s(i32* %A) nounwind {
-; CHECK: ld4r_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.2s { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x2x4_t  %tmp2
-}
-
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
-
-define %struct.__neon_int32x4x2_t @ld2r_4s(i32* %A) nounwind {
-; CHECK: ld2r_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.4s { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x4x2_t  %tmp2
-}
-
-define %struct.__neon_int32x4x3_t @ld3r_4s(i32* %A) nounwind {
-; CHECK: ld3r_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.4s { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x4x3_t  %tmp2
-}
-
-define %struct.__neon_int32x4x4_t @ld4r_4s(i32* %A) nounwind {
-; CHECK: ld4r_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.4s { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x4x4_t  %tmp2
-}
-
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
-
-define %struct.__neon_int64x1x2_t @ld2r_1d(i64* %A) nounwind {
-; CHECK: ld2r_1d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.1d { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x1x2_t  %tmp2
-}
-
-define %struct.__neon_int64x1x3_t @ld3r_1d(i64* %A) nounwind {
-; CHECK: ld3r_1d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.1d { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x1x3_t  %tmp2
-}
-
-define %struct.__neon_int64x1x4_t @ld4r_1d(i64* %A) nounwind {
-; CHECK: ld4r_1d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.1d { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x1x4_t  %tmp2
-}
-
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
-
-define %struct.__neon_int64x2x2_t @ld2r_2d(i64* %A) nounwind {
-; CHECK: ld2r_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.2d { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x2x2_t  %tmp2
-}
-
-define %struct.__neon_int64x2x3_t @ld3r_2d(i64* %A) nounwind {
-; CHECK: ld3r_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.2d { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x2x3_t  %tmp2
-}
-
-define %struct.__neon_int64x2x4_t @ld4r_2d(i64* %A) nounwind {
-; CHECK: ld4r_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.2d { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x2x4_t  %tmp2
-}
-
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
-
-define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
-; CHECK-LABEL: ld1_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.b { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <16 x i8> %V, i8 %tmp1, i32 0
-  ret <16 x i8> %tmp2
-}
-
-define <8 x i16> @ld1_8h(<8 x i16> %V, i16* %bar) {
-; CHECK-LABEL: ld1_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.h { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <8 x i16> %V, i16 %tmp1, i32 0
-  ret <8 x i16> %tmp2
-}
-
-define <4 x i32> @ld1_4s(<4 x i32> %V, i32* %bar) {
-; CHECK-LABEL: ld1_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.s { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <4 x i32> %V, i32 %tmp1, i32 0
-  ret <4 x i32> %tmp2
-}
-
-define <4 x float> @ld1_4s_float(<4 x float> %V, float* %bar) {
-; CHECK-LABEL: ld1_4s_float:
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.s { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load float* %bar
-  %tmp2 = insertelement <4 x float> %V, float %tmp1, i32 0
-  ret <4 x float> %tmp2
-}
-
-define <2 x i64> @ld1_2d(<2 x i64> %V, i64* %bar) {
-; CHECK-LABEL: ld1_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.d { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i64* %bar
-  %tmp2 = insertelement <2 x i64> %V, i64 %tmp1, i32 0
-  ret <2 x i64> %tmp2
-}
-
-define <2 x double> @ld1_2d_double(<2 x double> %V, double* %bar) {
-; CHECK-LABEL: ld1_2d_double:
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.d { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load double* %bar
-  %tmp2 = insertelement <2 x double> %V, double %tmp1, i32 0
-  ret <2 x double> %tmp2
-}
-
-define <1 x i64> @ld1_1d(<1 x i64>* %p) {
-; CHECK-LABEL: ld1_1d
-; Make sure we are using the operands defined by the ABI
-; CHECK: ldr [[REG:d[0-9]+]], [x0]
-; CHECK-NEXT: ret
-  %tmp = load <1 x i64>* %p, align 8
-  ret <1 x i64> %tmp
-}
-
-define <8 x i8> @ld1_8b(<8 x i8> %V, i8* %bar) {
-; CHECK-LABEL: ld1_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.b { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <8 x i8> %V, i8 %tmp1, i32 0
-  ret <8 x i8> %tmp2
-}
-
-define <4 x i16> @ld1_4h(<4 x i16> %V, i16* %bar) {
-; CHECK-LABEL: ld1_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.h { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <4 x i16> %V, i16 %tmp1, i32 0
-  ret <4 x i16> %tmp2
-}
-
-define <2 x i32> @ld1_2s(<2 x i32> %V, i32* %bar) {
-; CHECK-LABEL: ld1_2s:
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.s { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <2 x i32> %V, i32 %tmp1, i32 0
-  ret <2 x i32> %tmp2
-}
-
-define <2 x float> @ld1_2s_float(<2 x float> %V, float* %bar) {
-; CHECK-LABEL: ld1_2s_float:
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.s { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load float* %bar
-  %tmp2 = insertelement <2 x float> %V, float %tmp1, i32 0
-  ret <2 x float> %tmp2
-}
-
-
-; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
-define void @ld1r_2s_from_dup(i8* nocapture %a, i8* nocapture %b, i16* nocapture %diff) nounwind ssp {
-entry:
-; CHECK: ld1r_2s_from_dup
-; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0]
-; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1]
-; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]]
-; CHECK-NEXT: str d[[RESREGNUM]], [x2]
-; CHECK-NEXT: ret
-  %tmp = bitcast i8* %a to i32*
-  %tmp1 = load i32* %tmp, align 4
-  %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0
-  %lane = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %tmp3 = bitcast <2 x i32> %lane to <8 x i8>
-  %tmp4 = bitcast i8* %b to i32*
-  %tmp5 = load i32* %tmp4, align 4
-  %tmp6 = insertelement <2 x i32> undef, i32 %tmp5, i32 0
-  %lane1 = shufflevector <2 x i32> %tmp6, <2 x i32> undef, <2 x i32> zeroinitializer
-  %tmp7 = bitcast <2 x i32> %lane1 to <8 x i8>
-  %vmovl.i.i = zext <8 x i8> %tmp3 to <8 x i16>
-  %vmovl.i4.i = zext <8 x i8> %tmp7 to <8 x i16>
-  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i4.i
-  %tmp8 = bitcast <8 x i16> %sub.i to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %tmp8, <2 x i64> undef, <1 x i32> zeroinitializer
-  %tmp9 = bitcast <1 x i64> %shuffle.i to <4 x i16>
-  %tmp10 = bitcast i16* %diff to <4 x i16>*
-  store <4 x i16> %tmp9, <4 x i16>* %tmp10, align 8
-  ret void
-}
-
-; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
-define <4 x float> @ld1r_4s_float(float* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_4s_float
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.4s { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp = load float* %x, align 4
-  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
-  %tmp2 = insertelement <4 x float> %tmp1, float %tmp, i32 1
-  %tmp3 = insertelement <4 x float> %tmp2, float %tmp, i32 2
-  %tmp4 = insertelement <4 x float> %tmp3, float %tmp, i32 3
-  ret <4 x float> %tmp4
-}
-
-define <2 x float> @ld1r_2s_float(float* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_2s_float
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.2s { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp = load float* %x, align 4
-  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
-  %tmp2 = insertelement <2 x float> %tmp1, float %tmp, i32 1
-  ret <2 x float> %tmp2
-}
-
-define <2 x double> @ld1r_2d_double(double* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_2d_double
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.2d { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp = load double* %x, align 4
-  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
-  %tmp2 = insertelement <2 x double> %tmp1, double %tmp, i32 1
-  ret <2 x double> %tmp2
-}
-
-define <1 x double> @ld1r_1d_double(double* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_1d_double
-; Make sure we are using the operands defined by the ABI
-; CHECK: ldr d0, [x0]
-; CHECK-NEXT ret
-  %tmp = load double* %x, align 4
-  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
-  ret <1 x double> %tmp1
-}
-
-define <4 x float> @ld1r_4s_float_shuff(float* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_4s_float_shuff
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.4s { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp = load float* %x, align 4
-  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
-  %lane = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
-  ret <4 x float> %lane
-}
-
-define <2 x float> @ld1r_2s_float_shuff(float* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_2s_float_shuff
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.2s { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp = load float* %x, align 4
-  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
-  %lane = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
-  ret <2 x float> %lane
-}
-
-define <2 x double> @ld1r_2d_double_shuff(double* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_2d_double_shuff
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.2d { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp = load double* %x, align 4
-  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
-  %lane = shufflevector <2 x double> %tmp1, <2 x double> undef, <2 x i32> zeroinitializer
-  ret <2 x double> %lane
-}
-
-define <1 x double> @ld1r_1d_double_shuff(double* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_1d_double_shuff
-; Make sure we are using the operands defined by the ABI
-; CHECK: ldr d0, [x0]
-; CHECK-NEXT ret
-  %tmp = load double* %x, align 4
-  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
-  %lane = shufflevector <1 x double> %tmp1, <1 x double> undef, <1 x i32> zeroinitializer
-  ret <1 x double> %lane
-}
-
-%struct.__neon_float32x2x2_t = type { <2 x float>,  <2 x float> }
-%struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
-%struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
-
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
-
-define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(i8* %addr) {
-; CHECK-LABEL: ld1_x2_v8i8:
-; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %addr)
-  ret %struct.__neon_int8x8x2_t %val
-}
-
-define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(i16* %addr) {
-; CHECK-LABEL: ld1_x2_v4i16:
-; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %addr)
-  ret %struct.__neon_int16x4x2_t %val
-}
-
-define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(i32* %addr) {
-; CHECK-LABEL: ld1_x2_v2i32:
-; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %addr)
-  ret %struct.__neon_int32x2x2_t %val
-}
-
-define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(float* %addr) {
-; CHECK-LABEL: ld1_x2_v2f32:
-; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %addr)
-  ret %struct.__neon_float32x2x2_t %val
-}
-
-define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(i64* %addr) {
-; CHECK-LABEL: ld1_x2_v1i64:
-; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %addr)
-  ret %struct.__neon_int64x1x2_t %val
-}
-
-define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
-; CHECK-LABEL: ld1_x2_v1f64:
-; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %addr)
-  ret %struct.__neon_float64x1x2_t %val
-}
-
-
-%struct.__neon_float32x4x2_t = type { <4 x float>,  <4 x float> }
-%struct.__neon_float32x4x3_t = type { <4 x float>,  <4 x float>,  <4 x float> }
-%struct.__neon_float32x4x4_t = type { <4 x float>,  <4 x float>, <4 x float>,  <4 x float> }
-
-%struct.__neon_float64x2x2_t = type { <2 x double>,  <2 x double> }
-%struct.__neon_float64x2x3_t = type { <2 x double>,  <2 x double>,  <2 x double> }
-%struct.__neon_float64x2x4_t = type { <2 x double>,  <2 x double>, <2 x double>,  <2 x double> }
-
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
-
-define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(i8* %addr) {
-; CHECK-LABEL: ld1_x2_v16i8:
-; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %addr)
-  ret %struct.__neon_int8x16x2_t %val
-}
-
-define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(i16* %addr) {
-; CHECK-LABEL: ld1_x2_v8i16:
-; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %addr)
-  ret %struct.__neon_int16x8x2_t %val
-}
-
-define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(i32* %addr) {
-; CHECK-LABEL: ld1_x2_v4i32:
-; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %addr)
-  ret %struct.__neon_int32x4x2_t %val
-}
-
-define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(float* %addr) {
-; CHECK-LABEL: ld1_x2_v4f32:
-; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %addr)
-  ret %struct.__neon_float32x4x2_t %val
-}
-
-define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(i64* %addr) {
-; CHECK-LABEL: ld1_x2_v2i64:
-; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %addr)
-  ret %struct.__neon_int64x2x2_t %val
-}
-
-define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(double* %addr) {
-; CHECK-LABEL: ld1_x2_v2f64:
-; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %addr)
-  ret %struct.__neon_float64x2x2_t %val
-}
-
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
-
-define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(i8* %addr) {
-; CHECK-LABEL: ld1_x3_v8i8:
-; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %addr)
-  ret %struct.__neon_int8x8x3_t %val
-}
-
-define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(i16* %addr) {
-; CHECK-LABEL: ld1_x3_v4i16:
-; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %addr)
-  ret %struct.__neon_int16x4x3_t %val
-}
-
-define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(i32* %addr) {
-; CHECK-LABEL: ld1_x3_v2i32:
-; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %addr)
-  ret %struct.__neon_int32x2x3_t %val
-}
-
-define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(float* %addr) {
-; CHECK-LABEL: ld1_x3_v2f32:
-; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %addr)
-  ret %struct.__neon_float32x2x3_t %val
-}
-
-define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(i64* %addr) {
-; CHECK-LABEL: ld1_x3_v1i64:
-; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %addr)
-  ret %struct.__neon_int64x1x3_t %val
-}
-
-define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(double* %addr) {
-; CHECK-LABEL: ld1_x3_v1f64:
-; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %addr)
-  ret %struct.__neon_float64x1x3_t %val
-}
-
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
-
-define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(i8* %addr) {
-; CHECK-LABEL: ld1_x3_v16i8:
-; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %addr)
-  ret %struct.__neon_int8x16x3_t %val
-}
-
-define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(i16* %addr) {
-; CHECK-LABEL: ld1_x3_v8i16:
-; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %addr)
-  ret %struct.__neon_int16x8x3_t %val
-}
-
-define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(i32* %addr) {
-; CHECK-LABEL: ld1_x3_v4i32:
-; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %addr)
-  ret %struct.__neon_int32x4x3_t %val
-}
-
-define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(float* %addr) {
-; CHECK-LABEL: ld1_x3_v4f32:
-; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %addr)
-  ret %struct.__neon_float32x4x3_t %val
-}
-
-define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(i64* %addr) {
-; CHECK-LABEL: ld1_x3_v2i64:
-; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %addr)
-  ret %struct.__neon_int64x2x3_t %val
-}
-
-define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(double* %addr) {
-; CHECK-LABEL: ld1_x3_v2f64:
-; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %addr)
-  ret %struct.__neon_float64x2x3_t %val
-}
-
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
-
-define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(i8* %addr) {
-; CHECK-LABEL: ld1_x4_v8i8:
-; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %addr)
-  ret %struct.__neon_int8x8x4_t %val
-}
-
-define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(i16* %addr) {
-; CHECK-LABEL: ld1_x4_v4i16:
-; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %addr)
-  ret %struct.__neon_int16x4x4_t %val
-}
-
-define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(i32* %addr) {
-; CHECK-LABEL: ld1_x4_v2i32:
-; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %addr)
-  ret %struct.__neon_int32x2x4_t %val
-}
-
-define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(float* %addr) {
-; CHECK-LABEL: ld1_x4_v2f32:
-; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %addr)
-  ret %struct.__neon_float32x2x4_t %val
-}
-
-define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(i64* %addr) {
-; CHECK-LABEL: ld1_x4_v1i64:
-; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %addr)
-  ret %struct.__neon_int64x1x4_t %val
-}
-
-define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(double* %addr) {
-; CHECK-LABEL: ld1_x4_v1f64:
-; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %addr)
-  ret %struct.__neon_float64x1x4_t %val
-}
-
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
-
-define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(i8* %addr) {
-; CHECK-LABEL: ld1_x4_v16i8:
-; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %addr)
-  ret %struct.__neon_int8x16x4_t %val
-}
-
-define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(i16* %addr) {
-; CHECK-LABEL: ld1_x4_v8i16:
-; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %addr)
-  ret %struct.__neon_int16x8x4_t %val
-}
-
-define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(i32* %addr) {
-; CHECK-LABEL: ld1_x4_v4i32:
-; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %addr)
-  ret %struct.__neon_int32x4x4_t %val
-}
-
-define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(float* %addr) {
-; CHECK-LABEL: ld1_x4_v4f32:
-; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %addr)
-  ret %struct.__neon_float32x4x4_t %val
-}
-
-define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(i64* %addr) {
-; CHECK-LABEL: ld1_x4_v2i64:
-; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %addr)
-  ret %struct.__neon_int64x2x4_t %val
-}
-
-define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(double* %addr) {
-; CHECK-LABEL: ld1_x4_v2f64:
-; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %addr)
-  ret %struct.__neon_float64x2x4_t %val
-}
diff --git a/test/CodeGen/ARM64/ldp.ll b/test/CodeGen/ARM64/ldp.ll
deleted file mode 100644
index 9444385f8ab..00000000000
--- a/test/CodeGen/ARM64/ldp.ll
+++ /dev/null
@@ -1,149 +0,0 @@
-; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
-; RUN:   -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
-
-; CHECK: ldp_int
-; CHECK: ldp
-define i32 @ldp_int(i32* %p) nounwind {
-  %tmp = load i32* %p, align 4
-  %add.ptr = getelementptr inbounds i32* %p, i64 1
-  %tmp1 = load i32* %add.ptr, align 4
-  %add = add nsw i32 %tmp1, %tmp
-  ret i32 %add
-}
-
-; CHECK: ldp_long
-; CHECK: ldp
-define i64 @ldp_long(i64* %p) nounwind {
-  %tmp = load i64* %p, align 8
-  %add.ptr = getelementptr inbounds i64* %p, i64 1
-  %tmp1 = load i64* %add.ptr, align 8
-  %add = add nsw i64 %tmp1, %tmp
-  ret i64 %add
-}
-
-; CHECK: ldp_float
-; CHECK: ldp
-define float @ldp_float(float* %p) nounwind {
-  %tmp = load float* %p, align 4
-  %add.ptr = getelementptr inbounds float* %p, i64 1
-  %tmp1 = load float* %add.ptr, align 4
-  %add = fadd float %tmp, %tmp1
-  ret float %add
-}
-
-; CHECK: ldp_double
-; CHECK: ldp
-define double @ldp_double(double* %p) nounwind {
-  %tmp = load double* %p, align 8
-  %add.ptr = getelementptr inbounds double* %p, i64 1
-  %tmp1 = load double* %add.ptr, align 8
-  %add = fadd double %tmp, %tmp1
-  ret double %add
-}
-
-; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
-define i32 @ldur_int(i32* %a) nounwind {
-; LDUR_CHK: ldur_int
-; LDUR_CHK: ldp     [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8]
-; LDUR_CHK-NEXT: add     w{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i32* %a, i32 -1
-  %tmp1 = load i32* %p1, align 2
-  %p2 = getelementptr inbounds i32* %a, i32 -2
-  %tmp2 = load i32* %p2, align 2
-  %tmp3 = add i32 %tmp1, %tmp2
-  ret i32 %tmp3
-}
-
-define i64 @ldur_long(i64* %a) nounwind ssp {
-; LDUR_CHK: ldur_long
-; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i64* %a, i64 -1
-  %tmp1 = load i64* %p1, align 2
-  %p2 = getelementptr inbounds i64* %a, i64 -2
-  %tmp2 = load i64* %p2, align 2
-  %tmp3 = add i64 %tmp1, %tmp2
-  ret i64 %tmp3
-}
-
-define float @ldur_float(float* %a) {
-; LDUR_CHK: ldur_float
-; LDUR_CHK: ldp     [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8]
-; LDUR_CHK-NEXT: add     s{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds float* %a, i64 -1
-  %tmp1 = load float* %p1, align 2
-  %p2 = getelementptr inbounds float* %a, i64 -2
-  %tmp2 = load float* %p2, align 2
-  %tmp3 = fadd float %tmp1, %tmp2
-  ret float %tmp3
-}
-
-define double @ldur_double(double* %a) {
-; LDUR_CHK: ldur_double
-; LDUR_CHK: ldp     [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16]
-; LDUR_CHK-NEXT: add     d{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds double* %a, i64 -1
-  %tmp1 = load double* %p1, align 2
-  %p2 = getelementptr inbounds double* %a, i64 -2
-  %tmp2 = load double* %p2, align 2
-  %tmp3 = fadd double %tmp1, %tmp2
-  ret double %tmp3
-}
-
-; Now check some boundary conditions
-define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyIn
-; LDUR_CHK-NOT: ldur
-; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i64* %a, i64 -31
-  %tmp1 = load i64* %p1, align 2
-  %p2 = getelementptr inbounds i64* %a, i64 -32
-  %tmp2 = load i64* %p2, align 2
-  %tmp3 = add i64 %tmp1, %tmp2
-  ret i64 %tmp3
-}
-
-define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyOut
-; LDUR_CHK-NOT: ldp
-; Don't be fragile about which loads or manipulations of the base register
-; are used---just check that there isn't an ldp before the add
-; LDUR_CHK: add
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i64* %a, i64 -32
-  %tmp1 = load i64* %p1, align 2
-  %p2 = getelementptr inbounds i64* %a, i64 -33
-  %tmp2 = load i64* %p2, align 2
-  %tmp3 = add i64 %tmp1, %tmp2
-  ret i64 %tmp3
-}
-
-define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
-; LDUR_CHK: pairUpNotAligned
-; LDUR_CHK-NOT: ldp
-; LDUR_CHK: ldur
-; LDUR_CHK-NEXT: ldur
-; LDUR_CHK-NEXT: add
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i64* %a, i64 -18
-  %bp1 = bitcast i64* %p1 to i8*
-  %bp1p1 = getelementptr inbounds i8* %bp1, i64 1
-  %dp1 = bitcast i8* %bp1p1 to i64*
-  %tmp1 = load i64* %dp1, align 1
-
-  %p2 = getelementptr inbounds i64* %a, i64 -17
-  %bp2 = bitcast i64* %p2 to i8*
-  %bp2p1 = getelementptr inbounds i8* %bp2, i64 1
-  %dp2 = bitcast i8* %bp2p1 to i64*
-  %tmp2 = load i64* %dp2, align 1
-
-  %tmp3 = add i64 %tmp1, %tmp2
-  ret i64 %tmp3
-}
diff --git a/test/CodeGen/ARM64/ldur.ll b/test/CodeGen/ARM64/ldur.ll
deleted file mode 100644
index 2848c06f9bb..00000000000
--- a/test/CodeGen/ARM64/ldur.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i64 @_f0(i64* %p) {
-; CHECK: f0:
-; CHECK: ldur x0, [x0, #-8]
-; CHECK-NEXT: ret
-  %tmp = getelementptr inbounds i64* %p, i64 -1
-  %ret = load i64* %tmp, align 2
-  ret i64 %ret
-}
-define i32 @_f1(i32* %p) {
-; CHECK: f1:
-; CHECK: ldur w0, [x0, #-4]
-; CHECK-NEXT: ret
-  %tmp = getelementptr inbounds i32* %p, i64 -1
-  %ret = load i32* %tmp, align 2
-  ret i32 %ret
-}
-define i16 @_f2(i16* %p) {
-; CHECK: f2:
-; CHECK: ldurh w0, [x0, #-2]
-; CHECK-NEXT: ret
-  %tmp = getelementptr inbounds i16* %p, i64 -1
-  %ret = load i16* %tmp, align 2
-  ret i16 %ret
-}
-define i8 @_f3(i8* %p) {
-; CHECK: f3:
-; CHECK: ldurb w0, [x0, #-1]
-; CHECK-NEXT: ret
-  %tmp = getelementptr inbounds i8* %p, i64 -1
-  %ret = load i8* %tmp, align 2
-  ret i8 %ret
-}
-
-define i64 @zext32(i8* %a) nounwind ssp {
-; CHECK-LABEL: zext32:
-; CHECK: ldur w0, [x0, #-12]
-; CHECK-NEXT: ret
-  %p = getelementptr inbounds i8* %a, i64 -12
-  %tmp1 = bitcast i8* %p to i32*
-  %tmp2 = load i32* %tmp1, align 4
-  %ret = zext i32 %tmp2 to i64
-
-  ret i64 %ret
-}
-define i64 @zext16(i8* %a) nounwind ssp {
-; CHECK-LABEL: zext16:
-; CHECK: ldurh w0, [x0, #-12]
-; CHECK-NEXT: ret
-  %p = getelementptr inbounds i8* %a, i64 -12
-  %tmp1 = bitcast i8* %p to i16*
-  %tmp2 = load i16* %tmp1, align 2
-  %ret = zext i16 %tmp2 to i64
-
-  ret i64 %ret
-}
-define i64 @zext8(i8* %a) nounwind ssp {
-; CHECK-LABEL: zext8:
-; CHECK: ldurb w0, [x0, #-12]
-; CHECK-NEXT: ret
-  %p = getelementptr inbounds i8* %a, i64 -12
-  %tmp2 = load i8* %p, align 1
-  %ret = zext i8 %tmp2 to i64
-
-  ret i64 %ret
-}
diff --git a/test/CodeGen/ARM64/ldxr-stxr.ll b/test/CodeGen/ARM64/ldxr-stxr.ll
deleted file mode 100644
index ed53a14ca8c..00000000000
--- a/test/CodeGen/ARM64/ldxr-stxr.ll
+++ /dev/null
@@ -1,270 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
-
-%0 = type { i64, i64 }
-
-define i128 @f0(i8* %p) nounwind readonly {
-; CHECK-LABEL: f0:
-; CHECK: ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
-entry:
-  %ldrexd = tail call %0 @llvm.arm64.ldxp(i8* %p)
-  %0 = extractvalue %0 %ldrexd, 1
-  %1 = extractvalue %0 %ldrexd, 0
-  %2 = zext i64 %0 to i128
-  %3 = zext i64 %1 to i128
-  %shl = shl nuw i128 %2, 64
-  %4 = or i128 %shl, %3
-  ret i128 %4
-}
-
-define i32 @f1(i8* %ptr, i128 %val) nounwind {
-; CHECK-LABEL: f1:
-; CHECK: stxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
-entry:
-  %tmp4 = trunc i128 %val to i64
-  %tmp6 = lshr i128 %val, 64
-  %tmp7 = trunc i128 %tmp6 to i64
-  %strexd = tail call i32 @llvm.arm64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
-  ret i32 %strexd
-}
-
-declare %0 @llvm.arm64.ldxp(i8*) nounwind
-declare i32 @llvm.arm64.stxp(i64, i64, i8*) nounwind
-
-@var = global i64 0, align 8
-
-define void @test_load_i8(i8* %addr) {
-; CHECK-LABEL: test_load_i8:
-; CHECK: ldxrb w[[LOADVAL:[0-9]+]], [x0]
-; CHECK-NOT: uxtb
-; CHECK-NOT: and
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldxr.p0i8(i8* %addr)
-  %shortval = trunc i64 %val to i8
-  %extval = zext i8 %shortval to i64
-  store i64 %extval, i64* @var, align 8
-  ret void
-}
-
-define void @test_load_i16(i16* %addr) {
-; CHECK-LABEL: test_load_i16:
-; CHECK: ldxrh w[[LOADVAL:[0-9]+]], [x0]
-; CHECK-NOT: uxth
-; CHECK-NOT: and
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldxr.p0i16(i16* %addr)
-  %shortval = trunc i64 %val to i16
-  %extval = zext i16 %shortval to i64
-  store i64 %extval, i64* @var, align 8
-  ret void
-}
-
-define void @test_load_i32(i32* %addr) {
-; CHECK-LABEL: test_load_i32:
-; CHECK: ldxr w[[LOADVAL:[0-9]+]], [x0]
-; CHECK-NOT: uxtw
-; CHECK-NOT: and
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldxr.p0i32(i32* %addr)
-  %shortval = trunc i64 %val to i32
-  %extval = zext i32 %shortval to i64
-  store i64 %extval, i64* @var, align 8
-  ret void
-}
-
-define void @test_load_i64(i64* %addr) {
-; CHECK-LABEL: test_load_i64:
-; CHECK: ldxr x[[LOADVAL:[0-9]+]], [x0]
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldxr.p0i64(i64* %addr)
-  store i64 %val, i64* @var, align 8
-  ret void
-}
-
-
-declare i64 @llvm.arm64.ldxr.p0i8(i8*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i16(i16*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i32(i32*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i64(i64*) nounwind
-
-define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
-; CHECK-LABEL: test_store_i8:
-; CHECK-NOT: uxtb
-; CHECK-NOT: and
-; CHECK: stxrb w0, w1, [x2]
-  %extval = zext i8 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i8(i64 %extval, i8* %addr)
-  ret i32 %res
-}
-
-define i32 @test_store_i16(i32, i16 %val, i16* %addr) {
-; CHECK-LABEL: test_store_i16:
-; CHECK-NOT: uxth
-; CHECK-NOT: and
-; CHECK: stxrh w0, w1, [x2]
-  %extval = zext i16 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i16(i64 %extval, i16* %addr)
-  ret i32 %res
-}
-
-define i32 @test_store_i32(i32, i32 %val, i32* %addr) {
-; CHECK-LABEL: test_store_i32:
-; CHECK-NOT: uxtw
-; CHECK-NOT: and
-; CHECK: stxr w0, w1, [x2]
-  %extval = zext i32 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i32(i64 %extval, i32* %addr)
-  ret i32 %res
-}
-
-define i32 @test_store_i64(i32, i64 %val, i64* %addr) {
-; CHECK-LABEL: test_store_i64:
-; CHECK: stxr w0, x1, [x2]
-  %res = call i32 @llvm.arm64.stxr.p0i64(i64 %val, i64* %addr)
-  ret i32 %res
-}
-
-declare i32 @llvm.arm64.stxr.p0i8(i64, i8*) nounwind
-declare i32 @llvm.arm64.stxr.p0i16(i64, i16*) nounwind
-declare i32 @llvm.arm64.stxr.p0i32(i64, i32*) nounwind
-declare i32 @llvm.arm64.stxr.p0i64(i64, i64*) nounwind
-
-; CHECK: test_clear:
-; CHECK: clrex
-define void @test_clear() {
-  call void @llvm.arm64.clrex()
-  ret void
-}
-
-declare void @llvm.arm64.clrex() nounwind
-
-define i128 @test_load_acquire_i128(i8* %p) nounwind readonly {
-; CHECK-LABEL: test_load_acquire_i128:
-; CHECK: ldaxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
-entry:
-  %ldrexd = tail call %0 @llvm.arm64.ldaxp(i8* %p)
-  %0 = extractvalue %0 %ldrexd, 1
-  %1 = extractvalue %0 %ldrexd, 0
-  %2 = zext i64 %0 to i128
-  %3 = zext i64 %1 to i128
-  %shl = shl nuw i128 %2, 64
-  %4 = or i128 %shl, %3
-  ret i128 %4
-}
-
-define i32 @test_store_release_i128(i8* %ptr, i128 %val) nounwind {
-; CHECK-LABEL: test_store_release_i128:
-; CHECK: stlxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
-entry:
-  %tmp4 = trunc i128 %val to i64
-  %tmp6 = lshr i128 %val, 64
-  %tmp7 = trunc i128 %tmp6 to i64
-  %strexd = tail call i32 @llvm.arm64.stlxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
-  ret i32 %strexd
-}
-
-declare %0 @llvm.arm64.ldaxp(i8*) nounwind
-declare i32 @llvm.arm64.stlxp(i64, i64, i8*) nounwind
-
-define void @test_load_acquire_i8(i8* %addr) {
-; CHECK-LABEL: test_load_acquire_i8:
-; CHECK: ldaxrb w[[LOADVAL:[0-9]+]], [x0]
-; CHECK-NOT: uxtb
-; CHECK-NOT: and
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldaxr.p0i8(i8* %addr)
-  %shortval = trunc i64 %val to i8
-  %extval = zext i8 %shortval to i64
-  store i64 %extval, i64* @var, align 8
-  ret void
-}
-
-define void @test_load_acquire_i16(i16* %addr) {
-; CHECK-LABEL: test_load_acquire_i16:
-; CHECK: ldaxrh w[[LOADVAL:[0-9]+]], [x0]
-; CHECK-NOT: uxth
-; CHECK-NOT: and
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldaxr.p0i16(i16* %addr)
-  %shortval = trunc i64 %val to i16
-  %extval = zext i16 %shortval to i64
-  store i64 %extval, i64* @var, align 8
-  ret void
-}
-
-define void @test_load_acquire_i32(i32* %addr) {
-; CHECK-LABEL: test_load_acquire_i32:
-; CHECK: ldaxr w[[LOADVAL:[0-9]+]], [x0]
-; CHECK-NOT: uxtw
-; CHECK-NOT: and
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldaxr.p0i32(i32* %addr)
-  %shortval = trunc i64 %val to i32
-  %extval = zext i32 %shortval to i64
-  store i64 %extval, i64* @var, align 8
-  ret void
-}
-
-define void @test_load_acquire_i64(i64* %addr) {
-; CHECK-LABEL: test_load_acquire_i64:
-; CHECK: ldaxr x[[LOADVAL:[0-9]+]], [x0]
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldaxr.p0i64(i64* %addr)
-  store i64 %val, i64* @var, align 8
-  ret void
-}
-
-
-declare i64 @llvm.arm64.ldaxr.p0i8(i8*) nounwind
-declare i64 @llvm.arm64.ldaxr.p0i16(i16*) nounwind
-declare i64 @llvm.arm64.ldaxr.p0i32(i32*) nounwind
-declare i64 @llvm.arm64.ldaxr.p0i64(i64*) nounwind
-
-define i32 @test_store_release_i8(i32, i8 %val, i8* %addr) {
-; CHECK-LABEL: test_store_release_i8:
-; CHECK-NOT: uxtb
-; CHECK-NOT: and
-; CHECK: stlxrb w0, w1, [x2]
-  %extval = zext i8 %val to i64
-  %res = call i32 @llvm.arm64.stlxr.p0i8(i64 %extval, i8* %addr)
-  ret i32 %res
-}
-
-define i32 @test_store_release_i16(i32, i16 %val, i16* %addr) {
-; CHECK-LABEL: test_store_release_i16:
-; CHECK-NOT: uxth
-; CHECK-NOT: and
-; CHECK: stlxrh w0, w1, [x2]
-  %extval = zext i16 %val to i64
-  %res = call i32 @llvm.arm64.stlxr.p0i16(i64 %extval, i16* %addr)
-  ret i32 %res
-}
-
-define i32 @test_store_release_i32(i32, i32 %val, i32* %addr) {
-; CHECK-LABEL: test_store_release_i32:
-; CHECK-NOT: uxtw
-; CHECK-NOT: and
-; CHECK: stlxr w0, w1, [x2]
-  %extval = zext i32 %val to i64
-  %res = call i32 @llvm.arm64.stlxr.p0i32(i64 %extval, i32* %addr)
-  ret i32 %res
-}
-
-define i32 @test_store_release_i64(i32, i64 %val, i64* %addr) {
-; CHECK-LABEL: test_store_release_i64:
-; CHECK: stlxr w0, x1, [x2]
-  %res = call i32 @llvm.arm64.stlxr.p0i64(i64 %val, i64* %addr)
-  ret i32 %res
-}
-
-declare i32 @llvm.arm64.stlxr.p0i8(i64, i8*) nounwind
-declare i32 @llvm.arm64.stlxr.p0i16(i64, i16*) nounwind
-declare i32 @llvm.arm64.stlxr.p0i32(i64, i32*) nounwind
-declare i32 @llvm.arm64.stlxr.p0i64(i64, i64*) nounwind
diff --git a/test/CodeGen/ARM64/leaf.ll b/test/CodeGen/ARM64/leaf.ll
deleted file mode 100644
index d3b2031686e..00000000000
--- a/test/CodeGen/ARM64/leaf.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
-; rdar://12829704
-
-define void @t8() nounwind ssp {
-; CHECK-LABEL: t8:
-; CHECK-NOT: stp	fp, lr, [sp, #-16]!
-; CHECK-NOT: mov	fp, sp
-; CHECK: nop
-; CHECK-NOT: mov	sp, fp
-; CHECK-NOT: ldp	fp, lr, [sp], #16
-  tail call void asm sideeffect "nop", "~{v8}"() nounwind
-  ret void
-}
diff --git a/test/CodeGen/ARM64/lit.local.cfg b/test/CodeGen/ARM64/lit.local.cfg
deleted file mode 100644
index 3468e27f07f..00000000000
--- a/test/CodeGen/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,11 +0,0 @@
-import re
-
-config.suffixes = ['.ll']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
-# For now we don't test arm64-win32.
-if re.search(r'cygwin|mingw32|win32', config.target_triple):
-    config.unsupported = True
diff --git a/test/CodeGen/ARM64/long-shift.ll b/test/CodeGen/ARM64/long-shift.ll
deleted file mode 100644
index d5baf16bdd5..00000000000
--- a/test/CodeGen/ARM64/long-shift.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
-
-define i128 @shl(i128 %r, i128 %s) nounwind readnone {
-; CHECK-LABEL: shl:
-; CHECK: lsl  [[XREG_0:x[0-9]+]], x1, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsr  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
-; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
-; CHECK-NEXT: lsl  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
-; CHECK-NEXT: cmp   [[XREG_4]], #0
-; CHECK-NEXT: csel  x1, [[XREG_5]], [[XREG_6]], ge
-; CHECK-NEXT: lsl  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
-; CHECK-NEXT: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
-; CHECK-NEXT: ret
-
-  %shl = shl i128 %r, %s
-  ret i128 %shl
-}
-
-define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
-; CHECK-LABEL: ashr:
-; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
-; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: asr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
-; CHECK-NEXT: cmp   [[XREG_5]], #0
-; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: asr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
-; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
-; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
-; CHECK-NEXT: ret
-
-  %shr = ashr i128 %r, %s
-  ret i128 %shr
-}
-
-define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
-; CHECK-LABEL: lshr:
-; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
-; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: lsr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
-; CHECK-NEXT: cmp   [[XREG_5]], #0
-; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: lsr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
-; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
-; CHECK-NEXT: ret
-
-  %shr = lshr i128 %r, %s
-  ret i128 %shr
-}
diff --git a/test/CodeGen/ARM64/memcpy-inline.ll b/test/CodeGen/ARM64/memcpy-inline.ll
deleted file mode 100644
index f921a592451..00000000000
--- a/test/CodeGen/ARM64/memcpy-inline.ll
+++ /dev/null
@@ -1,112 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
-
-%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
-
-@src = external global %struct.x
-@dst = external global %struct.x
-
-@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
-@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
-@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
-@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR  \00", align 1
-@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
-@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
-@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
-
-define i32 @t0() {
-entry:
-; CHECK-LABEL: t0:
-; CHECK: ldrb [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #10]
-; CHECK: strb [[REG0]], [x[[BASEREG2:[0-9]+]], #10]
-; CHECK: ldrh [[REG1:w[0-9]+]], [x[[BASEREG]], #8]
-; CHECK: strh [[REG1]], [x[[BASEREG2]], #8]
-; CHECK: ldr [[REG2:x[0-9]+]],
-; CHECK: str [[REG2]],
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
-  ret i32 0
-}
-
-define void @t1(i8* nocapture %C) nounwind {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
-; CHECK: stur [[DEST]], [x0, #15]
-; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
-; CHECK: str [[DEST]], [x0]
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
-  ret void
-}
-
-define void @t2(i8* nocapture %C) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: movz [[REG3:w[0-9]+]]
-; CHECK: movk [[REG3]],
-; CHECK: str [[REG3]], [x0, #32]
-; CHECK: ldp [[DEST1:q[0-9]+]], [[DEST2:q[0-9]+]], [x{{[0-9]+}}]
-; CHECK: stp [[DEST1]], [[DEST2]], [x0]
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
-  ret void
-}
-
-define void @t3(i8* nocapture %C) nounwind {
-entry:
-; CHECK-LABEL: t3:
-; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16]
-; CHECK: str [[REG4]], [x0, #16]
-; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
-; CHECK: str [[DEST]], [x0]
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
-  ret void
-}
-
-define void @t4(i8* nocapture %C) nounwind {
-entry:
-; CHECK-LABEL: t4:
-; CHECK: orr [[REG5:w[0-9]+]], wzr, #0x20
-; CHECK: strh [[REG5]], [x0, #16]
-; CHECK: ldr [[REG6:q[0-9]+]], [x{{[0-9]+}}]
-; CHECK: str [[REG6]], [x0]
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
-  ret void
-}
-
-define void @t5(i8* nocapture %C) nounwind {
-entry:
-; CHECK-LABEL: t5:
-; CHECK: strb wzr, [x0, #6]
-; CHECK: movz [[REG7:w[0-9]+]], #0x5453
-; CHECK: strh [[REG7]], [x0, #4]
-; CHECK: movz [[REG8:w[0-9]+]],
-; CHECK: movk [[REG8]],
-; CHECK: str [[REG8]], [x0]
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
-  ret void
-}
-
-define void @t6() nounwind {
-entry:
-; CHECK-LABEL: t6:
-; CHECK: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6]
-; CHECK: stur [[REG9]], [x{{[0-9]+}}, #6]
-; CHECK: ldr
-; CHECK: str
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
-  ret void
-}
-
-%struct.Foo = type { i32, i32, i32, i32 }
-
-define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
-entry:
-; CHECK: t7
-; CHECK: ldr [[REG10:q[0-9]+]], [x1]
-; CHECK: str [[REG10]], [x0]
-  %0 = bitcast %struct.Foo* %a to i8*
-  %1 = bitcast %struct.Foo* %b to i8*
-  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
-  ret void
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM64/memset-inline.ll b/test/CodeGen/ARM64/memset-inline.ll
deleted file mode 100644
index 2e237f4a882..00000000000
--- a/test/CodeGen/ARM64/memset-inline.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define void @t1(i8* nocapture %c) nounwind optsize {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: str wzr, [x0, #8]
-; CHECK: str xzr, [x0]
-  call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
-  ret void
-}
-
-define void @t2() nounwind ssp {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: strh wzr, [sp, #32]
-; CHECK: stp xzr, xzr, [sp, #16]
-; CHECK: str xzr, [sp, #8]
-  %buf = alloca [26 x i8], align 1
-  %0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0
-  call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
-  call void @something(i8* %0) nounwind
-  ret void
-}
-
-declare void @something(i8*) nounwind
-declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM64/memset-to-bzero.ll b/test/CodeGen/ARM64/memset-to-bzero.ll
deleted file mode 100644
index 29036caabf3..00000000000
--- a/test/CodeGen/ARM64/memset-to-bzero.ll
+++ /dev/null
@@ -1,108 +0,0 @@
-; RUN: llc %s -mtriple=arm64-apple-darwin -o - | \
-; RUN:   FileCheck --check-prefix=CHECK-DARWIN --check-prefix=CHECK %s
-; RUN: llc %s -mtriple=arm64-linux-gnu -o - | \
-; RUN:   FileCheck --check-prefix=CHECK-LINUX --check-prefix=CHECK %s
-; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
-
-; CHECK: @fct1
-; For small size (<= 256), we do not change memset to bzero.
-; CHECK: memset
-define void @fct1(i8* nocapture %ptr) {
-entry:
-  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false)
-  ret void
-}
-
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
-
-; CHECK: @fct2
-; When the size is bigger than 256, change into bzero.
-; CHECK-DARWIN: bzero
-; CHECK-LINUX: memset
-define void @fct2(i8* nocapture %ptr) {
-entry:
-  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false)
-  ret void
-}
-
-; CHECK: @fct3
-; For unknown size, change to bzero.
-; CHECK-DARWIN: bzero
-; CHECK-LINUX: memset
-define void @fct3(i8* nocapture %ptr, i32 %unknown) {
-entry:
-  %conv = sext i32 %unknown to i64
-  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i32 1, i1 false)
-  ret void
-}
-
-; CHECK: @fct4
-; Size <= 256, no change.
-; CHECK: memset
-define void @fct4(i8* %ptr) {
-entry:
-  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
-  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp)
-  ret void
-}
-
-declare i8* @__memset_chk(i8*, i32, i64, i64)
-
-declare i64 @llvm.objectsize.i64(i8*, i1)
-
-; CHECK: @fct5
-; Size > 256, change.
-; CHECK-DARWIN: bzero
-; CHECK-LINUX: memset
-define void @fct5(i8* %ptr) {
-entry:
-  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
-  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp)
-  ret void
-}
-
-; CHECK: @fct6
-; Size = unknown, change.
-; CHECK-DARWIN: bzero
-; CHECK-LINUX: memset
-define void @fct6(i8* %ptr, i32 %unknown) {
-entry:
-  %conv = sext i32 %unknown to i64
-  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
-  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp)
-  ret void
-}
-
-; Next functions check that memset is not turned into bzero
-; when the set constant is non-zero, whatever the given size.
-
-; CHECK: @fct7
-; memset with something that is not a zero, no change.
-; CHECK: memset
-define void @fct7(i8* %ptr) {
-entry:
-  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
-  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp)
-  ret void
-}
-
-; CHECK: @fct8
-; memset with something that is not a zero, no change.
-; CHECK: memset
-define void @fct8(i8* %ptr) {
-entry:
-  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
-  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp)
-  ret void
-}
-
-; CHECK: @fct9
-; memset with something that is not a zero, no change.
-; CHECK: memset
-define void @fct9(i8* %ptr, i32 %unknown) {
-entry:
-  %conv = sext i32 %unknown to i64
-  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
-  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp)
-  ret void
-}
diff --git a/test/CodeGen/ARM64/misched-basic-A53.ll b/test/CodeGen/ARM64/misched-basic-A53.ll
deleted file mode 100644
index d69b097a9b5..00000000000
--- a/test/CodeGen/ARM64/misched-basic-A53.ll
+++ /dev/null
@@ -1,124 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
-;
-; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
-; much higher than the ADD instructions in order to hide latency. When not
-; specifying a subtarget, the MADD will remain near the end of the block.
-;
-; CHECK: ********** MI Scheduling **********
-; CHECK: main
-; CHECK: *** Final schedule for BB#2 ***
-; CHECK: MADDWrrr
-; CHECK: ADDWri
-; CHECK: ********** INTERVALS **********
-@main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
-@main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4
-
-; Function Attrs: nounwind
-define i32 @main() #0 {
-entry:
-  %retval = alloca i32, align 4
-  %x = alloca [8 x i32], align 4
-  %y = alloca [8 x i32], align 4
-  %i = alloca i32, align 4
-  %xx = alloca i32, align 4
-  %yy = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = bitcast [8 x i32]* %x to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([8 x i32]* @main.x to i8*), i64 32, i32 4, i1 false)
-  %1 = bitcast [8 x i32]* %y to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([8 x i32]* @main.y to i8*), i64 32, i32 4, i1 false)
-  store i32 0, i32* %xx, align 4
-  store i32 0, i32* %yy, align 4
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %2 = load i32* %i, align 4
-  %cmp = icmp slt i32 %2, 8
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %3 = load i32* %i, align 4
-  %idxprom = sext i32 %3 to i64
-  %arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom
-  %4 = load i32* %arrayidx, align 4
-  %add = add nsw i32 %4, 1
-  store i32 %add, i32* %xx, align 4
-  %5 = load i32* %xx, align 4
-  %add1 = add nsw i32 %5, 12
-  store i32 %add1, i32* %xx, align 4
-  %6 = load i32* %xx, align 4
-  %add2 = add nsw i32 %6, 23
-  store i32 %add2, i32* %xx, align 4
-  %7 = load i32* %xx, align 4
-  %add3 = add nsw i32 %7, 34
-  store i32 %add3, i32* %xx, align 4
-  %8 = load i32* %i, align 4
-  %idxprom4 = sext i32 %8 to i64
-  %arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4
-  %9 = load i32* %arrayidx5, align 4
-  %10 = load i32* %yy, align 4
-  %mul = mul nsw i32 %10, %9
-  store i32 %mul, i32* %yy, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %11 = load i32* %i, align 4
-  %inc = add nsw i32 %11, 1
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %12 = load i32* %xx, align 4
-  %13 = load i32* %yy, align 4
-  %add6 = add nsw i32 %12, %13
-  ret i32 %add6
-}
-
-
-; The Cortex-A53 machine model will cause the FDIVvvv_42 to be raised to
-; hide latency. Whereas normally there would only be a single FADDvvv_4s
-; after it, this test checks to make sure there are more than one.
-;
-; CHECK: ********** MI Scheduling **********
-; CHECK: neon4xfloat:BB#0
-; CHECK: *** Final schedule for BB#0 ***
-; CHECK: FDIVv4f32
-; CHECK: FADDv4f32
-; CHECK: FADDv4f32
-; CHECK: ********** INTERVALS **********
-define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
-        %tmp1 = fadd <4 x float> %A, %B;
-        %tmp2 = fadd <4 x float> %A, %tmp1;
-        %tmp3 = fadd <4 x float> %A, %tmp2;
-        %tmp4 = fadd <4 x float> %A, %tmp3;
-        %tmp5 = fadd <4 x float> %A, %tmp4;
-        %tmp6 = fadd <4 x float> %A, %tmp5;
-        %tmp7 = fadd <4 x float> %A, %tmp6;
-        %tmp8 = fadd <4 x float> %A, %tmp7;
-        %tmp9 = fdiv <4 x float> %A, %B;
-        %tmp10 = fadd <4 x float> %tmp8, %tmp9;
-
-        ret <4 x float> %tmp10
-}
-
-; Function Attrs: nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-
-
-; Regression Test for PR19761
-;   [ARM64] Cortex-a53 schedule mode can't handle NEON post-increment load
-;
-; Nothing explicit to check other than llc not crashing.
-define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
-  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
-  %tmp = getelementptr i8* %A, i32 32
-  store i8* %tmp, i8** %ptr
-  ret { <16 x i8>, <16 x i8> } %ld2
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8*)
diff --git a/test/CodeGen/ARM64/misched-forwarding-A53.ll b/test/CodeGen/ARM64/misched-forwarding-A53.ll
deleted file mode 100644
index 97bfb5ca9d3..00000000000
--- a/test/CodeGen/ARM64/misched-forwarding-A53.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
-;
-; For Cortex-A53, shiftable operands that are not actually shifted
-; are not needed for an additional two cycles.
-;
-; CHECK: ********** MI Scheduling **********
-; CHECK: shiftable
-; CHECK: *** Final schedule for BB#0 ***
-; CHECK: ADDXrr %vreg0, %vreg2
-; CHECK: ADDXrs %vreg0, %vreg2, 5
-; CHECK: ********** INTERVALS **********
-define i64 @shiftable(i64 %A, i64 %B) {
-        %tmp0 = sub i64 %B, 20
-        %tmp1 = shl i64 %tmp0, 5;
-        %tmp2 = add i64 %A, %tmp1;
-        %tmp3 = add i64 %A, %tmp0
-        %tmp4 = mul i64 %tmp2, %tmp3
-
-        ret i64 %tmp4
-}
diff --git a/test/CodeGen/ARM64/movi.ll b/test/CodeGen/ARM64/movi.ll
deleted file mode 100644
index 2cd368d909d..00000000000
--- a/test/CodeGen/ARM64/movi.ll
+++ /dev/null
@@ -1,202 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-;==--------------------------------------------------------------------------==
-; Tests for MOV-immediate implemented with ORR-immediate.
-;==--------------------------------------------------------------------------==
-
-; 64-bit immed with 32-bit pattern size, rotated by 0.
-define i64 @test64_32_rot0() nounwind {
-; CHECK-LABEL: test64_32_rot0:
-; CHECK: orr x0, xzr, #0x700000007
-  ret i64 30064771079
-}
-
-; 64-bit immed with 32-bit pattern size, rotated by 2.
-define i64 @test64_32_rot2() nounwind {
-; CHECK-LABEL: test64_32_rot2:
-; CHECK: orr x0, xzr, #0xc0000003c0000003
-  ret i64 13835058071388291075
-}
-
-; 64-bit immed with 4-bit pattern size, rotated by 3.
-define i64 @test64_4_rot3() nounwind {
-; CHECK-LABEL: test64_4_rot3:
-; CHECK: orr  x0, xzr, #0xeeeeeeeeeeeeeeee
-  ret i64 17216961135462248174
-}
-
-; 32-bit immed with 32-bit pattern size, rotated by 16.
-define i32 @test32_32_rot16() nounwind {
-; CHECK-LABEL: test32_32_rot16:
-; CHECK: orr w0, wzr, #0xff0000
-  ret i32 16711680
-}
-
-; 32-bit immed with 2-bit pattern size, rotated by 1.
-define i32 @test32_2_rot1() nounwind {
-; CHECK-LABEL: test32_2_rot1:
-; CHECK: orr w0, wzr, #0xaaaaaaaa
-  ret i32 2863311530
-}
-
-;==--------------------------------------------------------------------------==
-; Tests for MOVZ with MOVK.
-;==--------------------------------------------------------------------------==
-
-define i32 @movz() nounwind {
-; CHECK-LABEL: movz:
-; CHECK: movz w0, #0x5
-  ret i32 5
-}
-
-define i64 @movz_3movk() nounwind {
-; CHECK-LABEL: movz_3movk:
-; CHECK:      movz x0, #0x5, lsl #48
-; CHECK-NEXT: movk x0, #0x1234, lsl #32
-; CHECK-NEXT: movk x0, #0xabcd, lsl #16
-; CHECK-NEXT: movk x0, #0x5678
-  ret i64 1427392313513592
-}
-
-define i64 @movz_movk_skip1() nounwind {
-; CHECK-LABEL: movz_movk_skip1:
-; CHECK:      movz x0, #0x5, lsl #32
-; CHECK-NEXT: movk x0, #0x4321, lsl #16
-  ret i64 22601072640
-}
-
-define i64 @movz_skip1_movk() nounwind {
-; CHECK-LABEL: movz_skip1_movk:
-; CHECK:      movz x0, #0x8654, lsl #32
-; CHECK-NEXT: movk x0, #0x1234
-  ret i64 147695335379508
-}
-
-;==--------------------------------------------------------------------------==
-; Tests for MOVN with MOVK.
-;==--------------------------------------------------------------------------==
-
-define i64 @movn() nounwind {
-; CHECK-LABEL: movn:
-; CHECK: movn x0, #0x29
-  ret i64 -42
-}
-
-define i64 @movn_skip1_movk() nounwind {
-; CHECK-LABEL: movn_skip1_movk:
-; CHECK:      movn x0, #0x29, lsl #32
-; CHECK-NEXT: movk x0, #0x1234
-  ret i64 -176093720012
-}
-
-;==--------------------------------------------------------------------------==
-; Tests for ORR with MOVK.
-;==--------------------------------------------------------------------------==
-; rdar://14987673
-
-define i64 @orr_movk1() nounwind {
-; CHECK-LABEL: orr_movk1:
-; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #0xdead, lsl #16
-  ret i64 72056498262245120
-}
-
-define i64 @orr_movk2() nounwind {
-; CHECK-LABEL: orr_movk2:
-; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #0xdead, lsl #48
-  ret i64 -2400982650836746496
-}
-
-define i64 @orr_movk3() nounwind {
-; CHECK-LABEL: orr_movk3:
-; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #0xdead, lsl #32
-  ret i64 72020953688702720
-}
-
-define i64 @orr_movk4() nounwind {
-; CHECK-LABEL: orr_movk4:
-; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #0xdead
-  ret i64 72056494543068845
-}
-
-; rdar://14987618
-define i64 @orr_movk5() nounwind {
-; CHECK-LABEL: orr_movk5:
-; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #0xdead, lsl #16
-  ret i64 -71777214836900096
-}
-
-define i64 @orr_movk6() nounwind {
-; CHECK-LABEL: orr_movk6:
-; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #0xdead, lsl #16
-; CHECK: movk x0, #0xdead, lsl #48
-  ret i64 -2400982647117578496
-}
-
-define i64 @orr_movk7() nounwind {
-; CHECK-LABEL: orr_movk7:
-; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #0xdead, lsl #48
-  ret i64 -2400982646575268096
-}
-
-define i64 @orr_movk8() nounwind {
-; CHECK-LABEL: orr_movk8:
-; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #0xdead
-; CHECK: movk x0, #0xdead, lsl #48
-  ret i64 -2400982646575276371
-}
-
-; rdar://14987715
-define i64 @orr_movk9() nounwind {
-; CHECK-LABEL: orr_movk9:
-; CHECK: orr x0, xzr, #0xffffff000000000
-; CHECK: movk x0, #0xff00
-; CHECK: movk x0, #0xdead, lsl #16
-  ret i64 1152921439623315200
-}
-
-define i64 @orr_movk10() nounwind {
-; CHECK-LABEL: orr_movk10:
-; CHECK: orr x0, xzr, #0xfffffffffffff00
-; CHECK: movk x0, #0xdead, lsl #16
-  ret i64 1152921504047824640
-}
-
-define i64 @orr_movk11() nounwind {
-; CHECK-LABEL: orr_movk11:
-; CHECK: orr x0, xzr, #0xfff00000000000ff
-; CHECK: movk x0, #0xdead, lsl #16
-; CHECK: movk x0, #0xffff, lsl #32
-  ret i64 -4222125209747201
-}
-
-define i64 @orr_movk12() nounwind {
-; CHECK-LABEL: orr_movk12:
-; CHECK: orr x0, xzr, #0xfff00000000000ff
-; CHECK: movk x0, #0xdead, lsl #32
-  ret i64 -4258765016661761
-}
-
-define i64 @orr_movk13() nounwind {
-; CHECK-LABEL: orr_movk13:
-; CHECK: orr x0, xzr, #0xfffff000000
-; CHECK: movk x0, #0xdead
-; CHECK: movk x0, #0xdead, lsl #48
-  ret i64 -2401245434149282131
-}
-
-; rdar://13944082
-define i64 @g() nounwind {
-; CHECK-LABEL: g:
-; CHECK: movz x0, #0xffff, lsl #48
-; CHECK: movk x0, #0x2
-entry:
-  ret i64 -281474976710654
-}
diff --git a/test/CodeGen/ARM64/mul.ll b/test/CodeGen/ARM64/mul.ll
deleted file mode 100644
index 2e7986d67d9..00000000000
--- a/test/CodeGen/ARM64/mul.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-; rdar://9296808
-; rdar://9349137
-
-define i128 @t1(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: umulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-  %tmp1 = zext i64 %a to i128
-  %tmp2 = zext i64 %b to i128
-  %tmp3 = mul i128 %tmp1, %tmp2
-  ret i128 %tmp3
-}
-
-define i128 @t2(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: smulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-  %tmp1 = sext i64 %a to i128
-  %tmp2 = sext i64 %b to i128
-  %tmp3 = mul i128 %tmp1, %tmp2
-  ret i128 %tmp3
-}
-
-define i64 @t3(i32 %a, i32 %b) nounwind {
-entry:
-; CHECK-LABEL: t3:
-; CHECK: umull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-  %tmp1 = zext i32 %a to i64
-  %tmp2 = zext i32 %b to i64
-  %tmp3 = mul i64 %tmp1, %tmp2
-  ret i64 %tmp3
-}
-
-define i64 @t4(i32 %a, i32 %b) nounwind {
-entry:
-; CHECK-LABEL: t4:
-; CHECK: smull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-  %tmp1 = sext i32 %a to i64
-  %tmp2 = sext i32 %b to i64
-  %tmp3 = mul i64 %tmp1, %tmp2
-  ret i64 %tmp3
-}
-
-define i64 @t5(i32 %a, i32 %b, i64 %c) nounwind {
-entry:
-; CHECK-LABEL: t5:
-; CHECK: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
-  %tmp1 = zext i32 %a to i64
-  %tmp2 = zext i32 %b to i64
-  %tmp3 = mul i64 %tmp1, %tmp2
-  %tmp4 = add i64 %c, %tmp3
-  ret i64 %tmp4
-}
-
-define i64 @t6(i32 %a, i32 %b, i64 %c) nounwind {
-entry:
-; CHECK-LABEL: t6:
-; CHECK: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
-  %tmp1 = sext i32 %a to i64
-  %tmp2 = sext i32 %b to i64
-  %tmp3 = mul i64 %tmp1, %tmp2
-  %tmp4 = sub i64 %c, %tmp3
-  ret i64 %tmp4
-}
-
-define i64 @t7(i32 %a, i32 %b) nounwind {
-entry:
-; CHECK-LABEL: t7:
-; CHECK: umnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-  %tmp1 = zext i32 %a to i64
-  %tmp2 = zext i32 %b to i64
-  %tmp3 = mul i64 %tmp1, %tmp2
-  %tmp4 = sub i64 0, %tmp3
-  ret i64 %tmp4
-}
-
-define i64 @t8(i32 %a, i32 %b) nounwind {
-entry:
-; CHECK-LABEL: t8:
-; CHECK: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-  %tmp1 = sext i32 %a to i64
-  %tmp2 = sext i32 %b to i64
-  %tmp3 = mul i64 %tmp1, %tmp2
-  %tmp4 = sub i64 0, %tmp3
-  ret i64 %tmp4
-}
diff --git a/test/CodeGen/ARM64/named-reg-alloc.ll b/test/CodeGen/ARM64/named-reg-alloc.ll
deleted file mode 100644
index d86d2e617ee..00000000000
--- a/test/CodeGen/ARM64/named-reg-alloc.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
-; RUN: not llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
-
-define i32 @get_stack() nounwind {
-entry:
-; FIXME: Include an allocatable-specific error message
-; CHECK: Invalid register name global variable
-	%sp = call i32 @llvm.read_register.i32(metadata !0)
-  ret i32 %sp
-}
-
-declare i32 @llvm.read_register.i32(metadata) nounwind
-
-!0 = metadata !{metadata !"x5\00"}
diff --git a/test/CodeGen/ARM64/named-reg-notareg.ll b/test/CodeGen/ARM64/named-reg-notareg.ll
deleted file mode 100644
index 3ca14c408f4..00000000000
--- a/test/CodeGen/ARM64/named-reg-notareg.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
-; RUN: not llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
-
-define i32 @get_stack() nounwind {
-entry:
-; CHECK: Invalid register name global variable
-	%sp = call i32 @llvm.read_register.i32(metadata !0)
-  ret i32 %sp
-}
-
-declare i32 @llvm.read_register.i32(metadata) nounwind
-
-!0 = metadata !{metadata !"notareg\00"}
diff --git a/test/CodeGen/ARM64/neg.ll b/test/CodeGen/ARM64/neg.ll
deleted file mode 100644
index 659ce988a70..00000000000
--- a/test/CodeGen/ARM64/neg.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-
-define i32 @test_neg_i32(i32 %in) {
-; CHECK-LABEL: test_neg_i32:
-; CHECK: neg w0, w0
-  %res = sub i32 0, %in
-  ret i32 %res
-}
-
-define i64 @test_neg_i64(i64 %in) {
-; CHECK-LABEL: test_neg_i64:
-; CHECK: neg x0, x0
-  %res = sub i64 0, %in
-  ret i64 %res
-}
-
-define <8 x i8> @test_neg_v8i8(<8 x i8> %in) {
-; CHECK-LABEL: test_neg_v8i8:
-; CHECK: neg v0.8b, v0.8b
-  %res = sub <8 x i8> zeroinitializer, %in
-  ret <8 x i8> %res
-}
-
-define <4 x i16> @test_neg_v4i16(<4 x i16> %in) {
-; CHECK-LABEL: test_neg_v4i16:
-; CHECK: neg v0.4h, v0.4h
-  %res = sub <4 x i16> zeroinitializer, %in
-  ret <4 x i16> %res
-}
-
-define <2 x i32> @test_neg_v2i32(<2 x i32> %in) {
-; CHECK-LABEL: test_neg_v2i32:
-; CHECK: neg v0.2s, v0.2s
-  %res = sub <2 x i32> zeroinitializer, %in
-  ret <2 x i32> %res
-}
-
-define <16 x i8> @test_neg_v16i8(<16 x i8> %in) {
-; CHECK-LABEL: test_neg_v16i8:
-; CHECK: neg v0.16b, v0.16b
-  %res = sub <16 x i8> zeroinitializer, %in
-  ret <16 x i8> %res
-}
-
-define <8 x i16> @test_neg_v8i16(<8 x i16> %in) {
-; CHECK-LABEL: test_neg_v8i16:
-; CHECK: neg v0.8h, v0.8h
-  %res = sub <8 x i16> zeroinitializer, %in
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @test_neg_v4i32(<4 x i32> %in) {
-; CHECK-LABEL: test_neg_v4i32:
-; CHECK: neg v0.4s, v0.4s
-  %res = sub <4 x i32> zeroinitializer, %in
-  ret <4 x i32> %res
-}
-
-define <2 x i64> @test_neg_v2i64(<2 x i64> %in) {
-; CHECK-LABEL: test_neg_v2i64:
-; CHECK: neg v0.2d, v0.2d
-  %res = sub <2 x i64> zeroinitializer, %in
-  ret <2 x i64> %res
-}
-
-define <1 x i64> @test_neg_v1i64(<1 x i64> %in) {
-; CHECK-LABEL: test_neg_v1i64:
-; CHECK: neg d0, d0
-  %res = sub <1 x i64> zeroinitializer, %in
-  ret <1 x i64> %res
-}
diff --git a/test/CodeGen/ARM64/neon-compare-instructions.ll b/test/CodeGen/ARM64/neon-compare-instructions.ll
deleted file mode 100644
index cba81ef99b9..00000000000
--- a/test/CodeGen/ARM64/neon-compare-instructions.ll
+++ /dev/null
@@ -1,1191 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s
-
-define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp eq <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp eq <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = icmp eq <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = icmp eq <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = icmp eq <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = icmp eq <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = icmp eq <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ne <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ne <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ne <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp sgt <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp sgt <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = icmp sgt <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = icmp sgt <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = icmp sgt <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = icmp sgt <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = icmp sgt <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
-	%tmp3 = icmp slt <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
-	%tmp3 = icmp slt <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
-	%tmp3 = icmp slt <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
-	%tmp3 = icmp slt <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-	%tmp3 = icmp slt <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-	%tmp3 = icmp slt <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-	%tmp3 = icmp slt <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp sge <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp sge <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = icmp sge <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = icmp sge <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = icmp sge <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = icmp sge <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = icmp sge <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
-	%tmp3 = icmp sle <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
-	%tmp3 = icmp sle <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
-	%tmp3 = icmp sle <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
-	%tmp3 = icmp sle <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
-	%tmp3 = icmp sle <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
-	%tmp3 = icmp sle <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
-	%tmp3 = icmp sle <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ugt <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ugt <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = icmp ugt <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = icmp ugt <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = icmp ugt <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = icmp ugt <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = icmp ugt <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
-	%tmp3 = icmp ult <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
-	%tmp3 = icmp ult <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
-	%tmp3 = icmp ult <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
-	%tmp3 = icmp ult <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
-	%tmp3 = icmp ult <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
-	%tmp3 = icmp ult <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
-	%tmp3 = icmp ult <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp uge <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp uge <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = icmp uge <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = icmp uge <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = icmp uge <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = icmp uge <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = icmp uge <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
-	%tmp3 = icmp ule <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
-	%tmp3 = icmp ule <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
-	%tmp3 = icmp ule <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
-	%tmp3 = icmp ule <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
-	%tmp3 = icmp ule <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
-	%tmp3 = icmp ule <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
-	%tmp3 = icmp ule <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-
-define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-	%tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-	%tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-	%tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-	%tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-	%tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-	%tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-	%tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-
-define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
-;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-	%tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
-;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-	%tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
-;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-	%tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
-;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-	%tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
-;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-	%tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
-;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-	%tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
-;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-	%tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-
-define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
-;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-	%tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
-;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-	%tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
-;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-	%tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
-;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-	%tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
-;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-	%tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
-;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-	%tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
-;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-	%tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
-;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-	%tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
-;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-	%tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
-;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-	%tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
-;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-	%tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
-;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-	%tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
-;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-	%tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
-;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-	%tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
-;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-	%tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
-;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-	%tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
-;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-	%tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
-;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-	%tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
-;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-	%tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
-;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-	%tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
-;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-	%tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
-	%tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
-	%tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
-	%tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
-	%tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
-	%tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
-	%tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
-	%tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-
-define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
-	%tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
-	%tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
-	%tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
-	%tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
-	%tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
-	%tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
-	%tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v[[ZERO]].8b, v0.8b
-	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
-	%tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
-	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
-	%tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
-	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
-	%tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
-	%tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v[[ZERO]].8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
-	%tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
-	%tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
-	%tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
-	%tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
-	%tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
-	%tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <1 x i64> @cmeqz_v1i64(<1 x i64> %A) {
-; CHECK-LABEL: cmeqz_v1i64:
-; CHECK: cmeq d0, d0, #0
-  %tst = icmp eq <1 x i64> %A, <i64 0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @cmgez_v1i64(<1 x i64> %A) {
-; CHECK-LABEL: cmgez_v1i64:
-; CHECK: cmge d0, d0, #0
-  %tst = icmp sge <1 x i64> %A, <i64 0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @cmgtz_v1i64(<1 x i64> %A) {
-; CHECK-LABEL: cmgtz_v1i64:
-; CHECK: cmgt d0, d0, #0
-  %tst = icmp sgt <1 x i64> %A, <i64 0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @cmlez_v1i64(<1 x i64> %A) {
-; CHECK-LABEL: cmlez_v1i64:
-; CHECK: cmle d0, d0, #0
-  %tst = icmp sle <1 x i64> %A, <i64 0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @cmltz_v1i64(<1 x i64> %A) {
-; CHECK-LABEL: cmltz_v1i64:
-; CHECK: cmlt d0, d0, #0
-  %tst = icmp slt <1 x i64> %A, <i64 0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmeqz_v1f64(<1 x double> %A) {
-; CHECK-LABEL: fcmeqz_v1f64:
-; CHECK: fcmeq d0, d0, #0
-  %tst = fcmp oeq <1 x double> %A, <double 0.0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmgez_v1f64(<1 x double> %A) {
-; CHECK-LABEL: fcmgez_v1f64:
-; CHECK: fcmge d0, d0, #0
-  %tst = fcmp oge <1 x double> %A, <double 0.0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmgtz_v1f64(<1 x double> %A) {
-; CHECK-LABEL: fcmgtz_v1f64:
-; CHECK: fcmgt d0, d0, #0
-  %tst = fcmp ogt <1 x double> %A, <double 0.0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmlez_v1f64(<1 x double> %A) {
-; CHECK-LABEL: fcmlez_v1f64:
-; CHECK: fcmle d0, d0, #0
-  %tst = fcmp ole <1 x double> %A, <double 0.0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmltz_v1f64(<1 x double> %A) {
-; CHECK-LABEL: fcmltz_v1f64:
-; CHECK: fcmlt d0, d0, #0
-  %tst = fcmp olt <1 x double> %A, <double 0.0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
diff --git a/test/CodeGen/ARM64/neon-v1i1-setcc.ll b/test/CodeGen/ARM64/neon-v1i1-setcc.ll
deleted file mode 100644
index 74e3af8206f..00000000000
--- a/test/CodeGen/ARM64/neon-v1i1-setcc.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=arm64-none-linux-gnu | FileCheck %s
-
-; This is the analogue of AArch64's file of the same name. It's mostly testing
-; some form of correct lowering occurs, the tests are a little artificial but I
-; strongly suspect there's room for improved CodeGen (FIXME).
-
-define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
-; CHECK-LABEL: test_sext_extr_cmp_0:
-; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: cset
-  %1 = icmp sge <1 x i64> %v1, %v2
-  %2 = extractelement <1 x i1> %1, i32 0
-  %vget_lane = sext i1 %2 to i64
-  ret i64 %vget_lane
-}
-
-define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
-; CHECK-LABEL: test_sext_extr_cmp_1:
-; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = fcmp oeq <1 x double> %v1, %v2
-  %2 = extractelement <1 x i1> %1, i32 0
-  %vget_lane = sext i1 %2 to i64
-  ret i64 %vget_lane
-}
-
-define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_select_v1i1_0:
-; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %1 = icmp eq <1 x i64> %v1, %v2
-  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
-  ret <1 x i64> %res
-}
-
-define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_select_v1i1_1:
-; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %1 = fcmp oeq <1 x double> %v1, %v2
-  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
-  ret <1 x i64> %res
-}
-
-define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
-; CHECK-LABEL: test_select_v1i1_2:
-; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %1 = icmp eq <1 x i64> %v1, %v2
-  %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
-  ret <1 x double> %res
-}
-
-define <1 x i64> @test_select_v1i1_3(i64 %lhs, i64 %rhs, <1 x i64> %v3) {
-; CHECK-LABEL: test_select_v1i1_3:
-; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
-  %tst = icmp eq i64 %lhs, %rhs
-  %evil = insertelement <1 x i1> undef, i1 %tst, i32 0
-  %res = select <1 x i1> %evil, <1 x i64> zeroinitializer, <1 x i64> %v3
-  ret <1 x i64> %res
-}
-
-define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) {
-; CHECK-LABEL: test_br_extr_cmp:
-; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}}
-  %1 = icmp eq <1 x i64> %v1, %v2
-  %2 = extractelement <1 x i1> %1, i32 0
-  br i1 %2, label %if.end, label %if.then
-
-if.then:
-  ret i32 0;
-
-if.end:
-  ret i32 1;
-}
diff --git a/test/CodeGen/ARM64/patchpoint.ll b/test/CodeGen/ARM64/patchpoint.ll
deleted file mode 100644
index 9ef1d778a31..00000000000
--- a/test/CodeGen/ARM64/patchpoint.ll
+++ /dev/null
@@ -1,163 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone | FileCheck %s
-
-; Trivial patchpoint codegen
-;
-define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-; CHECK-LABEL: trivial_patchpoint_codegen:
-; CHECK:       movz x16, #0xdead, lsl #32
-; CHECK-NEXT:  movk x16, #0xbeef, lsl #16
-; CHECK-NEXT:  movk x16, #0xcafe
-; CHECK-NEXT:  blr  x16
-; CHECK:       movz x16, #0xdead, lsl #32
-; CHECK-NEXT:  movk x16, #0xbeef, lsl #16
-; CHECK-NEXT:  movk x16, #0xcaff
-; CHECK-NEXT:  blr  x16
-; CHECK:       ret
-  %resolveCall2 = inttoptr i64 244837814094590 to i8*
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
-  %resolveCall3 = inttoptr i64 244837814094591 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
-  ret i64 %result
-}
-
-; Caller frame metadata with stackmaps. This should not be optimized
-; as a leaf function.
-;
-; CHECK-LABEL: caller_meta_leaf
-; CHECK:       mov x29, sp
-; CHECK-NEXT:  sub sp, sp, #32
-; CHECK:       Ltmp
-; CHECK:       mov sp, x29
-; CHECK:       ret
-
-define void @caller_meta_leaf() {
-entry:
-  %metadata = alloca i64, i32 3, align 8
-  store i64 11, i64* %metadata
-  store i64 12, i64* %metadata
-  store i64 13, i64* %metadata
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
-  ret void
-}
-
-; Test the webkit_jscc calling convention.
-; One argument will be passed in register, the other will be pushed on the stack.
-; Return value in x0.
-define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen:
-; CHECK:      Ltmp
-; CHECK:      str x{{.+}}, [sp]
-; CHECK-NEXT: mov  x0, x{{.+}}
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #0xffff, lsl #32
-; CHECK-NEXT: movk  x16, #0xdead, lsl #16
-; CHECK-NEXT: movk  x16, #0xbeef
-; CHECK-NEXT: blr x16
-  %resolveCall2 = inttoptr i64 281474417671919 to i8*
-  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
-  %resolveCall3 = inttoptr i64 244837814038255 to i8*
-  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
-  ret void
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen2(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen2:
-; CHECK:      Ltmp
-; CHECK:      orr w{{.+}}, wzr, #0x6
-; CHECK-NEXT: str x{{.+}}, [sp, #24]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
-; CHECK-NEXT: str w{{.+}}, [sp, #16]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
-; CHECK-NEXT: str x{{.+}}, [sp]
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #0xffff, lsl #32
-; CHECK-NEXT: movk  x16, #0xdead, lsl #16
-; CHECK-NEXT: movk  x16, #0xbeef
-; CHECK-NEXT: blr x16
-  %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
-  ret i64 %result
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen3(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen3:
-; CHECK:      Ltmp
-; CHECK:      movz  w{{.+}}, #0xa
-; CHECK-NEXT: str x{{.+}}, [sp, #48]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x8
-; CHECK-NEXT: str w{{.+}}, [sp, #36]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x6
-; CHECK-NEXT: str x{{.+}}, [sp, #24]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
-; CHECK-NEXT: str w{{.+}}, [sp, #16]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
-; CHECK-NEXT: str x{{.+}}, [sp]
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #0xffff, lsl #32
-; CHECK-NEXT: movk  x16, #0xdead, lsl #16
-; CHECK-NEXT: movk  x16, #0xbeef
-; CHECK-NEXT: blr x16
-  %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
-  ret i64 %result
-}
-
-; Test patchpoints reusing the same TargetConstant.
-; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
-; There is no way to verify this, since it depends on memory allocation.
-; But I think it's useful to include as a working example.
-define i64 @testLowerConstant(i64 %arg, i64 %tmp2, i64 %tmp10, i64* %tmp33, i64 %tmp79) {
-entry:
-  %tmp80 = add i64 %tmp79, -16
-  %tmp81 = inttoptr i64 %tmp80 to i64*
-  %tmp82 = load i64* %tmp81, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
-  %tmp83 = load i64* %tmp33, align 8
-  %tmp84 = add i64 %tmp83, -24
-  %tmp85 = inttoptr i64 %tmp84 to i64*
-  %tmp86 = load i64* %tmp85, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
-  ret i64 10
-}
-
-; Test small patchpoints that don't emit calls.
-define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-; CHECK-LABEL: small_patchpoint_codegen:
-; CHECK:      Ltmp
-; CHECK:      nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: ldp
-; CHECK-NEXT: ret
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
-  ret void
-}
-
-; Test that scratch registers are spilled around patchpoints
-; CHECK: InlineAsm End
-; CHECK-NEXT: mov x{{[0-9]+}}, x16
-; CHECK-NEXT: mov x{{[0-9]+}}, x17
-; CHECK-NEXT: Ltmp
-; CHECK-NEXT: nop
-define void @clobberScratch(i32* %p) {
-  %v = load i32* %p
-  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
-  store i32 %v, i32* %p
-  ret void
-}
-
-declare void @llvm.experimental.stackmap(i64, i32, ...)
-declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
-declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/pic-local-symbol.ll b/test/CodeGen/ARM64/pic-local-symbol.ll
deleted file mode 100644
index 627e741fc32..00000000000
--- a/test/CodeGen/ARM64/pic-local-symbol.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc -mtriple=arm64-unknown-linux-gnu -relocation-model=pic < %s | FileCheck %s
-
-@a = internal unnamed_addr global i32 0, align 4
-@.str = private unnamed_addr constant [6 x i8] c"test\0A\00", align 1
-
-define i32 @get() {
-; CHECK: get:
-; CHECK: adrp x{{[0-9]+}}, a
-; CHECK-NEXT: ldr w{{[0-9]+}}, [x{{[0-9]}}, :lo12:a]
-  %res = load i32* @a, align 4
-  ret i32 %res
-}
-
-define void @foo() nounwind {
-; CHECK: foo:
-; CHECK: adrp x{{[0-9]}}, .L.str
-; CHECK-NEXT: add x{{[0-9]}}, x{{[0-9]}}, :lo12:.L.str
-  tail call void @bar(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0))
-  ret void
-}
-
-declare void @bar(i8*)
diff --git a/test/CodeGen/ARM64/platform-reg.ll b/test/CodeGen/ARM64/platform-reg.ll
deleted file mode 100644
index 651c793f73a..00000000000
--- a/test/CodeGen/ARM64/platform-reg.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-DARWIN
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-
-; x18 is reserved as a platform register on Darwin but not on other
-; systems. Create loads of register pressure and make sure this is respected.
-
-; Also, fp must always refer to a valid frame record, even if it's not the one
-; of the current function, so it shouldn't be used either.
-
-@var = global [30 x i64] zeroinitializer
-
-define void @keep_live() {
-  %val = load volatile [30 x i64]* @var
-  store volatile [30 x i64] %val, [30 x i64]* @var
-
-; CHECK: ldr x18
-; CHECK: str x18
-
-; CHECK-DARWIN-NOT: ldr fp
-; CHECK-DARWIN-NOT: ldr x18
-; CHECK-DARWIN: Spill
-; CHECK-DARWIN-NOT: ldr fp
-; CHECK-DARWIN-NOT: ldr x18
-; CHECK-DARWIN: ret
-  ret void
-}
diff --git a/test/CodeGen/ARM64/popcnt.ll b/test/CodeGen/ARM64/popcnt.ll
deleted file mode 100644
index 9bbba09c250..00000000000
--- a/test/CodeGen/ARM64/popcnt.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
-  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
-  ret i32 %cnt
-; CHECK: fmov	s0, w0
-; CHECK: cnt.8b	v0, v0
-; CHECK: uaddlv.8b	h0, v0
-; CHECK: fmov w0, s0
-; CHECK: ret
-}
-
-define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
-  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
-  ret i64 %cnt
-; CHECK: fmov	d0, x0
-; CHECK: cnt.8b	v0, v0
-; CHECK: uaddlv.8b	h0, v0
-; CHECK: fmov	w0, s0
-; CHECK: ret
-}
-
-; Do not use AdvSIMD when -mno-implicit-float is specified.
-; rdar://9473858
-
-define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
-  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
-  ret i32 %cnt
-; CHECK-LABEL: cnt32:
-; CHECK-NOT 16b
-; CHECK: ret
-}
-
-define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
-  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
-  ret i64 %cnt
-; CHECK-LABEL: cnt64:
-; CHECK-NOT 16b
-; CHECK: ret
-}
-
-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/test/CodeGen/ARM64/prefetch.ll b/test/CodeGen/ARM64/prefetch.ll
deleted file mode 100644
index b2e06edf931..00000000000
--- a/test/CodeGen/ARM64/prefetch.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; RUN: llc %s -march arm64 -o - | FileCheck %s
-
-@a = common global i32* null, align 8
-
-define void @test(i32 %i, i32 %j) nounwind ssp {
-entry:
-  ; CHECK: @test
-  %j.addr = alloca i32, align 4
-  store i32 %j, i32* %j.addr, align 4, !tbaa !0
-  %tmp = bitcast i32* %j.addr to i8*
-  ; CHECK: prfum pldl1strm
-  call void @llvm.prefetch(i8* %tmp, i32 0, i32 0, i32 1)
-  ; CHECK: prfum pldl3keep
-  call void @llvm.prefetch(i8* %tmp, i32 0, i32 1, i32 1)
-  ; CHECK: prfum pldl2keep
-  call void @llvm.prefetch(i8* %tmp, i32 0, i32 2, i32 1)
-  ; CHECK: prfum pldl1keep
-  call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 1)
-
-  ; CHECK: prfum pstl1strm
-  call void @llvm.prefetch(i8* %tmp, i32 1, i32 0, i32 1)
-  ; CHECK: prfum pstl3keep
-  call void @llvm.prefetch(i8* %tmp, i32 1, i32 1, i32 1)
-  ; CHECK: prfum pstl2keep
-  call void @llvm.prefetch(i8* %tmp, i32 1, i32 2, i32 1)
-  ; CHECK: prfum pstl1keep
-  call void @llvm.prefetch(i8* %tmp, i32 1, i32 3, i32 1)
-
-  %tmp1 = load i32* %j.addr, align 4, !tbaa !0
-  %add = add nsw i32 %tmp1, %i
-  %idxprom = sext i32 %add to i64
-  %tmp2 = load i32** @a, align 8, !tbaa !3
-  %arrayidx = getelementptr inbounds i32* %tmp2, i64 %idxprom
-  %tmp3 = bitcast i32* %arrayidx to i8*
-
-  ; CHECK: prfm pldl1strm
-  call void @llvm.prefetch(i8* %tmp3, i32 0, i32 0, i32 1)
-  %tmp4 = load i32** @a, align 8, !tbaa !3
-  %arrayidx3 = getelementptr inbounds i32* %tmp4, i64 %idxprom
-  %tmp5 = bitcast i32* %arrayidx3 to i8*
-
-  ; CHECK: prfm pldl3keep
-  call void @llvm.prefetch(i8* %tmp5, i32 0, i32 1, i32 1)
-  %tmp6 = load i32** @a, align 8, !tbaa !3
-  %arrayidx6 = getelementptr inbounds i32* %tmp6, i64 %idxprom
-  %tmp7 = bitcast i32* %arrayidx6 to i8*
-
-  ; CHECK: prfm pldl2keep
-  call void @llvm.prefetch(i8* %tmp7, i32 0, i32 2, i32 1)
-  %tmp8 = load i32** @a, align 8, !tbaa !3
-  %arrayidx9 = getelementptr inbounds i32* %tmp8, i64 %idxprom
-  %tmp9 = bitcast i32* %arrayidx9 to i8*
-
-  ; CHECK: prfm pldl1keep
-  call void @llvm.prefetch(i8* %tmp9, i32 0, i32 3, i32 1)
-  %tmp10 = load i32** @a, align 8, !tbaa !3
-  %arrayidx12 = getelementptr inbounds i32* %tmp10, i64 %idxprom
-  %tmp11 = bitcast i32* %arrayidx12 to i8*
-
-  ; CHECK: prfm pstl1strm
-  call void @llvm.prefetch(i8* %tmp11, i32 1, i32 0, i32 1)
-  %tmp12 = load i32** @a, align 8, !tbaa !3
-  %arrayidx15 = getelementptr inbounds i32* %tmp12, i64 %idxprom
-  %tmp13 = bitcast i32* %arrayidx15 to i8*
-
-  ; CHECK: prfm pstl3keep
-  call void @llvm.prefetch(i8* %tmp13, i32 1, i32 1, i32 1)
-  %tmp14 = load i32** @a, align 8, !tbaa !3
-  %arrayidx18 = getelementptr inbounds i32* %tmp14, i64 %idxprom
-  %tmp15 = bitcast i32* %arrayidx18 to i8*
-
-  ; CHECK: prfm pstl2keep
-  call void @llvm.prefetch(i8* %tmp15, i32 1, i32 2, i32 1)
-  %tmp16 = load i32** @a, align 8, !tbaa !3
-  %arrayidx21 = getelementptr inbounds i32* %tmp16, i64 %idxprom
-  %tmp17 = bitcast i32* %arrayidx21 to i8*
-
-  ; CHECK: prfm pstl1keep
-  call void @llvm.prefetch(i8* %tmp17, i32 1, i32 3, i32 1)
-  ret void
-}
-
-declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/ARM64/promote-const.ll b/test/CodeGen/ARM64/promote-const.ll
deleted file mode 100644
index 9e7a215f64c..00000000000
--- a/test/CodeGen/ARM64/promote-const.ll
+++ /dev/null
@@ -1,255 +0,0 @@
-; Disable machine cse to stress the different path of the algorithm.
-; Otherwise, we always fall in the simple case, i.e., only one definition.
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-stress-promote-const -mcpu=cyclone | FileCheck -check-prefix=PROMOTED %s
-; The REGULAR run just checks that the inputs passed to promote const expose
-; the appropriate patterns.
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-promote-const=false -mcpu=cyclone | FileCheck -check-prefix=REGULAR %s
-
-%struct.uint8x16x4_t = type { [4 x <16 x i8>] }
-
-; Constant is a structure
-define %struct.uint8x16x4_t @test1() {
-; PROMOTED-LABEL: test1:
-; Promote constant has created a big constant for the whole structure
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], __PromotedConst@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], __PromotedConst@PAGEOFF
-; Destination registers are defined by the ABI
-; PROMOTED-NEXT: ldp q0, q1, {{\[}}[[BASEADDR]]]
-; PROMOTED-NEXT: ldp q2, q3, {{\[}}[[BASEADDR]], #32]
-; PROMOTED-NEXT: ret
-
-; REGULAR-LABEL: test1:
-; Regular access is quite bad, it performs 4 loads, one for each chunk of
-; the structure
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; Destination registers are defined by the ABI
-; REGULAR: ldr q0, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; REGULAR: ldr q1, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
-; REGULAR: adrp [[PAGEADDR2:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
-; REGULAR: ldr q2, {{\[}}[[PAGEADDR2]], [[CSTLABEL2]]@PAGEOFF]
-; REGULAR: adrp [[PAGEADDR3:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
-; REGULAR: ldr q3, {{\[}}[[PAGEADDR3]], [[CSTLABEL3]]@PAGEOFF]
-; REGULAR-NEXT: ret
-entry:
-  ret %struct.uint8x16x4_t { [4 x <16 x i8>] [<16 x i8> <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, <16 x i8> <i8 32, i8 124, i8 121, i8 120, i8 8, i8 117, i8 -56, i8 113, i8 -76, i8 110, i8 -53, i8 107, i8 7, i8 105, i8 103, i8 102>, <16 x i8> <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>, <16 x i8> <i8 -104, i8 83, i8 -20, i8 81, i8 81, i8 80, i8 -59, i8 78, i8 73, i8 77, i8 -37, i8 75, i8 122, i8 74, i8 37, i8 73>] }
-}
-
-; Two different uses of the same constant in the same basic block
-define <16 x i8> @test2(<16 x i8> %arg) {
-entry:
-; PROMOTED-LABEL: test2:
-; In stress mode, constant vector are promoted
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1:__PromotedConst[0-9]+]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
-; PROMOTED: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
-; Destination register is defined by ABI
-; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
-; PROMOTED-NEXT: mla.16b v0, v0, v[[REGNUM]]
-; PROMOTED-NEXT: ret
-
-; REGULAR-LABEL: test2:
-; Regular access is strickly the same as promoted access.
-; The difference is that the address (and thus the space in memory) is not
-; shared between constants
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; REGULAR: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
-; Destination register is defined by ABI
-; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
-; REGULAR-NEXT: mla.16b v0, v0, v[[REGNUM]]
-; REGULAR-NEXT: ret
-  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  %add.i9 = add <16 x i8> %add.i, %mul.i
-  ret <16 x i8> %add.i9
-}
-
-; Two different uses of the sane constant in two different basic blocks,
-; one dominates the other
-define <16 x i8> @test3(<16 x i8> %arg, i32 %path) {
-; PROMOTED-LABEL: test3:
-; In stress mode, constant vector are promoted
-; Since, the constant is the same as the previous function,
-; the same address must be used
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
-; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
-; Destination register is defined by ABI
-; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
-; PROMOTED-NEXT: cbnz w0, [[LABEL:LBB.*]]
-; Next BB
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV2:__PromotedConst[0-9]+]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV2]]@PAGEOFF
-; PROMOTED-NEXT: ldr q[[REGNUM]], {{\[}}[[BASEADDR]]]
-; Next BB
-; PROMOTED-NEXT: [[LABEL]]:
-; PROMOTED-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
-; PROMOTED-NEXT: add.16b v0, v0, [[DESTV]]
-; PROMOTED-NEXT: ret
-
-; REGULAR-LABEL: test3:
-; Regular mode does not elimitate common sub expression by its own.
-; In other words, the same loads appears several times.
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
-; Destination register is defined by ABI
-; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
-; REGULAR-NEXT: cbz w0, [[LABELelse:LBB.*]]
-; Next BB
-; Redundant load
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
-; REGULAR-NEXT: b [[LABELend:LBB.*]]
-; Next BB
-; REGULAR-NEXT: [[LABELelse]]
-; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL2]]@PAGEOFF]
-; Next BB
-; REGULAR-NEXT: [[LABELend]]:
-; REGULAR-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
-; REGULAR-NEXT: add.16b v0, v0, [[DESTV]]
-; REGULAR-NEXT: ret
-entry:
-  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  %tobool = icmp eq i32 %path, 0
-  br i1 %tobool, label %if.else, label %if.then
-
-if.then:                                          ; preds = %entry
-  %mul.i13 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %mul.i = mul <16 x i8> %add.i, <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %ret2.0 = phi <16 x i8> [ %mul.i13, %if.then ], [ %mul.i, %if.else ]
-  %add.i12 = add <16 x i8> %add.i, %ret2.0
-  ret <16 x i8> %add.i12
-}
-
-; Two different uses of the sane constant in two different basic blocks,
-; none dominates the other
-define <16 x i8> @test4(<16 x i8> %arg, i32 %path) {
-; PROMOTED-LABEL: test4:
-; In stress mode, constant vector are promoted
-; Since, the constant is the same as the previous function,
-; the same address must be used
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
-; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
-; Destination register is defined by ABI
-; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
-; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
-; Next BB
-; PROMOTED: mul.16b v0, v0, v[[REGNUM]]
-; Next BB
-; PROMOTED-NEXT: [[LABEL]]:
-; PROMOTED-NEXT: ret
-
-
-; REGULAR-LABEL: test4:
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
-; Destination register is defined by ABI
-; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
-; REGULAR-NEXT: cbz w0, [[LABEL:LBB.*]]
-; Next BB
-; Redundant expression
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
-; Destination register is defined by ABI
-; REGULAR-NEXT: mul.16b v0, v0, v[[REGNUM]]
-; Next BB
-; REGULAR-NEXT: [[LABEL]]:
-; REGULAR-NEXT: ret
-entry:
-  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  %tobool = icmp eq i32 %path, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  br label %if.end
-
-if.end:                                           ; preds = %entry, %if.then
-  %ret.0 = phi <16 x i8> [ %mul.i, %if.then ], [ %add.i, %entry ]
-  ret <16 x i8> %ret.0
-}
-
-; Two different uses of the sane constant in two different basic blocks,
-; one is in a phi.
-define <16 x i8> @test5(<16 x i8> %arg, i32 %path) {
-; PROMOTED-LABEL: test5:
-; In stress mode, constant vector are promoted
-; Since, the constant is the same as the previous function,
-; the same address must be used
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
-; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
-; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
-; Next BB
-; PROMOTED: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
-; PROMOTED-NEXT: mul.16b v[[REGNUM]], [[DESTV]], v[[REGNUM]]
-; Next BB
-; PROMOTED-NEXT: [[LABEL]]:
-; PROMOTED-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[REGNUM]], v[[REGNUM]]
-; PROMOTED-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
-; PROMOTED-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
-; PROMOTED-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
-; PROMOTED-NEXT: ret
-
-; REGULAR-LABEL: test5:
-; REGULAR: cbz w0, [[LABELelse:LBB.*]]
-; Next BB
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
-; REGULAR-NEXT: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
-; REGULAR-NEXT: mul.16b v[[DESTREGNUM:[0-9]+]], [[DESTV]], v[[REGNUM]]
-; REGULAR-NEXT: b [[LABELend:LBB.*]]
-; Next BB
-; REGULAR-NEXT: [[LABELelse]]
-; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[DESTREGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
-; Next BB
-; REGULAR-NEXT: [[LABELend]]:
-; REGULAR-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[DESTREGNUM]], v[[DESTREGNUM]]
-; REGULAR-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
-; REGULAR-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
-; REGULAR-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
-; REGULAR-NEXT: ret
-entry:
-  %tobool = icmp eq i32 %path, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  %mul.i26 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  br label %if.end
-
-if.end:                                           ; preds = %entry, %if.then
-  %ret.0 = phi <16 x i8> [ %mul.i26, %if.then ], [ <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, %entry ]
-  %mul.i25 = mul <16 x i8> %ret.0, %ret.0
-  %mul.i24 = mul <16 x i8> %mul.i25, %mul.i25
-  %mul.i23 = mul <16 x i8> %mul.i24, %mul.i24
-  %mul.i = mul <16 x i8> %mul.i23, %mul.i23
-  ret <16 x i8> %mul.i
-}
-
-define void @accessBig(i64* %storage) {
-; PROMOTED-LABEL: accessBig:
-; PROMOTED: adrp
-; PROMOTED: ret
-  %addr = bitcast i64* %storage to <1 x i80>*
-  store <1 x i80> <i80 483673642326615442599424>, <1 x i80>* %addr
-  ret void
-}
-
-define void @asmStatement() {
-; PROMOTED-LABEL: asmStatement:
-; PROMOTED-NOT: adrp
-; PROMOTED: ret
-  call void asm sideeffect "bfxil w0, w0, $0, $1", "i,i"(i32 28, i32 4)
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/redzone.ll b/test/CodeGen/ARM64/redzone.ll
deleted file mode 100644
index b89d7b1de3f..00000000000
--- a/test/CodeGen/ARM64/redzone.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
-
-define i32 @foo(i32 %a, i32 %b) nounwind ssp {
-; CHECK-LABEL: foo:
-; CHECK-NOT: sub sp, sp
-; CHECK: ret
-  %a.addr = alloca i32, align 4
-  %b.addr = alloca i32, align 4
-  %x = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  store i32 %b, i32* %b.addr, align 4
-  %tmp = load i32* %a.addr, align 4
-  %tmp1 = load i32* %b.addr, align 4
-  %add = add nsw i32 %tmp, %tmp1
-  store i32 %add, i32* %x, align 4
-  %tmp2 = load i32* %x, align 4
-  ret i32 %tmp2
-}
diff --git a/test/CodeGen/ARM64/reg-copy-noneon.ll b/test/CodeGen/ARM64/reg-copy-noneon.ll
deleted file mode 100644
index 29255ef187c..00000000000
--- a/test/CodeGen/ARM64/reg-copy-noneon.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s | FileCheck %s
-
-define float @copy_FPR32(float %a, float %b) {
-;CHECK-LABEL: copy_FPR32:
-;CHECK: fmov s0, s1
-  ret float %b;
-}
-  
-define double @copy_FPR64(double %a, double %b) {
-;CHECK-LABEL: copy_FPR64:
-;CHECK: fmov d0, d1
-  ret double %b;
-}
-  
-define fp128 @copy_FPR128(fp128 %a, fp128 %b) {
-;CHECK-LABEL: copy_FPR128:
-;CHECK: str	q1, [sp, #-16]!
-;CHECK-NEXT: ldr	q0, [sp, #16]!
-  ret fp128 %b;
-}
diff --git a/test/CodeGen/ARM64/register-offset-addressing.ll b/test/CodeGen/ARM64/register-offset-addressing.ll
deleted file mode 100644
index 045712bea6a..00000000000
--- a/test/CodeGen/ARM64/register-offset-addressing.ll
+++ /dev/null
@@ -1,145 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i8 @test_64bit_add(i16* %a, i64 %b) {
-; CHECK-LABEL: test_64bit_add:
-; CHECK: lsl [[REG:x[0-9]+]], x1, #1
-; CHECK: ldrb w0, [x0, [[REG]]]
-; CHECK: ret
-  %tmp1 = getelementptr inbounds i16* %a, i64 %b
-  %tmp2 = load i16* %tmp1
-  %tmp3 = trunc i16 %tmp2 to i8
-  ret i8 %tmp3
-}
-
-; These tests are trying to form SEXT and ZEXT operations that never leave i64
-; space, to make sure LLVM can adapt the offset register correctly.
-define void @ldst_8bit(i8* %base, i64 %offset) minsize {
-; CHECK-LABEL: ldst_8bit:
-
-   %off32.sext.tmp = shl i64 %offset, 32
-   %off32.sext = ashr i64 %off32.sext.tmp, 32
-   %addr8_sxtw = getelementptr i8* %base, i64 %off32.sext
-   %val8_sxtw = load volatile i8* %addr8_sxtw
-   %val32_signed = sext i8 %val8_sxtw to i32
-   store volatile i32 %val32_signed, i32* @var_32bit
-; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
-
-  %addrint_uxtw = ptrtoint i8* %base to i64
-  %offset_uxtw = and i64 %offset, 4294967295
-  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
-  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i8*
-  %val8_uxtw = load volatile i8* %addr_uxtw
-  %newval8 = add i8 %val8_uxtw, 1
-  store volatile i8 %newval8, i8* @var_8bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
-
-   ret void
-}
-
-
-define void @ldst_16bit(i16* %base, i64 %offset) minsize {
-; CHECK-LABEL: ldst_16bit:
-
-  %addrint_uxtw = ptrtoint i16* %base to i64
-  %offset_uxtw = and i64 %offset, 4294967295
-  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
-  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i16*
-  %val8_uxtw = load volatile i16* %addr_uxtw
-  %newval8 = add i16 %val8_uxtw, 1
-  store volatile i16 %newval8, i16* @var_16bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
-
-  %base_sxtw = ptrtoint i16* %base to i64
-  %offset_sxtw.tmp = shl i64 %offset, 32
-  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
-  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
-  %addr_sxtw = inttoptr i64 %addrint_sxtw to i16*
-  %val16_sxtw = load volatile i16* %addr_sxtw
-  %val64_signed = sext i16 %val16_sxtw to i64
-  store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
-
-
-  %base_uxtwN = ptrtoint i16* %base to i64
-  %offset_uxtwN = and i64 %offset, 4294967295
-  %offset2_uxtwN = shl i64 %offset_uxtwN, 1
-  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
-  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i16*
-  %val32 = load volatile i32* @var_32bit
-  %val16_trunc32 = trunc i32 %val32 to i16
-  store volatile i16 %val16_trunc32, i16* %addr_uxtwN
-; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #1]
-   ret void
-}
-
-define void @ldst_32bit(i32* %base, i64 %offset) minsize {
-; CHECK-LABEL: ldst_32bit:
-
-  %addrint_uxtw = ptrtoint i32* %base to i64
-  %offset_uxtw = and i64 %offset, 4294967295
-  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
-  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i32*
-  %val32_uxtw = load volatile i32* %addr_uxtw
-  %newval32 = add i32 %val32_uxtw, 1
-  store volatile i32 %newval32, i32* @var_32bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
-
-  %base_sxtw = ptrtoint i32* %base to i64
-  %offset_sxtw.tmp = shl i64 %offset, 32
-  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
-  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
-  %addr_sxtw = inttoptr i64 %addrint_sxtw to i32*
-  %val32_sxtw = load volatile i32* %addr_sxtw
-  %val64_signed = sext i32 %val32_sxtw to i64
-  store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
-
-
-  %base_uxtwN = ptrtoint i32* %base to i64
-  %offset_uxtwN = and i64 %offset, 4294967295
-  %offset2_uxtwN = shl i64 %offset_uxtwN, 2
-  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
-  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i32*
-  %val32 = load volatile i32* @var_32bit
-  store volatile i32 %val32, i32* %addr_uxtwN
-; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
-   ret void
-}
-
-define void @ldst_64bit(i64* %base, i64 %offset) minsize {
-; CHECK-LABEL: ldst_64bit:
-
-  %addrint_uxtw = ptrtoint i64* %base to i64
-  %offset_uxtw = and i64 %offset, 4294967295
-  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
-  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i64*
-  %val64_uxtw = load volatile i64* %addr_uxtw
-  %newval8 = add i64 %val64_uxtw, 1
-  store volatile i64 %newval8, i64* @var_64bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
-
-  %base_sxtw = ptrtoint i64* %base to i64
-  %offset_sxtw.tmp = shl i64 %offset, 32
-  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
-  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
-  %addr_sxtw = inttoptr i64 %addrint_sxtw to i64*
-  %val64_sxtw = load volatile i64* %addr_sxtw
-  store volatile i64 %val64_sxtw, i64* @var_64bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
-
-
-  %base_uxtwN = ptrtoint i64* %base to i64
-  %offset_uxtwN = and i64 %offset, 4294967295
-  %offset2_uxtwN = shl i64 %offset_uxtwN, 3
-  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
-  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i64*
-  %val64 = load volatile i64* @var_64bit
-  store volatile i64 %val64, i64* %addr_uxtwN
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
-   ret void
-}
-
-@var_8bit = global i8 0
-@var_16bit = global i16 0
-@var_32bit = global i32 0
-@var_64bit = global i64 0
diff --git a/test/CodeGen/ARM64/register-pairing.ll b/test/CodeGen/ARM64/register-pairing.ll
deleted file mode 100644
index 99defb1aad7..00000000000
--- a/test/CodeGen/ARM64/register-pairing.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
-;
-; rdar://14075006
-
-define void @odd() nounwind {
-; CHECK-LABEL: odd:
-; CHECK: stp d15, d14, [sp, #-144]!
-; CHECK: stp d13, d12, [sp, #16]
-; CHECK: stp d11, d10, [sp, #32]
-; CHECK: stp d9, d8, [sp, #48]
-; CHECK: stp x28, x27, [sp, #64]
-; CHECK: stp x26, x25, [sp, #80]
-; CHECK: stp x24, x23, [sp, #96]
-; CHECK: stp x22, x21, [sp, #112]
-; CHECK: stp x20, x19, [sp, #128]
-; CHECK: movz x0, #0x2a
-; CHECK: ldp x20, x19, [sp, #128]
-; CHECK: ldp x22, x21, [sp, #112]
-; CHECK: ldp x24, x23, [sp, #96]
-; CHECK: ldp x26, x25, [sp, #80]
-; CHECK: ldp x28, x27, [sp, #64]
-; CHECK: ldp d9, d8, [sp, #48]
-; CHECK: ldp d11, d10, [sp, #32]
-; CHECK: ldp d13, d12, [sp, #16]
-; CHECK: ldp d15, d14, [sp], #144
-  call void asm sideeffect "mov x0, #42", "~{x0},~{x19},~{x21},~{x23},~{x25},~{x27},~{d8},~{d10},~{d12},~{d14}"() nounwind
-  ret void
-}
-
-define void @even() nounwind {
-; CHECK-LABEL: even:
-; CHECK: stp d15, d14, [sp, #-144]!
-; CHECK: stp d13, d12, [sp, #16]
-; CHECK: stp d11, d10, [sp, #32]
-; CHECK: stp d9, d8, [sp, #48]
-; CHECK: stp x28, x27, [sp, #64]
-; CHECK: stp x26, x25, [sp, #80]
-; CHECK: stp x24, x23, [sp, #96]
-; CHECK: stp x22, x21, [sp, #112]
-; CHECK: stp x20, x19, [sp, #128]
-; CHECK: movz x0, #0x2a
-; CHECK: ldp x20, x19, [sp, #128]
-; CHECK: ldp x22, x21, [sp, #112]
-; CHECK: ldp x24, x23, [sp, #96]
-; CHECK: ldp x26, x25, [sp, #80]
-; CHECK: ldp x28, x27, [sp, #64]
-; CHECK: ldp d9, d8, [sp, #48]
-; CHECK: ldp d11, d10, [sp, #32]
-; CHECK: ldp d13, d12, [sp, #16]
-; CHECK: ldp d15, d14, [sp], #144
-  call void asm sideeffect "mov x0, #42", "~{x0},~{x20},~{x22},~{x24},~{x26},~{x28},~{d9},~{d11},~{d13},~{d15}"() nounwind
-  ret void
-}
diff --git a/test/CodeGen/ARM64/regress-f128csel-flags.ll b/test/CodeGen/ARM64/regress-f128csel-flags.ll
deleted file mode 100644
index a1daf03f4fa..00000000000
--- a/test/CodeGen/ARM64/regress-f128csel-flags.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
-
-; We used to not mark NZCV as being used in the continuation basic-block
-; when lowering a 128-bit "select" to branches. This meant a subsequent use
-; of the same flags gave an internal fault here.
-
-declare void @foo(fp128)
-
-define double @test_f128csel_flags(i32 %lhs, fp128 %a, fp128 %b, double %l, double %r) nounwind {
-; CHECK: test_f128csel_flags
-
-    %tst = icmp ne i32 %lhs, 42
-    %val = select i1 %tst, fp128 %a, fp128 %b
-; CHECK: cmp w0, #42
-; CHECK: b.eq {{.?LBB0}}
-
-    call void @foo(fp128 %val)
-    %retval = select i1 %tst, double %l, double %r
-
-    ; It's also reasonably important that the actual fcsel comes before the
-    ; function call since bl may corrupt NZCV. We were doing the right thing anyway,
-    ; but just as well test it while we're here.
-; CHECK: fcsel {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, ne
-; CHECK: bl {{_?foo}}
-
-    ret double %retval
-}
diff --git a/test/CodeGen/ARM64/regress-interphase-shift.ll b/test/CodeGen/ARM64/regress-interphase-shift.ll
deleted file mode 100644
index fec89334801..00000000000
--- a/test/CodeGen/ARM64/regress-interphase-shift.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc -march=arm64 -o - %s | FileCheck %s
-
-; This is mostly a "don't assert" test. The type of the RHS of a shift depended
-; on the phase of legalization, which led to the creation of an unexpected and
-; unselectable "rotr" node: (i32 (rotr i32, i64)).
-
-; FIXME: This test is xfailed because it relies on an optimization that has
-; been reverted (see PR17975).
-; XFAIL: *
-
-define void @foo(i64* nocapture %d) {
-; CHECK-LABEL: foo:
-; CHECK: rorv
-  %tmp = load i64* undef, align 8
-  %sub397 = sub i64 0, %tmp
-  %and398 = and i64 %sub397, 4294967295
-  %shr404 = lshr i64 %and398, 0
-  %or405 = or i64 0, %shr404
-  %xor406 = xor i64 %or405, 0
-  %xor417 = xor i64 0, %xor406
-  %xor428 = xor i64 0, %xor417
-  %sub430 = sub i64 %xor417, 0
-  %and431 = and i64 %sub430, 4294967295
-  %and432 = and i64 %xor428, 31
-  %sub433 = sub i64 32, %and432
-  %shl434 = shl i64 %and431, %sub433
-  %shr437 = lshr i64 %and431, %and432
-  %or438 = or i64 %shl434, %shr437
-  %xor439 = xor i64 %or438, %xor428
-  %sub441 = sub i64 %xor439, 0
-  store i64 %sub441, i64* %d, align 8
-  ret void
-}
diff --git a/test/CodeGen/ARM64/return-vector.ll b/test/CodeGen/ARM64/return-vector.ll
deleted file mode 100644
index 9457d8bc6d0..00000000000
--- a/test/CodeGen/ARM64/return-vector.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-; 2x64 vector should be returned in Q0.
-
-define <2 x double> @test(<2 x double>* %p) nounwind {
-; CHECK: test
-; CHECK: ldr q0, [x0]
-; CHECK: ret
-  %tmp1 = load <2 x double>* %p, align 16
-  ret <2 x double> %tmp1
-}
diff --git a/test/CodeGen/ARM64/returnaddr.ll b/test/CodeGen/ARM64/returnaddr.ll
deleted file mode 100644
index 285b29563c0..00000000000
--- a/test/CodeGen/ARM64/returnaddr.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i8* @rt0(i32 %x) nounwind readnone {
-entry:
-; CHECK-LABEL: rt0:
-; CHECK: mov x0, x30
-; CHECK: ret
-  %0 = tail call i8* @llvm.returnaddress(i32 0)
-  ret i8* %0
-}
-
-define i8* @rt2() nounwind readnone {
-entry:
-; CHECK-LABEL: rt2:
-; CHECK: stp x29, x30, [sp, #-16]!
-; CHECK: mov x29, sp
-; CHECK: ldr x[[REG:[0-9]+]], [x29]
-; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]]]
-; CHECK: ldr x0, [x[[REG2]], #8]
-; CHECK: ldp x29, x30, [sp], #16
-; CHECK: ret
-  %0 = tail call i8* @llvm.returnaddress(i32 2)
-  ret i8* %0
-}
-
-declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/rev.ll b/test/CodeGen/ARM64/rev.ll
deleted file mode 100644
index 1da59e42f6b..00000000000
--- a/test/CodeGen/ARM64/rev.ll
+++ /dev/null
@@ -1,235 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define i32 @test_rev_w(i32 %a) nounwind {
-entry:
-; CHECK-LABEL: test_rev_w:
-; CHECK: rev w0, w0
-  %0 = tail call i32 @llvm.bswap.i32(i32 %a)
-  ret i32 %0
-}
-
-define i64 @test_rev_x(i64 %a) nounwind {
-entry:
-; CHECK-LABEL: test_rev_x:
-; CHECK: rev x0, x0
-  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
-  ret i64 %0
-}
-
-declare i32 @llvm.bswap.i32(i32) nounwind readnone
-declare i64 @llvm.bswap.i64(i64) nounwind readnone
-
-define i32 @test_rev16_w(i32 %X) nounwind {
-entry:
-; CHECK-LABEL: test_rev16_w:
-; CHECK: rev16 w0, w0
-  %tmp1 = lshr i32 %X, 8
-  %X15 = bitcast i32 %X to i32
-  %tmp4 = shl i32 %X15, 8
-  %tmp2 = and i32 %tmp1, 16711680
-  %tmp5 = and i32 %tmp4, -16777216
-  %tmp9 = and i32 %tmp1, 255
-  %tmp13 = and i32 %tmp4, 65280
-  %tmp6 = or i32 %tmp5, %tmp2
-  %tmp10 = or i32 %tmp6, %tmp13
-  %tmp14 = or i32 %tmp10, %tmp9
-  ret i32 %tmp14
-}
-
-; 64-bit REV16 is *not* a swap then a 16-bit rotation:
-;   01234567 ->(bswap) 76543210 ->(rotr) 10765432
-;   01234567 ->(rev16) 10325476
-define i64 @test_rev16_x(i64 %a) nounwind {
-entry:
-; CHECK-LABEL: test_rev16_x:
-; CHECK-NOT: rev16 x0, x0
-  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
-  %1 = lshr i64 %0, 16
-  %2 = shl i64 %0, 48
-  %3 = or i64 %1, %2
-  ret i64 %3
-}
-
-define i64 @test_rev32_x(i64 %a) nounwind {
-entry:
-; CHECK-LABEL: test_rev32_x:
-; CHECK: rev32 x0, x0
-  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
-  %1 = lshr i64 %0, 32
-  %2 = shl i64 %0, 32
-  %3 = or i64 %1, %2
-  ret i64 %3
-}
-
-define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev64D8:
-;CHECK: rev64.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-	ret <8 x i8> %tmp2
-}
-
-define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: test_vrev64D16:
-;CHECK: rev64.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-	ret <4 x i16> %tmp2
-}
-
-define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: test_vrev64D32:
-;CHECK: rev64.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-	ret <2 x i32> %tmp2
-}
-
-define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
-;CHECK-LABEL: test_vrev64Df:
-;CHECK: rev64.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-	ret <2 x float> %tmp2
-}
-
-define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev64Q8:
-;CHECK: rev64.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-	ret <16 x i8> %tmp2
-}
-
-define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: test_vrev64Q16:
-;CHECK: rev64.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-	ret <8 x i16> %tmp2
-}
-
-define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: test_vrev64Q32:
-;CHECK: rev64.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-	ret <4 x i32> %tmp2
-}
-
-define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
-;CHECK-LABEL: test_vrev64Qf:
-;CHECK: rev64.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-	ret <4 x float> %tmp2
-}
-
-define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev32D8:
-;CHECK: rev32.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-	ret <8 x i8> %tmp2
-}
-
-define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: test_vrev32D16:
-;CHECK: rev32.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-	ret <4 x i16> %tmp2
-}
-
-define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev32Q8:
-;CHECK: rev32.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-	ret <16 x i8> %tmp2
-}
-
-define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: test_vrev32Q16:
-;CHECK: rev32.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-	ret <8 x i16> %tmp2
-}
-
-define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev16D8:
-;CHECK: rev16.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-	ret <8 x i8> %tmp2
-}
-
-define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev16Q8:
-;CHECK: rev16.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-	ret <16 x i8> %tmp2
-}
-
-; Undef shuffle indices should not prevent matching to VREV:
-
-define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev64D8_undef:
-;CHECK: rev64.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
-	ret <8 x i8> %tmp2
-}
-
-define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: test_vrev32Q16_undef:
-;CHECK: rev32.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
-	ret <8 x i16> %tmp2
-}
-
-; vrev <4 x i16> should use REV32 and not REV64
-define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
-; CHECK-LABEL: test_vrev64:
-; CHECK: ldr [[DEST:q[0-9]+]],
-; CHECK: st1.h
-; CHECK: st1.h
-entry:
-  %0 = bitcast <4 x i16>* %source to <8 x i16>*
-  %tmp2 = load <8 x i16>* %0, align 4
-  %tmp3 = extractelement <8 x i16> %tmp2, i32 6
-  %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
-  %tmp9 = extractelement <8 x i16> %tmp2, i32 5
-  %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
-  store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
-  ret void
-}
-
-; Test vrev of float4
-define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
-; CHECK: float_vrev64
-; CHECK: ldr [[DEST:q[0-9]+]],
-; CHECK: rev64.4s
-entry:
-  %0 = bitcast float* %source to <4 x float>*
-  %tmp2 = load <4 x float>* %0, align 4
-  %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
-  %arrayidx8 = getelementptr inbounds <4 x float>* %dest, i32 11
-  store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
-  ret void
-}
-
-
-define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
-; CHECK-LABEL: test_vrev32_bswap:
-; CHECK: rev32.16b
-; CHECK-NOT: rev
-; CHECK: ret
-  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
-  ret <4 x i32> %bswap
-}
-
-declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/rounding.ll b/test/CodeGen/ARM64/rounding.ll
deleted file mode 100644
index 931114447ad..00000000000
--- a/test/CodeGen/ARM64/rounding.ll
+++ /dev/null
@@ -1,208 +0,0 @@
-; RUN: llc -O3 < %s -mcpu=cyclone | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
-target triple = "arm64-apple-ios6.0.0"
-
-; CHECK: test1
-; CHECK: frintx
-; CHECK: frintm
-define float @test1(float %a) #0 {
-entry:
-  %call = tail call float @floorf(float %a) nounwind readnone
-  ret float %call
-}
-
-declare float @floorf(float) nounwind readnone
-
-; CHECK: test2
-; CHECK: frintx
-; CHECK: frintm
-define double @test2(double %a) #0 {
-entry:
-  %call = tail call double @floor(double %a) nounwind readnone
-  ret double %call
-}
-
-declare double @floor(double) nounwind readnone
-
-; CHECK: test3
-; CHECK: frinti
-define float @test3(float %a) #0 {
-entry:
-  %call = tail call float @nearbyintf(float %a) nounwind readnone
-  ret float %call
-}
-
-declare float @nearbyintf(float) nounwind readnone
-
-; CHECK: test4
-; CHECK: frinti
-define double @test4(double %a) #0 {
-entry:
-  %call = tail call double @nearbyint(double %a) nounwind readnone
-  ret double %call
-}
-
-declare double @nearbyint(double) nounwind readnone
-
-; CHECK: test5
-; CHECK: frintx
-; CHECK: frintp
-define float @test5(float %a) #0 {
-entry:
-  %call = tail call float @ceilf(float %a) nounwind readnone
-  ret float %call
-}
-
-declare float @ceilf(float) nounwind readnone
-
-; CHECK: test6
-; CHECK: frintx
-; CHECK: frintp
-define double @test6(double %a) #0 {
-entry:
-  %call = tail call double @ceil(double %a) nounwind readnone
-  ret double %call
-}
-
-declare double @ceil(double) nounwind readnone
-
-; CHECK: test7
-; CHECK: frintx
-define float @test7(float %a) #0 {
-entry:
-  %call = tail call float @rintf(float %a) nounwind readnone
-  ret float %call
-}
-
-declare float @rintf(float) nounwind readnone
-
-; CHECK: test8
-; CHECK: frintx
-define double @test8(double %a) #0 {
-entry:
-  %call = tail call double @rint(double %a) nounwind readnone
-  ret double %call
-}
-
-declare double @rint(double) nounwind readnone
-
-; CHECK: test9
-; CHECK: frintx
-; CHECK: frintz
-define float @test9(float %a) #0 {
-entry:
-  %call = tail call float @truncf(float %a) nounwind readnone
-  ret float %call
-}
-
-declare float @truncf(float) nounwind readnone
-
-; CHECK: test10
-; CHECK: frintx
-; CHECK: frintz
-define double @test10(double %a) #0 {
-entry:
-  %call = tail call double @trunc(double %a) nounwind readnone
-  ret double %call
-}
-
-declare double @trunc(double) nounwind readnone
-
-; CHECK: test11
-; CHECK: frintx
-; CHECK: frinta
-define float @test11(float %a) #0 {
-entry:
-  %call = tail call float @roundf(float %a) nounwind readnone
-  ret float %call
-}
-
-declare float @roundf(float %a) nounwind readnone
-
-; CHECK: test12
-; CHECK: frintx
-; CHECK: frinta
-define double @test12(double %a) #0 {
-entry:
-  %call = tail call double @round(double %a) nounwind readnone
-  ret double %call
-}
-
-declare double @round(double %a) nounwind readnone
-
-; CHECK: test13
-; CHECK-NOT: frintx
-; CHECK: frintm
-define float @test13(float %a) #1 {
-entry:
-  %call = tail call float @floorf(float %a) nounwind readnone
-  ret float %call
-}
-
-; CHECK: test14
-; CHECK-NOT: frintx
-; CHECK: frintm
-define double @test14(double %a) #1 {
-entry:
-  %call = tail call double @floor(double %a) nounwind readnone
-  ret double %call
-}
-
-; CHECK: test15
-; CHECK-NOT: frintx
-; CHECK: frintp
-define float @test15(float %a) #1 {
-entry:
-  %call = tail call float @ceilf(float %a) nounwind readnone
-  ret float %call
-}
-
-; CHECK: test16
-; CHECK-NOT: frintx
-; CHECK: frintp
-define double @test16(double %a) #1 {
-entry:
-  %call = tail call double @ceil(double %a) nounwind readnone
-  ret double %call
-}
-
-; CHECK: test17
-; CHECK-NOT: frintx
-; CHECK: frintz
-define float @test17(float %a) #1 {
-entry:
-  %call = tail call float @truncf(float %a) nounwind readnone
-  ret float %call
-}
-
-; CHECK: test18
-; CHECK-NOT: frintx
-; CHECK: frintz
-define double @test18(double %a) #1 {
-entry:
-  %call = tail call double @trunc(double %a) nounwind readnone
-  ret double %call
-}
-
-; CHECK: test19
-; CHECK-NOT: frintx
-; CHECK: frinta
-define float @test19(float %a) #1 {
-entry:
-  %call = tail call float @roundf(float %a) nounwind readnone
-  ret float %call
-}
-
-; CHECK: test20
-; CHECK-NOT: frintx
-; CHECK: frinta
-define double @test20(double %a) #1 {
-entry:
-  %call = tail call double @round(double %a) nounwind readnone
-  ret double %call
-}
-
-
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/ARM64/scaled_iv.ll b/test/CodeGen/ARM64/scaled_iv.ll
deleted file mode 100644
index 987373e542a..00000000000
--- a/test/CodeGen/ARM64/scaled_iv.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: opt -S -loop-reduce < %s | FileCheck %s
-; Scaling factor in addressing mode are costly.
-; Make loop-reduce prefer unscaled accesses.
-; <rdar://problem/13806271>
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios7.0.0"
-
-; Function Attrs: nounwind ssp
-define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
-; CHECK: @mulDouble
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
-; Only one induction variable should have been generated.
-; CHECK-NOT: phi
-  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
-  %tmp = add nsw i64 %indvars.iv, -1
-  %arrayidx = getelementptr inbounds double* %b, i64 %tmp
-  %tmp1 = load double* %arrayidx, align 8
-; The induction variable should carry the scaling factor: 1 * 8 = 8.
-; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
-  %tmp2 = load double* %arrayidx2, align 8
-  %mul = fmul double %tmp1, %tmp2
-  %arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
-  store double %mul, double* %arrayidx4, align 8
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-; Comparison should be 19 * 8 = 152.
-; CHECK: icmp eq i32 {{%[^,]+}}, 152
-  %exitcond = icmp eq i32 %lftr.wideiv, 20
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
diff --git a/test/CodeGen/ARM64/scvt.ll b/test/CodeGen/ARM64/scvt.ll
deleted file mode 100644
index b4d4add1e8a..00000000000
--- a/test/CodeGen/ARM64/scvt.ll
+++ /dev/null
@@ -1,830 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; rdar://13082402
-
-define float @t1(i32* nocapture %src) nounwind ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: ldr s0, [x0]
-; CHECK: scvtf s0, s0
-  %tmp1 = load i32* %src, align 4
-  %tmp2 = sitofp i32 %tmp1 to float
-  ret float %tmp2
-}
-
-define float @t2(i32* nocapture %src) nounwind ssp {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: ldr s0, [x0]
-; CHECK: ucvtf s0, s0
-  %tmp1 = load i32* %src, align 4
-  %tmp2 = uitofp i32 %tmp1 to float
-  ret float %tmp2
-}
-
-define double @t3(i64* nocapture %src) nounwind ssp {
-entry:
-; CHECK-LABEL: t3:
-; CHECK: ldr d0, [x0]
-; CHECK: scvtf d0, d0
-  %tmp1 = load i64* %src, align 4
-  %tmp2 = sitofp i64 %tmp1 to double
-  ret double %tmp2
-}
-
-define double @t4(i64* nocapture %src) nounwind ssp {
-entry:
-; CHECK-LABEL: t4:
-; CHECK: ldr d0, [x0]
-; CHECK: ucvtf d0, d0
-  %tmp1 = load i64* %src, align 4
-  %tmp2 = uitofp i64 %tmp1 to double
-  ret double %tmp2
-}
-
-; rdar://13136456
-define double @t5(i32* nocapture %src) nounwind ssp optsize {
-entry:
-; CHECK-LABEL: t5:
-; CHECK: ldr [[REG:w[0-9]+]], [x0]
-; CHECK: scvtf d0, [[REG]]
-  %tmp1 = load i32* %src, align 4
-  %tmp2 = sitofp i32 %tmp1 to double
-  ret double %tmp2
-}
-
-; Check that we load in FP register when we want to convert into
-; floating point value.
-; This is much faster than loading on GPR and making the conversion
-; GPR -> FPR.
-; <rdar://problem/14599607>
-;
-; Check the flollowing patterns for signed/unsigned:
-; 1. load with scaled imm to float.
-; 2. load with scaled register to float.
-; 3. load with scaled imm to double.
-; 4. load with scaled register to double.
-; 5. load with unscaled imm to float.
-; 6. load with unscaled imm to double.
-; With loading size: 8, 16, 32, and 64-bits.
-
-; ********* 1. load with scaled imm to float. *********
-define float @fct1(i8* nocapture %sp0) {
-; CHECK-LABEL: fct1:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 1
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = uitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @fct2(i16* nocapture %sp0) {
-; CHECK-LABEL: fct2:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 1
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = uitofp i16 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @fct3(i32* nocapture %sp0) {
-; CHECK-LABEL: fct3:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = uitofp i32 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; i64 -> f32 is not supported on floating point unit.
-define float @fct4(i64* nocapture %sp0) {
-; CHECK-LABEL: fct4:
-; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 1
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = uitofp i64 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; ********* 2. load with scaled register to float. *********
-define float @fct5(i8* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct5:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = uitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @fct6(i16* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct6:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = uitofp i16 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @fct7(i32* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct7:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = uitofp i32 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; i64 -> f32 is not supported on floating point unit.
-define float @fct8(i64* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct8:
-; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = uitofp i64 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-
-; ********* 3. load with scaled imm to double. *********
-define double @fct9(i8* nocapture %sp0) {
-; CHECK-LABEL: fct9:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 1
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = uitofp i8 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct10(i16* nocapture %sp0) {
-; CHECK-LABEL: fct10:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 1
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = uitofp i16 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct11(i32* nocapture %sp0) {
-; CHECK-LABEL: fct11:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = uitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct12(i64* nocapture %sp0) {
-; CHECK-LABEL: fct12:
-; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 1
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = uitofp i64 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-; ********* 4. load with scaled register to double. *********
-define double @fct13(i8* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct13:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = uitofp i8 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct14(i16* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct14:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = uitofp i16 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct15(i32* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct15:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = uitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct16(i64* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct16:
-; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = uitofp i64 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-; ********* 5. load with unscaled imm to float. *********
-define float @fct17(i8* nocapture %sp0) {
-entry:
-; CHECK-LABEL: fct17:
-; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i8* %sp0 to i64
-  %add = add i64 %bitcast, -1
-  %addr = inttoptr i64 %add to i8*
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = uitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @fct18(i16* nocapture %sp0) {
-; CHECK-LABEL: fct18:
-; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i16* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i16*
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = uitofp i16 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @fct19(i32* nocapture %sp0) {
-; CHECK-LABEL: fct19:
-; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i32* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i32*
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = uitofp i32 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; i64 -> f32 is not supported on floating point unit.
-define float @fct20(i64* nocapture %sp0) {
-; CHECK-LABEL: fct20:
-; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i64* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i64*
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = uitofp i64 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-
-}
-
-; ********* 6. load with unscaled imm to double. *********
-define double @fct21(i8* nocapture %sp0) {
-entry:
-; CHECK-LABEL: fct21:
-; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i8* %sp0 to i64
-  %add = add i64 %bitcast, -1
-  %addr = inttoptr i64 %add to i8*
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = uitofp i8 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct22(i16* nocapture %sp0) {
-; CHECK-LABEL: fct22:
-; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i16* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i16*
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = uitofp i16 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct23(i32* nocapture %sp0) {
-; CHECK-LABEL: fct23:
-; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i32* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i32*
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = uitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct24(i64* nocapture %sp0) {
-; CHECK-LABEL: fct24:
-; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i64* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i64*
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = uitofp i64 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-
-}
-
-; ********* 1s. load with scaled imm to float. *********
-define float @sfct1(i8* nocapture %sp0) {
-; CHECK-LABEL: sfct1:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
-; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
-; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 1
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @sfct2(i16* nocapture %sp0) {
-; CHECK-LABEL: sfct2:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
-; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
-; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 1
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = sitofp i16 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @sfct3(i32* nocapture %sp0) {
-; CHECK-LABEL: sfct3:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; i64 -> f32 is not supported on floating point unit.
-define float @sfct4(i64* nocapture %sp0) {
-; CHECK-LABEL: sfct4:
-; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 1
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = sitofp i64 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; ********* 2s. load with scaled register to float. *********
-define float @sfct5(i8* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct5:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
-; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
-; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
-; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @sfct6(i16* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct6:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
-; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
-; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = sitofp i16 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @sfct7(i32* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct7:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; i64 -> f32 is not supported on floating point unit.
-define float @sfct8(i64* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct8:
-; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = sitofp i64 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; ********* 3s. load with scaled imm to double. *********
-define double @sfct9(i8* nocapture %sp0) {
-; CHECK-LABEL: sfct9:
-; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 1
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct10(i16* nocapture %sp0) {
-; CHECK-LABEL: sfct10:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
-; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
-; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
-; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 1
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = sitofp i16 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct11(i32* nocapture %sp0) {
-; CHECK-LABEL: sfct11:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
-; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct12(i64* nocapture %sp0) {
-; CHECK-LABEL: sfct12:
-; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 1
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = sitofp i64 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-; ********* 4s. load with scaled register to double. *********
-define double @sfct13(i8* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct13:
-; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, x1]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct14(i16* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct14:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
-; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
-; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
-; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = sitofp i16 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct15(i32* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct15:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
-; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
-; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct16(i64* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct16:
-; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = sitofp i64 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-; ********* 5s. load with unscaled imm to float. *********
-define float @sfct17(i8* nocapture %sp0) {
-entry:
-; CHECK-LABEL: sfct17:
-; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
-; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
-; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
-; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i8* %sp0 to i64
-  %add = add i64 %bitcast, -1
-  %addr = inttoptr i64 %add to i8*
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @sfct18(i16* nocapture %sp0) {
-; CHECK-LABEL: sfct18:
-; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
-; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i16* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i16*
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = sitofp i16 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @sfct19(i32* nocapture %sp0) {
-; CHECK-LABEL: sfct19:
-; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i32* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i32*
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; i64 -> f32 is not supported on floating point unit.
-define float @sfct20(i64* nocapture %sp0) {
-; CHECK-LABEL: sfct20:
-; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i64* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i64*
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = sitofp i64 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-
-}
-
-; ********* 6s. load with unscaled imm to double. *********
-define double @sfct21(i8* nocapture %sp0) {
-entry:
-; CHECK-LABEL: sfct21:
-; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i8* %sp0 to i64
-  %add = add i64 %bitcast, -1
-  %addr = inttoptr i64 %add to i8*
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct22(i16* nocapture %sp0) {
-; CHECK-LABEL: sfct22:
-; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
-; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
-; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i16* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i16*
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = sitofp i16 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct23(i32* nocapture %sp0) {
-; CHECK-LABEL: sfct23:
-; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
-; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i32* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i32*
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct24(i64* nocapture %sp0) {
-; CHECK-LABEL: sfct24:
-; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i64* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i64*
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = sitofp i64 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-
-}
-
-; Check that we do not use SSHLL code sequence when code size is a concern.
-define float @codesize_sfct17(i8* nocapture %sp0) optsize {
-entry:
-; CHECK-LABEL: codesize_sfct17:
-; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i8* %sp0 to i64
-  %add = add i64 %bitcast, -1
-  %addr = inttoptr i64 %add to i8*
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define double @codesize_sfct11(i32* nocapture %sp0) minsize {
-; CHECK-LABEL: sfct11:
-; CHECK: ldr w[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-; Adding fp128 custom lowering makes these a little fragile since we have to
-; return the correct mix of Legal/Expand from the custom method.
-;
-; rdar://problem/14991489
-
-define float @float_from_i128(i128 %in) {
-; CHECK-LABEL: float_from_i128:
-; CHECK: bl {{_?__floatuntisf}}
-  %conv = uitofp i128 %in to float
-  ret float %conv
-}
-
-define double @double_from_i128(i128 %in) {
-; CHECK-LABEL: double_from_i128:
-; CHECK: bl {{_?__floattidf}}
-  %conv = sitofp i128 %in to double
-  ret double %conv
-}
-
-define fp128 @fp128_from_i128(i128 %in) {
-; CHECK-LABEL: fp128_from_i128:
-; CHECK: bl {{_?__floatuntitf}}
-  %conv = uitofp i128 %in to fp128
-  ret fp128 %conv
-}
-
-define i128 @i128_from_float(float %in) {
-; CHECK-LABEL: i128_from_float
-; CHECK: bl {{_?__fixsfti}}
-  %conv = fptosi float %in to i128
-  ret i128 %conv
-}
-
-define i128 @i128_from_double(double %in) {
-; CHECK-LABEL: i128_from_double
-; CHECK: bl {{_?__fixunsdfti}}
-  %conv = fptoui double %in to i128
-  ret i128 %conv
-}
-
-define i128 @i128_from_fp128(fp128 %in) {
-; CHECK-LABEL: i128_from_fp128
-; CHECK: bl {{_?__fixtfti}}
-  %conv = fptosi fp128 %in to i128
-  ret i128 %conv
-}
-
diff --git a/test/CodeGen/ARM64/shifted-sext.ll b/test/CodeGen/ARM64/shifted-sext.ll
deleted file mode 100644
index b7b4e5de1d5..00000000000
--- a/test/CodeGen/ARM64/shifted-sext.ll
+++ /dev/null
@@ -1,277 +0,0 @@
-; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
-;
-; <rdar://problem/13820218>
-
-define signext i16 @extendedLeftShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftcharToshortBy4:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfiz w0, [[REG]], #4, #8
-  %inc = add i8 %a, 1
-  %conv1 = sext i8 %inc to i32
-  %shl = shl nsw i32 %conv1, 4
-  %conv2 = trunc i32 %shl to i16
-  ret i16 %conv2
-}
-
-define signext i16 @extendedRightShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftcharToshortBy4:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfx w0, [[REG]], #4, #4
-  %inc = add i8 %a, 1
-  %conv1 = sext i8 %inc to i32
-  %shr4 = lshr i32 %conv1, 4
-  %conv2 = trunc i32 %shr4 to i16
-  ret i16 %conv2
-}
-
-define signext i16 @extendedLeftShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftcharToshortBy8:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfiz w0, [[REG]], #8, #8
-  %inc = add i8 %a, 1
-  %conv1 = sext i8 %inc to i32
-  %shl = shl nsw i32 %conv1, 8
-  %conv2 = trunc i32 %shl to i16
-  ret i16 %conv2
-}
-
-define signext i16 @extendedRightShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftcharToshortBy8:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sxtb [[REG]], [[REG]]
-; CHECK: asr w0, [[REG]], #8
-  %inc = add i8 %a, 1
-  %conv1 = sext i8 %inc to i32
-  %shr4 = lshr i32 %conv1, 8
-  %conv2 = trunc i32 %shr4 to i16
-  ret i16 %conv2
-}
-
-define i32 @extendedLeftShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftcharTointBy4:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfiz w0, [[REG]], #4, #8
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i32
-  %shl = shl nsw i32 %conv, 4
-  ret i32 %shl
-}
-
-define i32 @extendedRightShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftcharTointBy4:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfx w0, [[REG]], #4, #4
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i32
-  %shr = ashr i32 %conv, 4
-  ret i32 %shr
-}
-
-define i32 @extendedLeftShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftcharTointBy8:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfiz w0, [[REG]], #8, #8
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i32
-  %shl = shl nsw i32 %conv, 8
-  ret i32 %shl
-}
-
-define i32 @extendedRightShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftcharTointBy8:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sxtb [[REG]], [[REG]]
-; CHECK: asr w0, [[REG]], #8
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i32
-  %shr = ashr i32 %conv, 8
-  ret i32 %shr
-}
-
-define i64 @extendedLeftShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftcharToint64By4:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfiz x0, x[[REG]], #4, #8
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i64
-  %shl = shl nsw i64 %conv, 4
-  ret i64 %shl
-}
-
-define i64 @extendedRightShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftcharToint64By4:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfx x0, x[[REG]], #4, #4
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i64
-  %shr = ashr i64 %conv, 4
-  ret i64 %shr
-}
-
-define i64 @extendedLeftShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftcharToint64By8:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfiz x0, x[[REG]], #8, #8
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i64
-  %shl = shl nsw i64 %conv, 8
-  ret i64 %shl
-}
-
-define i64 @extendedRightShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftcharToint64By8:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sxtb x[[REG]], w[[REG]]
-; CHECK: asr x0, x[[REG]], #8
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i64
-  %shr = ashr i64 %conv, 8
-  ret i64 %shr
-}
-
-define i32 @extendedLeftShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftshortTointBy4:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfiz w0, [[REG]], #4, #16
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i32
-  %shl = shl nsw i32 %conv, 4
-  ret i32 %shl
-}
-
-define i32 @extendedRightShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftshortTointBy4:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfx w0, [[REG]], #4, #12
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i32
-  %shr = ashr i32 %conv, 4
-  ret i32 %shr
-}
-
-define i32 @extendedLeftShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftshortTointBy16:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: lsl w0, [[REG]], #16
-  %inc = add i16 %a, 1
-  %conv2 = zext i16 %inc to i32
-  %shl = shl nuw i32 %conv2, 16
-  ret i32 %shl
-}
-
-define i32 @extendedRightShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftshortTointBy16:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sxth [[REG]], [[REG]]
-; CHECK: asr w0, [[REG]], #16
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i32
-  %shr = ashr i32 %conv, 16
-  ret i32 %shr
-}
-
-define i64 @extendedLeftShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftshortToint64By4:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfiz x0, x[[REG]], #4, #16
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i64
-  %shl = shl nsw i64 %conv, 4
-  ret i64 %shl
-}
-
-define i64 @extendedRightShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftshortToint64By4:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfx x0, x[[REG]], #4, #12
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i64
-  %shr = ashr i64 %conv, 4
-  ret i64 %shr
-}
-
-define i64 @extendedLeftShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftshortToint64By16:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfiz x0, x[[REG]], #16, #16
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i64
-  %shl = shl nsw i64 %conv, 16
-  ret i64 %shl
-}
-
-define i64 @extendedRightShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftshortToint64By16:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sxth x[[REG]], w[[REG]]
-; CHECK: asr x0, x[[REG]], #16
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i64
-  %shr = ashr i64 %conv, 16
-  ret i64 %shr
-}
-
-define i64 @extendedLeftShiftintToint64By4(i32 %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftintToint64By4:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfiz x0, x[[REG]], #4, #32
-  %inc = add nsw i32 %a, 1
-  %conv = sext i32 %inc to i64
-  %shl = shl nsw i64 %conv, 4
-  ret i64 %shl
-}
-
-define i64 @extendedRightShiftintToint64By4(i32 %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftintToint64By4:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfx x0, x[[REG]], #4, #28
-  %inc = add nsw i32 %a, 1
-  %conv = sext i32 %inc to i64
-  %shr = ashr i64 %conv, 4
-  ret i64 %shr
-}
-
-define i64 @extendedLeftShiftintToint64By32(i32 %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftintToint64By32:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: lsl x0, x[[REG]], #32
-  %inc = add nsw i32 %a, 1
-  %conv2 = zext i32 %inc to i64
-  %shl = shl nuw i64 %conv2, 32
-  ret i64 %shl
-}
-
-define i64 @extendedRightShiftintToint64By32(i32 %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftintToint64By32:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sxtw x[[REG]], w[[REG]]
-; CHECK: asr x0, x[[REG]], #32
-  %inc = add nsw i32 %a, 1
-  %conv = sext i32 %inc to i64
-  %shr = ashr i64 %conv, 32
-  ret i64 %shr
-}
diff --git a/test/CodeGen/ARM64/simd-scalar-to-vector.ll b/test/CodeGen/ARM64/simd-scalar-to-vector.ll
deleted file mode 100644
index 10e8d6c0ea4..00000000000
--- a/test/CodeGen/ARM64/simd-scalar-to-vector.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
-
-define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
-; CHECK: uaddlv.16b h0, v0
-; CHECK: rshrn.8b v0, v0, #4
-; CHECK: dup.16b v0, v0[0]
-; CHECK: ret
-
-; CHECK-FAST: uaddlv.16b
-; CHECK-FAST: rshrn.8b
-; CHECK-FAST: dup.16b
-  %tmp = tail call i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
-  %tmp1 = trunc i32 %tmp to i16
-  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
-  %tmp3 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
-  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
-  ret <16 x i8> %tmp4
-}
-
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/simplest-elf.ll b/test/CodeGen/ARM64/simplest-elf.ll
deleted file mode 100644
index 1254365b820..00000000000
--- a/test/CodeGen/ARM64/simplest-elf.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump - -r -d --triple=arm64-linux-gnu | FileCheck --check-prefix=CHECK-ELF %s
-
-define void @foo() nounwind {
-  ret void
-}
-
-  ; Check source looks ELF-like: no leading underscore, comments with //
-; CHECK: foo: // @foo
-; CHECK:     ret
-
-  ; Similarly make sure ELF output works and is vaguely sane: aarch64 target
-  ; machine with correct section & symbol names.
-; CHECK-ELF: file format ELF64-aarch64
-
-; CHECK-ELF: Disassembly of section .text
-; CHECK-ELF-LABEL: foo:
-; CHECK-ELF:    ret
diff --git a/test/CodeGen/ARM64/sincos.ll b/test/CodeGen/ARM64/sincos.ll
deleted file mode 100644
index 06157b2580c..00000000000
--- a/test/CodeGen/ARM64/sincos.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7 | FileCheck %s --check-prefix CHECK-IOS
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix CHECK-LINUX
-
-; Combine sin / cos into a single call.
-; rdar://12856873
-
-define float @test1(float %x) nounwind {
-entry:
-; CHECK-IOS-LABEL: test1:
-; CHECK-IOS: bl ___sincosf_stret
-; CHECK-IOS: fadd s0, s0, s1
-
-; CHECK-LINUX-LABEL: test1:
-; CHECK-LINUX: bl sinf
-; CHECK-LINUX: bl cosf
-
-  %call = tail call float @sinf(float %x) nounwind readnone
-  %call1 = tail call float @cosf(float %x) nounwind readnone
-  %add = fadd float %call, %call1
-  ret float %add
-}
-
-define double @test2(double %x) nounwind {
-entry:
-; CHECK-IOS-LABEL: test2:
-; CHECK-IOS: bl ___sincos_stret
-; CHECK-IOS: fadd d0, d0, d1
-
-; CHECK-LINUX-LABEL: test2:
-; CHECK-LINUX: bl sin
-; CHECK-LINUX: bl cos
-
-  %call = tail call double @sin(double %x) nounwind readnone
-  %call1 = tail call double @cos(double %x) nounwind readnone
-  %add = fadd double %call, %call1
-  ret double %add
-}
-
-declare float  @sinf(float) readonly
-declare double @sin(double) readonly
-declare float @cosf(float) readonly
-declare double @cos(double) readonly
diff --git a/test/CodeGen/ARM64/sitofp-combine-chains.ll b/test/CodeGen/ARM64/sitofp-combine-chains.ll
deleted file mode 100644
index 10b433b9775..00000000000
--- a/test/CodeGen/ARM64/sitofp-combine-chains.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc -march=arm64 -o -  %s | FileCheck %s
-
-; ARM64ISelLowering.cpp was creating a new (floating-point) load for efficiency
-; but not updating chain-successors of the old one. As a result, the two memory
-; operations in this function both ended up direct successors to the EntryToken
-; and could be reordered.
-
-@var = global i32 0, align 4
-
-define float @foo() {
-; CHECK-LABEL: foo:
-  ; Load must come before we clobber @var
-; CHECK: adrp x[[VARBASE:[0-9]+]], {{_?var}}
-; CHECK: ldr [[SREG:s[0-9]+]], [x[[VARBASE]],
-; CHECK: str wzr, [x[[VARBASE]],
-
-  %val = load i32* @var, align 4
-  store i32 0, i32* @var, align 4
-
-  %fltval = sitofp i32 %val to float
-  ret float %fltval
-}
diff --git a/test/CodeGen/ARM64/sli-sri-opt.ll b/test/CodeGen/ARM64/sli-sri-opt.ll
deleted file mode 100644
index 725dcd51fd1..00000000000
--- a/test/CodeGen/ARM64/sli-sri-opt.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc -arm64-shift-insert-generation=true -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
-; CHECK-LABEL: testLeftGood:
-; CHECK: sli.16b v0, v1, #3
-  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
-  %vshl_n = shl <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %result = or <16 x i8> %and.i, %vshl_n
-  store <16 x i8> %result, <16 x i8>* %dest, align 16
-  ret void
-}
-
-define void @testLeftBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
-; CHECK-LABEL: testLeftBad:
-; CHECK-NOT: sli
-  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
-  %vshl_n = shl <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-  %result = or <16 x i8> %and.i, %vshl_n
-  store <16 x i8> %result, <16 x i8>* %dest, align 16
-  ret void
-}
-
-define void @testRightGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
-; CHECK-LABEL: testRightGood:
-; CHECK: sri.16b v0, v1, #3
-  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
-  %vshl_n = lshr <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %result = or <16 x i8> %and.i, %vshl_n
-  store <16 x i8> %result, <16 x i8>* %dest, align 16
-  ret void
-}
-
-define void @testRightBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
-; CHECK-LABEL: testRightBad:
-; CHECK-NOT: sri
-  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
-  %vshl_n = lshr <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-  %result = or <16 x i8> %and.i, %vshl_n
-  store <16 x i8> %result, <16 x i8>* %dest, align 16
-  ret void
-}
diff --git a/test/CodeGen/ARM64/smaxv.ll b/test/CodeGen/ARM64/smaxv.ll
deleted file mode 100644
index 4f6e01b31ea..00000000000
--- a/test/CodeGen/ARM64/smaxv.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
-; CHECK: test_vmaxv_s8
-; CHECK: smaxv.8b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
-  %0 = trunc i32 %vmaxv.i to i8
-  ret i8 %0
-}
-
-define signext i16 @test_vmaxv_s16(<4 x i16> %a1) {
-; CHECK: test_vmaxv_s16
-; CHECK: smaxv.4h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
-  %0 = trunc i32 %vmaxv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vmaxv_s32(<2 x i32> %a1) {
-; CHECK: test_vmaxv_s32
-; 2 x i32 is not supported by the ISA, thus, this is a special case
-; CHECK: smaxp.2s v[[REGNUM:[0-9]+]], v0, v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
-  ret i32 %vmaxv.i
-}
-
-define signext i8 @test_vmaxvq_s8(<16 x i8> %a1) {
-; CHECK: test_vmaxvq_s8
-; CHECK: smaxv.16b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
-  %0 = trunc i32 %vmaxv.i to i8
-  ret i8 %0
-}
-
-define signext i16 @test_vmaxvq_s16(<8 x i16> %a1) {
-; CHECK: test_vmaxvq_s16
-; CHECK: smaxv.8h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
-  %0 = trunc i32 %vmaxv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vmaxvq_s32(<4 x i32> %a1) {
-; CHECK: test_vmaxvq_s32
-; CHECK: smaxv.4s [[REGNUM:s[0-9]+]], v0
-; CHECK-NEXT: fmov w0, [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
-  ret i32 %vmaxv.i
-}
-
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8>)
-
diff --git a/test/CodeGen/ARM64/sminv.ll b/test/CodeGen/ARM64/sminv.ll
deleted file mode 100644
index a246868d2f2..00000000000
--- a/test/CodeGen/ARM64/sminv.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define signext i8 @test_vminv_s8(<8 x i8> %a1) {
-; CHECK: test_vminv_s8
-; CHECK: sminv.8b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8> %a1)
-  %0 = trunc i32 %vminv.i to i8
-  ret i8 %0
-}
-
-define signext i16 @test_vminv_s16(<4 x i16> %a1) {
-; CHECK: test_vminv_s16
-; CHECK: sminv.4h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16> %a1)
-  %0 = trunc i32 %vminv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vminv_s32(<2 x i32> %a1) {
-; CHECK: test_vminv_s32
-; 2 x i32 is not supported by the ISA, thus, this is a special case
-; CHECK: sminp.2s v[[REGNUM:[0-9]+]], v0, v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32> %a1)
-  ret i32 %vminv.i
-}
-
-define signext i8 @test_vminvq_s8(<16 x i8> %a1) {
-; CHECK: test_vminvq_s8
-; CHECK: sminv.16b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8> %a1)
-  %0 = trunc i32 %vminv.i to i8
-  ret i8 %0
-}
-
-define signext i16 @test_vminvq_s16(<8 x i16> %a1) {
-; CHECK: test_vminvq_s16
-; CHECK: sminv.8h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16> %a1)
-  %0 = trunc i32 %vminv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vminvq_s32(<4 x i32> %a1) {
-; CHECK: test_vminvq_s32
-; CHECK: sminv.4s [[REGNUM:s[0-9]+]], v0
-; CHECK-NEXT: fmov w0, [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32> %a1)
-  ret i32 %vminv.i
-}
-
-declare i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32>)
-declare i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16>)
-declare i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8>)
-declare i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32>)
-declare i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16>)
-declare i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8>)
-
diff --git a/test/CodeGen/ARM64/spill-lr.ll b/test/CodeGen/ARM64/spill-lr.ll
deleted file mode 100644
index fb6588e6ae4..00000000000
--- a/test/CodeGen/ARM64/spill-lr.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios < %s
-@bar = common global i32 0, align 4
-
-; Leaf function which uses all callee-saved registers and allocates >= 256 bytes on the stack
-; this will cause processFunctionBeforeCalleeSavedScan() to spill LR as an additional scratch
-; register.
-;
-; This is a crash-only regression test for rdar://15124582.
-define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) nounwind {
-entry:
-  %stack = alloca [128 x i32], align 4
-  %0 = bitcast [128 x i32]* %stack to i8*
-  %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom
-  store i32 %b, i32* %arrayidx, align 4
-  %1 = load volatile i32* @bar, align 4
-  %2 = load volatile i32* @bar, align 4
-  %3 = load volatile i32* @bar, align 4
-  %4 = load volatile i32* @bar, align 4
-  %5 = load volatile i32* @bar, align 4
-  %6 = load volatile i32* @bar, align 4
-  %7 = load volatile i32* @bar, align 4
-  %8 = load volatile i32* @bar, align 4
-  %9 = load volatile i32* @bar, align 4
-  %10 = load volatile i32* @bar, align 4
-  %11 = load volatile i32* @bar, align 4
-  %12 = load volatile i32* @bar, align 4
-  %13 = load volatile i32* @bar, align 4
-  %14 = load volatile i32* @bar, align 4
-  %15 = load volatile i32* @bar, align 4
-  %16 = load volatile i32* @bar, align 4
-  %17 = load volatile i32* @bar, align 4
-  %18 = load volatile i32* @bar, align 4
-  %19 = load volatile i32* @bar, align 4
-  %20 = load volatile i32* @bar, align 4
-  %idxprom1 = sext i32 %c to i64
-  %arrayidx2 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom1
-  %21 = load i32* %arrayidx2, align 4
-  %factor = mul i32 %h, -2
-  %factor67 = mul i32 %g, -2
-  %factor68 = mul i32 %f, -2
-  %factor69 = mul i32 %e, -2
-  %factor70 = mul i32 %d, -2
-  %factor71 = mul i32 %c, -2
-  %factor72 = mul i32 %b, -2
-  %sum = add i32 %2, %1
-  %sum73 = add i32 %sum, %3
-  %sum74 = add i32 %sum73, %4
-  %sum75 = add i32 %sum74, %5
-  %sum76 = add i32 %sum75, %6
-  %sum77 = add i32 %sum76, %7
-  %sum78 = add i32 %sum77, %8
-  %sum79 = add i32 %sum78, %9
-  %sum80 = add i32 %sum79, %10
-  %sum81 = add i32 %sum80, %11
-  %sum82 = add i32 %sum81, %12
-  %sum83 = add i32 %sum82, %13
-  %sum84 = add i32 %sum83, %14
-  %sum85 = add i32 %sum84, %15
-  %sum86 = add i32 %sum85, %16
-  %sum87 = add i32 %sum86, %17
-  %sum88 = add i32 %sum87, %18
-  %sum89 = add i32 %sum88, %19
-  %sum90 = add i32 %sum89, %20
-  %sub15 = sub i32 %21, %sum90
-  %sub16 = add i32 %sub15, %factor
-  %sub17 = add i32 %sub16, %factor67
-  %sub18 = add i32 %sub17, %factor68
-  %sub19 = add i32 %sub18, %factor69
-  %sub20 = add i32 %sub19, %factor70
-  %sub21 = add i32 %sub20, %factor71
-  %add = add i32 %sub21, %factor72
-  ret i32 %add
-}
diff --git a/test/CodeGen/ARM64/spill.ll b/test/CodeGen/ARM64/spill.ll
deleted file mode 100644
index 5a5da9723cb..00000000000
--- a/test/CodeGen/ARM64/spill.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -arm64-neon-syntax=apple -verify-machineinstrs
-
-; CHECK: fpr128
-; CHECK: ld1.2d
-; CHECK: str q
-; CHECK: inlineasm
-; CHECK: ldr q
-; CHECK: st1.2d
-define void @fpr128(<4 x float>* %p) nounwind ssp {
-entry:
-  %x = load <4 x float>* %p, align 16
-  call void asm sideeffect "; inlineasm", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23},~{q24},~{q25},~{q26},~{q27},~{q28},~{q29},~{q30},~{q31},~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
-  store <4 x float> %x, <4 x float>* %p, align 16
-  ret void
-}
diff --git a/test/CodeGen/ARM64/st1.ll b/test/CodeGen/ARM64/st1.ll
deleted file mode 100644
index b9aafc60e7b..00000000000
--- a/test/CodeGen/ARM64/st1.ll
+++ /dev/null
@@ -1,676 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
-
-define void @st1lane_16b(<16 x i8> %A, i8* %D) {
-; CHECK-LABEL: st1lane_16b
-; CHECK: st1.b
-  %tmp = extractelement <16 x i8> %A, i32 1
-  store i8 %tmp, i8* %D
-  ret void
-}
-
-define void @st1lane_8h(<8 x i16> %A, i16* %D) {
-; CHECK-LABEL: st1lane_8h
-; CHECK: st1.h
-  %tmp = extractelement <8 x i16> %A, i32 1
-  store i16 %tmp, i16* %D
-  ret void
-}
-
-define void @st1lane_4s(<4 x i32> %A, i32* %D) {
-; CHECK-LABEL: st1lane_4s
-; CHECK: st1.s
-  %tmp = extractelement <4 x i32> %A, i32 1
-  store i32 %tmp, i32* %D
-  ret void
-}
-
-define void @st1lane_4s_float(<4 x float> %A, float* %D) {
-; CHECK-LABEL: st1lane_4s_float
-; CHECK: st1.s
-  %tmp = extractelement <4 x float> %A, i32 1
-  store float %tmp, float* %D
-  ret void
-}
-
-define void @st1lane_2d(<2 x i64> %A, i64* %D) {
-; CHECK-LABEL: st1lane_2d
-; CHECK: st1.d
-  %tmp = extractelement <2 x i64> %A, i32 1
-  store i64 %tmp, i64* %D
-  ret void
-}
-
-define void @st1lane_2d_double(<2 x double> %A, double* %D) {
-; CHECK-LABEL: st1lane_2d_double
-; CHECK: st1.d
-  %tmp = extractelement <2 x double> %A, i32 1
-  store double %tmp, double* %D
-  ret void
-}
-
-define void @st1lane_8b(<8 x i8> %A, i8* %D) {
-; CHECK-LABEL: st1lane_8b
-; CHECK: st1.b
-  %tmp = extractelement <8 x i8> %A, i32 1
-  store i8 %tmp, i8* %D
-  ret void
-}
-
-define void @st1lane_4h(<4 x i16> %A, i16* %D) {
-; CHECK-LABEL: st1lane_4h
-; CHECK: st1.h
-  %tmp = extractelement <4 x i16> %A, i32 1
-  store i16 %tmp, i16* %D
-  ret void
-}
-
-define void @st1lane_2s(<2 x i32> %A, i32* %D) {
-; CHECK-LABEL: st1lane_2s
-; CHECK: st1.s
-  %tmp = extractelement <2 x i32> %A, i32 1
-  store i32 %tmp, i32* %D
-  ret void
-}
-
-define void @st1lane_2s_float(<2 x float> %A, float* %D) {
-; CHECK-LABEL: st1lane_2s_float
-; CHECK: st1.s
-  %tmp = extractelement <2 x float> %A, i32 1
-  store float %tmp, float* %D
-  ret void
-}
-
-define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
-; CHECK-LABEL: st2lane_16b
-; CHECK: st2.b
-  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
-  ret void
-}
-
-define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
-; CHECK-LABEL: st2lane_8h
-; CHECK: st2.h
-  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
-  ret void
-}
-
-define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
-; CHECK-LABEL: st2lane_4s
-; CHECK: st2.s
-  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
-  ret void
-}
-
-define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
-; CHECK-LABEL: st2lane_2d
-; CHECK: st2.d
-  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
-
-define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
-; CHECK-LABEL: st3lane_16b
-; CHECK: st3.b
-  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
-  ret void
-}
-
-define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
-; CHECK-LABEL: st3lane_8h
-; CHECK: st3.h
-  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
-  ret void
-}
-
-define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
-; CHECK-LABEL: st3lane_4s
-; CHECK: st3.s
-  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
-  ret void
-}
-
-define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
-; CHECK-LABEL: st3lane_2d
-; CHECK: st3.d
-  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
-
-define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
-; CHECK-LABEL: st4lane_16b
-; CHECK: st4.b
-  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
-  ret void
-}
-
-define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
-; CHECK-LABEL: st4lane_8h
-; CHECK: st4.h
-  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
-  ret void
-}
-
-define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
-; CHECK-LABEL: st4lane_4s
-; CHECK: st4.s
-  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
-  ret void
-}
-
-define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
-; CHECK-LABEL: st4lane_2d
-; CHECK: st4.d
-  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
-
-
-define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
-; CHECK-LABEL: st2_8b
-; CHECK st2.8b
-	call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
-	ret void
-}
-
-define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
-; CHECK-LABEL: st3_8b
-; CHECK st3.8b
-	call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
-	ret void
-}
-
-define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
-; CHECK-LABEL: st4_8b
-; CHECK st4.8b
-	call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-
-define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
-; CHECK-LABEL: st2_16b
-; CHECK st2.16b
-	call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
-	ret void
-}
-
-define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
-; CHECK-LABEL: st3_16b
-; CHECK st3.16b
-	call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
-	ret void
-}
-
-define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
-; CHECK-LABEL: st4_16b
-; CHECK st4.16b
-	call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-
-define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
-; CHECK-LABEL: st2_4h
-; CHECK st2.4h
-	call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
-	ret void
-}
-
-define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
-; CHECK-LABEL: st3_4h
-; CHECK st3.4h
-	call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
-	ret void
-}
-
-define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
-; CHECK-LABEL: st4_4h
-; CHECK st4.4h
-	call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-
-define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
-; CHECK-LABEL: st2_8h
-; CHECK st2.8h
-	call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
-	ret void
-}
-
-define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
-; CHECK-LABEL: st3_8h
-; CHECK st3.8h
-	call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
-	ret void
-}
-
-define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
-; CHECK-LABEL: st4_8h
-; CHECK st4.8h
-	call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-
-define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
-; CHECK-LABEL: st2_2s
-; CHECK st2.2s
-	call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
-	ret void
-}
-
-define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
-; CHECK-LABEL: st3_2s
-; CHECK st3.2s
-	call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
-	ret void
-}
-
-define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
-; CHECK-LABEL: st4_2s
-; CHECK st4.2s
-	call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-
-define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
-; CHECK-LABEL: st2_4s
-; CHECK st2.4s
-	call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
-	ret void
-}
-
-define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
-; CHECK-LABEL: st3_4s
-; CHECK st3.4s
-	call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
-	ret void
-}
-
-define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
-; CHECK-LABEL: st4_4s
-; CHECK st4.4s
-	call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-
-define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
-; CHECK-LABEL: st2_1d
-; CHECK st1.2d
-	call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
-	ret void
-}
-
-define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
-; CHECK-LABEL: st3_1d
-; CHECK st1.3d
-	call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
-	ret void
-}
-
-define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
-; CHECK-LABEL: st4_1d
-; CHECK st1.4d
-	call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-
-define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
-; CHECK-LABEL: st2_2d
-; CHECK st2.2d
-	call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
-	ret void
-}
-
-define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
-; CHECK-LABEL: st3_2d
-; CHECK st2.3d
-	call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
-	ret void
-}
-
-define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
-; CHECK-LABEL: st4_2d
-; CHECK st2.4d
-	call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-
-declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
-
-define void @st1_x2_v8i8(<8 x i8> %A, <8 x i8> %B, i8* %addr) {
-; CHECK-LABEL: st1_x2_v8i8:
-; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
-  ret void
-}
-
-define void @st1_x2_v4i16(<4 x i16> %A, <4 x i16> %B, i16* %addr) {
-; CHECK-LABEL: st1_x2_v4i16:
-; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
-  ret void
-}
-
-define void @st1_x2_v2i32(<2 x i32> %A, <2 x i32> %B, i32* %addr) {
-; CHECK-LABEL: st1_x2_v2i32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
-  ret void
-}
-
-define void @st1_x2_v2f32(<2 x float> %A, <2 x float> %B, float* %addr) {
-; CHECK-LABEL: st1_x2_v2f32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
-  ret void
-}
-
-define void @st1_x2_v1i64(<1 x i64> %A, <1 x i64> %B, i64* %addr) {
-; CHECK-LABEL: st1_x2_v1i64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
-  ret void
-}
-
-define void @st1_x2_v1f64(<1 x double> %A, <1 x double> %B, double* %addr) {
-; CHECK-LABEL: st1_x2_v1f64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
-
-define void @st1_x2_v16i8(<16 x i8> %A, <16 x i8> %B, i8* %addr) {
-; CHECK-LABEL: st1_x2_v16i8:
-; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
-  ret void
-}
-
-define void @st1_x2_v8i16(<8 x i16> %A, <8 x i16> %B, i16* %addr) {
-; CHECK-LABEL: st1_x2_v8i16:
-; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
-  ret void
-}
-
-define void @st1_x2_v4i32(<4 x i32> %A, <4 x i32> %B, i32* %addr) {
-; CHECK-LABEL: st1_x2_v4i32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
-  ret void
-}
-
-define void @st1_x2_v4f32(<4 x float> %A, <4 x float> %B, float* %addr) {
-; CHECK-LABEL: st1_x2_v4f32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
-  ret void
-}
-
-define void @st1_x2_v2i64(<2 x i64> %A, <2 x i64> %B, i64* %addr) {
-; CHECK-LABEL: st1_x2_v2i64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
-  ret void
-}
-
-define void @st1_x2_v2f64(<2 x double> %A, <2 x double> %B, double* %addr) {
-; CHECK-LABEL: st1_x2_v2f64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
-
-define void @st1_x3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr) {
-; CHECK-LABEL: st1_x3_v8i8:
-; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
-  ret void
-}
-
-define void @st1_x3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr) {
-; CHECK-LABEL: st1_x3_v4i16:
-; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
-  ret void
-}
-
-define void @st1_x3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr) {
-; CHECK-LABEL: st1_x3_v2i32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
-  ret void
-}
-
-define void @st1_x3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr) {
-; CHECK-LABEL: st1_x3_v2f32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
-  ret void
-}
-
-define void @st1_x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr) {
-; CHECK-LABEL: st1_x3_v1i64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
-  ret void
-}
-
-define void @st1_x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr) {
-; CHECK-LABEL: st1_x3_v1f64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
-
-define void @st1_x3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr) {
-; CHECK-LABEL: st1_x3_v16i8:
-; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
-  ret void
-}
-
-define void @st1_x3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr) {
-; CHECK-LABEL: st1_x3_v8i16:
-; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
-  ret void
-}
-
-define void @st1_x3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr) {
-; CHECK-LABEL: st1_x3_v4i32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
-  ret void
-}
-
-define void @st1_x3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr) {
-; CHECK-LABEL: st1_x3_v4f32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
-  ret void
-}
-
-define void @st1_x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr) {
-; CHECK-LABEL: st1_x3_v2i64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
-  ret void
-}
-
-define void @st1_x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr) {
-; CHECK-LABEL: st1_x3_v2f64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
-  ret void
-}
-
-
-declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
-
-define void @st1_x4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr) {
-; CHECK-LABEL: st1_x4_v8i8:
-; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
-  ret void
-}
-
-define void @st1_x4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr) {
-; CHECK-LABEL: st1_x4_v4i16:
-; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
-  ret void
-}
-
-define void @st1_x4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr) {
-; CHECK-LABEL: st1_x4_v2i32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
-  ret void
-}
-
-define void @st1_x4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr) {
-; CHECK-LABEL: st1_x4_v2f32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
-  ret void
-}
-
-define void @st1_x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr) {
-; CHECK-LABEL: st1_x4_v1i64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
-  ret void
-}
-
-define void @st1_x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr) {
-; CHECK-LABEL: st1_x4_v1f64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
-
-define void @st1_x4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr) {
-; CHECK-LABEL: st1_x4_v16i8:
-; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
-  ret void
-}
-
-define void @st1_x4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr) {
-; CHECK-LABEL: st1_x4_v8i16:
-; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
-  ret void
-}
-
-define void @st1_x4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr) {
-; CHECK-LABEL: st1_x4_v4i32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
-  ret void
-}
-
-define void @st1_x4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr) {
-; CHECK-LABEL: st1_x4_v4f32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
-  ret void
-}
-
-define void @st1_x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr) {
-; CHECK-LABEL: st1_x4_v2i64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
-  ret void
-}
-
-define void @st1_x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr) {
-; CHECK-LABEL: st1_x4_v2f64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
-  ret void
-}
diff --git a/test/CodeGen/ARM64/stack-no-frame.ll b/test/CodeGen/ARM64/stack-no-frame.ll
deleted file mode 100644
index b5970c00ff9..00000000000
--- a/test/CodeGen/ARM64/stack-no-frame.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
-
-@global = global [20 x i64] zeroinitializer, align 8
-
-; The following function has enough locals to need some restoring, but not a
-; frame record. In an intermediate frame refactoring, prologue and epilogue were
-; inconsistent about how much to move SP.
-define void @test_stack_no_frame() {
-; CHECK: test_stack_no_frame
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-  %local = alloca [20 x i64]
-  %val = load volatile [20 x i64]* @global, align 8
-  store volatile [20 x i64] %val, [20 x i64]* %local, align 8
-
-  %val2 = load volatile [20 x i64]* %local, align 8
-  store volatile [20 x i64] %val2, [20 x i64]* @global, align 8
-
-; CHECK: add sp, sp, #[[STACKSIZE]]
-  ret void
-}
diff --git a/test/CodeGen/ARM64/stackmap.ll b/test/CodeGen/ARM64/stackmap.ll
deleted file mode 100644
index 2c7c6ae5d6d..00000000000
--- a/test/CodeGen/ARM64/stackmap.ll
+++ /dev/null
@@ -1,288 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-;
-; Note: Print verbose stackmaps using -debug-only=stackmaps.
-
-; We are not getting the correct stack alignment when cross compiling for arm64.
-; So specify a datalayout here.
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
-; CHECK-NEXT:  __LLVM_StackMaps:
-; Header
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 0
-; CHECK-NEXT:   .short 0
-; Num Functions
-; CHECK-NEXT:   .long 11
-; Num LargeConstants
-; CHECK-NEXT:   .long 2
-; Num Callsites
-; CHECK-NEXT:   .long 11
-
-; Functions and stack size
-; CHECK-NEXT:   .quad _constantargs
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _osrinline
-; CHECK-NEXT:   .quad 32
-; CHECK-NEXT:   .quad _osrcold
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _propertyRead
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _propertyWrite
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _jsVoidCall
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _jsIntCall
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _spilledValue
-; CHECK-NEXT:   .quad 160
-; CHECK-NEXT:   .quad _spilledStackMapValue
-; CHECK-NEXT:   .quad 128
-; CHECK-NEXT:   .quad _liveConstant
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _clobberLR
-; CHECK-NEXT:   .quad 112
-
-; Num LargeConstants
-; CHECK-NEXT:   .quad   4294967295
-; CHECK-NEXT:   .quad   4294967296
-
-; Constant arguments
-;
-; CHECK-NEXT:   .quad   1
-; CHECK-NEXT:   .long   L{{.*}}-_constantargs
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  4
-; SmallConstant
-; CHECK-NEXT:   .byte   4
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   65535
-; SmallConstant
-; CHECK-NEXT:   .byte   4
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   65536
-; SmallConstant
-; CHECK-NEXT:   .byte   5
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   0
-; LargeConstant at index 0
-; CHECK-NEXT:   .byte   5
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   1
-
-define void @constantargs() {
-entry:
-  %0 = inttoptr i64 244837814094590 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
-  ret void
-}
-
-; Inline OSR Exit
-;
-; CHECK-LABEL:  .long   L{{.*}}-_osrinline
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  2
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long  0
-define void @osrinline(i64 %a, i64 %b) {
-entry:
-  ; Runtime void->void call.
-  call void inttoptr (i64 244837814094590 to void ()*)()
-  ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
-  ret void
-}
-
-; Cold OSR Exit
-;
-; 2 live variables in register.
-;
-; CHECK-LABEL:  .long   L{{.*}}-_osrcold
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  2
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long  0
-define void @osrcold(i64 %a, i64 %b) {
-entry:
-  %test = icmp slt i64 %a, %b
-  br i1 %test, label %ret, label %cold
-cold:
-  ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
-  %thunk = inttoptr i64 244837814094590 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
-  unreachable
-ret:
-  ret void
-}
-
-; Property Read
-; CHECK-LABEL:  .long   L{{.*}}-_propertyRead
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  0
-;
-; FIXME: There are currently no stackmap entries. After moving to
-; AnyRegCC, we will have entries for the object and return value.
-define i64 @propertyRead(i64* %obj) {
-entry:
-  %resolveRead = inttoptr i64 244837814094590 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
-  %add = add i64 %result, 3
-  ret i64 %add
-}
-
-; Property Write
-; CHECK-LABEL:  .long   L{{.*}}-_propertyWrite
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  2
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
-entry:
-  %resolveWrite = inttoptr i64 244837814094590 to i8*
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
-  ret void
-}
-
-; Void JS Call
-;
-; 2 live variables in registers.
-;
-; CHECK-LABEL:  .long   L{{.*}}-_jsVoidCall
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  2
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
-entry:
-  %resolveCall = inttoptr i64 244837814094590 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
-  ret void
-}
-
-; i64 JS Call
-;
-; 2 live variables in registers.
-;
-; CHECK-LABEL:  .long   L{{.*}}-_jsIntCall
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  2
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
-entry:
-  %resolveCall = inttoptr i64 244837814094590 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
-  %add = add i64 %result, 3
-  ret i64 %add
-}
-
-; Spilled stack map values.
-;
-; Verify 28 stack map entries.
-;
-; CHECK-LABEL:  .long L{{.*}}-_spilledValue
-; CHECK-NEXT:   .short 0
-; CHECK-NEXT:   .short 28
-;
-; Check that at least one is a spilled entry from RBP.
-; Location: Indirect FP + ...
-; CHECK:        .byte 3
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short 29
-define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
-entry:
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
-  ret void
-}
-
-; Spilled stack map values.
-;
-; Verify 23 stack map entries.
-;
-; CHECK-LABEL:  .long L{{.*}}-_spilledStackMapValue
-; CHECK-NEXT:   .short 0
-; CHECK-NEXT:   .short 30
-;
-; Check that at least one is a spilled entry from RBP.
-; Location: Indirect FP + ...
-; CHECK:        .byte 3
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short 29
-define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
-entry:
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
-  ret void
-}
-
-
-; Map a constant value.
-;
-; CHECK-LABEL:  .long L{{.*}}-_liveConstant
-; CHECK-NEXT:   .short 0
-; 1 location
-; CHECK-NEXT:   .short 1
-; Loc 0: SmallConstant
-; CHECK-NEXT:   .byte   4
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   33
-
-define void @liveConstant() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
-  ret void
-}
-
-; Map a value when LR is the only free register.
-;
-; CHECK-LABEL:  .long L{{.*}}-_clobberLR
-; CHECK-NEXT:   .short 0
-; 1 location
-; CHECK-NEXT:   .short 1
-; Loc 0: Indirect FP (r29) - offset
-; CHECK-NEXT:   .byte   3
-; CHECK-NEXT:   .byte   4
-; CHECK-NEXT:   .short  29
-; CHECK-NEXT:   .long   -{{[0-9]+}}
-define void @clobberLR(i32 %a) {
-  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x31}"() nounwind
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
-  ret void
-}
-
-declare void @llvm.experimental.stackmap(i64, i32, ...)
-declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
-declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/stackpointer.ll b/test/CodeGen/ARM64/stackpointer.ll
deleted file mode 100644
index 581faf130f1..00000000000
--- a/test/CodeGen/ARM64/stackpointer.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
-
-define i64 @get_stack() nounwind {
-entry:
-; CHECK-LABEL: get_stack:
-; CHECK: mov   x0, sp
-	%sp = call i64 @llvm.read_register.i64(metadata !0)
-  ret i64 %sp
-}
-
-define void @set_stack(i64 %val) nounwind {
-entry:
-; CHECK-LABEL: set_stack:
-; CHECK: mov   sp, x0
-  call void @llvm.write_register.i64(metadata !0, i64 %val)
-  ret void
-}
-
-declare i64 @llvm.read_register.i64(metadata) nounwind
-declare void @llvm.write_register.i64(metadata, i64) nounwind
-
-; register unsigned long current_stack_pointer asm("sp");
-; CHECK-NOT: .asciz  "sp"
-!0 = metadata !{metadata !"sp\00"}
diff --git a/test/CodeGen/ARM64/stacksave.ll b/test/CodeGen/ARM64/stacksave.ll
deleted file mode 100644
index a79e99ba323..00000000000
--- a/test/CodeGen/ARM64/stacksave.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -verify-coalescing
-; <rdar://problem/11522048>
-target triple = "arm64-apple-macosx10.8.0"
-
-; Verify that we can handle spilling the stack pointer without attempting
-; spilling it directly.
-; CHECK: f
-; CHECK: mov [[X0:x[0-9]+]], sp
-; CHECK: str [[X0]]
-; CHECK: inlineasm
-define void @f() nounwind ssp {
-entry:
-  %savedstack = call i8* @llvm.stacksave() nounwind
-  call void asm sideeffect "; inlineasm", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
-  call void @llvm.stackrestore(i8* %savedstack) nounwind
-  ret void
-}
-
-declare i8* @llvm.stacksave() nounwind
-declare void @llvm.stackrestore(i8*) nounwind
diff --git a/test/CodeGen/ARM64/stp.ll b/test/CodeGen/ARM64/stp.ll
deleted file mode 100644
index 3a58396e61b..00000000000
--- a/test/CodeGen/ARM64/stp.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
-; RUN:   -verify-machineinstrs -mcpu=cyclone | FileCheck -check-prefix=STUR_CHK %s
-
-; CHECK: stp_int
-; CHECK: stp w0, w1, [x2]
-define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
-  store i32 %a, i32* %p, align 4
-  %add.ptr = getelementptr inbounds i32* %p, i64 1
-  store i32 %b, i32* %add.ptr, align 4
-  ret void
-}
-
-; CHECK: stp_long
-; CHECK: stp x0, x1, [x2]
-define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
-  store i64 %a, i64* %p, align 8
-  %add.ptr = getelementptr inbounds i64* %p, i64 1
-  store i64 %b, i64* %add.ptr, align 8
-  ret void
-}
-
-; CHECK: stp_float
-; CHECK: stp s0, s1, [x0]
-define void @stp_float(float %a, float %b, float* nocapture %p) nounwind {
-  store float %a, float* %p, align 4
-  %add.ptr = getelementptr inbounds float* %p, i64 1
-  store float %b, float* %add.ptr, align 4
-  ret void
-}
-
-; CHECK: stp_double
-; CHECK: stp d0, d1, [x0]
-define void @stp_double(double %a, double %b, double* nocapture %p) nounwind {
-  store double %a, double* %p, align 8
-  %add.ptr = getelementptr inbounds double* %p, i64 1
-  store double %b, double* %add.ptr, align 8
-  ret void
-}
-
-; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
-define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
-; STUR_CHK: stur_int
-; STUR_CHK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8]
-; STUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i32* %p, i32 -1
-  store i32 %a, i32* %p1, align 2
-  %p2 = getelementptr inbounds i32* %p, i32 -2
-  store i32 %b, i32* %p2, align 2
-  ret void
-}
-
-define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
-; STUR_CHK: stur_long
-; STUR_CHK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16]
-; STUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i64* %p, i32 -1
-  store i64 %a, i64* %p1, align 2
-  %p2 = getelementptr inbounds i64* %p, i32 -2
-  store i64 %b, i64* %p2, align 2
-  ret void
-}
-
-define void @stur_float(float %a, float %b, float* nocapture %p) nounwind {
-; STUR_CHK: stur_float
-; STUR_CHK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8]
-; STUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds float* %p, i32 -1
-  store float %a, float* %p1, align 2
-  %p2 = getelementptr inbounds float* %p, i32 -2
-  store float %b, float* %p2, align 2
-  ret void
-}
-
-define void @stur_double(double %a, double %b, double* nocapture %p) nounwind {
-; STUR_CHK: stur_double
-; STUR_CHK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16]
-; STUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds double* %p, i32 -1
-  store double %a, double* %p1, align 2
-  %p2 = getelementptr inbounds double* %p, i32 -2
-  store double %b, double* %p2, align 2
-  ret void
-}
-
-define void @splat_v4i32(i32 %v, i32 *%p) {
-entry:
-
-; CHECK-LABEL: splat_v4i32
-; CHECK-DAG: stp w0, w0, [x1]
-; CHECK-DAG: stp w0, w0, [x1, #8]
-; CHECK: ret
-
-  %p17 = insertelement <4 x i32> undef, i32 %v, i32 0
-  %p18 = insertelement <4 x i32> %p17, i32 %v, i32 1
-  %p19 = insertelement <4 x i32> %p18, i32 %v, i32 2
-  %p20 = insertelement <4 x i32> %p19, i32 %v, i32 3
-  %p21 = bitcast i32* %p to <4 x i32>*
-  store <4 x i32> %p20, <4 x i32>* %p21, align 4
-  ret void
-}
diff --git a/test/CodeGen/ARM64/strict-align.ll b/test/CodeGen/ARM64/strict-align.ll
deleted file mode 100644
index 48a1528b5cd..00000000000
--- a/test/CodeGen/ARM64/strict-align.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-apple-darwin -arm64-no-strict-align | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-apple-darwin -arm64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
-
-define i32 @f0(i32* nocapture %p) nounwind {
-; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
-; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
-; CHECK-STRICT: bfi [[LOW]], [[HIGH]], #16, #16
-; CHECK-STRICT: ret
-
-; CHECK: ldr w0, [x0]
-; CHECK: ret
-  %tmp = load i32* %p, align 2
-  ret i32 %tmp
-}
-
-define i64 @f1(i64* nocapture %p) nounwind {
-; CHECK-STRICT:	ldp	w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
-; CHECK-STRICT: bfi x[[LOW]], x[[HIGH]], #32, #32
-; CHECK-STRICT:	ret
-
-; CHECK: ldr x0, [x0]
-; CHECK: ret
-  %tmp = load i64* %p, align 4
-  ret i64 %tmp
-}
diff --git a/test/CodeGen/ARM64/stur.ll b/test/CodeGen/ARM64/stur.ll
deleted file mode 100644
index dc67b60000d..00000000000
--- a/test/CodeGen/ARM64/stur.ll
+++ /dev/null
@@ -1,98 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
-%struct.X = type <{ i32, i64, i64 }>
-
-define void @foo1(i32* %p, i64 %val) nounwind {
-; CHECK-LABEL: foo1:
-; CHECK: 	stur	w1, [x0, #-4]
-; CHECK-NEXT: 	ret
-  %tmp1 = trunc i64 %val to i32
-  %ptr = getelementptr inbounds i32* %p, i64 -1
-  store i32 %tmp1, i32* %ptr, align 4
-  ret void
-}
-define void @foo2(i16* %p, i64 %val) nounwind {
-; CHECK-LABEL: foo2:
-; CHECK: 	sturh	w1, [x0, #-2]
-; CHECK-NEXT: 	ret
-  %tmp1 = trunc i64 %val to i16
-  %ptr = getelementptr inbounds i16* %p, i64 -1
-  store i16 %tmp1, i16* %ptr, align 2
-  ret void
-}
-define void @foo3(i8* %p, i64 %val) nounwind {
-; CHECK-LABEL: foo3:
-; CHECK: 	sturb	w1, [x0, #-1]
-; CHECK-NEXT: 	ret
-  %tmp1 = trunc i64 %val to i8
-  %ptr = getelementptr inbounds i8* %p, i64 -1
-  store i8 %tmp1, i8* %ptr, align 1
-  ret void
-}
-define void @foo4(i16* %p, i32 %val) nounwind {
-; CHECK-LABEL: foo4:
-; CHECK: 	sturh	w1, [x0, #-2]
-; CHECK-NEXT: 	ret
-  %tmp1 = trunc i32 %val to i16
-  %ptr = getelementptr inbounds i16* %p, i32 -1
-  store i16 %tmp1, i16* %ptr, align 2
-  ret void
-}
-define void @foo5(i8* %p, i32 %val) nounwind {
-; CHECK-LABEL: foo5:
-; CHECK: 	sturb	w1, [x0, #-1]
-; CHECK-NEXT: 	ret
-  %tmp1 = trunc i32 %val to i8
-  %ptr = getelementptr inbounds i8* %p, i32 -1
-  store i8 %tmp1, i8* %ptr, align 1
-  ret void
-}
-
-define void @foo(%struct.X* nocapture %p) nounwind optsize ssp {
-; CHECK-LABEL: foo:
-; CHECK-NOT: str
-; CHECK: stur    xzr, [x0, #12]
-; CHECK-NEXT: stur    xzr, [x0, #4]
-; CHECK-NEXT: ret
-  %B = getelementptr inbounds %struct.X* %p, i64 0, i32 1
-  %val = bitcast i64* %B to i8*
-  call void @llvm.memset.p0i8.i64(i8* %val, i8 0, i64 16, i32 1, i1 false)
-  ret void
-}
-
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
-
-; Unaligned 16b stores are split into 8b stores for performance.
-; radar://15424193
-
-; CHECK-LABEL: unaligned:
-; CHECK-NOT: str q0
-; CHECK: str     d[[REG:[0-9]+]], [x0]
-; CHECK: ext.16b v[[REG2:[0-9]+]], v[[REG]], v[[REG]], #8
-; CHECK: str     d[[REG2]], [x0, #8]
-define void @unaligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
-  store <4 x i32> %v, <4 x i32>* %p, align 4
-  ret void
-}
-
-; CHECK-LABEL: aligned:
-; CHECK: str q0
-define void @aligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
-  store <4 x i32> %v, <4 x i32>* %p
-  ret void
-}
-
-; Don't split one and two byte aligned stores.
-; radar://16349308
-
-; CHECK-LABEL: twobytealign:
-; CHECK: str q0
-define void @twobytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
-  store <4 x i32> %v, <4 x i32>* %p, align 2
-  ret void
-}
-; CHECK-LABEL: onebytealign:
-; CHECK: str q0
-define void @onebytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
-  store <4 x i32> %v, <4 x i32>* %p, align 1
-  ret void
-}
diff --git a/test/CodeGen/ARM64/subsections.ll b/test/CodeGen/ARM64/subsections.ll
deleted file mode 100644
index 316e7c3a8eb..00000000000
--- a/test/CodeGen/ARM64/subsections.ll
+++ /dev/null
@@ -1,5 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s --check-prefix=CHECK-MACHO
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s --check-prefix=CHECK-ELF
-
-; CHECK-MACHO: .subsections_via_symbols
-; CHECK-ELF-NOT: .subsections_via_symbols
\ No newline at end of file
diff --git a/test/CodeGen/ARM64/subvector-extend.ll b/test/CodeGen/ARM64/subvector-extend.ll
deleted file mode 100644
index ad2f06ce7ba..00000000000
--- a/test/CodeGen/ARM64/subvector-extend.ll
+++ /dev/null
@@ -1,141 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
-
-; Test efficient codegen of vector extends up from legal type to 128 bit
-; and 256 bit vector types.
-
-;-----
-; Vectors of i16.
-;-----
-define <8 x i16> @func1(<8 x i8> %v0) nounwind {
-; CHECK-LABEL: func1:
-; CHECK-NEXT: ushll.8h  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <8 x i8> %v0 to <8 x i16>
-  ret <8 x i16> %r
-}
-
-define <8 x i16> @func2(<8 x i8> %v0) nounwind {
-; CHECK-LABEL: func2:
-; CHECK-NEXT: sshll.8h  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <8 x i8> %v0 to <8 x i16>
-  ret <8 x i16> %r
-}
-
-define <16 x i16> @func3(<16 x i8> %v0) nounwind {
-; CHECK-LABEL: func3:
-; CHECK-NEXT: ushll2.8h  v1, v0, #0
-; CHECK-NEXT: ushll.8h  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <16 x i8> %v0 to <16 x i16>
-  ret <16 x i16> %r
-}
-
-define <16 x i16> @func4(<16 x i8> %v0) nounwind {
-; CHECK-LABEL: func4:
-; CHECK-NEXT: sshll2.8h  v1, v0, #0
-; CHECK-NEXT: sshll.8h  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <16 x i8> %v0 to <16 x i16>
-  ret <16 x i16> %r
-}
-
-;-----
-; Vectors of i32.
-;-----
-
-define <4 x i32> @afunc1(<4 x i16> %v0) nounwind {
-; CHECK-LABEL: afunc1:
-; CHECK-NEXT: ushll.4s v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <4 x i16> %v0 to <4 x i32>
-  ret <4 x i32> %r
-}
-
-define <4 x i32> @afunc2(<4 x i16> %v0) nounwind {
-; CHECK-LABEL: afunc2:
-; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <4 x i16> %v0 to <4 x i32>
-  ret <4 x i32> %r
-}
-
-define <8 x i32> @afunc3(<8 x i16> %v0) nounwind {
-; CHECK-LABEL: afunc3:
-; CHECK-NEXT: ushll2.4s v1, v0, #0
-; CHECK-NEXT: ushll.4s v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <8 x i16> %v0 to <8 x i32>
-  ret <8 x i32> %r
-}
-
-define <8 x i32> @afunc4(<8 x i16> %v0) nounwind {
-; CHECK-LABEL: afunc4:
-; CHECK-NEXT: sshll2.4s v1, v0, #0
-; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <8 x i16> %v0 to <8 x i32>
-  ret <8 x i32> %r
-}
-
-define <8 x i32> @bfunc1(<8 x i8> %v0) nounwind {
-; CHECK-LABEL: bfunc1:
-; CHECK-NEXT: ushll.8h  v0, v0, #0
-; CHECK-NEXT: ushll2.4s v1, v0, #0
-; CHECK-NEXT: ushll.4s  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <8 x i8> %v0 to <8 x i32>
-  ret <8 x i32> %r
-}
-
-define <8 x i32> @bfunc2(<8 x i8> %v0) nounwind {
-; CHECK-LABEL: bfunc2:
-; CHECK-NEXT: sshll.8h  v0, v0, #0
-; CHECK-NEXT: sshll2.4s v1, v0, #0
-; CHECK-NEXT: sshll.4s  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <8 x i8> %v0 to <8 x i32>
-  ret <8 x i32> %r
-}
-
-;-----
-; Vectors of i64.
-;-----
-
-define <4 x i64> @zfunc1(<4 x i32> %v0) nounwind {
-; CHECK-LABEL: zfunc1:
-; CHECK-NEXT: ushll2.2d v1, v0, #0
-; CHECK-NEXT: ushll.2d v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <4 x i32> %v0 to <4 x i64>
-  ret <4 x i64> %r
-}
-
-define <4 x i64> @zfunc2(<4 x i32> %v0) nounwind {
-; CHECK-LABEL: zfunc2:
-; CHECK-NEXT: sshll2.2d v1, v0, #0
-; CHECK-NEXT: sshll.2d v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <4 x i32> %v0 to <4 x i64>
-  ret <4 x i64> %r
-}
-
-define <4 x i64> @bfunc3(<4 x i16> %v0) nounwind {
-; CHECK-LABEL: func3:
-; CHECK-NEXT: ushll.4s  v0, v0, #0
-; CHECK-NEXT: ushll2.2d v1, v0, #0
-; CHECK-NEXT: ushll.2d  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <4 x i16> %v0 to <4 x i64>
-  ret <4 x i64> %r
-}
-
-define <4 x i64> @cfunc4(<4 x i16> %v0) nounwind {
-; CHECK-LABEL: func4:
-; CHECK-NEXT: sshll.4s  v0, v0, #0
-; CHECK-NEXT: sshll2.2d v1, v0, #0
-; CHECK-NEXT: sshll.2d  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <4 x i16> %v0 to <4 x i64>
-  ret <4 x i64> %r
-}
diff --git a/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll b/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
deleted file mode 100644
index 4ab2bee0ed1..00000000000
--- a/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-; rdar://13214163 - Make sure we generate a correct lookup table for the TBL
-; instruction when the element size of the vector is not 8 bits. We were
-; getting both the endianness wrong and the element indexing wrong.
-define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {
-; CHECK:	.section	__TEXT,__literal16,16byte_literals
-; CHECK:	.align	4
-; CHECK:lCPI0_0:
-; CHECK:	.byte	0                       ; 0x0
-; CHECK:	.byte	1                       ; 0x1
-; CHECK:	.byte	0                       ; 0x0
-; CHECK:	.byte	1                       ; 0x1
-; CHECK:	.byte	0                       ; 0x0
-; CHECK:	.byte	1                       ; 0x1
-; CHECK:	.byte	0                       ; 0x0
-; CHECK:	.byte	1                       ; 0x1
-; CHECK:	.byte	8                       ; 0x8
-; CHECK:	.byte	9                       ; 0x9
-; CHECK:	.byte	8                       ; 0x8
-; CHECK:	.byte	9                       ; 0x9
-; CHECK:	.byte	8                       ; 0x8
-; CHECK:	.byte	9                       ; 0x9
-; CHECK:	.byte	8                       ; 0x8
-; CHECK:	.byte	9                       ; 0x9
-; CHECK:	.section __TEXT,__text,regular,pure_instructions
-; CHECK:	.globl	_foo
-; CHECK:	.align	2
-; CHECK:_foo:                                   ; @foo
-; CHECK:	adrp	[[BASE:x[0-9]+]], lCPI0_0@PAGE
-; CHECK:	ldr	q[[REG:[0-9]+]], {{\[}}[[BASE]], lCPI0_0@PAGEOFF]
-; CHECK:	tbl.16b	v0, { v0 }, v[[REG]]
-; CHECK:	ret
-
-  %val = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
-  ret <8 x i16> %val
-}
diff --git a/test/CodeGen/ARM64/tbl.ll b/test/CodeGen/ARM64/tbl.ll
deleted file mode 100644
index e1edd21d8a2..00000000000
--- a/test/CodeGen/ARM64/tbl.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
-; CHECK: tbl1_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
-; CHECK: tbl1_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
-; CHECK: tbl2_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
-; CHECK: tbl2_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK: tbl3_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK: tbl3_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK: tbl4_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK: tbl4_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
-  ret <16 x i8> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-
-define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
-; CHECK: tbx1_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
-; CHECK: tbx1_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK: tbx2_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK: tbx2_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK: tbx3_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK: tbx3_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
-; CHECK: tbx4_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
-; CHECK: tbx4_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
-  ret <16 x i8> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-
diff --git a/test/CodeGen/ARM64/this-return.ll b/test/CodeGen/ARM64/this-return.ll
deleted file mode 100644
index 30f5b9b064a..00000000000
--- a/test/CodeGen/ARM64/this-return.ll
+++ /dev/null
@@ -1,83 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-%struct.A = type { i8 }
-%struct.B = type { i32 }
-%struct.C = type { %struct.B }
-%struct.D = type { %struct.B }
-%struct.E = type { %struct.B, %struct.B }
-
-declare %struct.A* @A_ctor_base(%struct.A* returned)
-declare %struct.B* @B_ctor_base(%struct.B* returned, i32)
-declare %struct.B* @B_ctor_complete(%struct.B* returned, i32)
-
-declare %struct.A* @A_ctor_base_nothisret(%struct.A*)
-declare %struct.B* @B_ctor_base_nothisret(%struct.B*, i32)
-declare %struct.B* @B_ctor_complete_nothisret(%struct.B*, i32)
-
-define %struct.C* @C_ctor_base(%struct.C* returned %this, i32 %x) {
-entry:
-; CHECK-LABEL: C_ctor_base:
-; CHECK-NOT: mov {{x[0-9]+}}, x0
-; CHECK: bl {{_?A_ctor_base}}
-; CHECK-NOT: mov x0, {{x[0-9]+}}
-; CHECK: b {{_?B_ctor_base}}
-  %0 = bitcast %struct.C* %this to %struct.A*
-  %call = tail call %struct.A* @A_ctor_base(%struct.A* %0)
-  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
-  %call2 = tail call %struct.B* @B_ctor_base(%struct.B* %1, i32 %x)
-  ret %struct.C* %this
-}
-
-define %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x) {
-entry:
-; CHECK-LABEL: C_ctor_base_nothisret:
-; CHECK: mov [[SAVETHIS:x[0-9]+]], x0
-; CHECK: bl {{_?A_ctor_base_nothisret}}
-; CHECK: mov x0, [[SAVETHIS]]
-; CHECK-NOT: b {{_?B_ctor_base_nothisret}}
-  %0 = bitcast %struct.C* %this to %struct.A*
-  %call = tail call %struct.A* @A_ctor_base_nothisret(%struct.A* %0)
-  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
-  %call2 = tail call %struct.B* @B_ctor_base_nothisret(%struct.B* %1, i32 %x)
-  ret %struct.C* %this
-}
-
-define %struct.C* @C_ctor_complete(%struct.C* %this, i32 %x) {
-entry:
-; CHECK-LABEL: C_ctor_complete:
-; CHECK: b {{_?C_ctor_base}}
-  %call = tail call %struct.C* @C_ctor_base(%struct.C* %this, i32 %x)
-  ret %struct.C* %this
-}
-
-define %struct.C* @C_ctor_complete_nothisret(%struct.C* %this, i32 %x) {
-entry:
-; CHECK-LABEL: C_ctor_complete_nothisret:
-; CHECK-NOT: b {{_?C_ctor_base_nothisret}}
-  %call = tail call %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x)
-  ret %struct.C* %this
-}
-
-define %struct.D* @D_ctor_base(%struct.D* %this, i32 %x) {
-entry:
-; CHECK-LABEL: D_ctor_base:
-; CHECK-NOT: mov {{x[0-9]+}}, x0
-; CHECK: bl {{_?B_ctor_complete}}
-; CHECK-NOT: mov x0, {{x[0-9]+}}
-; CHECK: b {{_?B_ctor_complete}}
-  %b = getelementptr inbounds %struct.D* %this, i32 0, i32 0
-  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
-  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
-  ret %struct.D* %this
-}
-
-define %struct.E* @E_ctor_base(%struct.E* %this, i32 %x) {
-entry:
-; CHECK-LABEL: E_ctor_base:
-; CHECK-NOT: b {{_?B_ctor_complete}}
-  %b = getelementptr inbounds %struct.E* %this, i32 0, i32 0
-  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
-  %b2 = getelementptr inbounds %struct.E* %this, i32 0, i32 1
-  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b2, i32 %x)
-  ret %struct.E* %this
-}
diff --git a/test/CodeGen/ARM64/tls-darwin.ll b/test/CodeGen/ARM64/tls-darwin.ll
deleted file mode 100644
index 5e8ec33ba41..00000000000
--- a/test/CodeGen/ARM64/tls-darwin.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
-
-@var = thread_local global i8 0
-
-; N.b. x0 must be the result of the first load (i.e. the address of the
-; descriptor) when tlv_get_addr is called. Likewise the result is returned in
-; x0.
-define i8 @get_var() {
-; CHECK-LABEL: get_var:
-; CHECK: adrp x[[TLVPDESC_SLOT_HI:[0-9]+]], _var@TLVPPAGE
-; CHECK: ldr x0, [x[[TLVPDESC_SLOT_HI]], _var@TLVPPAGEOFF]
-; CHECK: ldr [[TLV_GET_ADDR:x[0-9]+]], [x0]
-; CHECK: blr [[TLV_GET_ADDR]]
-; CHECK: ldrb w0, [x0]
-
-  %val = load i8* @var, align 1
-  ret i8 %val
-}
diff --git a/test/CodeGen/ARM64/tls-dynamic-together.ll b/test/CodeGen/ARM64/tls-dynamic-together.ll
deleted file mode 100644
index 3daae625c84..00000000000
--- a/test/CodeGen/ARM64/tls-dynamic-together.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
-
-; If the .tlsdesccall and blr parts are emitted completely separately (even with
-; glue) then LLVM will separate them quite happily (with a spill at O0, hence
-; the option). This is definitely wrong, so we make sure they are emitted
-; together.
-
-@general_dynamic_var = external thread_local global i32
-
-define i32 @test_generaldynamic() {
-; CHECK-LABEL: test_generaldynamic:
-
-  %val = load i32* @general_dynamic_var
-  ret i32 %val
-
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr {{x[0-9]+}}
-}
diff --git a/test/CodeGen/ARM64/tls-dynamics.ll b/test/CodeGen/ARM64/tls-dynamics.ll
deleted file mode 100644
index e8a83fd7db3..00000000000
--- a/test/CodeGen/ARM64/tls-dynamics.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
-
-@general_dynamic_var = external thread_local global i32
-
-define i32 @test_generaldynamic() {
-; CHECK-LABEL: test_generaldynamic:
-
-  %val = load i32* @general_dynamic_var
-  ret i32 %val
-
-  ; FIXME: the adrp instructions are redundant (if harmless).
-; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
-; CHECK: ldr w0, [x[[TP]], x0]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-define i32* @test_generaldynamic_addr() {
-; CHECK-LABEL: test_generaldynamic_addr:
-
-  ret i32* @general_dynamic_var
-
-  ; FIXME: the adrp instructions are redundant (if harmless).
-; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
-; CHECK: add x0, [[TP]], x0
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-}
-
-@local_dynamic_var = external thread_local(localdynamic) global i32
-
-define i32 @test_localdynamic() {
-; CHECK-LABEL: test_localdynamic:
-
-  %val = load i32* @local_dynamic_var
-  ret i32 %val
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
-; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-
-; CHECK: add x[[TPREL:[0-9]+]], x0, [[DTP_OFFSET]]
-
-; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
-
-; CHECK: ldr w0, [x[[TPIDR]], x[[TPREL]]]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-define i32* @test_localdynamic_addr() {
-; CHECK-LABEL: test_localdynamic_addr:
-
-  ret i32* @local_dynamic_var
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
-; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-
-; CHECK: add [[TPREL:x[0-9]+]], x0, [[DTP_OFFSET]]
-
-; CHECK: mrs [[TPIDR:x[0-9]+]], TPIDR_EL0
-
-; CHECK: add x0, [[TPIDR]], [[TPREL]]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-; The entire point of the local-dynamic access model is to have a single call to
-; the expensive resolver. Make sure we achieve that goal.
-
-@local_dynamic_var2 = external thread_local(localdynamic) global i32
-
-define i32 @test_localdynamic_deduplicate() {
-; CHECK-LABEL: test_localdynamic_deduplicate:
-
-  %val = load i32* @local_dynamic_var
-  %val2 = load i32* @local_dynamic_var2
-
-  %sum = add i32 %val, %val2
-  ret i32 %sum
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK-NOT: _TLS_MODULE_BASE_
-
-; CHECK: ret
-}
diff --git a/test/CodeGen/ARM64/tls-execs.ll b/test/CodeGen/ARM64/tls-execs.ll
deleted file mode 100644
index f0130d85889..00000000000
--- a/test/CodeGen/ARM64/tls-execs.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
-
-@initial_exec_var = external thread_local(initialexec) global i32
-
-define i32 @test_initial_exec() {
-; CHECK-LABEL: test_initial_exec:
-  %val = load i32* @initial_exec_var
-
-; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
-; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
-; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
-; CHECK: ldr w0, [x[[TP]], x[[TP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
-; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
-
-  ret i32 %val
-}
-
-define i32* @test_initial_exec_addr() {
-; CHECK-LABEL: test_initial_exec_addr:
-  ret i32* @initial_exec_var
-
-; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
-; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
-; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
-; CHECK: add x0, [[TP]], [[TP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
-; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
-
-}
-
-@local_exec_var = thread_local(localexec) global i32 0
-
-define i32 @test_local_exec() {
-; CHECK-LABEL: test_local_exec:
-  %val = load i32* @local_exec_var
-
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [0bAAA{{[01]+}},A,0b101AAAAA,0x92]
-; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
-; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
-
-  ret i32 %val
-}
-
-define i32* @test_local_exec_addr() {
-; CHECK-LABEL: test_local_exec_addr:
-  ret i32* @local_exec_var
-
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
-; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
-; CHECK: add x0, [[TP]], [[TP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
-}
diff --git a/test/CodeGen/ARM64/trap.ll b/test/CodeGen/ARM64/trap.ll
deleted file mode 100644
index 5e99c32c57b..00000000000
--- a/test/CodeGen/ARM64/trap.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-define void @foo() nounwind {
-; CHECK: foo
-; CHECK: brk #0x1
-  tail call void @llvm.trap()
-  ret void
-}
-declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM64/trn.ll b/test/CodeGen/ARM64/trn.ll
deleted file mode 100644
index f46798490f6..00000000000
--- a/test/CodeGen/ARM64/trn.ll
+++ /dev/null
@@ -1,134 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vtrni8:
-;CHECK: trn1.8b
-;CHECK: trn2.8b
-;CHECK-NEXT: add.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-	ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: vtrni16:
-;CHECK: trn1.4h
-;CHECK: trn2.4h
-;CHECK-NEXT: add.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-	ret <4 x i16> %tmp5
-}
-
-; 2xi32 TRN is redundant with ZIP
-define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: vtrni32:
-;CHECK: zip1.2s
-;CHECK: zip2.2s
-;CHECK-NEXT: add.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
-	%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-	ret <2 x i32> %tmp5
-}
-
-define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: vtrnf:
-;CHECK: zip1.2s
-;CHECK: zip2.2s
-;CHECK-NEXT: fadd.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2>
-	%tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3>
-        %tmp5 = fadd <2 x float> %tmp3, %tmp4
-	ret <2 x float> %tmp5
-}
-
-define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: vtrnQi8:
-;CHECK: trn1.16b
-;CHECK: trn2.16b
-;CHECK-NEXT: add.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-	ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vtrnQi16:
-;CHECK: trn1.8h
-;CHECK: trn2.8h
-;CHECK-NEXT: add.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-	ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vtrnQi32:
-;CHECK: trn1.4s
-;CHECK: trn2.4s
-;CHECK-NEXT: add.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-	ret <4 x i32> %tmp5
-}
-
-define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: vtrnQf:
-;CHECK: trn1.4s
-;CHECK: trn2.4s
-;CHECK-NEXT: fadd.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-        %tmp5 = fadd <4 x float> %tmp3, %tmp4
-	ret <4 x float> %tmp5
-}
-
-; Undef shuffle indices should not prevent matching to VTRN:
-
-define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vtrni8_undef:
-;CHECK: trn1.8b
-;CHECK: trn2.8b
-;CHECK-NEXT: add.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14>
-	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-	ret <8 x i8> %tmp5
-}
-
-define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vtrnQi16_undef:
-;CHECK: trn1.8h
-;CHECK: trn2.8h
-;CHECK-NEXT: add.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
-	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-	ret <8 x i16> %tmp5
-}
diff --git a/test/CodeGen/ARM64/trunc-store.ll b/test/CodeGen/ARM64/trunc-store.ll
deleted file mode 100644
index cf15247e152..00000000000
--- a/test/CodeGen/ARM64/trunc-store.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-
-define void @bar(<8 x i16> %arg, <8 x i8>* %p) nounwind {
-; CHECK-LABEL: bar:
-; CHECK: xtn.8b v[[REG:[0-9]+]], v0
-; CHECK-NEXT: str d[[REG]], [x0]
-; CHECK-NEXT: ret
-  %tmp = trunc <8 x i16> %arg to <8 x i8>
-  store <8 x i8> %tmp, <8 x i8>* %p, align 8
-  ret void
-}
-
-@zptr8 = common global i8* null, align 8
-@zptr16 = common global i16* null, align 8
-@zptr32 = common global i32* null, align 8
-
-define void @fct32(i32 %arg, i64 %var) {
-; CHECK: fct32
-; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr32@GOTPAGE
-; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr32@GOTPAGEOFF]
-; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
-; w0 is %arg
-; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
-; w1 is %var truncated
-; CHECK-NEXT: str w1, {{\[}}[[GLOBALADDR]], w[[OFFSETREGNUM]], sxtw #2]
-; CHECK-NEXT: ret
-bb:
-  %.pre37 = load i32** @zptr32, align 8
-  %dec = add nsw i32 %arg, -1
-  %idxprom8 = sext i32 %dec to i64
-  %arrayidx9 = getelementptr inbounds i32* %.pre37, i64 %idxprom8
-  %tmp = trunc i64 %var to i32
-  store i32 %tmp, i32* %arrayidx9, align 4
-  ret void
-}
-
-define void @fct16(i32 %arg, i64 %var) {
-; CHECK: fct16
-; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr16@GOTPAGE
-; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr16@GOTPAGEOFF]
-; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
-; w0 is %arg
-; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
-; w1 is %var truncated
-; CHECK-NEXT: strh w1, {{\[}}[[GLOBALADDR]], w[[OFFSETREGNUM]], sxtw #1]
-; CHECK-NEXT: ret
-bb:
-  %.pre37 = load i16** @zptr16, align 8
-  %dec = add nsw i32 %arg, -1
-  %idxprom8 = sext i32 %dec to i64
-  %arrayidx9 = getelementptr inbounds i16* %.pre37, i64 %idxprom8
-  %tmp = trunc i64 %var to i16
-  store i16 %tmp, i16* %arrayidx9, align 4
-  ret void
-}
-
-define void @fct8(i32 %arg, i64 %var) {
-; CHECK: fct8
-; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr8@GOTPAGE
-; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr8@GOTPAGEOFF]
-; CHECK: ldr [[BASEADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
-; w0 is %arg
-; CHECK-NEXT: add [[ADDR:x[0-9]+]], [[BASEADDR]], w0, sxtw
-; w1 is %var truncated
-; CHECK-NEXT: sturb w1, {{\[}}[[ADDR]], #-1]
-; CHECK-NEXT: ret
-bb:
-  %.pre37 = load i8** @zptr8, align 8
-  %dec = add nsw i32 %arg, -1
-  %idxprom8 = sext i32 %dec to i64
-  %arrayidx9 = getelementptr inbounds i8* %.pre37, i64 %idxprom8
-  %tmp = trunc i64 %var to i8
-  store i8 %tmp, i8* %arrayidx9, align 4
-  ret void
-}
diff --git a/test/CodeGen/ARM64/umaxv.ll b/test/CodeGen/ARM64/umaxv.ll
deleted file mode 100644
index 15277d32f03..00000000000
--- a/test/CodeGen/ARM64/umaxv.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
-; CHECK-LABEL: vmax_u8x8:
-; CHECK: umaxv.8b        b[[REG:[0-9]+]], v0
-; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
-  %tmp = trunc i32 %vmaxv.i to i8
-  %tobool = icmp eq i8 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-declare i32 @bar(...)
-
-define i32 @vmax_u4x16(<4 x i16> %a) nounwind ssp {
-; CHECK-LABEL: vmax_u4x16:
-; CHECK: umaxv.4h        h[[REG:[0-9]+]], v0
-; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
-  %tmp = trunc i32 %vmaxv.i to i16
-  %tobool = icmp eq i16 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @vmax_u8x16(<8 x i16> %a) nounwind ssp {
-; CHECK-LABEL: vmax_u8x16:
-; CHECK: umaxv.8h        h[[REG:[0-9]+]], v0
-; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
-  %tmp = trunc i32 %vmaxv.i to i16
-  %tobool = icmp eq i16 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @vmax_u16x8(<16 x i8> %a) nounwind ssp {
-; CHECK-LABEL: vmax_u16x8:
-; CHECK: umaxv.16b        b[[REG:[0-9]+]], v0
-; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
-  %tmp = trunc i32 %vmaxv.i to i8
-  %tobool = icmp eq i8 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/uminv.ll b/test/CodeGen/ARM64/uminv.ll
deleted file mode 100644
index 440522f1693..00000000000
--- a/test/CodeGen/ARM64/uminv.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
-; CHECK-LABEL: vmin_u8x8:
-; CHECK: uminv.8b        b[[REG:[0-9]+]], v0
-; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
-  %tmp = trunc i32 %vminv.i to i8
-  %tobool = icmp eq i8 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-declare i32 @bar(...)
-
-define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {
-; CHECK-LABEL: vmin_u4x16:
-; CHECK: uminv.4h        h[[REG:[0-9]+]], v0
-; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
-  %tmp = trunc i32 %vminv.i to i16
-  %tobool = icmp eq i16 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {
-; CHECK-LABEL: vmin_u8x16:
-; CHECK: uminv.8h        h[[REG:[0-9]+]], v0
-; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
-  %tmp = trunc i32 %vminv.i to i16
-  %tobool = icmp eq i16 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {
-; CHECK-LABEL: vmin_u16x8:
-; CHECK: uminv.16b        b[[REG:[0-9]+]], v0
-; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
-  %tmp = trunc i32 %vminv.i to i8
-  %tobool = icmp eq i8 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/umov.ll b/test/CodeGen/ARM64/umov.ll
deleted file mode 100644
index 19fd91b6c3d..00000000000
--- a/test/CodeGen/ARM64/umov.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define zeroext i8 @f1(<16 x i8> %a) {
-; CHECK-LABEL: f1:
-; CHECK: mov.b w0, v0[3]
-; CHECK-NEXT: ret
-  %vecext = extractelement <16 x i8> %a, i32 3
-  ret i8 %vecext
-}
-
-define zeroext i16 @f2(<4 x i16> %a) {
-; CHECK-LABEL: f2:
-; CHECK: mov.h w0, v0[2]
-; CHECK-NEXT: ret
-  %vecext = extractelement <4 x i16> %a, i32 2
-  ret i16 %vecext
-}
-
-define i32 @f3(<2 x i32> %a) {
-; CHECK-LABEL: f3:
-; CHECK: mov.s w0, v0[1]
-; CHECK-NEXT: ret
-  %vecext = extractelement <2 x i32> %a, i32 1
-  ret i32 %vecext
-}
-
-define i64 @f4(<2 x i64> %a) {
-; CHECK-LABEL: f4:
-; CHECK: mov.d x0, v0[1]
-; CHECK-NEXT: ret
-  %vecext = extractelement <2 x i64> %a, i32 1
-  ret i64 %vecext
-}
diff --git a/test/CodeGen/ARM64/unaligned_ldst.ll b/test/CodeGen/ARM64/unaligned_ldst.ll
deleted file mode 100644
index 20b80c09f72..00000000000
--- a/test/CodeGen/ARM64/unaligned_ldst.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-; rdar://r11231896
-
-define void @t1(i8* nocapture %a, i8* nocapture %b) nounwind {
-entry:
-; CHECK-LABEL: t1:
-; CHECK-NOT: orr
-; CHECK: ldr [[X0:x[0-9]+]], [x1]
-; CHECK: str [[X0]], [x0]
-  %tmp1 = bitcast i8* %b to i64*
-  %tmp2 = bitcast i8* %a to i64*
-  %tmp3 = load i64* %tmp1, align 1
-  store i64 %tmp3, i64* %tmp2, align 1
-  ret void
-}
-
-define void @t2(i8* nocapture %a, i8* nocapture %b) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK-NOT: orr
-; CHECK: ldr [[W0:w[0-9]+]], [x1]
-; CHECK: str [[W0]], [x0]
-  %tmp1 = bitcast i8* %b to i32*
-  %tmp2 = bitcast i8* %a to i32*
-  %tmp3 = load i32* %tmp1, align 1
-  store i32 %tmp3, i32* %tmp2, align 1
-  ret void
-}
-
-define void @t3(i8* nocapture %a, i8* nocapture %b) nounwind {
-entry:
-; CHECK-LABEL: t3:
-; CHECK-NOT: orr
-; CHECK: ldrh [[W0:w[0-9]+]], [x1]
-; CHECK: strh [[W0]], [x0]
-  %tmp1 = bitcast i8* %b to i16*
-  %tmp2 = bitcast i8* %a to i16*
-  %tmp3 = load i16* %tmp1, align 1
-  store i16 %tmp3, i16* %tmp2, align 1
-  ret void
-}
diff --git a/test/CodeGen/ARM64/uzp.ll b/test/CodeGen/ARM64/uzp.ll
deleted file mode 100644
index 60e16d0d686..00000000000
--- a/test/CodeGen/ARM64/uzp.ll
+++ /dev/null
@@ -1,107 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vuzpi8:
-;CHECK: uzp1.8b
-;CHECK: uzp2.8b
-;CHECK-NEXT: add.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-	ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: vuzpi16:
-;CHECK: uzp1.4h
-;CHECK: uzp2.4h
-;CHECK-NEXT: add.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-	ret <4 x i16> %tmp5
-}
-
-define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: vuzpQi8:
-;CHECK: uzp1.16b
-;CHECK: uzp2.16b
-;CHECK-NEXT: add.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-	ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vuzpQi16:
-;CHECK: uzp1.8h
-;CHECK: uzp2.8h
-;CHECK-NEXT: add.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-	ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vuzpQi32:
-;CHECK: uzp1.4s
-;CHECK: uzp2.4s
-;CHECK-NEXT: add.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-	ret <4 x i32> %tmp5
-}
-
-define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: vuzpQf:
-;CHECK: uzp1.4s
-;CHECK: uzp2.4s
-;CHECK-NEXT: fadd.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-        %tmp5 = fadd <4 x float> %tmp3, %tmp4
-	ret <4 x float> %tmp5
-}
-
-; Undef shuffle indices should not prevent matching to VUZP:
-
-define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vuzpi8_undef:
-;CHECK: uzp1.8b
-;CHECK: uzp2.8b
-;CHECK-NEXT: add.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
-	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-	ret <8 x i8> %tmp5
-}
-
-define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vuzpQi16_undef:
-;CHECK: uzp1.8h
-;CHECK: uzp2.8h
-;CHECK-NEXT: add.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
-	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-	ret <8 x i16> %tmp5
-}
diff --git a/test/CodeGen/ARM64/vaargs.ll b/test/CodeGen/ARM64/vaargs.ll
deleted file mode 100644
index ce07635a5c8..00000000000
--- a/test/CodeGen/ARM64/vaargs.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
-target triple = "arm64-apple-darwin11.0.0"
-
-define float @t1(i8* nocapture %fmt, ...) nounwind ssp {
-entry:
-; CHECK: t1
-; CHECK: fcvt
-  %argp = alloca i8*, align 8
-  %argp1 = bitcast i8** %argp to i8*
-  call void @llvm.va_start(i8* %argp1)
-  %0 = va_arg i8** %argp, i32
-  %1 = va_arg i8** %argp, float
-  call void @llvm.va_end(i8* %argp1)
-  ret float %1
-}
-
-declare void @llvm.va_start(i8*) nounwind
-
-declare void @llvm.va_end(i8*) nounwind
diff --git a/test/CodeGen/ARM64/vabs.ll b/test/CodeGen/ARM64/vabs.ll
deleted file mode 100644
index 0d8aa24e1b4..00000000000
--- a/test/CodeGen/ARM64/vabs.ll
+++ /dev/null
@@ -1,804 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-
-define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sabdl8h:
-;CHECK: sabdl.8h
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sabdl4s:
-;CHECK: sabdl.4s
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
-        ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sabdl2d:
-;CHECK: sabdl.2d
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
-        ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sabdl2_8h:
-;CHECK: sabdl2.8h
-        %load1 = load <16 x i8>* %A
-        %load2 = load <16 x i8>* %B
-        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sabdl2_4s:
-;CHECK: sabdl2.4s
-        %load1 = load <8 x i16>* %A
-        %load2 = load <8 x i16>* %B
-        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
-        ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sabdl2_2d:
-;CHECK: sabdl2.2d
-        %load1 = load <4 x i32>* %A
-        %load2 = load <4 x i32>* %B
-        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
-        ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uabdl8h:
-;CHECK: uabdl.8h
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
-  ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uabdl4s:
-;CHECK: uabdl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uabdl2d:
-;CHECK: uabdl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
-  ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uabdl2_8h:
-;CHECK: uabdl2.8h
-  %load1 = load <16 x i8>* %A
-  %load2 = load <16 x i8>* %B
-  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
-  ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uabdl2_4s:
-;CHECK: uabdl2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uabdl2_2d:
-;CHECK: uabdl2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
-  ret <2 x i64> %tmp4
-}
-
-define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fabd_2s:
-;CHECK: fabd.2s
-        %tmp1 = load <2 x float>* %A
-        %tmp2 = load <2 x float>* %B
-        %tmp3 = call <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-        ret <2 x float> %tmp3
-}
-
-define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fabd_4s:
-;CHECK: fabd.4s
-        %tmp1 = load <4 x float>* %A
-        %tmp2 = load <4 x float>* %B
-        %tmp3 = call <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-        ret <4 x float> %tmp3
-}
-
-define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fabd_2d:
-;CHECK: fabd.2d
-        %tmp1 = load <2 x double>* %A
-        %tmp2 = load <2 x double>* %B
-        %tmp3 = call <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-        ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sabd_8b:
-;CHECK: sabd.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sabd_16b:
-;CHECK: sabd.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sabd_4h:
-;CHECK: sabd.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sabd_8h:
-;CHECK: sabd.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sabd_2s:
-;CHECK: sabd.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sabd_4s:
-;CHECK: sabd.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uabd_8b:
-;CHECK: uabd.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uabd_16b:
-;CHECK: uabd.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uabd_4h:
-;CHECK: uabd.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uabd_8h:
-;CHECK: uabd.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uabd_2s:
-;CHECK: uabd.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uabd_4s:
-;CHECK: uabd.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: sqabs_8b:
-;CHECK: sqabs.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8> %tmp1)
-        ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: sqabs_16b:
-;CHECK: sqabs.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8> %tmp1)
-        ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: sqabs_4h:
-;CHECK: sqabs.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16> %tmp1)
-        ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqabs_8h:
-;CHECK: sqabs.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16> %tmp1)
-        ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: sqabs_2s:
-;CHECK: sqabs.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32> %tmp1)
-        ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqabs_4s:
-;CHECK: sqabs.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32> %tmp1)
-        ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
-
-define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: sqneg_8b:
-;CHECK: sqneg.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8> %tmp1)
-        ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: sqneg_16b:
-;CHECK: sqneg.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8> %tmp1)
-        ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: sqneg_4h:
-;CHECK: sqneg.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16> %tmp1)
-        ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqneg_8h:
-;CHECK: sqneg.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16> %tmp1)
-        ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: sqneg_2s:
-;CHECK: sqneg.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32> %tmp1)
-        ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqneg_4s:
-;CHECK: sqneg.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32> %tmp1)
-        ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
-
-define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: abs_8b:
-;CHECK: abs.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8> %tmp1)
-        ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: abs_16b:
-;CHECK: abs.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8> %tmp1)
-        ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: abs_4h:
-;CHECK: abs.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16> %tmp1)
-        ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: abs_8h:
-;CHECK: abs.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16> %tmp1)
-        ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: abs_2s:
-;CHECK: abs.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32> %tmp1)
-        ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: abs_4s:
-;CHECK: abs.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32> %tmp1)
-        ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
-; CHECK-LABEL: abs_1d:
-; CHECK: abs d0, d0
-  %abs = call <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64> %A)
-  ret <1 x i64> %abs
-}
-
-define i64 @abs_1d_honestly(i64 %A) nounwind {
-; CHECK-LABEL: abs_1d_honestly:
-; CHECK: abs d0, d0
-  %abs = call i64 @llvm.arm64.neon.abs.i64(i64 %A)
-  ret i64 %abs
-}
-
-declare <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.abs.i64(i64) nounwind readnone
-
-define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
-;CHECK-LABEL: sabal8h:
-;CHECK: sabal.8h
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = load <8 x i16>* %C
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sabal4s:
-;CHECK: sabal.4s
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = load <4 x i32>* %C
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sabal2d:
-;CHECK: sabal.2d
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = load <2 x i64>* %C
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
-        %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
-        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
-;CHECK-LABEL: sabal2_8h:
-;CHECK: sabal2.8h
-        %load1 = load <16 x i8>* %A
-        %load2 = load <16 x i8>* %B
-        %tmp3 = load <8 x i16>* %C
-        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sabal2_4s:
-;CHECK: sabal2.4s
-        %load1 = load <8 x i16>* %A
-        %load2 = load <8 x i16>* %B
-        %tmp3 = load <4 x i32>* %C
-        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sabal2_2d:
-;CHECK: sabal2.2d
-        %load1 = load <4 x i32>* %A
-        %load2 = load <4 x i32>* %B
-        %tmp3 = load <2 x i64>* %C
-        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
-        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
-;CHECK-LABEL: uabal8h:
-;CHECK: uabal.8h
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = load <8 x i16>* %C
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: uabal4s:
-;CHECK: uabal.4s
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = load <4 x i32>* %C
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: uabal2d:
-;CHECK: uabal.2d
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = load <2 x i64>* %C
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
-        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
-;CHECK-LABEL: uabal2_8h:
-;CHECK: uabal2.8h
-        %load1 = load <16 x i8>* %A
-        %load2 = load <16 x i8>* %B
-        %tmp3 = load <8 x i16>* %C
-        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: uabal2_4s:
-;CHECK: uabal2.4s
-        %load1 = load <8 x i16>* %A
-        %load2 = load <8 x i16>* %B
-        %tmp3 = load <4 x i32>* %C
-        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: uabal2_2d:
-;CHECK: uabal2.2d
-        %load1 = load <4 x i32>* %A
-        %load2 = load <4 x i32>* %B
-        %tmp3 = load <2 x i64>* %C
-        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
-        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
-;CHECK-LABEL: saba_8b:
-;CHECK: saba.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4 = load <8 x i8>* %C
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
-;CHECK-LABEL: saba_16b:
-;CHECK: saba.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        %tmp4 = load <16 x i8>* %C
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-        ret <16 x i8> %tmp5
-}
-
-define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
-;CHECK-LABEL: saba_4h:
-;CHECK: saba.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4 = load <4 x i16>* %C
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
-;CHECK-LABEL: saba_8h:
-;CHECK: saba.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        %tmp4 = load <8 x i16>* %C
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-        ret <8 x i16> %tmp5
-}
-
-define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
-;CHECK-LABEL: saba_2s:
-;CHECK: saba.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4 = load <2 x i32>* %C
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: saba_4s:
-;CHECK: saba.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        %tmp4 = load <4 x i32>* %C
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-        ret <4 x i32> %tmp5
-}
-
-define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
-;CHECK-LABEL: uaba_8b:
-;CHECK: uaba.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4 = load <8 x i8>* %C
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
-;CHECK-LABEL: uaba_16b:
-;CHECK: uaba.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        %tmp4 = load <16 x i8>* %C
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-        ret <16 x i8> %tmp5
-}
-
-define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
-;CHECK-LABEL: uaba_4h:
-;CHECK: uaba.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4 = load <4 x i16>* %C
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
-;CHECK-LABEL: uaba_8h:
-;CHECK: uaba.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        %tmp4 = load <8 x i16>* %C
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-        ret <8 x i16> %tmp5
-}
-
-define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
-;CHECK-LABEL: uaba_2s:
-;CHECK: uaba.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4 = load <2 x i32>* %C
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: uaba_4s:
-;CHECK: uaba.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        %tmp4 = load <4 x i32>* %C
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-        ret <4 x i32> %tmp5
-}
-
-; Scalar FABD
-define float @fabds(float %a, float %b) nounwind {
-; CHECK-LABEL: fabds:
-; CHECK: fabd s0, s0, s1
-  %vabd.i = tail call float @llvm.arm64.sisd.fabd.f32(float %a, float %b) nounwind
-  ret float %vabd.i
-}
-
-define double @fabdd(double %a, double %b) nounwind {
-; CHECK-LABEL: fabdd:
-; CHECK: fabd d0, d0, d1
-  %vabd.i = tail call double @llvm.arm64.sisd.fabd.f64(double %a, double %b) nounwind
-  ret double %vabd.i
-}
-
-declare double @llvm.arm64.sisd.fabd.f64(double, double) nounwind readnone
-declare float @llvm.arm64.sisd.fabd.f32(float, float) nounwind readnone
-
-define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: uabdl_from_extract_dup:
-; CHECK-NOT: ext.16b
-; CHECK: uabdl2.2d
-  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
-  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %res = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
-  %res1 = zext <2 x i32> %res to <2 x i64>
-  ret <2 x i64> %res1
-}
-
-define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: sabdl_from_extract_dup:
-; CHECK-NOT: ext.16b
-; CHECK: sabdl2.2d
-  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
-  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %res = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
-  %res1 = zext <2 x i32> %res to <2 x i64>
-  ret <2 x i64> %res1
-}
diff --git a/test/CodeGen/ARM64/vadd.ll b/test/CodeGen/ARM64/vadd.ll
deleted file mode 100644
index f674c6de339..00000000000
--- a/test/CodeGen/ARM64/vadd.ll
+++ /dev/null
@@ -1,941 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
-
-define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: addhn8b:
-;CHECK: addhn.8b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: addhn4h:
-;CHECK: addhn.4h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: addhn2s:
-;CHECK: addhn.2s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
-;CHECK-LABEL: addhn2_16b:
-;CHECK: addhn.8b
-;CHECK-NEXT: addhn2.16b
-  %vaddhn2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vaddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %res
-}
-
-define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
-;CHECK-LABEL: addhn2_8h:
-;CHECK: addhn.4h
-;CHECK-NEXT: addhn2.8h
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vaddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
-;CHECK-LABEL: addhn2_4s:
-;CHECK: addhn.2s
-;CHECK-NEXT: addhn2.4s
-  %vaddhn2.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vaddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %res
-}
-
-declare <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: raddhn8b:
-;CHECK: raddhn.8b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: raddhn4h:
-;CHECK: raddhn.4h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: raddhn2s:
-;CHECK: raddhn.2s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
-;CHECK-LABEL: raddhn2_16b:
-;CHECK: raddhn.8b
-;CHECK-NEXT: raddhn2.16b
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vraddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %res
-}
-
-define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
-;CHECK-LABEL: raddhn2_8h:
-;CHECK: raddhn.4h
-;CHECK-NEXT: raddhn2.8h
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vraddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
-;CHECK-LABEL: raddhn2_4s:
-;CHECK: raddhn.2s
-;CHECK-NEXT: raddhn2.4s
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vraddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %res
-}
-
-declare <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: saddl8h:
-;CHECK: saddl.8h
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
-  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
-  %tmp5 = add <8 x i16> %tmp3, %tmp4
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: saddl4s:
-;CHECK: saddl.4s
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
-  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
-  %tmp5 = add <4 x i32> %tmp3, %tmp4
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: saddl2d:
-;CHECK: saddl.2d
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
-  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
-  %tmp5 = add <2 x i64> %tmp3, %tmp4
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
-; CHECK-LABEL: saddl2_8h:
-; CHECK-NEXT: saddl2.8h v0, v0, v1
-; CHECK-NEXT: ret
-  %tmp = bitcast <16 x i8> %a to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
-  %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
-  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
-  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
-  %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
-; CHECK-LABEL: saddl2_4s:
-; CHECK-NEXT: saddl2.4s v0, v0, v1
-; CHECK-NEXT: ret
-  %tmp = bitcast <8 x i16> %a to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
-  %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
-  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
-  %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
-; CHECK-LABEL: saddl2_2d:
-; CHECK-NEXT: saddl2.2d v0, v0, v1
-; CHECK-NEXT: ret
-  %tmp = bitcast <4 x i32> %a to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
-  %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
-  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
-  %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uaddl8h:
-;CHECK: uaddl.8h
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
-  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
-  %tmp5 = add <8 x i16> %tmp3, %tmp4
-  ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uaddl4s:
-;CHECK: uaddl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
-  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
-  %tmp5 = add <4 x i32> %tmp3, %tmp4
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uaddl2d:
-;CHECK: uaddl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
-  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
-  %tmp5 = add <2 x i64> %tmp3, %tmp4
-  ret <2 x i64> %tmp5
-}
-
-
-define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
-; CHECK-LABEL: uaddl2_8h:
-; CHECK-NEXT: uaddl2.8h v0, v0, v1
-; CHECK-NEXT: ret
-  %tmp = bitcast <16 x i8> %a to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
-  %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
-  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
-  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
-  %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
-; CHECK-LABEL: uaddl2_4s:
-; CHECK-NEXT: uaddl2.4s v0, v0, v1
-; CHECK-NEXT: ret
-  %tmp = bitcast <8 x i16> %a to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
-  %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
-  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
-  %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
-; CHECK-LABEL: uaddl2_2d:
-; CHECK-NEXT: uaddl2.2d v0, v0, v1
-; CHECK-NEXT: ret
-  %tmp = bitcast <4 x i32> %a to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
-  %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
-  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
-  %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uaddw8h:
-;CHECK: uaddw.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i8>* %B
-  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
-  %tmp4 = add <8 x i16> %tmp1, %tmp3
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uaddw4s:
-;CHECK: uaddw.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i16>* %B
-  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
-  %tmp4 = add <4 x i32> %tmp1, %tmp3
-        ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uaddw2d:
-;CHECK: uaddw.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i32>* %B
-  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
-  %tmp4 = add <2 x i64> %tmp1, %tmp3
-        ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uaddw2_8h:
-;CHECK: uaddw2.8h
-        %tmp1 = load <8 x i16>* %A
-
-        %tmp2 = load <16 x i8>* %B
-        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %ext2 = zext <8 x i8> %high2 to <8 x i16>
-
-        %res = add <8 x i16> %tmp1, %ext2
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uaddw2_4s:
-;CHECK: uaddw2.4s
-        %tmp1 = load <4 x i32>* %A
-
-        %tmp2 = load <8 x i16>* %B
-        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %ext2 = zext <4 x i16> %high2 to <4 x i32>
-
-        %res = add <4 x i32> %tmp1, %ext2
-        ret <4 x i32> %res
-}
-
-define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uaddw2_2d:
-;CHECK: uaddw2.2d
-        %tmp1 = load <2 x i64>* %A
-
-        %tmp2 = load <4 x i32>* %B
-        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %ext2 = zext <2 x i32> %high2 to <2 x i64>
-
-        %res = add <2 x i64> %tmp1, %ext2
-        ret <2 x i64> %res
-}
-
-define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: saddw8h:
-;CHECK: saddw.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
-        %tmp4 = add <8 x i16> %tmp1, %tmp3
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: saddw4s:
-;CHECK: saddw.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
-        %tmp4 = add <4 x i32> %tmp1, %tmp3
-        ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: saddw2d:
-;CHECK: saddw.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
-        %tmp4 = add <2 x i64> %tmp1, %tmp3
-        ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: saddw2_8h:
-;CHECK: saddw2.8h
-        %tmp1 = load <8 x i16>* %A
-
-        %tmp2 = load <16 x i8>* %B
-        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %ext2 = sext <8 x i8> %high2 to <8 x i16>
-
-        %res = add <8 x i16> %tmp1, %ext2
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: saddw2_4s:
-;CHECK: saddw2.4s
-        %tmp1 = load <4 x i32>* %A
-
-        %tmp2 = load <8 x i16>* %B
-        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %ext2 = sext <4 x i16> %high2 to <4 x i32>
-
-        %res = add <4 x i32> %tmp1, %ext2
-        ret <4 x i32> %res
-}
-
-define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: saddw2_2d:
-;CHECK: saddw2.2d
-        %tmp1 = load <2 x i64>* %A
-
-        %tmp2 = load <4 x i32>* %B
-        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %ext2 = sext <2 x i32> %high2 to <2 x i64>
-
-        %res = add <2 x i64> %tmp1, %ext2
-        ret <2 x i64> %res
-}
-
-define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: saddlp4h:
-;CHECK: saddlp.4h
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: saddlp2s:
-;CHECK: saddlp.2s
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
-        ret <2 x i32> %tmp3
-}
-
-define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: saddlp1d:
-;CHECK: saddlp.1d
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
-        ret <1 x i64> %tmp3
-}
-
-define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: saddlp8h:
-;CHECK: saddlp.8h
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: saddlp4s:
-;CHECK: saddlp.4s
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: saddlp2d:
-;CHECK: saddlp.2d
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
-        ret <2 x i64> %tmp3
-}
-
-declare <4 x i16>  @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
-
-declare <8 x i16>  @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
-
-define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: uaddlp4h:
-;CHECK: uaddlp.4h
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: uaddlp2s:
-;CHECK: uaddlp.2s
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
-        ret <2 x i32> %tmp3
-}
-
-define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: uaddlp1d:
-;CHECK: uaddlp.1d
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
-        ret <1 x i64> %tmp3
-}
-
-define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: uaddlp8h:
-;CHECK: uaddlp.8h
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: uaddlp4s:
-;CHECK: uaddlp.4s
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: uaddlp2d:
-;CHECK: uaddlp.2d
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
-        ret <2 x i64> %tmp3
-}
-
-declare <4 x i16>  @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
-
-declare <8 x i16>  @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
-
-define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sadalp4h:
-;CHECK: sadalp.4h
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
-        %tmp4 = load <4 x i16>* %B
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sadalp2s:
-;CHECK: sadalp.2s
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
-        %tmp4 = load <2 x i32>* %B
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sadalp8h:
-;CHECK: sadalp.8h
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
-        %tmp4 = load <8 x i16>* %B
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sadalp4s:
-;CHECK: sadalp.4s
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
-        %tmp4 = load <4 x i32>* %B
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sadalp2d:
-;CHECK: sadalp.2d
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
-        %tmp4 = load <2 x i64>* %B
-        %tmp5 = add <2 x i64> %tmp3, %tmp4
-        ret <2 x i64> %tmp5
-}
-
-define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uadalp4h:
-;CHECK: uadalp.4h
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
-        %tmp4 = load <4 x i16>* %B
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uadalp2s:
-;CHECK: uadalp.2s
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
-        %tmp4 = load <2 x i32>* %B
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uadalp8h:
-;CHECK: uadalp.8h
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
-        %tmp4 = load <8 x i16>* %B
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uadalp4s:
-;CHECK: uadalp.4s
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
-        %tmp4 = load <4 x i32>* %B
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uadalp2d:
-;CHECK: uadalp.2d
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
-        %tmp4 = load <2 x i64>* %B
-        %tmp5 = add <2 x i64> %tmp3, %tmp4
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: addp_8b:
-;CHECK: addp.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: addp_16b:
-;CHECK: addp.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: addp_4h:
-;CHECK: addp.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: addp_8h:
-;CHECK: addp.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: addp_2s:
-;CHECK: addp.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: addp_4s:
-;CHECK: addp.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: addp_2d:
-;CHECK: addp.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: faddp_2s:
-;CHECK: faddp.2s
-        %tmp1 = load <2 x float>* %A
-        %tmp2 = load <2 x float>* %B
-        %tmp3 = call <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-        ret <2 x float> %tmp3
-}
-
-define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: faddp_4s:
-;CHECK: faddp.4s
-        %tmp1 = load <4 x float>* %A
-        %tmp2 = load <4 x float>* %B
-        %tmp3 = call <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-        ret <4 x float> %tmp3
-}
-
-define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: faddp_2d:
-;CHECK: faddp.2d
-        %tmp1 = load <2 x double>* %A
-        %tmp2 = load <2 x double>* %B
-        %tmp3 = call <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-        ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: uaddl2_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: uaddl2.2d
-  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
-  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
-  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
-
-  %res = add <2 x i64> %lhs.ext, %rhs.ext
-  ret <2 x i64> %res
-}
-
-define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: saddl2_duplhs
-; CHECK-NOT: ext.16b
-; CHECK: saddl2.2d
-  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
-  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
-
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
-  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
-
-  %res = add <2 x i64> %lhs.ext, %rhs.ext
-  ret <2 x i64> %res
-}
-
-define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: usubl2_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: usubl2.2d
-  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
-  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
-  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
-
-  %res = sub <2 x i64> %lhs.ext, %rhs.ext
-  ret <2 x i64> %res
-}
-
-define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: ssubl2_duplhs
-; CHECK-NOT: ext.16b
-; CHECK: ssubl2.2d
-  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
-  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
-
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
-  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
-
-  %res = sub <2 x i64> %lhs.ext, %rhs.ext
-  ret <2 x i64> %res
-}
-
-define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: addhn8b_natural:
-;CHECK: addhn.8b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %sum = add <8 x i16> %tmp1, %tmp2
-        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
-        ret <8 x i8> %narrowed
-}
-
-define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: addhn4h_natural:
-;CHECK: addhn.4h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %sum = add <4 x i32> %tmp1, %tmp2
-        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
-        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
-        ret <4 x i16> %narrowed
-}
-
-define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: addhn2s_natural:
-;CHECK: addhn.2s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %sum = add <2 x i64> %tmp1, %tmp2
-        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
-        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
-        ret <2 x i32> %narrowed
-}
-
-define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: addhn2_16b_natural:
-;CHECK: addhn2.16b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %sum = add <8 x i16> %tmp1, %tmp2
-        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
-        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %res
-}
-
-define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: addhn2_8h_natural:
-;CHECK: addhn2.8h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %sum = add <4 x i32> %tmp1, %tmp2
-        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
-        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
-        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: addhn2_4s_natural:
-;CHECK: addhn2.4s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %sum = add <2 x i64> %tmp1, %tmp2
-        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
-        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
-        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %res
-}
-
-define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: subhn8b_natural:
-;CHECK: subhn.8b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %diff = sub <8 x i16> %tmp1, %tmp2
-        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
-        ret <8 x i8> %narrowed
-}
-
-define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: subhn4h_natural:
-;CHECK: subhn.4h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %diff = sub <4 x i32> %tmp1, %tmp2
-        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
-        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
-        ret <4 x i16> %narrowed
-}
-
-define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: subhn2s_natural:
-;CHECK: subhn.2s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %diff = sub <2 x i64> %tmp1, %tmp2
-        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
-        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
-        ret <2 x i32> %narrowed
-}
-
-define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: subhn2_16b_natural:
-;CHECK: subhn2.16b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %diff = sub <8 x i16> %tmp1, %tmp2
-        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
-        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %res
-}
-
-define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: subhn2_8h_natural:
-;CHECK: subhn2.8h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %diff = sub <4 x i32> %tmp1, %tmp2
-        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
-        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
-        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: subhn2_4s_natural:
-;CHECK: subhn2.4s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %diff = sub <2 x i64> %tmp1, %tmp2
-        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
-        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
-        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %res
-}
diff --git a/test/CodeGen/ARM64/vaddlv.ll b/test/CodeGen/ARM64/vaddlv.ll
deleted file mode 100644
index d4d4608ba07..00000000000
--- a/test/CodeGen/ARM64/vaddlv.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
-; CHECK: test_vaddlv_s32
-; CHECK: saddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
-; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddlv.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
-  ret i64 %vaddlv.i
-}
-
-define i64 @test_vaddlv_u32(<2 x i32> %a1) nounwind readnone {
-; CHECK: test_vaddlv_u32
-; CHECK: uaddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
-; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddlv.i = tail call i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
-  ret i64 %vaddlv.i
-}
-
-declare i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
-
-declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
-
diff --git a/test/CodeGen/ARM64/vaddv.ll b/test/CodeGen/ARM64/vaddv.ll
deleted file mode 100644
index 154f9177937..00000000000
--- a/test/CodeGen/ARM64/vaddv.ll
+++ /dev/null
@@ -1,245 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
-
-define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
-; CHECK-LABEL: test_vaddv_s8:
-; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a1)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
-; CHECK-LABEL: test_vaddv_s16:
-; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a1)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddv_s32(<2 x i32> %a1) {
-; CHECK-LABEL: test_vaddv_s32:
-; 2 x i32 is not supported by the ISA, thus, this is a special case
-; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32> %a1)
-  ret i32 %vaddv.i
-}
-
-define i64 @test_vaddv_s64(<2 x i64> %a1) {
-; CHECK-LABEL: test_vaddv_s64:
-; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
-; CHECK-NEXT: fmov x0, [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64> %a1)
-  ret i64 %vaddv.i
-}
-
-define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
-; CHECK-LABEL: test_vaddv_u8:
-; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
-; CHECK-LABEL: test_vaddv_u8_masked:
-; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
-  %0 = and i32 %vaddv.i, 511 ; 0x1ff
-  ret i32 %0
-}
-
-define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
-; CHECK-LABEL: test_vaddv_u16:
-; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
-; CHECK-LABEL: test_vaddv_u16_masked:
-; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
-  %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
-  ret i32 %0
-}
-
-define i32 @test_vaddv_u32(<2 x i32> %a1) {
-; CHECK-LABEL: test_vaddv_u32:
-; 2 x i32 is not supported by the ISA, thus, this is a special case
-; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
-  ret i32 %vaddv.i
-}
-
-define float @test_vaddv_f32(<2 x float> %a1) {
-; CHECK-LABEL: test_vaddv_f32:
-; CHECK: faddp.2s s0, v0
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
-  ret float %vaddv.i
-}
-
-define float @test_vaddv_v4f32(<4 x float> %a1) {
-; CHECK-LABEL: test_vaddv_v4f32:
-; CHECK: faddp.4s [[REGNUM:v[0-9]+]], v0, v0
-; CHECK: faddp.2s s0, [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
-  ret float %vaddv.i
-}
-
-define double @test_vaddv_f64(<2 x double> %a1) {
-; CHECK-LABEL: test_vaddv_f64:
-; CHECK: faddp.2d d0, v0
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
-  ret double %vaddv.i
-}
-
-define i64 @test_vaddv_u64(<2 x i64> %a1) {
-; CHECK-LABEL: test_vaddv_u64:
-; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
-; CHECK-NEXT: fmov x0, [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
-  ret i64 %vaddv.i
-}
-
-define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) {
-; CHECK-LABEL: test_vaddv_u64_to_vec:
-; CHECK: addp.2d d0, v0
-; CHECK-NOT: fmov
-; CHECK-NOT: ins
-; CHECK: ret
-entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
-  %vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
-  ret <1 x i64> %vec
-}
-
-define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
-; CHECK-LABEL: test_vaddvq_s8:
-; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a1)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
-; CHECK-LABEL: test_vaddvq_s16:
-; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a1)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_s32(<4 x i32> %a1) {
-; CHECK-LABEL: test_vaddvq_s32:
-; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
-; CHECK-NEXT: fmov w0, [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a1)
-  ret i32 %vaddv.i
-}
-
-define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
-; CHECK-LABEL: test_vaddvq_u8:
-; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
-; CHECK-LABEL: test_vaddvq_u16:
-; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_u32(<4 x i32> %a1) {
-; CHECK-LABEL: test_vaddvq_u32:
-; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
-; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
-  ret i32 %vaddv.i
-}
-
-declare i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8>)
-
-declare i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64>)
-
-declare i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32>)
-
-declare i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32>)
-
-declare i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8>)
-
-declare float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
-declare float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
-declare double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
diff --git a/test/CodeGen/ARM64/variadic-aapcs.ll b/test/CodeGen/ARM64/variadic-aapcs.ll
deleted file mode 100644
index 36a7bfd9252..00000000000
--- a/test/CodeGen/ARM64/variadic-aapcs.ll
+++ /dev/null
@@ -1,143 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s
-
-%va_list = type {i8*, i8*, i8*, i32, i32}
-
-@var = global %va_list zeroinitializer, align 8
-
-declare void @llvm.va_start(i8*)
-
-define void @test_simple(i32 %n, ...) {
-; CHECK-LABEL: test_simple:
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
-
-; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
-
-; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]]
-; ... omit middle ones ...
-; CHECK: str x7, [sp, #
-
-; CHECK: stp q0, q1, [sp]
-; ... omit middle ones ...
-; CHECK: stp q6, q7, [sp, #
-
-; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
-
-; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
-; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #56
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
-; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
-
-; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
-; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #128
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #0x37
-; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-
-; CHECK: orr [[VR_OFFS:w[0-9]+]], wzr, #0xffffff80
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-
-  ret void
-}
-
-define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
-; CHECK-LABEL: test_fewargs:
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
-
-; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
-
-; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]]
-; ... omit middle ones ...
-; CHECK: str x7, [sp, #
-
-; CHECK: stp q1, q2, [sp]
-; ... omit middle ones ...
-; CHECK: str q7, [sp, #
-
-; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
-
-; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
-; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #40
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
-; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
-
-; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
-; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #112
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #0x27
-; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #0x6f
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-
-  ret void
-}
-
-define void @test_nospare([8 x i64], [8 x float], ...) {
-; CHECK-LABEL: test_nospare:
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK-NOT: sub sp, sp
-; CHECK: mov [[STACK:x[0-9]+]], sp
-; CHECK: str [[STACK]], [{{x[0-9]+}}, :lo12:var]
-
-  ret void
-}
-
-; If there are non-variadic arguments on the stack (here two i64s) then the
-; __stack field should point just past them.
-define void @test_offsetstack([10 x i64], [3 x float], ...) {
-; CHECK-LABEL: test_offsetstack:
-; CHECK: sub sp, sp, #80
-; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
-; CHECK: str [[STACK_TOP]], [{{x[0-9]+}}, :lo12:var]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-  ret void
-}
-
-declare void @llvm.va_end(i8*)
-
-define void @test_va_end() nounwind {
-; CHECK-LABEL: test_va_end:
-; CHECK-NEXT: BB#0
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_end(i8* %addr)
-
-  ret void
-; CHECK-NEXT: ret
-}
-
-declare void @llvm.va_copy(i8* %dest, i8* %src)
-
-@second_list = global %va_list zeroinitializer
-
-define void @test_va_copy() {
-; CHECK-LABEL: test_va_copy:
-  %srcaddr = bitcast %va_list* @var to i8*
-  %dstaddr = bitcast %va_list* @second_list to i8*
-  call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
-
-; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var
-
-; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]]
-; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
-; CHECK: str [[BLOCK]], [x[[DST]]]
-
-; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16]
-; CHECK: str [[BLOCK]], [x[[DST]], #16]
-  ret void
-; CHECK: ret
-}
diff --git a/test/CodeGen/ARM64/vbitwise.ll b/test/CodeGen/ARM64/vbitwise.ll
deleted file mode 100644
index 7d8378de292..00000000000
--- a/test/CodeGen/ARM64/vbitwise.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @rbit_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: rbit_8b:
-;CHECK: rbit.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8> %tmp1)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @rbit_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: rbit_16b:
-;CHECK: rbit.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8> %tmp1)
-	ret <16 x i8> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
-
-define <8 x i16> @sxtl8h(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: sxtl8h:
-;CHECK: sshll.8h
-	%tmp1 = load <8 x i8>* %A
-  %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
-  ret <8 x i16> %tmp2
-}
-
-define <8 x i16> @uxtl8h(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: uxtl8h:
-;CHECK: ushll.8h
-	%tmp1 = load <8 x i8>* %A
-  %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
-  ret <8 x i16> %tmp2
-}
-
-define <4 x i32> @sxtl4s(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: sxtl4s:
-;CHECK: sshll.4s
-	%tmp1 = load <4 x i16>* %A
-  %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
-  ret <4 x i32> %tmp2
-}
-
-define <4 x i32> @uxtl4s(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: uxtl4s:
-;CHECK: ushll.4s
-	%tmp1 = load <4 x i16>* %A
-  %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
-  ret <4 x i32> %tmp2
-}
-
-define <2 x i64> @sxtl2d(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: sxtl2d:
-;CHECK: sshll.2d
-	%tmp1 = load <2 x i32>* %A
-  %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
-  ret <2 x i64> %tmp2
-}
-
-define <2 x i64> @uxtl2d(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: uxtl2d:
-;CHECK: ushll.2d
-	%tmp1 = load <2 x i32>* %A
-  %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
-  ret <2 x i64> %tmp2
-}
-
-; Check for incorrect use of vector bic.
-; rdar://11553859
-define void @test_vsliq(i8* nocapture %src, i8* nocapture %dest) nounwind noinline ssp {
-entry:
-; CHECK-LABEL: test_vsliq:
-; CHECK-NOT: bic
-; CHECK: movi.2d [[REG1:v[0-9]+]], #0x0000ff000000ff
-; CHECK: and.16b v{{[0-9]+}}, v{{[0-9]+}}, [[REG1]]
-  %0 = bitcast i8* %src to <16 x i8>*
-  %1 = load <16 x i8>* %0, align 16
-  %and.i = and <16 x i8> %1, <i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0>
-  %2 = bitcast <16 x i8> %and.i to <8 x i16>
-  %vshl_n = shl <8 x i16> %2, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %3 = or <8 x i16> %2, %vshl_n
-  %4 = bitcast <8 x i16> %3 to <4 x i32>
-  %vshl_n8 = shl <4 x i32> %4, <i32 16, i32 16, i32 16, i32 16>
-  %5 = or <4 x i32> %4, %vshl_n8
-  %6 = bitcast <4 x i32> %5 to <16 x i8>
-  %7 = bitcast i8* %dest to <16 x i8>*
-  store <16 x i8> %6, <16 x i8>* %7, align 16
-  ret void
-}
diff --git a/test/CodeGen/ARM64/vclz.ll b/test/CodeGen/ARM64/vclz.ll
deleted file mode 100644
index ddc09ed85fa..00000000000
--- a/test/CodeGen/ARM64/vclz.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define <8 x i8> @test_vclz_u8(<8 x i8> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclz_u8:
-  ; CHECK: clz.8b v0, v0
-  ; CHECK-NEXT: ret
-  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
-  ret <8 x i8> %vclz.i
-}
-
-define <8 x i8> @test_vclz_s8(<8 x i8> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclz_s8:
-  ; CHECK: clz.8b v0, v0
-  ; CHECK-NEXT: ret
-  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
-  ret <8 x i8> %vclz.i
-}
-
-define <4 x i16> @test_vclz_u16(<4 x i16> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclz_u16:
-  ; CHECK: clz.4h v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
-  ret <4 x i16> %vclz1.i
-}
-
-define <4 x i16> @test_vclz_s16(<4 x i16> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclz_s16:
-  ; CHECK: clz.4h v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
-  ret <4 x i16> %vclz1.i
-}
-
-define <2 x i32> @test_vclz_u32(<2 x i32> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclz_u32:
-  ; CHECK: clz.2s v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
-  ret <2 x i32> %vclz1.i
-}
-
-define <2 x i32> @test_vclz_s32(<2 x i32> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclz_s32:
-  ; CHECK: clz.2s v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
-  ret <2 x i32> %vclz1.i
-}
-
-define <16 x i8> @test_vclzq_u8(<16 x i8> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclzq_u8:
-  ; CHECK: clz.16b v0, v0
-  ; CHECK-NEXT: ret
-  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
-  ret <16 x i8> %vclz.i
-}
-
-define <16 x i8> @test_vclzq_s8(<16 x i8> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclzq_s8:
-  ; CHECK: clz.16b v0, v0
-  ; CHECK-NEXT: ret
-  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
-  ret <16 x i8> %vclz.i
-}
-
-define <8 x i16> @test_vclzq_u16(<8 x i16> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclzq_u16:
-  ; CHECK: clz.8h v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
-  ret <8 x i16> %vclz1.i
-}
-
-define <8 x i16> @test_vclzq_s16(<8 x i16> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclzq_s16:
-  ; CHECK: clz.8h v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
-  ret <8 x i16> %vclz1.i
-}
-
-define <4 x i32> @test_vclzq_u32(<4 x i32> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclzq_u32:
-  ; CHECK: clz.4s v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
-  ret <4 x i32> %vclz1.i
-}
-
-define <4 x i32> @test_vclzq_s32(<4 x i32> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclzq_s32:
-  ; CHECK: clz.4s v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
-  ret <4 x i32> %vclz1.i
-}
-
-declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
-
-declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
-
-declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
-
-declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
-
-declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
-
-declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcmp.ll b/test/CodeGen/ARM64/vcmp.ll
deleted file mode 100644
index 56153f08f35..00000000000
--- a/test/CodeGen/ARM64/vcmp.ll
+++ /dev/null
@@ -1,236 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-
-define void @fcmltz_4s(<4 x float> %a, <4 x i16>* %p) nounwind {
-;CHECK-LABEL: fcmltz_4s:
-;CHECK: fcmlt.4s [[REG:v[0-9]+]], v0, #0
-;CHECK-NEXT: xtn.4h v[[REG_1:[0-9]+]], [[REG]]
-;CHECK-NEXT: str d[[REG_1]], [x0]
-;CHECK-NEXT: ret
-  %tmp = fcmp olt <4 x float> %a, zeroinitializer
-  %tmp2 = sext <4 x i1> %tmp to <4 x i16>
-  store <4 x i16> %tmp2, <4 x i16>* %p, align 8
-  ret void
-}
-
-define <2 x i32> @facge_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: facge_2s:
-;CHECK: facge.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @facge_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: facge_4s:
-;CHECK: facge.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @facge_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: facge_2d:
-;CHECK: facge.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x i32> @facgt_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: facgt_2s:
-;CHECK: facgt.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @facgt_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: facgt_4s:
-;CHECK: facgt.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @facgt_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: facgt_2d:
-;CHECK: facgt.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define i32 @facge_s(float %A, float %B) nounwind {
-; CHECK-LABEL: facge_s:
-; CHECK: facge {{s[0-9]+}}, s0, s1
-  %mask = call i32 @llvm.arm64.neon.facge.i32.f32(float %A, float %B)
-  ret i32 %mask
-}
-
-define i64 @facge_d(double %A, double %B) nounwind {
-; CHECK-LABEL: facge_d:
-; CHECK: facge {{d[0-9]+}}, d0, d1
-  %mask = call i64 @llvm.arm64.neon.facge.i64.f64(double %A, double %B)
-  ret i64 %mask
-}
-
-declare i64 @llvm.arm64.neon.facge.i64.f64(double, double)
-declare i32 @llvm.arm64.neon.facge.i32.f32(float, float)
-
-define i32 @facgt_s(float %A, float %B) nounwind {
-; CHECK-LABEL: facgt_s:
-; CHECK: facgt {{s[0-9]+}}, s0, s1
-  %mask = call i32 @llvm.arm64.neon.facgt.i32.f32(float %A, float %B)
-  ret i32 %mask
-}
-
-define i64 @facgt_d(double %A, double %B) nounwind {
-; CHECK-LABEL: facgt_d:
-; CHECK: facgt {{d[0-9]+}}, d0, d1
-  %mask = call i64 @llvm.arm64.neon.facgt.i64.f64(double %A, double %B)
-  ret i64 %mask
-}
-
-declare i64 @llvm.arm64.neon.facgt.i64.f64(double, double)
-declare i32 @llvm.arm64.neon.facgt.i32.f32(float, float)
-
-define <8 x i8> @cmtst_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: cmtst_8b:
-;CHECK: cmtst.8b
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %commonbits = and <8 x i8> %tmp1, %tmp2
-  %mask = icmp ne <8 x i8> %commonbits, zeroinitializer
-  %res = sext <8 x i1> %mask to <8 x i8>
-  ret <8 x i8> %res
-}
-
-define <16 x i8> @cmtst_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: cmtst_16b:
-;CHECK: cmtst.16b
-  %tmp1 = load <16 x i8>* %A
-  %tmp2 = load <16 x i8>* %B
-  %commonbits = and <16 x i8> %tmp1, %tmp2
-  %mask = icmp ne <16 x i8> %commonbits, zeroinitializer
-  %res = sext <16 x i1> %mask to <16 x i8>
-  ret <16 x i8> %res
-}
-
-define <4 x i16> @cmtst_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: cmtst_4h:
-;CHECK: cmtst.4h
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %commonbits = and <4 x i16> %tmp1, %tmp2
-  %mask = icmp ne <4 x i16> %commonbits, zeroinitializer
-  %res = sext <4 x i1> %mask to <4 x i16>
-  ret <4 x i16> %res
-}
-
-define <8 x i16> @cmtst_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: cmtst_8h:
-;CHECK: cmtst.8h
-  %tmp1 = load <8 x i16>* %A
-  %tmp2 = load <8 x i16>* %B
-  %commonbits = and <8 x i16> %tmp1, %tmp2
-  %mask = icmp ne <8 x i16> %commonbits, zeroinitializer
-  %res = sext <8 x i1> %mask to <8 x i16>
-  ret <8 x i16> %res
-}
-
-define <2 x i32> @cmtst_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: cmtst_2s:
-;CHECK: cmtst.2s
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %commonbits = and <2 x i32> %tmp1, %tmp2
-  %mask = icmp ne <2 x i32> %commonbits, zeroinitializer
-  %res = sext <2 x i1> %mask to <2 x i32>
-  ret <2 x i32> %res
-}
-
-define <4 x i32> @cmtst_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: cmtst_4s:
-;CHECK: cmtst.4s
-  %tmp1 = load <4 x i32>* %A
-  %tmp2 = load <4 x i32>* %B
-  %commonbits = and <4 x i32> %tmp1, %tmp2
-  %mask = icmp ne <4 x i32> %commonbits, zeroinitializer
-  %res = sext <4 x i1> %mask to <4 x i32>
-  ret <4 x i32> %res
-}
-
-define <2 x i64> @cmtst_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: cmtst_2d:
-;CHECK: cmtst.2d
-  %tmp1 = load <2 x i64>* %A
-  %tmp2 = load <2 x i64>* %B
-  %commonbits = and <2 x i64> %tmp1, %tmp2
-  %mask = icmp ne <2 x i64> %commonbits, zeroinitializer
-  %res = sext <2 x i1> %mask to <2 x i64>
-  ret <2 x i64> %res
-}
-
-define <1 x i64> @fcmeq_d(<1 x double> %A, <1 x double> %B) nounwind {
-; CHECK-LABEL: fcmeq_d:
-; CHECK: fcmeq {{d[0-9]+}}, d0, d1
-  %tst = fcmp oeq <1 x double> %A, %B
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmge_d(<1 x double> %A, <1 x double> %B) nounwind {
-; CHECK-LABEL: fcmge_d:
-; CHECK: fcmge {{d[0-9]+}}, d0, d1
-  %tst = fcmp oge <1 x double> %A, %B
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmle_d(<1 x double> %A, <1 x double> %B) nounwind {
-; CHECK-LABEL: fcmle_d:
-; CHECK: fcmge {{d[0-9]+}}, d1, d0
-  %tst = fcmp ole <1 x double> %A, %B
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmgt_d(<1 x double> %A, <1 x double> %B) nounwind {
-; CHECK-LABEL: fcmgt_d:
-; CHECK: fcmgt {{d[0-9]+}}, d0, d1
-  %tst = fcmp ogt <1 x double> %A, %B
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmlt_d(<1 x double> %A, <1 x double> %B) nounwind {
-; CHECK-LABEL: fcmlt_d:
-; CHECK: fcmgt {{d[0-9]+}}, d1, d0
-  %tst = fcmp olt <1 x double> %A, %B
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @cmnez_d(<1 x i64> %A) nounwind {
-; CHECK-LABEL: cmnez_d:
-; CHECK: cmeq d[[EQ:[0-9]+]], d0, #0
-; CHECK: mvn.8b v0, v[[EQ]]
-  %tst = icmp ne <1 x i64> %A, zeroinitializer
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
diff --git a/test/CodeGen/ARM64/vcnt.ll b/test/CodeGen/ARM64/vcnt.ll
deleted file mode 100644
index e00658a4bda..00000000000
--- a/test/CodeGen/ARM64/vcnt.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: cls_8b:
-;CHECK: cls.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8> %tmp1)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: cls_16b:
-;CHECK: cls.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8> %tmp1)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: cls_4h:
-;CHECK: cls.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16> %tmp1)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: cls_8h:
-;CHECK: cls.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16> %tmp1)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: cls_2s:
-;CHECK: cls.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32> %tmp1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: cls_4s:
-;CHECK: cls.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32> %tmp1)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcombine.ll b/test/CodeGen/ARM64/vcombine.ll
deleted file mode 100644
index 16f591e378e..00000000000
--- a/test/CodeGen/ARM64/vcombine.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-; LowerCONCAT_VECTORS() was reversing the order of two parts.
-; rdar://11558157
-; rdar://11559553
-define <16 x i8> @test(<16 x i8> %q0, <16 x i8> %q1, i8* nocapture %dest) nounwind {
-entry:
-; CHECK-LABEL: test:
-; CHECK: ins.d v0[1], v1[0]
-  %0 = bitcast <16 x i8> %q0 to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> zeroinitializer
-  %1 = bitcast <16 x i8> %q1 to <2 x i64>
-  %shuffle.i4 = shufflevector <2 x i64> %1, <2 x i64> undef, <1 x i32> zeroinitializer
-  %shuffle.i3 = shufflevector <1 x i64> %shuffle.i, <1 x i64> %shuffle.i4, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i3 to <16 x i8>
-  ret <16 x i8> %2
-}
diff --git a/test/CodeGen/ARM64/vcvt.ll b/test/CodeGen/ARM64/vcvt.ll
deleted file mode 100644
index 19bb8cb8dc5..00000000000
--- a/test/CodeGen/ARM64/vcvt.ll
+++ /dev/null
@@ -1,686 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtas_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtas.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtas_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtas.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtas_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtas.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtau_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtau.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtau_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtau_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtau.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtau_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtau.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtms_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtms.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtms_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtms_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtms.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtms_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtms.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtmu_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtmu.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtmu_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtmu_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtmu.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtmu_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtmu.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtps_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtps.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtps_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtps_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtps.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtps_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtps.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtpu_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtpu.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtpu_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtpu_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtpu.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtpu_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtpu.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtns_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtns.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtns_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtns_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtns.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtns_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtns.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtnu_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtnu.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtnu_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtnu_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtnu.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtnu_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtnu.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzs_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzs.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = fptosi <2 x float> %A to <2 x i32>
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtzs_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzs_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzs.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = fptosi <4 x float> %A to <4 x i32>
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtzs_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtzs.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = fptosi <2 x double> %A to <2 x i64>
-	ret <2 x i64> %tmp3
-}
-
-
-define <2 x i32> @fcvtzu_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzu_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzu.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = fptoui <2 x float> %A to <2 x i32>
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtzu_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzu_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzu.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = fptoui <4 x float> %A to <4 x i32>
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtzu_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtzu.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = fptoui <2 x double> %A to <2 x i64>
-	ret <2 x i64> %tmp3
-}
-
-define <2 x float> @frinta_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frinta_2s:
-;CHECK-NOT: ld1
-;CHECK: frinta.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.round.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frinta_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frinta_4s:
-;CHECK-NOT: ld1
-;CHECK: frinta.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.round.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frinta_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frinta_2d:
-;CHECK-NOT: ld1
-;CHECK: frinta.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.round.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.round.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.round.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.round.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @frinti_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frinti_2s:
-;CHECK-NOT: ld1
-;CHECK: frinti.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frinti_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frinti_4s:
-;CHECK-NOT: ld1
-;CHECK: frinti.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frinti_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frinti_2d:
-;CHECK-NOT: ld1
-;CHECK: frinti.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @frintm_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frintm_2s:
-;CHECK-NOT: ld1
-;CHECK: frintm.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.floor.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frintm_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frintm_4s:
-;CHECK-NOT: ld1
-;CHECK: frintm.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.floor.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frintm_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frintm_2d:
-;CHECK-NOT: ld1
-;CHECK: frintm.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.floor.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.floor.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @frintn_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frintn_2s:
-;CHECK-NOT: ld1
-;CHECK: frintn.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frintn_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frintn_4s:
-;CHECK-NOT: ld1
-;CHECK: frintn.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frintn_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frintn_2d:
-;CHECK-NOT: ld1
-;CHECK: frintn.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @frintp_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frintp_2s:
-;CHECK-NOT: ld1
-;CHECK: frintp.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.ceil.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frintp_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frintp_4s:
-;CHECK-NOT: ld1
-;CHECK: frintp.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frintp_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frintp_2d:
-;CHECK-NOT: ld1
-;CHECK: frintp.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @frintx_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frintx_2s:
-;CHECK-NOT: ld1
-;CHECK: frintx.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.rint.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frintx_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frintx_4s:
-;CHECK-NOT: ld1
-;CHECK: frintx.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frintx_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frintx_2d:
-;CHECK-NOT: ld1
-;CHECK: frintx.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.rint.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.rint.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.rint.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @frintz_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frintz_2s:
-;CHECK-NOT: ld1
-;CHECK: frintz.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.trunc.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frintz_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frintz_4s:
-;CHECK-NOT: ld1
-;CHECK: frintz.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frintz_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frintz_2d:
-;CHECK-NOT: ld1
-;CHECK: frintz.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtxn_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtxn v0.2s, v0.2d
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtxn_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtxn2 v0.4s, v1.2d
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
-        %res = shufflevector <2 x float> %ret, <2 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-	ret <4 x float> %res
-}
-
-declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzsc_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzs.2s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtzsc_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzsc_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzs.4s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtzsc_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtzsc_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtzs.2d v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
-
-define <2 x i32> @fcvtzuc_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzuc_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzu.2s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtzuc_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzuc_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzu.4s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtzuc_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtzuc_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtzu.2d v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
-
-define <2 x float> @scvtf_2sc(<2 x i32> %A) nounwind {
-;CHECK-LABEL: scvtf_2sc:
-;CHECK-NOT: ld1
-;CHECK: scvtf.2s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @scvtf_4sc(<4 x i32> %A) nounwind {
-;CHECK-LABEL: scvtf_4sc:
-;CHECK-NOT: ld1
-;CHECK: scvtf.4s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @scvtf_2dc(<2 x i64> %A) nounwind {
-;CHECK-LABEL: scvtf_2dc:
-;CHECK-NOT: ld1
-;CHECK: scvtf.2d v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
-
-define <2 x float> @ucvtf_2sc(<2 x i32> %A) nounwind {
-;CHECK-LABEL: ucvtf_2sc:
-;CHECK-NOT: ld1
-;CHECK: ucvtf.2s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @ucvtf_4sc(<4 x i32> %A) nounwind {
-;CHECK-LABEL: ucvtf_4sc:
-;CHECK-NOT: ld1
-;CHECK: ucvtf.4s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
-;CHECK-LABEL: ucvtf_2dc:
-;CHECK-NOT: ld1
-;CHECK: ucvtf.2d v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
-	ret <2 x double> %tmp3
-}
-
-
-;CHECK-LABEL: autogen_SD28458:
-;CHECK: fcvt
-;CHECK: ret
-define void @autogen_SD28458() {
-  %Tr53 = fptrunc <8 x double> undef to <8 x float>
-  store <8 x float> %Tr53, <8 x float>* undef
-  ret void
-}
-
-;CHECK-LABEL: autogen_SD19225:
-;CHECK: fcvt
-;CHECK: ret
-define void @autogen_SD19225() {
-  %A = load <8 x float>* undef
-  %Tr53 = fpext <8 x float> %A to <8 x double>
-  store <8 x double> %Tr53, <8 x double>* undef
-  ret void
-}
-
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_f.ll b/test/CodeGen/ARM64/vcvt_f.ll
deleted file mode 100644
index d67aa3b9d47..00000000000
--- a/test/CodeGen/ARM64/vcvt_f.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -O0 -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvt_f64_f32:
-  %vcvt1.i = fpext <2 x float> %x to <2 x double>
-; CHECK: fcvtl	v0.2d, v0.2s
-  ret <2 x double> %vcvt1.i
-; CHECK: ret
-}
-
-define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvt_high_f64_f32:
-  %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
-  %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
-; CHECK: fcvtl2	v0.2d, v0.4s
-  ret <2 x double> %vcvt1.i
-; CHECK: ret
-}
-
-define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvt_f32_f64:
-  %vcvt1.i = fptrunc <2 x double> %v to <2 x float>
-; CHECK: fcvtn
-  ret <2 x float> %vcvt1.i
-; CHECK: ret
-}
-
-define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvt_high_f32_f64:
-
-  %cvt = fptrunc <2 x double> %v to <2 x float>
-  %vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: fcvtn2
-  ret <4 x float> %vcvt2.i
-; CHECK: ret
-}
-
-define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvtx_f32_f64:
-  %vcvtx1.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
-; CHECK: fcvtxn
-  ret <2 x float> %vcvtx1.i
-; CHECK: ret
-}
-
-define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvtx_high_f32_f64:
-  %vcvtx2.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
-  %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: fcvtxn2
-  ret <4 x float> %res
-; CHECK: ret
-}
-
-
-declare <2 x double> @llvm.arm64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfp2df(<2 x float>) nounwind readnone
-
-declare <2 x float> @llvm.arm64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
-
-declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
-
-define i16 @to_half(float %in) {
-; CHECK-LABEL: to_half:
-; CHECK: fcvt h[[HALFVAL:[0-9]+]], s0
-; CHECK: fmov {{w[0-9]+}}, {{s[0-9]+}}
-  %res = call i16 @llvm.convert.to.fp16(float %in)
-  ret i16 %res
-}
-
-define float @from_half(i16 %in) {
-; CHECK-LABEL: from_half:
-; CHECK: fmov s[[HALFVAL:[0-9]+]], {{w[0-9]+}}
-; CHECK: fcvt s0, h[[HALFVAL]]
-  %res = call float @llvm.convert.from.fp16(i16 %in)
-  ret float %res
-}
-
-declare float @llvm.convert.from.fp16(i16) #1
-declare i16 @llvm.convert.to.fp16(float) #1
diff --git a/test/CodeGen/ARM64/vcvt_f32_su32.ll b/test/CodeGen/ARM64/vcvt_f32_su32.ll
deleted file mode 100644
index 51e053d9745..00000000000
--- a/test/CodeGen/ARM64/vcvt_f32_su32.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @ucvt(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: ucvt:
-; CHECK: ucvtf.2s  v0, v0
-; CHECK: ret
-
-  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <2 x float> @scvt(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: scvt:
-; CHECK: scvtf.2s  v0, v0
-; CHECK: ret
-  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @ucvtq(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: ucvtq:
-; CHECK: ucvtf.4s  v0, v0
-; CHECK: ret
-  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @scvtq(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: scvtq:
-; CHECK: scvtf.4s  v0, v0
-; CHECK: ret
-  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @cvtf16(<4 x i16> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf16:
-; CHECK: fcvtl  v0.4s, v0.4h
-; CHECK-NEXT: ret
-  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %a) nounwind
-  ret <4 x float> %vcvt1.i
-}
-
-define <4 x float> @cvtf16_high(<8 x i16> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf16_high:
-; CHECK: fcvtl2  v0.4s, v0.8h
-; CHECK-NEXT: ret
-  %in = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %in) nounwind
-  ret <4 x float> %vcvt1.i
-}
-
-
-
-define <4 x i16> @cvtf16f32(<4 x float> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf16f32:
-; CHECK: fcvtn  v0.4h, v0.4s
-; CHECK-NEXT: ret
-  %vcvt1.i = tail call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %a) nounwind
-  ret <4 x i16> %vcvt1.i
-}
-
-define <8 x i16> @cvtf16f32_high(<4 x i16> %low, <4 x float> %high_big) {
-; CHECK-LABEL: cvtf16f32_high:
-; CHECK: fcvtn2 v0.8h, v1.4s
-; CHECK-NEXT: ret
-  %high = call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %high_big)
-  %res = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %res
-}
-
-declare <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_n.ll b/test/CodeGen/ARM64/vcvt_n.ll
deleted file mode 100644
index 46de557b070..00000000000
--- a/test/CodeGen/ARM64/vcvt_n.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf32fxpu:
-; CHECK: ucvtf.2s	v0, v0, #9
-; CHECK: ret
-  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
-  ret <2 x float> %vcvt_n1
-}
-
-define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf32fxps:
-; CHECK: scvtf.2s	v0, v0, #12
-; CHECK: ret
-  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
-  ret <2 x float> %vcvt_n1
-}
-
-define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtqf32fxpu:
-; CHECK: ucvtf.4s	v0, v0, #18
-; CHECK: ret
-  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
-  ret <4 x float> %vcvt_n1
-}
-
-define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtqf32fxps:
-; CHECK: scvtf.4s	v0, v0, #30
-; CHECK: ret
-  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
-  ret <4 x float> %vcvt_n1
-}
-define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
-  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
-  ret <2 x double> %vcvt_n1
-}
-
-define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
-  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
-  ret <2 x double> %vcvt_n1
-}
-
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_su32_f32.ll b/test/CodeGen/ARM64/vcvt_su32_f32.ll
deleted file mode 100644
index 8c82fa095c8..00000000000
--- a/test/CodeGen/ARM64/vcvt_su32_f32.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x i32> @c1(<2 x float> %a) nounwind readnone ssp {
-; CHECK: c1
-; CHECK: fcvtzs.2s	v0, v0
-; CHECK: ret
-  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <2 x i32> @c2(<2 x float> %a) nounwind readnone ssp {
-; CHECK: c2
-; CHECK: fcvtzu.2s	v0, v0
-; CHECK: ret
-  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <4 x i32> @c3(<4 x float> %a) nounwind readnone ssp {
-; CHECK: c3
-; CHECK: fcvtzs.4s	v0, v0
-; CHECK: ret
-  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
-  ret <4 x i32> %vcvt.i
-}
-
-define <4 x i32> @c4(<4 x float> %a) nounwind readnone ssp {
-; CHECK: c4
-; CHECK: fcvtzu.4s	v0, v0
-; CHECK: ret
-  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
-  ret <4 x i32> %vcvt.i
-}
-
diff --git a/test/CodeGen/ARM64/vcvtxd_f32_f64.ll b/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
deleted file mode 100644
index bbe8f0b3864..00000000000
--- a/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define float @fcvtxn(double %a) {
-; CHECK-LABEL: fcvtxn:
-; CHECK: fcvtxn s0, d0
-; CHECK-NEXT: ret
-  %vcvtxd.i = tail call float @llvm.arm64.sisd.fcvtxn(double %a) nounwind
-  ret float %vcvtxd.i
-}
-
-declare float @llvm.arm64.sisd.fcvtxn(double) nounwind readnone
diff --git a/test/CodeGen/ARM64/vecCmpBr.ll b/test/CodeGen/ARM64/vecCmpBr.ll
deleted file mode 100644
index 2af8775cea6..00000000000
--- a/test/CodeGen/ARM64/vecCmpBr.ll
+++ /dev/null
@@ -1,207 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
-; ModuleID = 'arm64_vecCmpBr.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios3.0.0"
-
-
-define i32 @anyZero64(<4 x i16> %a) #0 {
-; CHECK: _anyZero64:
-; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: b _bar
-entry:
-  %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
-  %1 = trunc i32 %vminv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %if.then, label %return
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-declare i32 @bar(...) #1
-
-define i32 @anyZero128(<8 x i16> %a) #0 {
-; CHECK: _anyZero128:
-; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: b _bar
-
-entry:
-  %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
-  %1 = trunc i32 %vminv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %if.then, label %return
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @anyNonZero64(<4 x i16> %a) #0 {
-; CHECK: _anyNonZero64:
-; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: movz w0, #0
-
-entry:
-  %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
-  %1 = trunc i32 %vmaxv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @anyNonZero128(<8 x i16> %a) #0 {
-; CHECK: _anyNonZero128:
-; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: movz w0, #0
-entry:
-  %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
-  %1 = trunc i32 %vmaxv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @allZero64(<4 x i16> %a) #0 {
-; CHECK: _allZero64:
-; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: b _bar
-entry:
-  %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
-  %1 = trunc i32 %vmaxv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %if.then, label %return
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @allZero128(<8 x i16> %a) #0 {
-; CHECK: _allZero128:
-; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: b _bar
-entry:
-  %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
-  %1 = trunc i32 %vmaxv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %if.then, label %return
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @allNonZero64(<4 x i16> %a) #0 {
-; CHECK: _allNonZero64:
-; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: movz w0, #0
-entry:
-  %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
-  %1 = trunc i32 %vminv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @allNonZero128(<8 x i16> %a) #0 {
-; CHECK: _allNonZero128:
-; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: movz w0, #0
-entry:
-  %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
-  %1 = trunc i32 %vminv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) #2
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) #2
-
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) #2
-
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) #2
-
-attributes #0 = { nounwind ssp "target-cpu"="cyclone" }
-attributes #1 = { "target-cpu"="cyclone" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-attributes #4 = { nobuiltin nounwind }
diff --git a/test/CodeGen/ARM64/vecFold.ll b/test/CodeGen/ARM64/vecFold.ll
deleted file mode 100644
index 6888932f2ce..00000000000
--- a/test/CodeGen/ARM64/vecFold.ll
+++ /dev/null
@@ -1,145 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s| FileCheck %s
-
-define <16 x i8> @foov16i8(<8 x i16> %a0, <8 x i16> %b0) nounwind readnone ssp {
-; CHECK-LABEL: foov16i8:
-  %vshrn_low_shift = lshr <8 x i16> %a0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
-  %vshrn_low = trunc <8 x i16> %vshrn_low_shift to <8 x i8>
-  %vshrn_high_shift = lshr <8 x i16> %b0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
-  %vshrn_high = trunc <8 x i16> %vshrn_high_shift to <8 x i8>
-; CHECK: shrn.8b v0, v0, #5
-; CHECK-NEXT: shrn2.16b v0, v1, #5
-; CHECK-NEXT: ret
-  %1 = bitcast <8 x i8> %vshrn_low to <1 x i64>
-  %2 = bitcast <8 x i8> %vshrn_high to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @foov8i16(<4 x i32> %a0, <4 x i32> %b0) nounwind readnone ssp {
-; CHECK-LABEL: foov8i16:
-  %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5>
-  %vshrn_low = trunc <4 x i32> %vshrn_low_shift to <4 x i16>
-  %vshrn_high_shift = lshr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
-  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
-; CHECK: shrn.4h v0, v0, #5
-; CHECK-NEXT: shrn2.8h v0, v1, #5
-; CHECK-NEXT: ret
-  %1 = bitcast <4 x i16> %vshrn_low to <1 x i64>
-  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @foov4i32(<2 x i64> %a0, <2 x i64> %b0) nounwind readnone ssp {
-; CHECK-LABEL: foov4i32:
-  %vshrn_low_shift = lshr <2 x i64> %a0, <i64 5, i64 5>
-  %vshrn_low = trunc <2 x i64> %vshrn_low_shift to <2 x i32>
-  %vshrn_high_shift = lshr <2 x i64> %b0, <i64 5, i64 5>
-  %vshrn_high = trunc <2 x i64> %vshrn_high_shift to <2 x i32>
-; CHECK: shrn.2s v0, v0, #5
-; CHECK-NEXT: shrn2.4s v0, v1, #5
-; CHECK-NEXT: ret
-  %1 = bitcast <2 x i32> %vshrn_low to <1 x i64>
-  %2 = bitcast <2 x i32> %vshrn_high to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
-; CHECK-LABEL: bar:
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
-  %vaddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
-; CHECK: addhn.4h	v0, v0, v1
-; CHECK-NEXT: addhn2.8h	v0, v2, v3
-; CHECK-NEXT: ret
-  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
-  %2 = bitcast <4 x i16> %vaddhn2.i10 to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @baz(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
-; CHECK-LABEL: baz:
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
-  %vshrn_high_shift = ashr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
-  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
-; CHECK: addhn.4h	v0, v0, v1
-; CHECK-NEXT: shrn2.8h	v0, v2, #5
-; CHECK-NEXT: ret
-  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
-  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @raddhn(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
-; CHECK-LABEL: raddhn:
-entry:
-; CHECK: 	raddhn.4h	v0, v0, v1
-; CHECK-NEXT: 	raddhn2.8h	v0, v2, v3
-; CHECK-NEXT: 	ret
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
-  %vraddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
-  %0 = bitcast <4 x i16> %vraddhn2.i to <1 x i64>
-  %1 = bitcast <4 x i16> %vraddhn2.i10 to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <8 x i16> @vrshrn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
-; CHECK-LABEL: vrshrn:
-; CHECK: rshrn.8b	v0, v0, #5
-; CHECK-NEXT: rshrn2.16b	v0, v2, #6
-; CHECK-NEXT: ret
-  %vrshrn_n1 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
-  %vrshrn_n4 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
-  %1 = bitcast <8 x i8> %vrshrn_n1 to <1 x i64>
-  %2 = bitcast <8 x i8> %vrshrn_n4 to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @vrsubhn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
-; CHECK-LABEL: vrsubhn:
-; CHECK: rsubhn.8b	v0, v0, v1
-; CHECK: rsubhn2.16b	v0, v2, v3
-; CHECK-NEXT: 	ret
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
-  %vrsubhn2.i10 = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
-  %1 = bitcast <8 x i8> %vrsubhn2.i to <1 x i64>
-  %2 = bitcast <8 x i8> %vrsubhn2.i10 to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @noOpt1(<2 x i32> %a0, <2 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
-; CHECK-LABEL: noOpt1:
-  %vqsub2.i = tail call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
-; CHECK:	sqsub.2s	v0, v0, v1
-; CHECK-NEXT:	addhn2.8h	v0, v2, v3
-  %1 = bitcast <2 x i32> %vqsub2.i to <1 x i64>
-  %2 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i8> @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-
diff --git a/test/CodeGen/ARM64/vector-ext.ll b/test/CodeGen/ARM64/vector-ext.ll
deleted file mode 100644
index 9cc0555d8c4..00000000000
--- a/test/CodeGen/ARM64/vector-ext.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-;CHECK: @func30
-;CHECK: ushll.4s  v0, v0, #0
-;CHECK: movi.4s v1, #0x1
-;CHECK: and.16b v0, v0, v1
-;CHECK: str  q0, [x0]
-;CHECK: ret
-
-%T0_30 = type <4 x i1>
-%T1_30 = type <4 x i32>
-define void @func30(%T0_30 %v0, %T1_30* %p1) {
-  %r = zext %T0_30 %v0 to %T1_30
-  store %T1_30 %r, %T1_30* %p1
-  ret void
-}
diff --git a/test/CodeGen/ARM64/vector-imm.ll b/test/CodeGen/ARM64/vector-imm.ll
deleted file mode 100644
index a84f804c8cd..00000000000
--- a/test/CodeGen/ARM64/vector-imm.ll
+++ /dev/null
@@ -1,134 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @v_orrimm(<8 x i8>* %A) nounwind {
-; CHECK-LABEL: v_orrimm:
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: orr
-	%tmp1 = load <8 x i8>* %A
-	%tmp3 = or <8 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @v_orrimmQ(<16 x i8>* %A) nounwind {
-; CHECK: v_orrimmQ
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: orr
-	%tmp1 = load <16 x i8>* %A
-	%tmp3 = or <16 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @v_bicimm(<8 x i8>* %A) nounwind {
-; CHECK-LABEL: v_bicimm:
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: bic
-	%tmp1 = load <8 x i8>* %A
-	%tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @v_bicimmQ(<16 x i8>* %A) nounwind {
-; CHECK-LABEL: v_bicimmQ:
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: bic
-	%tmp1 = load <16 x i8>* %A
-	%tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
-	ret <16 x i8> %tmp3
-}
-
-define <2 x double> @foo(<2 x double> %bar) nounwind {
-; CHECK: foo
-; CHECK: fmov.2d	v1, #1.0000000
-  %add = fadd <2 x double> %bar, <double 1.0, double 1.0>
-  ret <2 x double> %add
-}
-
-define <4 x i32> @movi_4s_imm_t1() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t1:
-; CHECK: movi.4s v0, #0x4b
-  ret <4 x i32> <i32 75, i32 75, i32 75, i32 75>
-}
-
-define <4 x i32> @movi_4s_imm_t2() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t2:
-; CHECK: movi.4s v0, #0x4b, lsl #8
-  ret <4 x i32> <i32 19200, i32 19200, i32 19200, i32 19200>
-}
-
-define <4 x i32> @movi_4s_imm_t3() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t3:
-; CHECK: movi.4s v0, #0x4b, lsl #16
-  ret <4 x i32> <i32 4915200, i32 4915200, i32 4915200, i32 4915200>
-}
-
-define <4 x i32> @movi_4s_imm_t4() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t4:
-; CHECK: movi.4s v0, #0x4b, lsl #24
-  ret <4 x i32> <i32 1258291200, i32 1258291200, i32 1258291200, i32 1258291200>
-}
-
-define <8 x i16> @movi_8h_imm_t5() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_8h_imm_t5:
-; CHECK: movi.8h v0, #0x4b
-  ret <8 x i16> <i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75>
-}
-
-; rdar://11989841
-define <8 x i16> @movi_8h_imm_t6() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_8h_imm_t6:
-; CHECK: movi.8h v0, #0x4b, lsl #8
-  ret <8 x i16> <i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200>
-}
-
-define <4 x i32> @movi_4s_imm_t7() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t7:
-; CHECK: movi.4s v0, #0x4b, msl #8
-ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
-}
-
-define <4 x i32> @movi_4s_imm_t8() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t8:
-; CHECK: movi.4s v0, #0x4b, msl #16
-ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
-}
-
-define <16 x i8> @movi_16b_imm_t9() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_16b_imm_t9:
-; CHECK: movi.16b v0, #0x4b
-ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75,
-               i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
-}
-
-define <2 x i64> @movi_2d_imm_t10() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_2d_imm_t10:
-; CHECK: movi.2d v0, #0xff00ff00ff00ff
-ret <2 x i64> <i64 71777214294589695, i64 71777214294589695>
-}
-
-define <4 x i32> @movi_4s_imm_t11() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t11:
-; CHECK: fmov.4s v0, #-0.32812500
-ret <4 x i32> <i32 3198681088, i32 3198681088, i32 3198681088, i32 3198681088>
-}
-
-define <2 x i64> @movi_2d_imm_t12() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_2d_imm_t12:
-; CHECK: fmov.2d v0, #-0.17187500
-ret <2 x i64> <i64 13818732506632945664, i64 13818732506632945664>
-}
diff --git a/test/CodeGen/ARM64/vector-insertion.ll b/test/CodeGen/ARM64/vector-insertion.ll
deleted file mode 100644
index 0926bcfde9a..00000000000
--- a/test/CodeGen/ARM64/vector-insertion.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc -march=arm64 -mcpu=generic -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define void @test0f(float* nocapture %x, float %a) #0 {
-entry:
-  %0 = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %a, i32 0
-  %1 = bitcast float* %x to <4 x float>*
-  store <4 x float> %0, <4 x float>* %1, align 16
-  ret void
-
-  ; CHECK-LABEL: test0f
-  ; CHECK: movi.2d v[[TEMP:[0-9]+]], #0000000000000000
-  ; CHECK: ins.s v[[TEMP]][0], v{{[0-9]+}}[0]
-  ; CHECK: str q[[TEMP]], [x0]
-  ; CHECK: ret
-
-
-}
-
-
-define void @test1f(float* nocapture %x, float %a) #0 {
-entry:
-  %0 = insertelement <4 x float> <float undef, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, float %a, i32 0
-  %1 = bitcast float* %x to <4 x float>*
-  store <4 x float> %0, <4 x float>* %1, align 16
-  ret void
-
-  ; CHECK-LABEL: test1f
-  ; CHECK: fmov  s[[TEMP:[0-9]+]], #1.0000000
-  ; CHECK: dup.4s  v[[TEMP2:[0-9]+]], v[[TEMP]][0]
-  ; CHECK: ins.s v[[TEMP2]][0], v0[0]
-  ; CHECK: str q[[TEMP2]], [x0]
-  ; CHECK: ret
-}
diff --git a/test/CodeGen/ARM64/vector-ldst.ll b/test/CodeGen/ARM64/vector-ldst.ll
deleted file mode 100644
index 154160ee502..00000000000
--- a/test/CodeGen/ARM64/vector-ldst.ll
+++ /dev/null
@@ -1,601 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
-
-; rdar://9428579
-
-%type1 = type { <16 x i8> }
-%type2 = type { <8 x i8> }
-%type3 = type { <4 x i16> }
-
-
-define hidden fastcc void @t1(%type1** %argtable) nounwind {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: ldr x[[REG:[0-9]+]], [x0]
-; CHECK: str q0, [x[[REG]]]
-  %tmp1 = load %type1** %argtable, align 8
-  %tmp2 = getelementptr inbounds %type1* %tmp1, i64 0, i32 0
-  store <16 x i8> zeroinitializer, <16 x i8>* %tmp2, align 16
-  ret void
-}
-
-define hidden fastcc void @t2(%type2** %argtable) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: ldr x[[REG:[0-9]+]], [x0]
-; CHECK: str d0, [x[[REG]]]
-  %tmp1 = load %type2** %argtable, align 8
-  %tmp2 = getelementptr inbounds %type2* %tmp1, i64 0, i32 0
-  store <8 x i8> zeroinitializer, <8 x i8>* %tmp2, align 8
-  ret void
-}
-
-; add a bunch of tests for rdar://11246289
-
-@globalArray64x2 = common global <2 x i64>* null, align 8
-@globalArray32x4 = common global <4 x i32>* null, align 8
-@globalArray16x8 = common global <8 x i16>* null, align 8
-@globalArray8x16 = common global <16 x i8>* null, align 8
-@globalArray64x1 = common global <1 x i64>* null, align 8
-@globalArray32x2 = common global <2 x i32>* null, align 8
-@globalArray16x4 = common global <4 x i16>* null, align 8
-@globalArray8x8 = common global <8 x i8>* null, align 8
-@floatglobalArray64x2 = common global <2 x double>* null, align 8
-@floatglobalArray32x4 = common global <4 x float>* null, align 8
-@floatglobalArray64x1 = common global <1 x double>* null, align 8
-@floatglobalArray32x2 = common global <2 x float>* null, align 8
-
-define void @fct1_64x2(<2 x i64>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_64x2:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 %offset
-  %tmp = load <2 x i64>* %arrayidx, align 16
-  %tmp1 = load <2 x i64>** @globalArray64x2, align 8
-  %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 %offset
-  store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct2_64x2(<2 x i64>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_64x2:
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
-  %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 3
-  %tmp = load <2 x i64>* %arrayidx, align 16
-  %tmp1 = load <2 x i64>** @globalArray64x2, align 8
-  %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 5
-  store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct1_32x4(<4 x i32>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_32x4:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 %offset
-  %tmp = load <4 x i32>* %arrayidx, align 16
-  %tmp1 = load <4 x i32>** @globalArray32x4, align 8
-  %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 %offset
-  store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct2_32x4(<4 x i32>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_32x4:
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
-  %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 3
-  %tmp = load <4 x i32>* %arrayidx, align 16
-  %tmp1 = load <4 x i32>** @globalArray32x4, align 8
-  %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 5
-  store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct1_16x8(<8 x i16>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_16x8:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 %offset
-  %tmp = load <8 x i16>* %arrayidx, align 16
-  %tmp1 = load <8 x i16>** @globalArray16x8, align 8
-  %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 %offset
-  store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct2_16x8(<8 x i16>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_16x8:
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
-  %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 3
-  %tmp = load <8 x i16>* %arrayidx, align 16
-  %tmp1 = load <8 x i16>** @globalArray16x8, align 8
-  %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 5
-  store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct1_8x16(<16 x i8>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_8x16:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 %offset
-  %tmp = load <16 x i8>* %arrayidx, align 16
-  %tmp1 = load <16 x i8>** @globalArray8x16, align 8
-  %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 %offset
-  store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct2_8x16(<16 x i8>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_8x16:
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
-  %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 3
-  %tmp = load <16 x i8>* %arrayidx, align 16
-  %tmp1 = load <16 x i8>** @globalArray8x16, align 8
-  %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 5
-  store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct1_64x1(<1 x i64>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_64x1:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 %offset
-  %tmp = load <1 x i64>* %arrayidx, align 8
-  %tmp1 = load <1 x i64>** @globalArray64x1, align 8
-  %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 %offset
-  store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
-  ret void
-}
-
-define void @fct2_64x1(<1 x i64>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_64x1:
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
-  %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 3
-  %tmp = load <1 x i64>* %arrayidx, align 8
-  %tmp1 = load <1 x i64>** @globalArray64x1, align 8
-  %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 5
-  store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
-  ret void
-}
-
-define void @fct1_32x2(<2 x i32>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_32x2:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 %offset
-  %tmp = load <2 x i32>* %arrayidx, align 8
-  %tmp1 = load <2 x i32>** @globalArray32x2, align 8
-  %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 %offset
-  store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
-  ret void
-}
-
-define void @fct2_32x2(<2 x i32>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_32x2:
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
-  %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 3
-  %tmp = load <2 x i32>* %arrayidx, align 8
-  %tmp1 = load <2 x i32>** @globalArray32x2, align 8
-  %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 5
-  store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
-  ret void
-}
-
-define void @fct1_16x4(<4 x i16>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_16x4:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 %offset
-  %tmp = load <4 x i16>* %arrayidx, align 8
-  %tmp1 = load <4 x i16>** @globalArray16x4, align 8
-  %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 %offset
-  store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
-  ret void
-}
-
-define void @fct2_16x4(<4 x i16>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_16x4:
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
-  %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 3
-  %tmp = load <4 x i16>* %arrayidx, align 8
-  %tmp1 = load <4 x i16>** @globalArray16x4, align 8
-  %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 5
-  store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
-  ret void
-}
-
-define void @fct1_8x8(<8 x i8>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_8x8:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <8 x i8>* %array, i64 %offset
-  %tmp = load <8 x i8>* %arrayidx, align 8
-  %tmp1 = load <8 x i8>** @globalArray8x8, align 8
-  %arrayidx1 = getelementptr inbounds <8 x i8>* %tmp1, i64 %offset
-  store <8 x i8> %tmp, <8 x i8>* %arrayidx1, align 8
-  ret void
-}
-
-; Add a bunch of tests for rdar://13258794: Match LDUR/STUR for D and Q
-; registers for unscaled vector accesses
-@str = global [63 x i8] c"Test case for rdar://13258794: LDUR/STUR for D and Q registers\00", align 1
-
-define <1 x i64> @fct0() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct0:
-; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
-  ret <1 x i64> %0
-}
-
-define <2 x i32> @fct1() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct1:
-; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
-  ret <2 x i32> %0
-}
-
-define <4 x i16> @fct2() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct2:
-; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
-  ret <4 x i16> %0
-}
-
-define <8 x i8> @fct3() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct3:
-; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
-  ret <8 x i8> %0
-}
-
-define <2 x i64> @fct4() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct4:
-; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
-  ret <2 x i64> %0
-}
-
-define <4 x i32> @fct5() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct5:
-; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
-  ret <4 x i32> %0
-}
-
-define <8 x i16> @fct6() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct6:
-; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
-  ret <8 x i16> %0
-}
-
-define <16 x i8> @fct7() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct7:
-; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
-  ret <16 x i8> %0
-}
-
-define void @fct8() nounwind ssp {
-entry:
-; CHECK-LABEL: fct8:
-; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
-  store <1 x i64> %0, <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <1 x i64>*), align 8
-  ret void
-}
-
-define void @fct9() nounwind ssp {
-entry:
-; CHECK-LABEL: fct9:
-; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
-  store <2 x i32> %0, <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i32>*), align 8
-  ret void
-}
-
-define void @fct10() nounwind ssp {
-entry:
-; CHECK-LABEL: fct10:
-; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
-  store <4 x i16> %0, <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i16>*), align 8
-  ret void
-}
-
-define void @fct11() nounwind ssp {
-entry:
-; CHECK-LABEL: fct11:
-; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
-  store <8 x i8> %0, <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i8>*), align 8
-  ret void
-}
-
-define void @fct12() nounwind ssp {
-entry:
-; CHECK-LABEL: fct12:
-; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
-  store <2 x i64> %0, <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i64>*), align 16
-  ret void
-}
-
-define void @fct13() nounwind ssp {
-entry:
-; CHECK-LABEL: fct13:
-; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
-  store <4 x i32> %0, <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i32>*), align 16
-  ret void
-}
-
-define void @fct14() nounwind ssp {
-entry:
-; CHECK-LABEL: fct14:
-; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
-  store <8 x i16> %0, <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i16>*), align 16
-  ret void
-}
-
-define void @fct15() nounwind ssp {
-entry:
-; CHECK-LABEL: fct15:
-; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
-  store <16 x i8> %0, <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <16 x i8>*), align 16
-  ret void
-}
-
-; Check the building of vector from a single loaded value.
-; Part of <rdar://problem/14170854>
-;
-; Single loads with immediate offset.
-define <8 x i8> @fct16(i8* nocapture %sp0) {
-; CHECK-LABEL: fct16:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 1
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <8 x i8> %vec, %vec
-  ret <8 x i8> %vmull.i
-}
-
-define <16 x i8> @fct17(i8* nocapture %sp0) {
-; CHECK-LABEL: fct17:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 1
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <16 x i8> %vec, %vec
-  ret <16 x i8> %vmull.i
-}
-
-define <4 x i16> @fct18(i16* nocapture %sp0) {
-; CHECK-LABEL: fct18:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
-; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 1
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <4 x i16> %vec, %vec
-  ret <4 x i16> %vmull.i
-}
-
-define <8 x i16> @fct19(i16* nocapture %sp0) {
-; CHECK-LABEL: fct19:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
-; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 1
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <8 x i16> %vec, %vec
-  ret <8 x i16> %vmull.i
-}
-
-define <2 x i32> @fct20(i32* nocapture %sp0) {
-; CHECK-LABEL: fct20:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <2 x i32> %vec, %vec
-  ret <2 x i32> %vmull.i
-}
-
-define <4 x i32> @fct21(i32* nocapture %sp0) {
-; CHECK-LABEL: fct21:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <4 x i32> %vec, %vec
-  ret <4 x i32> %vmull.i
-}
-
-define <1 x i64> @fct22(i64* nocapture %sp0) {
-; CHECK-LABEL: fct22:
-; CHECK: ldr d0, [x0, #8]
-entry:
-  %addr = getelementptr i64* %sp0, i64 1
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
-   ret <1 x i64> %vec
-}
-
-define <2 x i64> @fct23(i64* nocapture %sp0) {
-; CHECK-LABEL: fct23:
-; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
-entry:
-  %addr = getelementptr i64* %sp0, i64 1
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
-  ret <2 x i64> %vec
-}
-
-;
-; Single loads with register offset.
-define <8 x i8> @fct24(i8* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct24:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
-; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <8 x i8> %vec, %vec
-  ret <8 x i8> %vmull.i
-}
-
-define <16 x i8> @fct25(i8* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct25:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
-; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <16 x i8> %vec, %vec
-  ret <16 x i8> %vmull.i
-}
-
-define <4 x i16> @fct26(i16* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct26:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
-; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <4 x i16> %vec, %vec
-  ret <4 x i16> %vmull.i
-}
-
-define <8 x i16> @fct27(i16* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct27:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
-; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <8 x i16> %vec, %vec
-  ret <8 x i16> %vmull.i
-}
-
-define <2 x i32> @fct28(i32* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct28:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
-; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <2 x i32> %vec, %vec
-  ret <2 x i32> %vmull.i
-}
-
-define <4 x i32> @fct29(i32* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct29:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
-; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <4 x i32> %vec, %vec
-  ret <4 x i32> %vmull.i
-}
-
-define <1 x i64> @fct30(i64* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct30:
-; CHECK: ldr d0, [x0, x1, lsl #3]
-entry:
-  %addr = getelementptr i64* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
-   ret <1 x i64> %vec
-}
-
-define <2 x i64> @fct31(i64* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct31:
-; CHECK: ldr d0, [x0, x1, lsl #3]
-entry:
-  %addr = getelementptr i64* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
-  ret <2 x i64> %vec
-}
diff --git a/test/CodeGen/ARM64/vext.ll b/test/CodeGen/ARM64/vext.ll
deleted file mode 100644
index c82043940c8..00000000000
--- a/test/CodeGen/ARM64/vext.ll
+++ /dev/null
@@ -1,464 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define void @test_vext_s8() nounwind ssp {
-  ; CHECK-LABEL: test_vext_s8:
-  ; CHECK: {{ext.8.*#1}}
-  %xS8x8 = alloca <8 x i8>, align 8
-  %__a = alloca <8 x i8>, align 8
-  %__b = alloca <8 x i8>, align 8
-  %tmp = load <8 x i8>* %xS8x8, align 8
-  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
-  %tmp1 = load <8 x i8>* %xS8x8, align 8
-  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
-  %tmp2 = load <8 x i8>* %__a, align 8
-  %tmp3 = load <8 x i8>* %__b, align 8
-  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  store <8 x i8> %vext, <8 x i8>* %xS8x8, align 8
-  ret void
-}
-
-define void @test_vext_u8() nounwind ssp {
-  ; CHECK-LABEL: test_vext_u8:
-  ; CHECK: {{ext.8.*#2}}
-  %xU8x8 = alloca <8 x i8>, align 8
-  %__a = alloca <8 x i8>, align 8
-  %__b = alloca <8 x i8>, align 8
-  %tmp = load <8 x i8>* %xU8x8, align 8
-  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
-  %tmp1 = load <8 x i8>* %xU8x8, align 8
-  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
-  %tmp2 = load <8 x i8>* %__a, align 8
-  %tmp3 = load <8 x i8>* %__b, align 8
-  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-  store <8 x i8> %vext, <8 x i8>* %xU8x8, align 8
-  ret void
-}
-
-define void @test_vext_p8() nounwind ssp {
-  ; CHECK-LABEL: test_vext_p8:
-  ; CHECK: {{ext.8.*#3}}
-  %xP8x8 = alloca <8 x i8>, align 8
-  %__a = alloca <8 x i8>, align 8
-  %__b = alloca <8 x i8>, align 8
-  %tmp = load <8 x i8>* %xP8x8, align 8
-  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
-  %tmp1 = load <8 x i8>* %xP8x8, align 8
-  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
-  %tmp2 = load <8 x i8>* %__a, align 8
-  %tmp3 = load <8 x i8>* %__b, align 8
-  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-  store <8 x i8> %vext, <8 x i8>* %xP8x8, align 8
-  ret void
-}
-
-define void @test_vext_s16() nounwind ssp {
-  ; CHECK-LABEL: test_vext_s16:
-  ; CHECK: {{ext.8.*#2}}
-  %xS16x4 = alloca <4 x i16>, align 8
-  %__a = alloca <4 x i16>, align 8
-  %__b = alloca <4 x i16>, align 8
-  %tmp = load <4 x i16>* %xS16x4, align 8
-  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
-  %tmp1 = load <4 x i16>* %xS16x4, align 8
-  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
-  %tmp2 = load <4 x i16>* %__a, align 8
-  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
-  %tmp4 = load <4 x i16>* %__b, align 8
-  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
-  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-  store <4 x i16> %vext, <4 x i16>* %xS16x4, align 8
-  ret void
-}
-
-define void @test_vext_u16() nounwind ssp {
-  ; CHECK-LABEL: test_vext_u16:
-  ; CHECK: {{ext.8.*#4}}
-  %xU16x4 = alloca <4 x i16>, align 8
-  %__a = alloca <4 x i16>, align 8
-  %__b = alloca <4 x i16>, align 8
-  %tmp = load <4 x i16>* %xU16x4, align 8
-  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
-  %tmp1 = load <4 x i16>* %xU16x4, align 8
-  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
-  %tmp2 = load <4 x i16>* %__a, align 8
-  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
-  %tmp4 = load <4 x i16>* %__b, align 8
-  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
-  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  store <4 x i16> %vext, <4 x i16>* %xU16x4, align 8
-  ret void
-}
-
-define void @test_vext_p16() nounwind ssp {
-  ; CHECK-LABEL: test_vext_p16:
-  ; CHECK: {{ext.8.*#6}}
-  %xP16x4 = alloca <4 x i16>, align 8
-  %__a = alloca <4 x i16>, align 8
-  %__b = alloca <4 x i16>, align 8
-  %tmp = load <4 x i16>* %xP16x4, align 8
-  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
-  %tmp1 = load <4 x i16>* %xP16x4, align 8
-  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
-  %tmp2 = load <4 x i16>* %__a, align 8
-  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
-  %tmp4 = load <4 x i16>* %__b, align 8
-  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
-  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-  store <4 x i16> %vext, <4 x i16>* %xP16x4, align 8
-  ret void
-}
-
-define void @test_vext_s32() nounwind ssp {
-  ; CHECK-LABEL: test_vext_s32:
-  ; CHECK: {{ext.8.*#4}}
-  %xS32x2 = alloca <2 x i32>, align 8
-  %__a = alloca <2 x i32>, align 8
-  %__b = alloca <2 x i32>, align 8
-  %tmp = load <2 x i32>* %xS32x2, align 8
-  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
-  %tmp1 = load <2 x i32>* %xS32x2, align 8
-  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
-  %tmp2 = load <2 x i32>* %__a, align 8
-  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
-  %tmp4 = load <2 x i32>* %__b, align 8
-  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
-  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
-  store <2 x i32> %vext, <2 x i32>* %xS32x2, align 8
-  ret void
-}
-
-define void @test_vext_u32() nounwind ssp {
-  ; CHECK-LABEL: test_vext_u32:
-  ; CHECK: {{ext.8.*#4}}
-  %xU32x2 = alloca <2 x i32>, align 8
-  %__a = alloca <2 x i32>, align 8
-  %__b = alloca <2 x i32>, align 8
-  %tmp = load <2 x i32>* %xU32x2, align 8
-  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
-  %tmp1 = load <2 x i32>* %xU32x2, align 8
-  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
-  %tmp2 = load <2 x i32>* %__a, align 8
-  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
-  %tmp4 = load <2 x i32>* %__b, align 8
-  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
-  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
-  store <2 x i32> %vext, <2 x i32>* %xU32x2, align 8
-  ret void
-}
-
-define void @test_vext_f32() nounwind ssp {
-  ; CHECK-LABEL: test_vext_f32:
-  ; CHECK: {{ext.8.*#4}}
-  %xF32x2 = alloca <2 x float>, align 8
-  %__a = alloca <2 x float>, align 8
-  %__b = alloca <2 x float>, align 8
-  %tmp = load <2 x float>* %xF32x2, align 8
-  store <2 x float> %tmp, <2 x float>* %__a, align 8
-  %tmp1 = load <2 x float>* %xF32x2, align 8
-  store <2 x float> %tmp1, <2 x float>* %__b, align 8
-  %tmp2 = load <2 x float>* %__a, align 8
-  %tmp3 = bitcast <2 x float> %tmp2 to <8 x i8>
-  %tmp4 = load <2 x float>* %__b, align 8
-  %tmp5 = bitcast <2 x float> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x float>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x float>
-  %vext = shufflevector <2 x float> %tmp6, <2 x float> %tmp7, <2 x i32> <i32 1, i32 2>
-  store <2 x float> %vext, <2 x float>* %xF32x2, align 8
-  ret void
-}
-
-define void @test_vext_s64() nounwind ssp {
-  ; CHECK-LABEL: test_vext_s64:
-  ; CHECK_FIXME: {{ext.8.*#1}}
-  ; this just turns into a load of the second element
-  %xS64x1 = alloca <1 x i64>, align 8
-  %__a = alloca <1 x i64>, align 8
-  %__b = alloca <1 x i64>, align 8
-  %tmp = load <1 x i64>* %xS64x1, align 8
-  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
-  %tmp1 = load <1 x i64>* %xS64x1, align 8
-  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
-  %tmp2 = load <1 x i64>* %__a, align 8
-  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
-  %tmp4 = load <1 x i64>* %__b, align 8
-  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
-  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
-  store <1 x i64> %vext, <1 x i64>* %xS64x1, align 8
-  ret void
-}
-
-define void @test_vext_u64() nounwind ssp {
-  ; CHECK-LABEL: test_vext_u64:
-  ; CHECK_FIXME: {{ext.8.*#1}}
-  ; this is turned into a simple load of the 2nd element
-  %xU64x1 = alloca <1 x i64>, align 8
-  %__a = alloca <1 x i64>, align 8
-  %__b = alloca <1 x i64>, align 8
-  %tmp = load <1 x i64>* %xU64x1, align 8
-  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
-  %tmp1 = load <1 x i64>* %xU64x1, align 8
-  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
-  %tmp2 = load <1 x i64>* %__a, align 8
-  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
-  %tmp4 = load <1 x i64>* %__b, align 8
-  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
-  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
-  store <1 x i64> %vext, <1 x i64>* %xU64x1, align 8
-  ret void
-}
-
-define void @test_vextq_s8() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_s8:
-  ; CHECK: {{ext.16.*#4}}
-  %xS8x16 = alloca <16 x i8>, align 16
-  %__a = alloca <16 x i8>, align 16
-  %__b = alloca <16 x i8>, align 16
-  %tmp = load <16 x i8>* %xS8x16, align 16
-  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
-  %tmp1 = load <16 x i8>* %xS8x16, align 16
-  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
-  %tmp2 = load <16 x i8>* %__a, align 16
-  %tmp3 = load <16 x i8>* %__b, align 16
-  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
-  store <16 x i8> %vext, <16 x i8>* %xS8x16, align 16
-  ret void
-}
-
-define void @test_vextq_u8() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_u8:
-  ; CHECK: {{ext.16.*#5}}
-  %xU8x16 = alloca <16 x i8>, align 16
-  %__a = alloca <16 x i8>, align 16
-  %__b = alloca <16 x i8>, align 16
-  %tmp = load <16 x i8>* %xU8x16, align 16
-  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
-  %tmp1 = load <16 x i8>* %xU8x16, align 16
-  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
-  %tmp2 = load <16 x i8>* %__a, align 16
-  %tmp3 = load <16 x i8>* %__b, align 16
-  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
-  store <16 x i8> %vext, <16 x i8>* %xU8x16, align 16
-  ret void
-}
-
-define void @test_vextq_p8() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_p8:
-  ; CHECK: {{ext.16.*#6}}
-  %xP8x16 = alloca <16 x i8>, align 16
-  %__a = alloca <16 x i8>, align 16
-  %__b = alloca <16 x i8>, align 16
-  %tmp = load <16 x i8>* %xP8x16, align 16
-  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
-  %tmp1 = load <16 x i8>* %xP8x16, align 16
-  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
-  %tmp2 = load <16 x i8>* %__a, align 16
-  %tmp3 = load <16 x i8>* %__b, align 16
-  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
-  store <16 x i8> %vext, <16 x i8>* %xP8x16, align 16
-  ret void
-}
-
-define void @test_vextq_s16() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_s16:
-  ; CHECK: {{ext.16.*#14}}
-  %xS16x8 = alloca <8 x i16>, align 16
-  %__a = alloca <8 x i16>, align 16
-  %__b = alloca <8 x i16>, align 16
-  %tmp = load <8 x i16>* %xS16x8, align 16
-  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
-  %tmp1 = load <8 x i16>* %xS16x8, align 16
-  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
-  %tmp2 = load <8 x i16>* %__a, align 16
-  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
-  %tmp4 = load <8 x i16>* %__b, align 16
-  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
-  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
-  store <8 x i16> %vext, <8 x i16>* %xS16x8, align 16
-  ret void
-}
-
-define void @test_vextq_u16() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_u16:
-  ; CHECK: {{ext.16.*#8}}
-  %xU16x8 = alloca <8 x i16>, align 16
-  %__a = alloca <8 x i16>, align 16
-  %__b = alloca <8 x i16>, align 16
-  %tmp = load <8 x i16>* %xU16x8, align 16
-  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
-  %tmp1 = load <8 x i16>* %xU16x8, align 16
-  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
-  %tmp2 = load <8 x i16>* %__a, align 16
-  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
-  %tmp4 = load <8 x i16>* %__b, align 16
-  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
-  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  store <8 x i16> %vext, <8 x i16>* %xU16x8, align 16
-  ret void
-}
-
-define void @test_vextq_p16() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_p16:
-  ; CHECK: {{ext.16.*#10}}
-  %xP16x8 = alloca <8 x i16>, align 16
-  %__a = alloca <8 x i16>, align 16
-  %__b = alloca <8 x i16>, align 16
-  %tmp = load <8 x i16>* %xP16x8, align 16
-  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
-  %tmp1 = load <8 x i16>* %xP16x8, align 16
-  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
-  %tmp2 = load <8 x i16>* %__a, align 16
-  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
-  %tmp4 = load <8 x i16>* %__b, align 16
-  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
-  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
-  store <8 x i16> %vext, <8 x i16>* %xP16x8, align 16
-  ret void
-}
-
-define void @test_vextq_s32() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_s32:
-  ; CHECK: {{ext.16.*#4}}
-  %xS32x4 = alloca <4 x i32>, align 16
-  %__a = alloca <4 x i32>, align 16
-  %__b = alloca <4 x i32>, align 16
-  %tmp = load <4 x i32>* %xS32x4, align 16
-  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
-  %tmp1 = load <4 x i32>* %xS32x4, align 16
-  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
-  %tmp2 = load <4 x i32>* %__a, align 16
-  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
-  %tmp4 = load <4 x i32>* %__b, align 16
-  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
-  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-  store <4 x i32> %vext, <4 x i32>* %xS32x4, align 16
-  ret void
-}
-
-define void @test_vextq_u32() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_u32:
-  ; CHECK: {{ext.16.*#8}}
-  %xU32x4 = alloca <4 x i32>, align 16
-  %__a = alloca <4 x i32>, align 16
-  %__b = alloca <4 x i32>, align 16
-  %tmp = load <4 x i32>* %xU32x4, align 16
-  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
-  %tmp1 = load <4 x i32>* %xU32x4, align 16
-  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
-  %tmp2 = load <4 x i32>* %__a, align 16
-  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
-  %tmp4 = load <4 x i32>* %__b, align 16
-  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
-  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  store <4 x i32> %vext, <4 x i32>* %xU32x4, align 16
-  ret void
-}
-
-define void @test_vextq_f32() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_f32:
-  ; CHECK: {{ext.16.*#12}}
-  %xF32x4 = alloca <4 x float>, align 16
-  %__a = alloca <4 x float>, align 16
-  %__b = alloca <4 x float>, align 16
-  %tmp = load <4 x float>* %xF32x4, align 16
-  store <4 x float> %tmp, <4 x float>* %__a, align 16
-  %tmp1 = load <4 x float>* %xF32x4, align 16
-  store <4 x float> %tmp1, <4 x float>* %__b, align 16
-  %tmp2 = load <4 x float>* %__a, align 16
-  %tmp3 = bitcast <4 x float> %tmp2 to <16 x i8>
-  %tmp4 = load <4 x float>* %__b, align 16
-  %tmp5 = bitcast <4 x float> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x float>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x float>
-  %vext = shufflevector <4 x float> %tmp6, <4 x float> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-  store <4 x float> %vext, <4 x float>* %xF32x4, align 16
-  ret void
-}
-
-define void @test_vextq_s64() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_s64:
-  ; CHECK: {{ext.16.*#8}}
-  %xS64x2 = alloca <2 x i64>, align 16
-  %__a = alloca <2 x i64>, align 16
-  %__b = alloca <2 x i64>, align 16
-  %tmp = load <2 x i64>* %xS64x2, align 16
-  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
-  %tmp1 = load <2 x i64>* %xS64x2, align 16
-  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
-  %tmp2 = load <2 x i64>* %__a, align 16
-  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
-  %tmp4 = load <2 x i64>* %__b, align 16
-  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
-  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
-  store <2 x i64> %vext, <2 x i64>* %xS64x2, align 16
-  ret void
-}
-
-define void @test_vextq_u64() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_u64:
-  ; CHECK: {{ext.16.*#8}}
-  %xU64x2 = alloca <2 x i64>, align 16
-  %__a = alloca <2 x i64>, align 16
-  %__b = alloca <2 x i64>, align 16
-  %tmp = load <2 x i64>* %xU64x2, align 16
-  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
-  %tmp1 = load <2 x i64>* %xU64x2, align 16
-  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
-  %tmp2 = load <2 x i64>* %__a, align 16
-  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
-  %tmp4 = load <2 x i64>* %__b, align 16
-  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
-  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
-  store <2 x i64> %vext, <2 x i64>* %xU64x2, align 16
-  ret void
-}
-
-; shuffles with an undef second operand can use an EXT also so long as the
-; indices wrap and stay sequential.
-; rdar://12051674
-define <16 x i8> @vext1(<16 x i8> %_a) nounwind {
-; CHECK-LABEL: vext1:
-; CHECK: ext.16b  v0, v0, v0, #8
-  %vext = shufflevector <16 x i8> %_a, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <16 x i8> %vext
-}
-
-; <rdar://problem/12212062>
-define <2 x i64> @vext2(<2 x i64> %p0, <2 x i64> %p1) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: vext2:
-; CHECK: ext.16b v1, v1, v1, #8
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: add.2d  v0, v0, v1
-  %t0 = shufflevector <2 x i64> %p1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-  %t1 = shufflevector <2 x i64> %p0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-  %t2 = add <2 x i64> %t1, %t0
-  ret <2 x i64> %t2
-}
diff --git a/test/CodeGen/ARM64/vext_reverse.ll b/test/CodeGen/ARM64/vext_reverse.ll
deleted file mode 100644
index c45e55edeca..00000000000
--- a/test/CodeGen/ARM64/vext_reverse.ll
+++ /dev/null
@@ -1,172 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnuabi < %s | FileCheck %s
-
-; The following tests is to check the correctness of reversing input operand 
-; of vext by enumerating all cases of using two undefs in shuffle masks.
-
-define <4 x i16> @vext_6701_0(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_6701_0:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_6701_12(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_6701_12:
-; CHECK: ext	v0.8b, v0.8b, v0.8b, #4
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_6701_13(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_6701_13:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 7, i32 undef, i32 1>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_6701_14(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_6701_14:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 7, i32 0, i32 undef>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_6701_23(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_6701_23:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 undef, i32 undef, i32 1>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_6701_24(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_6701_24:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 undef, i32 0, i32 undef>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_6701_34(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_6701_34:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 undef, i32 undef>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_5670_0(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_5670_0:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_5670_12(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_5670_12:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 7, i32 0>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_5670_13(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_5670_13:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 6, i32 undef, i32 0>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_5670_14(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_5670_14:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 6, i32 7, i32 undef>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_5670_23(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_5670_23:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 undef, i32 undef, i32 0>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_5670_24(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_5670_24:
-; CHECK: rev32   v0.4h, v1.4h
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 undef, i32 7, i32 undef>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_5670_34(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_5670_34:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 6, i32 undef, i32 undef>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_7012_0(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_7012_0:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_7012_12(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_7012_12:
-; CHECK: ext	v0.8b, v0.8b, v0.8b, #6
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 1, i32 2>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_7012_13(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_7012_13:
-; CHECK: rev32   v0.4h, v0.4h
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 0, i32 undef, i32 2>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_7012_14(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_7012_14:
-; CHECK: ext	v0.8b, v0.8b, v0.8b, #6
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_7012_23(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_7012_23:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 undef, i32 undef, i32 2>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_7012_24(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_7012_24:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 undef, i32 1, i32 undef>
-  ret <4 x i16> %x
-}
-
-define <4 x i16> @vext_7012_34(<4 x i16> %a1, <4 x i16> %a2) {
-entry:
-; CHECK-LABEL: vext_7012_34:
-; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
-  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 0, i32 undef, i32 undef>
-  ret <4 x i16> %x
-}
diff --git a/test/CodeGen/ARM64/vfloatintrinsics.ll b/test/CodeGen/ARM64/vfloatintrinsics.ll
deleted file mode 100644
index a8c882bf696..00000000000
--- a/test/CodeGen/ARM64/vfloatintrinsics.ll
+++ /dev/null
@@ -1,375 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-;;; Float vectors
-
-%v2f32 = type <2 x float>
-; CHECK: test_v2f32.sqrt:
-define %v2f32 @test_v2f32.sqrt(%v2f32 %a) {
-  ; CHECK: fsqrt.2s
-  %1 = call %v2f32 @llvm.sqrt.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.powi:
-define %v2f32 @test_v2f32.powi(%v2f32 %a, i32 %b) {
-  ; CHECK: pow
-  %1 = call %v2f32 @llvm.powi.v2f32(%v2f32 %a, i32 %b)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.sin:
-define %v2f32 @test_v2f32.sin(%v2f32 %a) {
-  ; CHECK: sin
-  %1 = call %v2f32 @llvm.sin.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.cos:
-define %v2f32 @test_v2f32.cos(%v2f32 %a) {
-  ; CHECK: cos
-  %1 = call %v2f32 @llvm.cos.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.pow:
-define %v2f32 @test_v2f32.pow(%v2f32 %a, %v2f32 %b) {
-  ; CHECK: pow
-  %1 = call %v2f32 @llvm.pow.v2f32(%v2f32 %a, %v2f32 %b)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.exp:
-define %v2f32 @test_v2f32.exp(%v2f32 %a) {
-  ; CHECK: exp
-  %1 = call %v2f32 @llvm.exp.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.exp2:
-define %v2f32 @test_v2f32.exp2(%v2f32 %a) {
-  ; CHECK: exp
-  %1 = call %v2f32 @llvm.exp2.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.log:
-define %v2f32 @test_v2f32.log(%v2f32 %a) {
-  ; CHECK: log
-  %1 = call %v2f32 @llvm.log.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.log10:
-define %v2f32 @test_v2f32.log10(%v2f32 %a) {
-  ; CHECK: log
-  %1 = call %v2f32 @llvm.log10.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.log2:
-define %v2f32 @test_v2f32.log2(%v2f32 %a) {
-  ; CHECK: log
-  %1 = call %v2f32 @llvm.log2.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.fma:
-define %v2f32 @test_v2f32.fma(%v2f32 %a, %v2f32 %b, %v2f32 %c) {
-  ; CHECK: fma
-  %1 = call %v2f32 @llvm.fma.v2f32(%v2f32 %a, %v2f32 %b, %v2f32 %c)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.fabs:
-define %v2f32 @test_v2f32.fabs(%v2f32 %a) {
-  ; CHECK: fabs
-  %1 = call %v2f32 @llvm.fabs.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.floor:
-define %v2f32 @test_v2f32.floor(%v2f32 %a) {
-  ; CHECK: frintm.2s
-  %1 = call %v2f32 @llvm.floor.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.ceil:
-define %v2f32 @test_v2f32.ceil(%v2f32 %a) {
-  ; CHECK: frintp.2s
-  %1 = call %v2f32 @llvm.ceil.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.trunc:
-define %v2f32 @test_v2f32.trunc(%v2f32 %a) {
-  ; CHECK: frintz.2s
-  %1 = call %v2f32 @llvm.trunc.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.rint:
-define %v2f32 @test_v2f32.rint(%v2f32 %a) {
-  ; CHECK: frintx.2s
-  %1 = call %v2f32 @llvm.rint.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.nearbyint:
-define %v2f32 @test_v2f32.nearbyint(%v2f32 %a) {
-  ; CHECK: frinti.2s
-  %1 = call %v2f32 @llvm.nearbyint.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-
-declare %v2f32 @llvm.sqrt.v2f32(%v2f32) #0
-declare %v2f32 @llvm.powi.v2f32(%v2f32, i32) #0
-declare %v2f32 @llvm.sin.v2f32(%v2f32) #0
-declare %v2f32 @llvm.cos.v2f32(%v2f32) #0
-declare %v2f32 @llvm.pow.v2f32(%v2f32, %v2f32) #0
-declare %v2f32 @llvm.exp.v2f32(%v2f32) #0
-declare %v2f32 @llvm.exp2.v2f32(%v2f32) #0
-declare %v2f32 @llvm.log.v2f32(%v2f32) #0
-declare %v2f32 @llvm.log10.v2f32(%v2f32) #0
-declare %v2f32 @llvm.log2.v2f32(%v2f32) #0
-declare %v2f32 @llvm.fma.v2f32(%v2f32, %v2f32, %v2f32) #0
-declare %v2f32 @llvm.fabs.v2f32(%v2f32) #0
-declare %v2f32 @llvm.floor.v2f32(%v2f32) #0
-declare %v2f32 @llvm.ceil.v2f32(%v2f32) #0
-declare %v2f32 @llvm.trunc.v2f32(%v2f32) #0
-declare %v2f32 @llvm.rint.v2f32(%v2f32) #0
-declare %v2f32 @llvm.nearbyint.v2f32(%v2f32) #0
-
-;;;
-
-%v4f32 = type <4 x float>
-; CHECK: test_v4f32.sqrt:
-define %v4f32 @test_v4f32.sqrt(%v4f32 %a) {
-  ; CHECK: fsqrt.4s
-  %1 = call %v4f32 @llvm.sqrt.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.powi:
-define %v4f32 @test_v4f32.powi(%v4f32 %a, i32 %b) {
-  ; CHECK: pow
-  %1 = call %v4f32 @llvm.powi.v4f32(%v4f32 %a, i32 %b)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.sin:
-define %v4f32 @test_v4f32.sin(%v4f32 %a) {
-  ; CHECK: sin
-  %1 = call %v4f32 @llvm.sin.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.cos:
-define %v4f32 @test_v4f32.cos(%v4f32 %a) {
-  ; CHECK: cos
-  %1 = call %v4f32 @llvm.cos.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.pow:
-define %v4f32 @test_v4f32.pow(%v4f32 %a, %v4f32 %b) {
-  ; CHECK: pow
-  %1 = call %v4f32 @llvm.pow.v4f32(%v4f32 %a, %v4f32 %b)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.exp:
-define %v4f32 @test_v4f32.exp(%v4f32 %a) {
-  ; CHECK: exp
-  %1 = call %v4f32 @llvm.exp.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.exp2:
-define %v4f32 @test_v4f32.exp2(%v4f32 %a) {
-  ; CHECK: exp
-  %1 = call %v4f32 @llvm.exp2.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.log:
-define %v4f32 @test_v4f32.log(%v4f32 %a) {
-  ; CHECK: log
-  %1 = call %v4f32 @llvm.log.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.log10:
-define %v4f32 @test_v4f32.log10(%v4f32 %a) {
-  ; CHECK: log
-  %1 = call %v4f32 @llvm.log10.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.log2:
-define %v4f32 @test_v4f32.log2(%v4f32 %a) {
-  ; CHECK: log
-  %1 = call %v4f32 @llvm.log2.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.fma:
-define %v4f32 @test_v4f32.fma(%v4f32 %a, %v4f32 %b, %v4f32 %c) {
-  ; CHECK: fma
-  %1 = call %v4f32 @llvm.fma.v4f32(%v4f32 %a, %v4f32 %b, %v4f32 %c)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.fabs:
-define %v4f32 @test_v4f32.fabs(%v4f32 %a) {
-  ; CHECK: fabs
-  %1 = call %v4f32 @llvm.fabs.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.floor:
-define %v4f32 @test_v4f32.floor(%v4f32 %a) {
-  ; CHECK: frintm.4s
-  %1 = call %v4f32 @llvm.floor.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.ceil:
-define %v4f32 @test_v4f32.ceil(%v4f32 %a) {
-  ; CHECK: frintp.4s
-  %1 = call %v4f32 @llvm.ceil.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.trunc:
-define %v4f32 @test_v4f32.trunc(%v4f32 %a) {
-  ; CHECK: frintz.4s
-  %1 = call %v4f32 @llvm.trunc.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.rint:
-define %v4f32 @test_v4f32.rint(%v4f32 %a) {
-  ; CHECK: frintx.4s
-  %1 = call %v4f32 @llvm.rint.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.nearbyint:
-define %v4f32 @test_v4f32.nearbyint(%v4f32 %a) {
-  ; CHECK: frinti.4s
-  %1 = call %v4f32 @llvm.nearbyint.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-
-declare %v4f32 @llvm.sqrt.v4f32(%v4f32) #0
-declare %v4f32 @llvm.powi.v4f32(%v4f32, i32) #0
-declare %v4f32 @llvm.sin.v4f32(%v4f32) #0
-declare %v4f32 @llvm.cos.v4f32(%v4f32) #0
-declare %v4f32 @llvm.pow.v4f32(%v4f32, %v4f32) #0
-declare %v4f32 @llvm.exp.v4f32(%v4f32) #0
-declare %v4f32 @llvm.exp2.v4f32(%v4f32) #0
-declare %v4f32 @llvm.log.v4f32(%v4f32) #0
-declare %v4f32 @llvm.log10.v4f32(%v4f32) #0
-declare %v4f32 @llvm.log2.v4f32(%v4f32) #0
-declare %v4f32 @llvm.fma.v4f32(%v4f32, %v4f32, %v4f32) #0
-declare %v4f32 @llvm.fabs.v4f32(%v4f32) #0
-declare %v4f32 @llvm.floor.v4f32(%v4f32) #0
-declare %v4f32 @llvm.ceil.v4f32(%v4f32) #0
-declare %v4f32 @llvm.trunc.v4f32(%v4f32) #0
-declare %v4f32 @llvm.rint.v4f32(%v4f32) #0
-declare %v4f32 @llvm.nearbyint.v4f32(%v4f32) #0
-
-;;; Double vector
-
-%v2f64 = type <2 x double>
-; CHECK: test_v2f64.sqrt:
-define %v2f64 @test_v2f64.sqrt(%v2f64 %a) {
-  ; CHECK: fsqrt.2d
-  %1 = call %v2f64 @llvm.sqrt.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.powi:
-define %v2f64 @test_v2f64.powi(%v2f64 %a, i32 %b) {
-  ; CHECK: pow
-  %1 = call %v2f64 @llvm.powi.v2f64(%v2f64 %a, i32 %b)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.sin:
-define %v2f64 @test_v2f64.sin(%v2f64 %a) {
-  ; CHECK: sin
-  %1 = call %v2f64 @llvm.sin.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.cos:
-define %v2f64 @test_v2f64.cos(%v2f64 %a) {
-  ; CHECK: cos
-  %1 = call %v2f64 @llvm.cos.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.pow:
-define %v2f64 @test_v2f64.pow(%v2f64 %a, %v2f64 %b) {
-  ; CHECK: pow
-  %1 = call %v2f64 @llvm.pow.v2f64(%v2f64 %a, %v2f64 %b)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.exp:
-define %v2f64 @test_v2f64.exp(%v2f64 %a) {
-  ; CHECK: exp
-  %1 = call %v2f64 @llvm.exp.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.exp2:
-define %v2f64 @test_v2f64.exp2(%v2f64 %a) {
-  ; CHECK: exp
-  %1 = call %v2f64 @llvm.exp2.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.log:
-define %v2f64 @test_v2f64.log(%v2f64 %a) {
-  ; CHECK: log
-  %1 = call %v2f64 @llvm.log.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.log10:
-define %v2f64 @test_v2f64.log10(%v2f64 %a) {
-  ; CHECK: log
-  %1 = call %v2f64 @llvm.log10.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.log2:
-define %v2f64 @test_v2f64.log2(%v2f64 %a) {
-  ; CHECK: log
-  %1 = call %v2f64 @llvm.log2.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.fma:
-define %v2f64 @test_v2f64.fma(%v2f64 %a, %v2f64 %b, %v2f64 %c) {
-  ; CHECK: fma
-  %1 = call %v2f64 @llvm.fma.v2f64(%v2f64 %a, %v2f64 %b, %v2f64 %c)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.fabs:
-define %v2f64 @test_v2f64.fabs(%v2f64 %a) {
-  ; CHECK: fabs
-  %1 = call %v2f64 @llvm.fabs.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.floor:
-define %v2f64 @test_v2f64.floor(%v2f64 %a) {
-  ; CHECK: frintm.2d
-  %1 = call %v2f64 @llvm.floor.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.ceil:
-define %v2f64 @test_v2f64.ceil(%v2f64 %a) {
-  ; CHECK: frintp.2d
-  %1 = call %v2f64 @llvm.ceil.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.trunc:
-define %v2f64 @test_v2f64.trunc(%v2f64 %a) {
-  ; CHECK: frintz.2d
-  %1 = call %v2f64 @llvm.trunc.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.rint:
-define %v2f64 @test_v2f64.rint(%v2f64 %a) {
-  ; CHECK: frintx.2d
-  %1 = call %v2f64 @llvm.rint.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.nearbyint:
-define %v2f64 @test_v2f64.nearbyint(%v2f64 %a) {
-  ; CHECK: frinti.2d
-  %1 = call %v2f64 @llvm.nearbyint.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-
-declare %v2f64 @llvm.sqrt.v2f64(%v2f64) #0
-declare %v2f64 @llvm.powi.v2f64(%v2f64, i32) #0
-declare %v2f64 @llvm.sin.v2f64(%v2f64) #0
-declare %v2f64 @llvm.cos.v2f64(%v2f64) #0
-declare %v2f64 @llvm.pow.v2f64(%v2f64, %v2f64) #0
-declare %v2f64 @llvm.exp.v2f64(%v2f64) #0
-declare %v2f64 @llvm.exp2.v2f64(%v2f64) #0
-declare %v2f64 @llvm.log.v2f64(%v2f64) #0
-declare %v2f64 @llvm.log10.v2f64(%v2f64) #0
-declare %v2f64 @llvm.log2.v2f64(%v2f64) #0
-declare %v2f64 @llvm.fma.v2f64(%v2f64, %v2f64, %v2f64) #0
-declare %v2f64 @llvm.fabs.v2f64(%v2f64) #0
-declare %v2f64 @llvm.floor.v2f64(%v2f64) #0
-declare %v2f64 @llvm.ceil.v2f64(%v2f64) #0
-declare %v2f64 @llvm.trunc.v2f64(%v2f64) #0
-declare %v2f64 @llvm.rint.v2f64(%v2f64) #0
-declare %v2f64 @llvm.nearbyint.v2f64(%v2f64) #0
-
-attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/ARM64/vhadd.ll b/test/CodeGen/ARM64/vhadd.ll
deleted file mode 100644
index aed76810e13..00000000000
--- a/test/CodeGen/ARM64/vhadd.ll
+++ /dev/null
@@ -1,249 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: shadd8b:
-;CHECK: shadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: shadd16b:
-;CHECK: shadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: shadd4h:
-;CHECK: shadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: shadd8h:
-;CHECK: shadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: shadd2s:
-;CHECK: shadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: shadd4s:
-;CHECK: shadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uhadd8b:
-;CHECK: uhadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uhadd16b:
-;CHECK: uhadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uhadd4h:
-;CHECK: uhadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uhadd8h:
-;CHECK: uhadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uhadd2s:
-;CHECK: uhadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uhadd4s:
-;CHECK: uhadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: srhadd8b:
-;CHECK: srhadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: srhadd16b:
-;CHECK: srhadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: srhadd4h:
-;CHECK: srhadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: srhadd8h:
-;CHECK: srhadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: srhadd2s:
-;CHECK: srhadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: srhadd4s:
-;CHECK: srhadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: urhadd8b:
-;CHECK: urhadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: urhadd16b:
-;CHECK: urhadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: urhadd4h:
-;CHECK: urhadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: urhadd8h:
-;CHECK: urhadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: urhadd2s:
-;CHECK: urhadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: urhadd4s:
-;CHECK: urhadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vhsub.ll b/test/CodeGen/ARM64/vhsub.ll
deleted file mode 100644
index 85df4d4eb73..00000000000
--- a/test/CodeGen/ARM64/vhsub.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @shsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: shsub8b:
-;CHECK: shsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @shsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: shsub16b:
-;CHECK: shsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @shsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: shsub4h:
-;CHECK: shsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @shsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: shsub8h:
-;CHECK: shsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @shsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: shsub2s:
-;CHECK: shsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @shsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: shsub4s:
-;CHECK: shsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <8 x i8> @uhsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uhsub8b:
-;CHECK: uhsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @uhsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uhsub16b:
-;CHECK: uhsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @uhsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uhsub4h:
-;CHECK: uhsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @uhsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uhsub8h:
-;CHECK: uhsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @uhsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uhsub2s:
-;CHECK: uhsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @uhsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uhsub4s:
-;CHECK: uhsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/virtual_base.ll b/test/CodeGen/ARM64/virtual_base.ll
deleted file mode 100644
index cb959545334..00000000000
--- a/test/CodeGen/ARM64/virtual_base.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc < %s -O3 -march arm64 | FileCheck %s
-; <rdar://13463602>
-
-%struct.Counter_Struct = type { i64, i64 }
-%struct.Bicubic_Patch_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64, i32, i32, i32, [4 x [4 x [3 x double]]], [3 x double], double, double, %struct.Bezier_Node_Struct* }
-%struct.Method_Struct = type { i32 (%struct.Object_Struct*, %struct.Ray_Struct*, %struct.istack_struct*)*, i32 (double*, %struct.Object_Struct*)*, void (double*, %struct.Object_Struct*, %struct.istk_entry*)*, i8* (%struct.Object_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*)*, void (%struct.Object_Struct*)* }
-%struct.Object_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64 }
-%struct.Texture_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.9, %struct.Texture_Struct*, %struct.Pigment_Struct*, %struct.Tnormal_Struct*, %struct.Finish_Struct*, %struct.Texture_Struct*, i32 }
-%struct.Warps_Struct = type { i16, %struct.Warps_Struct* }
-%struct.Pattern_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.6 }
-%struct.Blend_Map_Struct = type { i16, i16, i16, i64, %struct.Blend_Map_Entry* }
-%struct.Blend_Map_Entry = type { float, i8, %union.anon }
-%union.anon = type { [2 x double], [8 x i8] }
-%union.anon.6 = type { %struct.anon.7 }
-%struct.anon.7 = type { float, [3 x double] }
-%union.anon.9 = type { %struct.anon.10 }
-%struct.anon.10 = type { float, [3 x double] }
-%struct.Pigment_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.0, [5 x float] }
-%union.anon.0 = type { %struct.anon }
-%struct.anon = type { float, [3 x double] }
-%struct.Tnormal_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.3, float }
-%union.anon.3 = type { %struct.anon.4 }
-%struct.anon.4 = type { float, [3 x double] }
-%struct.Finish_Struct = type { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, [3 x float], [3 x float] }
-%struct.Interior_Struct = type { i64, i32, float, float, float, float, float, %struct.Media_Struct* }
-%struct.Media_Struct = type { i32, i32, i32, i32, i32, double, double, i32, i32, i32, i32, [5 x float], [5 x float], [5 x float], [5 x float], double, double, double, double*, %struct.Pigment_Struct*, %struct.Media_Struct* }
-%struct.Bounding_Box_Struct = type { [3 x float], [3 x float] }
-%struct.Ray_Struct = type { [3 x double], [3 x double], i32, [100 x %struct.Interior_Struct*] }
-%struct.istack_struct = type { %struct.istack_struct*, %struct.istk_entry*, i32 }
-%struct.istk_entry = type { double, [3 x double], [3 x double], %struct.Object_Struct*, i32, i32, double, double, i8* }
-%struct.Transform_Struct = type { [4 x [4 x double]], [4 x [4 x double]] }
-%struct.Bezier_Node_Struct = type { i32, [3 x double], double, i32, i8* }
-
-define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
-; CHECK: Precompute_Patch_Values
-; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288]
-; CHECK-NEXT: str [[VAL]], [sp, #232]
-; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272]
-; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
-entry:
-  %Control_Points = alloca [16 x [3 x double]], align 8
-  %arraydecay5.3.1 = getelementptr inbounds [16 x [3 x double]]* %Control_Points, i64 0, i64 9, i64 0
-  %tmp14 = bitcast double* %arraydecay5.3.1 to i8*
-  %arraydecay11.3.1 = getelementptr inbounds %struct.Bicubic_Patch_Struct* %Shape, i64 0, i32 12, i64 1, i64 3, i64 0
-  %tmp15 = bitcast double* %arraydecay11.3.1 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp14, i8* %tmp15, i64 24, i32 1, i1 false)
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
diff --git a/test/CodeGen/ARM64/vmax.ll b/test/CodeGen/ARM64/vmax.ll
deleted file mode 100644
index b2426f35057..00000000000
--- a/test/CodeGen/ARM64/vmax.ll
+++ /dev/null
@@ -1,679 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @smax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: smax_8b:
-;CHECK: smax.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @smax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: smax_16b:
-;CHECK: smax.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @smax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smax_4h:
-;CHECK: smax.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @smax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: smax_8h:
-;CHECK: smax.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @smax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smax_2s:
-;CHECK: smax.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @smax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: smax_4s:
-;CHECK: smax.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @umax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: umax_8b:
-;CHECK: umax.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @umax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: umax_16b:
-;CHECK: umax.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @umax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umax_4h:
-;CHECK: umax.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @umax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: umax_8h:
-;CHECK: umax.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @umax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umax_2s:
-;CHECK: umax.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @umax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: umax_4s:
-;CHECK: umax.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @smin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: smin_8b:
-;CHECK: smin.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @smin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: smin_16b:
-;CHECK: smin.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @smin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smin_4h:
-;CHECK: smin.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @smin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: smin_8h:
-;CHECK: smin.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @smin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smin_2s:
-;CHECK: smin.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @smin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: smin_4s:
-;CHECK: smin.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @umin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: umin_8b:
-;CHECK: umin.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @umin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: umin_16b:
-;CHECK: umin.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @umin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umin_4h:
-;CHECK: umin.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @umin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: umin_8h:
-;CHECK: umin.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @umin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umin_2s:
-;CHECK: umin.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @umin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: umin_4s:
-;CHECK: umin.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @smaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: smaxp_8b:
-;CHECK: smaxp.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @smaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: smaxp_16b:
-;CHECK: smaxp.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @smaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smaxp_4h:
-;CHECK: smaxp.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @smaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: smaxp_8h:
-;CHECK: smaxp.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @smaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smaxp_2s:
-;CHECK: smaxp.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @smaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: smaxp_4s:
-;CHECK: smaxp.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @umaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: umaxp_8b:
-;CHECK: umaxp.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @umaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: umaxp_16b:
-;CHECK: umaxp.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @umaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umaxp_4h:
-;CHECK: umaxp.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @umaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: umaxp_8h:
-;CHECK: umaxp.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @umaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umaxp_2s:
-;CHECK: umaxp.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @umaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: umaxp_4s:
-;CHECK: umaxp.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @sminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sminp_8b:
-;CHECK: sminp.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sminp_16b:
-;CHECK: sminp.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @sminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sminp_4h:
-;CHECK: sminp.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sminp_8h:
-;CHECK: sminp.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sminp_2s:
-;CHECK: sminp.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sminp_4s:
-;CHECK: sminp.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @uminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uminp_8b:
-;CHECK: uminp.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @uminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uminp_16b:
-;CHECK: uminp.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @uminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uminp_4h:
-;CHECK: uminp.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @uminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uminp_8h:
-;CHECK: uminp.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @uminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uminp_2s:
-;CHECK: uminp.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @uminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uminp_4s:
-;CHECK: uminp.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x float> @fmax_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmax_2s:
-;CHECK: fmax.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmax_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmax_4s:
-;CHECK: fmax.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmax_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmax_2d:
-;CHECK: fmax.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fmaxp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmaxp_2s:
-;CHECK: fmaxp.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmaxp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmaxp_4s:
-;CHECK: fmaxp.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmaxp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmaxp_2d:
-;CHECK: fmaxp.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fmin_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmin_2s:
-;CHECK: fmin.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmin_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmin_4s:
-;CHECK: fmin.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmin_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmin_2d:
-;CHECK: fmin.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fminp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fminp_2s:
-;CHECK: fminp.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fminp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fminp_4s:
-;CHECK: fminp.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fminp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fminp_2d:
-;CHECK: fminp.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fminnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fminnmp_2s:
-;CHECK: fminnmp.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fminnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fminnmp_4s:
-;CHECK: fminnmp.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fminnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fminnmp_2d:
-;CHECK: fminnmp.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fmaxnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmaxnmp_2s:
-;CHECK: fmaxnmp.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmaxnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmaxnmp_4s:
-;CHECK: fmaxnmp.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmaxnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmaxnmp_2d:
-;CHECK: fmaxnmp.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vminmaxnm.ll b/test/CodeGen/ARM64/vminmaxnm.ll
deleted file mode 100644
index 628640759a5..00000000000
--- a/test/CodeGen/ARM64/vminmaxnm.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.2s	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
-  ret <2 x float> %vmaxnm2.i
-}
-
-define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.4s	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
-  ret <4 x float> %vmaxnm2.i
-}
-
-define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.2d	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
-  ret <2 x double> %vmaxnm2.i
-}
-
-define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
-; CHECK: fminnm.2s	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
-  ret <2 x float> %vminnm2.i
-}
-
-define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
-; CHECK: fminnm.4s	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
-  ret <4 x float> %vminnm2.i
-}
-
-define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
-; CHECK: fminnm.2d	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
-  ret <2 x double> %vminnm2.i
-}
-
-declare <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
-
-
-define double @test_fmaxnmv(<2 x double> %in) {
-; CHECK-LABEL: test_fmaxnmv:
-; CHECK: fmaxnmp.2d d0, v0
-  %max = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
-  ret double %max
-}
-
-define double @test_fminnmv(<2 x double> %in) {
-; CHECK-LABEL: test_fminnmv:
-; CHECK: fminnmp.2d d0, v0
-  %min = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
-  ret double %min
-}
-
-declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
-declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/vmovn.ll b/test/CodeGen/ARM64/vmovn.ll
deleted file mode 100644
index 675633b6cfa..00000000000
--- a/test/CodeGen/ARM64/vmovn.ll
+++ /dev/null
@@ -1,242 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @xtn8b(<8 x i16> %A) nounwind {
-;CHECK-LABEL: xtn8b:
-;CHECK-NOT: ld1
-;CHECK: xtn.8b v0, v0
-;CHECK-NEXT: ret
-  %tmp3 = trunc <8 x i16> %A to <8 x i8>
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @xtn4h(<4 x i32> %A) nounwind {
-;CHECK-LABEL: xtn4h:
-;CHECK-NOT: ld1
-;CHECK: xtn.4h v0, v0
-;CHECK-NEXT: ret
-  %tmp3 = trunc <4 x i32> %A to <4 x i16>
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @xtn2s(<2 x i64> %A) nounwind {
-;CHECK-LABEL: xtn2s:
-;CHECK-NOT: ld1
-;CHECK: xtn.2s v0, v0
-;CHECK-NEXT: ret
-  %tmp3 = trunc <2 x i64> %A to <2 x i32>
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @xtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
-;CHECK-LABEL: xtn2_16b:
-;CHECK-NOT: ld1
-;CHECK: xtn2.16b v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = trunc <8 x i16> %A to <8 x i8>
-        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %res
-}
-
-define <8 x i16> @xtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
-;CHECK-LABEL: xtn2_8h:
-;CHECK-NOT: ld1
-;CHECK: xtn2.8h v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = trunc <4 x i32> %A to <4 x i16>
-        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @xtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
-;CHECK-LABEL: xtn2_4s:
-;CHECK-NOT: ld1
-;CHECK: xtn2.4s v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = trunc <2 x i64> %A to <2 x i32>
-        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %res
-}
-
-define <8 x i8> @sqxtn8b(<8 x i16> %A) nounwind {
-;CHECK-LABEL: sqxtn8b:
-;CHECK-NOT: ld1
-;CHECK: sqxtn.8b v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqxtn4h(<4 x i32> %A) nounwind {
-;CHECK-LABEL: sqxtn4h:
-;CHECK-NOT: ld1
-;CHECK: sqxtn.4h v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqxtn2s(<2 x i64> %A) nounwind {
-;CHECK-LABEL: sqxtn2s:
-;CHECK-NOT: ld1
-;CHECK: sqxtn.2s v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
-;CHECK-LABEL: sqxtn2_16b:
-;CHECK-NOT: ld1
-;CHECK: sqxtn2.16b v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
-        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %res
-}
-
-define <8 x i16> @sqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
-;CHECK-LABEL: sqxtn2_8h:
-;CHECK-NOT: ld1
-;CHECK: sqxtn2.8h v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
-        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @sqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
-;CHECK-LABEL: sqxtn2_4s:
-;CHECK-NOT: ld1
-;CHECK: sqxtn2.4s v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
-        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %res
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
-
-define <8 x i8> @uqxtn8b(<8 x i16> %A) nounwind {
-;CHECK-LABEL: uqxtn8b:
-;CHECK-NOT: ld1
-;CHECK: uqxtn.8b v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqxtn4h(<4 x i32> %A) nounwind {
-;CHECK-LABEL: uqxtn4h:
-;CHECK-NOT: ld1
-;CHECK: uqxtn.4h v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqxtn2s(<2 x i64> %A) nounwind {
-;CHECK-LABEL: uqxtn2s:
-;CHECK-NOT: ld1
-;CHECK: uqxtn.2s v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @uqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
-;CHECK-LABEL: uqxtn2_16b:
-;CHECK-NOT: ld1
-;CHECK: uqxtn2.16b v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
-        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %res
-}
-
-define <8 x i16> @uqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
-;CHECK-LABEL: uqxtn2_8h:
-;CHECK-NOT: ld1
-;CHECK: uqxtn2.8h v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
-        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @uqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
-;CHECK-LABEL: uqxtn2_4s:
-;CHECK-NOT: ld1
-;CHECK: uqxtn2.4s v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
-        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %res
-}
-
-declare <8 x i8>  @llvm.arm64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
-
-define <8 x i8> @sqxtun8b(<8 x i16> %A) nounwind {
-;CHECK-LABEL: sqxtun8b:
-;CHECK-NOT: ld1
-;CHECK: sqxtun.8b v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqxtun4h(<4 x i32> %A) nounwind {
-;CHECK-LABEL: sqxtun4h:
-;CHECK-NOT: ld1
-;CHECK: sqxtun.4h v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqxtun2s(<2 x i64> %A) nounwind {
-;CHECK-LABEL: sqxtun2s:
-;CHECK-NOT: ld1
-;CHECK: sqxtun.2s v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqxtun2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
-;CHECK-LABEL: sqxtun2_16b:
-;CHECK-NOT: ld1
-;CHECK: sqxtun2.16b v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
-        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %res
-}
-
-define <8 x i16> @sqxtun2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
-;CHECK-LABEL: sqxtun2_8h:
-;CHECK-NOT: ld1
-;CHECK: sqxtun2.8h v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
-        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @sqxtun2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
-;CHECK-LABEL: sqxtun2_4s:
-;CHECK-NOT: ld1
-;CHECK: sqxtun2.4s v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
-        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %res
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
-
diff --git a/test/CodeGen/ARM64/vmul.ll b/test/CodeGen/ARM64/vmul.ll
deleted file mode 100644
index b6bd16ac0b4..00000000000
--- a/test/CodeGen/ARM64/vmul.ll
+++ /dev/null
@@ -1,2036 +0,0 @@
-; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-
-define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: smull8h:
-;CHECK: smull.8h
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
-  ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smull4s:
-;CHECK: smull.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smull2d:
-;CHECK: smull.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i64> %tmp3
-}
-
-declare <8 x i16>  @llvm.arm64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
-
-define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: umull8h:
-;CHECK: umull.8h
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
-  ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umull4s:
-;CHECK: umull.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umull2d:
-;CHECK: umull.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i64> %tmp3
-}
-
-declare <8 x i16>  @llvm.arm64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
-
-define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmull4s:
-;CHECK: sqdmull.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmull2d:
-;CHECK: sqdmull.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i64> %tmp3
-}
-
-define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmull2_4s:
-;CHECK: sqdmull2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmull2_2d:
-;CHECK: sqdmull2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i64> %tmp3
-}
-
-
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
-
-define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: pmull8h:
-;CHECK: pmull.8h
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
-  ret <8 x i16> %tmp3
-}
-
-declare <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-
-define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_4h:
-;CHECK: sqdmulh.4h
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_8h:
-;CHECK: sqdmulh.8h
-  %tmp1 = load <8 x i16>* %A
-  %tmp2 = load <8 x i16>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-  ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_2s:
-;CHECK: sqdmulh.2s
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_4s:
-;CHECK: sqdmulh.4s
-  %tmp1 = load <4 x i32>* %A
-  %tmp2 = load <4 x i32>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-  ret <4 x i32> %tmp3
-}
-
-define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
-;CHECK-LABEL: sqdmulh_1s:
-;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
-  %tmp1 = load i32* %A
-  %tmp2 = load i32* %B
-  %tmp3 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
-  ret i32 %tmp3
-}
-
-declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare i32 @llvm.arm64.neon.sqdmulh.i32(i32, i32) nounwind readnone
-
-define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_4h:
-;CHECK: sqrdmulh.4h
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_8h:
-;CHECK: sqrdmulh.8h
-  %tmp1 = load <8 x i16>* %A
-  %tmp2 = load <8 x i16>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-  ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_2s:
-;CHECK: sqrdmulh.2s
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_4s:
-;CHECK: sqrdmulh.4s
-  %tmp1 = load <4 x i32>* %A
-  %tmp2 = load <4 x i32>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-  ret <4 x i32> %tmp3
-}
-
-define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_1s:
-;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
-  %tmp1 = load i32* %A
-  %tmp2 = load i32* %B
-  %tmp3 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
-  ret i32 %tmp3
-}
-
-declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare i32 @llvm.arm64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
-
-define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmulx_2s:
-;CHECK: fmulx.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-  ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmulx_4s:
-;CHECK: fmulx.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-  ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmulx_2d:
-;CHECK: fmulx.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-  ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: smlal4s:
-;CHECK: smlal.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = add <4 x i32> %tmp3, %tmp4
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: smlal2d:
-;CHECK: smlal.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = add <2 x i64> %tmp3, %tmp4
-  ret <2 x i64> %tmp5
-}
-
-define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: smlsl4s:
-;CHECK: smlsl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = sub <4 x i32> %tmp3, %tmp4
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: smlsl2d:
-;CHECK: smlsl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = sub <2 x i64> %tmp3, %tmp4
-  ret <2 x i64> %tmp5
-}
-
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
-
-define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlal4s:
-;CHECK: sqdmlal.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlal2d:
-;CHECK: sqdmlal.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
-  ret <2 x i64> %tmp5
-}
-
-define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlal2_4s:
-;CHECK: sqdmlal2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlal2_2d:
-;CHECK: sqdmlal2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
-  ret <2 x i64> %tmp5
-}
-
-define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl4s:
-;CHECK: sqdmlsl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl2d:
-;CHECK: sqdmlsl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
-  ret <2 x i64> %tmp5
-}
-
-define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl2_4s:
-;CHECK: sqdmlsl2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl2_2d:
-;CHECK: sqdmlsl2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
-  ret <2 x i64> %tmp5
-}
-
-define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: umlal4s:
-;CHECK: umlal.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = add <4 x i32> %tmp3, %tmp4
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: umlal2d:
-;CHECK: umlal.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = add <2 x i64> %tmp3, %tmp4
-  ret <2 x i64> %tmp5
-}
-
-define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: umlsl4s:
-;CHECK: umlsl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = sub <4 x i32> %tmp3, %tmp4
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: umlsl2d:
-;CHECK: umlsl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = sub <2 x i64> %tmp3, %tmp4
-  ret <2 x i64> %tmp5
-}
-
-define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
-;CHECK-LABEL: fmla_2s:
-;CHECK: fmla.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = load <2 x float>* %C
-  %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
-  ret <2 x float> %tmp4
-}
-
-define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
-;CHECK-LABEL: fmla_4s:
-;CHECK: fmla.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = load <4 x float>* %C
-  %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
-  ret <4 x float> %tmp4
-}
-
-define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
-;CHECK-LABEL: fmla_2d:
-;CHECK: fmla.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = load <2 x double>* %C
-  %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
-  ret <2 x double> %tmp4
-}
-
-declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
-;CHECK-LABEL: fmls_2s:
-;CHECK: fmls.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = load <2 x float>* %C
-  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
-  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
-  ret <2 x float> %tmp5
-}
-
-define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
-;CHECK-LABEL: fmls_4s:
-;CHECK: fmls.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = load <4 x float>* %C
-  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
-  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
-  ret <4 x float> %tmp5
-}
-
-define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
-;CHECK-LABEL: fmls_2d:
-;CHECK: fmls.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = load <2 x double>* %C
-  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
-  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
-  ret <2 x double> %tmp5
-}
-
-define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
-;CHECK-LABEL: fmls_commuted_neg_2s:
-;CHECK: fmls.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = load <2 x float>* %C
-  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
-  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
-  ret <2 x float> %tmp5
-}
-
-define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
-;CHECK-LABEL: fmls_commuted_neg_4s:
-;CHECK: fmls.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = load <4 x float>* %C
-  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
-  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
-  ret <4 x float> %tmp5
-}
-
-define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
-;CHECK-LABEL: fmls_commuted_neg_2d:
-;CHECK: fmls.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = load <2 x double>* %C
-  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
-  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
-  ret <2 x double> %tmp5
-}
-
-define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
-;CHECK-LABEL: fmls_indexed_2s:
-;CHECK: fmls.2s
-entry:
-  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
-  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
-  %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
-  ret <2 x float> %fmls1
-}
-
-define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
-;CHECK-LABEL: fmls_indexed_4s:
-;CHECK: fmls.4s
-entry:
-  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
-  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
-  %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
-  ret <4 x float> %fmls1
-}
-
-define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
-;CHECK-LABEL: fmls_indexed_2d:
-;CHECK: fmls.2d
-entry:
-  %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
-  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
-  %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
-  ret <2 x double> %fmls1
-}
-
-define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fmla_indexed_scalar_2s:
-; CHECK-NEXT: fmla.2s
-; CHECK-NEXT: ret
-  %v1 = insertelement <2 x float> undef, float %c, i32 0
-  %v2 = insertelement <2 x float> %v1, float %c, i32 1
-  %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
-  ret <2 x float> %fmla1
-}
-
-define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fmla_indexed_scalar_4s:
-; CHECK-NEXT: fmla.4s
-; CHECK-NEXT: ret
-  %v1 = insertelement <4 x float> undef, float %c, i32 0
-  %v2 = insertelement <4 x float> %v1, float %c, i32 1
-  %v3 = insertelement <4 x float> %v2, float %c, i32 2
-  %v4 = insertelement <4 x float> %v3, float %c, i32 3
-  %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
-  ret <4 x float> %fmla1
-}
-
-define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
-; CHECK-LABEL: fmla_indexed_scalar_2d:
-; CHECK-NEXT: fmla.2d
-; CHECK-NEXT: ret
-entry:
-  %v1 = insertelement <2 x double> undef, double %c, i32 0
-  %v2 = insertelement <2 x double> %v1, double %c, i32 1
-  %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
-  ret <2 x double> %fmla1
-}
-
-define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: mul_4h:
-;CHECK-NOT: dup
-;CHECK: mul.4h
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = mul <4 x i16> %tmp1, %tmp3
-  ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: mul_8h:
-;CHECK-NOT: dup
-;CHECK: mul.8h
-  %tmp1 = load <8 x i16>* %A
-  %tmp2 = load <8 x i16>* %B
-  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = mul <8 x i16> %tmp1, %tmp3
-  ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: mul_2s:
-;CHECK-NOT: dup
-;CHECK: mul.2s
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = mul <2 x i32> %tmp1, %tmp3
-  ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: mul_4s:
-;CHECK-NOT: dup
-;CHECK: mul.4s
-  %tmp1 = load <4 x i32>* %A
-  %tmp2 = load <4 x i32>* %B
-  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = mul <4 x i32> %tmp1, %tmp3
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
-; CHECK-LABEL: mul_2d:
-; CHECK: mul
-; CHECK: mul
-  %tmp1 = mul <2 x i64> %A, %B
-  ret <2 x i64> %tmp1
-}
-
-define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmul_lane_2s:
-;CHECK-NOT: dup
-;CHECK: fmul.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = fmul <2 x float> %tmp1, %tmp3
-  ret <2 x float> %tmp4
-}
-
-define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmul_lane_4s:
-;CHECK-NOT: dup
-;CHECK: fmul.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = fmul <4 x float> %tmp1, %tmp3
-  ret <4 x float> %tmp4
-}
-
-define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmul_lane_2d:
-;CHECK-NOT: dup
-;CHECK: fmul.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = fmul <2 x double> %tmp1, %tmp3
-  ret <2 x double> %tmp4
-}
-
-define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
-;CHECK-LABEL: fmul_lane_s:
-;CHECK-NOT: dup
-;CHECK: fmul.s s0, s0, v1[3]
-  %B = extractelement <4 x float> %vec, i32 3
-  %res = fmul float %A, %B
-  ret float %res
-}
-
-define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
-;CHECK-LABEL: fmul_lane_d:
-;CHECK-NOT: dup
-;CHECK: fmul.d d0, d0, v1[1]
-  %B = extractelement <2 x double> %vec, i32 1
-  %res = fmul double %A, %B
-  ret double %res
-}
-
-
-
-define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmulx_lane_2s:
-;CHECK-NOT: dup
-;CHECK: fmulx.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
-  ret <2 x float> %tmp4
-}
-
-define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmulx_lane_4s:
-;CHECK-NOT: dup
-;CHECK: fmulx.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
-  ret <4 x float> %tmp4
-}
-
-define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmulx_lane_2d:
-;CHECK-NOT: dup
-;CHECK: fmulx.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
-  ret <2 x double> %tmp4
-}
-
-define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_lane_4h:
-;CHECK-NOT: dup
-;CHECK: sqdmulh.4h
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
-  ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_lane_8h:
-;CHECK-NOT: dup
-;CHECK: sqdmulh.8h
-  %tmp1 = load <8 x i16>* %A
-  %tmp2 = load <8 x i16>* %B
-  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
-  ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_lane_2s:
-;CHECK-NOT: dup
-;CHECK: sqdmulh.2s
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
-  ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmulh.4s
-  %tmp1 = load <4 x i32>* %A
-  %tmp2 = load <4 x i32>* %B
-  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
-  ret <4 x i32> %tmp4
-}
-
-define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
-;CHECK-LABEL: sqdmulh_lane_1s:
-;CHECK-NOT: dup
-;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
-  %tmp1 = extractelement <4 x i32> %B, i32 1
-  %tmp2 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
-  ret i32 %tmp2
-}
-
-define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_lane_4h:
-;CHECK-NOT: dup
-;CHECK: sqrdmulh.4h
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
-  ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_lane_8h:
-;CHECK-NOT: dup
-;CHECK: sqrdmulh.8h
-  %tmp1 = load <8 x i16>* %A
-  %tmp2 = load <8 x i16>* %B
-  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
-  ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_lane_2s:
-;CHECK-NOT: dup
-;CHECK: sqrdmulh.2s
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
-  ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqrdmulh.4s
-  %tmp1 = load <4 x i32>* %A
-  %tmp2 = load <4 x i32>* %B
-  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
-  ret <4 x i32> %tmp4
-}
-
-define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
-;CHECK-LABEL: sqrdmulh_lane_1s:
-;CHECK-NOT: dup
-;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
-  %tmp1 = extractelement <4 x i32> %B, i32 1
-  %tmp2 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
-  ret i32 %tmp2
-}
-
-define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmull_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmull.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmull_lane_2d:
-;CHECK-NOT: dup
-;CHECK: sqdmull.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
-  ret <2 x i64> %tmp4
-}
-
-define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmull2_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmull2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmull2_lane_2d:
-;CHECK-NOT: dup
-;CHECK: sqdmull2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i64> %tmp4
-}
-
-define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umull_lane_4s:
-;CHECK-NOT: dup
-;CHECK: umull.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umull_lane_2d:
-;CHECK-NOT: dup
-;CHECK: umull.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
-  ret <2 x i64> %tmp4
-}
-
-define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smull_lane_4s:
-;CHECK-NOT: dup
-;CHECK: smull.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smull_lane_2d:
-;CHECK-NOT: dup
-;CHECK: smull.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
-  ret <2 x i64> %tmp4
-}
-
-define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: smlal_lane_4s:
-;CHECK-NOT: dup
-;CHECK: smlal.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = add <4 x i32> %tmp3, %tmp5
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: smlal_lane_2d:
-;CHECK-NOT: dup
-;CHECK: smlal.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = add <2 x i64> %tmp3, %tmp5
-  ret <2 x i64> %tmp6
-}
-
-define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlal_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmlal.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlal_lane_2d:
-;CHECK-NOT: dup
-;CHECK: sqdmlal.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
-  ret <2 x i64> %tmp6
-}
-
-define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlal2_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmlal2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlal2_lane_2d:
-;CHECK-NOT: dup
-;CHECK: sqdmlal2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
-  ret <2 x i64> %tmp6
-}
-
-define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
-;CHECK-LABEL: sqdmlal_lane_1s:
-;CHECK: sqdmlal.4s
-  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
-  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
-  %prod = extractelement <4 x i32> %prod.vec, i32 0
-  %res = call i32 @llvm.arm64.neon.sqadd.i32(i32 %A, i32 %prod)
-  ret i32 %res
-}
-declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32)
-
-define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
-;CHECK-LABEL: sqdmlsl_lane_1s:
-;CHECK: sqdmlsl.4s
-  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
-  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
-  %prod = extractelement <4 x i32> %prod.vec, i32 0
-  %res = call i32 @llvm.arm64.neon.sqsub.i32(i32 %A, i32 %prod)
-  ret i32 %res
-}
-declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32)
-
-define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
-;CHECK-LABEL: sqdmlal_lane_1d:
-;CHECK: sqdmlal.s
-  %rhs = extractelement <2 x i32> %C, i32 1
-  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
-  %res = call i64 @llvm.arm64.neon.sqadd.i64(i64 %A, i64 %prod)
-  ret i64 %res
-}
-declare i64 @llvm.arm64.neon.sqdmulls.scalar(i32, i32)
-declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64)
-
-define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
-;CHECK-LABEL: sqdmlsl_lane_1d:
-;CHECK: sqdmlsl.s
-  %rhs = extractelement <2 x i32> %C, i32 1
-  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
-  %res = call i64 @llvm.arm64.neon.sqsub.i64(i64 %A, i64 %prod)
-  ret i64 %res
-}
-declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64)
-
-
-define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: umlal_lane_4s:
-;CHECK-NOT: dup
-;CHECK: umlal.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = add <4 x i32> %tmp3, %tmp5
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: umlal_lane_2d:
-;CHECK-NOT: dup
-;CHECK: umlal.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = add <2 x i64> %tmp3, %tmp5
-  ret <2 x i64> %tmp6
-}
-
-
-define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: smlsl_lane_4s:
-;CHECK-NOT: dup
-;CHECK: smlsl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = sub <4 x i32> %tmp3, %tmp5
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: smlsl_lane_2d:
-;CHECK-NOT: dup
-;CHECK: smlsl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = sub <2 x i64> %tmp3, %tmp5
-  ret <2 x i64> %tmp6
-}
-
-define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmlsl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl_lane_2d:
-;CHECK-NOT: dup
-;CHECK: sqdmlsl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
-  ret <2 x i64> %tmp6
-}
-
-define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl2_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmlsl2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl2_lane_2d:
-;CHECK-NOT: dup
-;CHECK: sqdmlsl2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
-  ret <2 x i64> %tmp6
-}
-
-define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: umlsl_lane_4s:
-;CHECK-NOT: dup
-;CHECK: umlsl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = sub <4 x i32> %tmp3, %tmp5
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: umlsl_lane_2d:
-;CHECK-NOT: dup
-;CHECK: umlsl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = sub <2 x i64> %tmp3, %tmp5
-  ret <2 x i64> %tmp6
-}
-
-; Scalar FMULX
-define float @fmulxs(float %a, float %b) nounwind {
-; CHECK-LABEL: fmulxs:
-; CHECKNEXT: fmulx s0, s0, s1
-  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
-; CHECKNEXT: ret
-  ret float %fmulx.i
-}
-
-define double @fmulxd(double %a, double %b) nounwind {
-; CHECK-LABEL: fmulxd:
-; CHECKNEXT: fmulx d0, d0, d1
-  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
-; CHECKNEXT: ret
-  ret double %fmulx.i
-}
-
-define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
-; CHECK-LABEL: fmulxs_lane:
-; CHECKNEXT: fmulx.s s0, s0, v1[3]
-  %b = extractelement <4 x float> %vec, i32 3
-  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
-; CHECKNEXT: ret
-  ret float %fmulx.i
-}
-
-define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
-; CHECK-LABEL: fmulxd_lane:
-; CHECKNEXT: fmulx d0, d0, v1[1]
-  %b = extractelement <2 x double> %vec, i32 1
-  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
-; CHECKNEXT: ret
-  ret double %fmulx.i
-}
-
-declare double @llvm.arm64.neon.fmulx.f64(double, double) nounwind readnone
-declare float @llvm.arm64.neon.fmulx.f32(float, float) nounwind readnone
-
-
-define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
-; CHECK-LABEL: smull2_8h_simple:
-; CHECK-NEXT: smull2.8h v0, v0, v1
-; CHECK-NEXT: ret
-  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %3 = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
-; CHECK-LABEL: foo0:
-; CHECK: smull2.8h v0, v0, v1
-  %tmp = bitcast <16 x i8> %a to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
-  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
-  ret <8 x i16> %vmull.i.i
-}
-
-define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
-; CHECK-LABEL: foo1:
-; CHECK: smull2.4s v0, v0, v1
-  %tmp = bitcast <8 x i16> %a to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
-  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
-; CHECK-LABEL: foo2:
-; CHECK: smull2.2d v0, v0, v1
-  %tmp = bitcast <4 x i32> %a to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
-  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
-; CHECK-LABEL: foo3:
-; CHECK: umull2.8h v0, v0, v1
-  %tmp = bitcast <16 x i8> %a to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
-  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
-  ret <8 x i16> %vmull.i.i
-}
-
-define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
-; CHECK-LABEL: foo4:
-; CHECK: umull2.4s v0, v0, v1
-  %tmp = bitcast <8 x i16> %a to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
-  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
-; CHECK-LABEL: foo5:
-; CHECK: umull2.2d v0, v0, v1
-  %tmp = bitcast <4 x i32> %a to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
-  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: foo6:
-; CHECK-NEXT: smull2.4s v0, v1, v2[1]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
-  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: foo7:
-; CHECK-NEXT: smull2.2d v0, v1, v2[1]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
-  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: foo8:
-; CHECK-NEXT: umull2.4s v0, v1, v2[1]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
-  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: foo9:
-; CHECK-NEXT: umull2.2d v0, v1, v2[1]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
-  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
-  ret <2 x i64> %vmull2.i
-}
-
-define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
-; CHECK-LABEL: bar0:
-; CHECK: smlal2.8h v0, v1, v2
-; CHECK-NEXT: ret
-
-  %tmp = bitcast <16 x i8> %b to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
-  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
-  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
-  %add.i = add <8 x i16> %vmull.i.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
-; CHECK-LABEL: bar1:
-; CHECK: smlal2.4s v0, v1, v2
-; CHECK-NEXT: ret
-
-  %tmp = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
-  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
-  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
-  %add.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
-; CHECK-LABEL: bar2:
-; CHECK: smlal2.2d v0, v1, v2
-; CHECK-NEXT: ret
-
-  %tmp = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
-  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
-  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
-  %add.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
-; CHECK-LABEL: bar3:
-; CHECK: umlal2.8h v0, v1, v2
-; CHECK-NEXT: ret
-
-  %tmp = bitcast <16 x i8> %b to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
-  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
-  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
-  %add.i = add <8 x i16> %vmull.i.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
-; CHECK-LABEL: bar4:
-; CHECK: umlal2.4s v0, v1, v2
-; CHECK-NEXT: ret
-
-  %tmp = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
-  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
-  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
-  %add.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
-; CHECK-LABEL: bar5:
-; CHECK: umlal2.2d v0, v1, v2
-; CHECK-NEXT: ret
-
-  %tmp = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
-  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
-  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
-  %add.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
-; CHECK-LABEL: mlal2_1:
-; CHECK: smlal2.4s v0, v1, v2[3]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %tmp = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
-  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
-  %add = add <4 x i32> %vmull2.i.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
-; CHECK-LABEL: mlal2_2:
-; CHECK: smlal2.2d v0, v1, v2[1]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
-  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
-  %add = add <2 x i64> %vmull2.i.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
-; CHECK-LABEL: mlal2_4:
-; CHECK: umlal2.4s v0, v1, v2[2]
-; CHECK-NEXT: ret
-
-  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %tmp = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
-  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
-  %add = add <4 x i32> %vmull2.i.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
-; CHECK-LABEL: mlal2_5:
-; CHECK: umlal2.2d v0, v1, v2[0]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
-  %tmp = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
-  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
-  %add = add <2 x i64> %vmull2.i.i, %a
-  ret <2 x i64> %add
-}
-
-; rdar://12328502
-define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: vmulq_n_f64:
-; CHECK-NOT: dup.2d
-; CHECK: fmul.2d v0, v0, v1[0]
-  %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
-  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
-  %mul.i = fmul <2 x double> %vecinit1.i, %x
-  ret <2 x double> %mul.i
-}
-
-define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: vmulq_n_f32:
-; CHECK-NOT: dup.4s
-; CHECK: fmul.4s v0, v0, v1[0]
-  %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
-  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
-  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
-  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
-  %mul.i = fmul <4 x float> %vecinit3.i, %x
-  ret <4 x float> %mul.i
-}
-
-define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: vmul_n_f32:
-; CHECK-NOT: dup.2s
-; CHECK: fmul.2s v0, v0, v1[0]
-  %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
-  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
-  %mul.i = fmul <2 x float> %vecinit1.i, %x
-  ret <2 x float> %mul.i
-}
-
-define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
-entry:
-; CHECK: vmla_laneq_s16_test
-; CHECK-NOT: ext
-; CHECK: mla.4h v0, v1, v2[6]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
-entry:
-; CHECK: vmla_laneq_s32_test
-; CHECK-NOT: ext
-; CHECK: mla.2s v0, v1, v2[3]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
-entry:
-; CHECK: not_really_vmlaq_laneq_s16_test
-; CHECK-NOT: ext
-; CHECK: mla.8h v0, v1, v2[5]
-; CHECK-NEXT: ret
-  %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <8 x i16> %shuffle2, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
-entry:
-; CHECK: not_really_vmlaq_laneq_s32_test
-; CHECK-NOT: ext
-; CHECK: mla.4s v0, v1, v2[3]
-; CHECK-NEXT: ret
-  %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle2, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
-entry:
-; CHECK: vmull_laneq_s16_test
-; CHECK-NOT: ext
-; CHECK: smull.4s v0, v0, v1[6]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
-entry:
-; CHECK: vmull_laneq_s32_test
-; CHECK-NOT: ext
-; CHECK: smull.2d v0, v0, v1[2]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
-  ret <2 x i64> %vmull2.i
-}
-define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
-entry:
-; CHECK: vmull_laneq_u16_test
-; CHECK-NOT: ext
-; CHECK: umull.4s v0, v0, v1[6]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
-entry:
-; CHECK: vmull_laneq_u32_test
-; CHECK-NOT: ext
-; CHECK: umull.2d v0, v0, v1[2]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
-entry:
-; CHECK: vmull_high_n_s16_test
-; CHECK-NOT: ext
-; CHECK: smull2.4s
-; CHECK-NEXT: ret
-  %conv = trunc i32 %d to i16
-  %0 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
-  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
-entry:
-; CHECK: vmull_high_n_s32_test
-; CHECK-NOT: ext
-; CHECK: smull2.2d
-; CHECK-NEXT: ret
-  %0 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
-  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
-entry:
-; CHECK: vmull_high_n_u16_test
-; CHECK-NOT: ext
-; CHECK: umull2.4s
-; CHECK-NEXT: ret
-  %conv = trunc i32 %d to i16
-  %0 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
-  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
-entry:
-; CHECK: vmull_high_n_u32_test
-; CHECK-NOT: ext
-; CHECK: umull2.2d
-; CHECK-NEXT: ret
-  %0 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
-  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: vmul_built_dup_test:
-; CHECK-NOT: ins
-; CHECK-NOT: dup
-; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1]
-  %vget_lane = extractelement <4 x i32> %b, i32 1
-  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
-  %prod = mul <4 x i32> %a, %vecinit3.i
-  ret <4 x i32> %prod
-}
-
-define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: vmul_built_dup_fromsmall_test:
-; CHECK-NOT: ins
-; CHECK-NOT: dup
-; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3]
-  %vget_lane = extractelement <4 x i16> %b, i32 3
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  %prod = mul <4 x i16> %a, %vecinit3.i
-  ret <4 x i16> %prod
-}
-
-define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
-; CHECK-NOT: ins
-; CHECK-NOT: dup
-; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
-  %vget_lane = extractelement <4 x i16> %b, i32 0
-  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
-  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
-  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
-  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
-  %prod = mul <8 x i16> %a, %vecinit7.i
-  ret <8 x i16> %prod
-}
-
-define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: mull_from_two_extracts:
-; CHECK-NOT: ext
-; CHECK: sqdmull2.2d
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  ret <2 x i64> %res
-}
-
-define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: mlal_from_two_extracts:
-; CHECK-NOT: ext
-; CHECK: sqdmlal2.2d
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
-  ret <2 x i64> %sum
-}
-
-define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: mull_from_extract_dup:
-; CHECK-NOT: ext
-; CHECK: sqdmull2.2d
-  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
-  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
-  ret <2 x i64> %res
-}
-
-define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
-; CHECK-LABEL: pmull_from_extract_dup:
-; CHECK-NOT: ext
-; CHECK: pmull2.8h
-  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
-  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-
-  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-
-  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK-LABEL: pmull_from_extract_duplane:
-; CHECK-NOT: ext
-; CHECK: pmull2.8h
-
-  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-
-  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
-  ret <8 x i16> %res
-}
-
-define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: sqdmull_from_extract_duplane:
-; CHECK-NOT: ext
-; CHECK: sqdmull2.2d
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
-
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  ret <2 x i64> %res
-}
-
-define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: sqdmlal_from_extract_duplane:
-; CHECK-NOT: ext
-; CHECK: sqdmlal2.2d
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
-
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
-  ret <2 x i64> %sum
-}
-
-define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: umlal_from_extract_duplane:
-; CHECK-NOT: ext
-; CHECK: umlal2.2d
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
-
-  %res = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  %sum = add <2 x i64> %accum, %res
-  ret <2 x i64> %sum
-}
-
-define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
-; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
-; CHECK: fmla.s s0, s1, v2[3]
-  %rhs = extractelement <4 x float> %rvec, i32 3
-  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
-  ret float %res
-}
-
-define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
-; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
-; CHECK: fmla.s s0, s1, v2[1]
-  %rhs = extractelement <2 x float> %rvec, i32 1
-  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
-  ret float %res
-}
-
-define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
-; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
-; CHECK: fmls.s s0, s1, v2[3]
-  %rhs.scal = extractelement <4 x float> %rvec, i32 3
-  %rhs = fsub float -0.0, %rhs.scal
-  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
-  ret float %res
-}
-
-define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
-; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
-; CHECK: fmls.s s0, s1, v2[1]
-  %rhs.scal = extractelement <2 x float> %rvec, i32 1
-  %rhs = fsub float -0.0, %rhs.scal
-  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
-  ret float %res
-}
-
-declare float @llvm.fma.f32(float, float, float)
-
-define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
-; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
-; CHECK: fmla.d d0, d1, v2[1]
-  %rhs = extractelement <2 x double> %rvec, i32 1
-  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
-  ret double %res
-}
-
-define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
-; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
-; CHECK: fmls.d d0, d1, v2[1]
-  %rhs.scal = extractelement <2 x double> %rvec, i32 1
-  %rhs = fsub double -0.0, %rhs.scal
-  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
-  ret double %res
-}
-
-declare double @llvm.fma.f64(double, double, double)
-
-define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
-; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
-; CHECK: fmls.2s v0, v1, v2[3]
-  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
-  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
-  ret <2 x float> %res
-}
-
-define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
-; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
-; CHECK: fmls.2s v0, v1, v2[1]
-  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
-  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
-  ret <2 x float> %res
-}
-
-define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
-; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
-; CHECK: fmls.4s v0, v1, v2[3]
-  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
-  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
-  ret <4 x float> %res
-}
-
-define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
-; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
-; CHECK: fmls.4s v0, v1, v2[1]
-  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
-  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
-  ret <4 x float> %res
-}
-
-define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
-; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
-; CHECK: fmls.2d v0, v1, v2[1]
-  %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
-  %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
-  ret <2 x double> %res
-}
-
-define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
-; CHECK-LABEL: test_fmul_v1f64:
-; CHECK: fmul
-  %prod = fmul <1 x double> %L, %R
-  ret <1 x double> %prod
-}
-
-define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
-; CHECK-LABEL: test_fdiv_v1f64:
-; CHECK-LABEL: fdiv
-  %prod = fdiv <1 x double> %L, %R
-  ret <1 x double> %prod
-}
-
-define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
-;CHECK-LABEL: sqdmlal_d:
-;CHECK: sqdmlal
-  %tmp4 = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %A, i32 %B)
-  %tmp5 = call i64 @llvm.arm64.neon.sqadd.i64(i64 %C, i64 %tmp4)
-  ret i64 %tmp5
-}
-
-define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
-;CHECK-LABEL: sqdmlsl_d:
-;CHECK: sqdmlsl
-  %tmp4 = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %A, i32 %B)
-  %tmp5 = call i64 @llvm.arm64.neon.sqsub.i64(i64 %C, i64 %tmp4)
-  ret i64 %tmp5
-}
-
-define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: test_pmull_64:
-; CHECK: pmull.1q
-  %val = call <16 x i8> @llvm.arm64.neon.pmull64(i64 %l, i64 %r)
-  ret <16 x i8> %val
-}
-
-define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
-; CHECK-LABEL: test_pmull_high_64:
-; CHECK: pmull2.1q
-  %l_hi = extractelement <2 x i64> %l, i32 1
-  %r_hi = extractelement <2 x i64> %r, i32 1
-  %val = call <16 x i8> @llvm.arm64.neon.pmull64(i64 %l_hi, i64 %r_hi)
-  ret <16 x i8> %val
-}
-
-declare <16 x i8> @llvm.arm64.neon.pmull64(i64, i64)
-
-define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
-; CHECK-LABEL: test_mul_v1i64:
-; CHECK: mul
-  %prod = mul <1 x i64> %lhs, %rhs
-  ret <1 x i64> %prod
-}
diff --git a/test/CodeGen/ARM64/volatile.ll b/test/CodeGen/ARM64/volatile.ll
deleted file mode 100644
index e00ac5acb5f..00000000000
--- a/test/CodeGen/ARM64/volatile.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-define i64 @normal_load(i64* nocapture %bar) nounwind readonly {
-; CHECK: normal_load
-; CHECK: ldp
-; CHECK-NEXT: add
-; CHECK-NEXT: ret
-  %add.ptr = getelementptr inbounds i64* %bar, i64 1
-  %tmp = load i64* %add.ptr, align 8
-  %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
-  %tmp1 = load i64* %add.ptr1, align 8
-  %add = add nsw i64 %tmp1, %tmp
-  ret i64 %add
-}
-
-define i64 @volatile_load(i64* nocapture %bar) nounwind {
-; CHECK: volatile_load
-; CHECK: ldr
-; CHECK-NEXT: ldr
-; CHECK-NEXT: add
-; CHECK-NEXT: ret
-  %add.ptr = getelementptr inbounds i64* %bar, i64 1
-  %tmp = load volatile i64* %add.ptr, align 8
-  %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
-  %tmp1 = load volatile i64* %add.ptr1, align 8
-  %add = add nsw i64 %tmp1, %tmp
-  ret i64 %add
-}
diff --git a/test/CodeGen/ARM64/vpopcnt.ll b/test/CodeGen/ARM64/vpopcnt.ll
deleted file mode 100644
index 25306eba491..00000000000
--- a/test/CodeGen/ARM64/vpopcnt.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
-target triple = "arm64-apple-ios"
-
-; The non-byte ones used to fail with "Cannot select"
-
-; CHECK-LABEL: ctpopv8i8
-; CHECK: cnt.8b
-define <8 x i8> @ctpopv8i8(<8 x i8> %x) nounwind readnone {
-  %cnt = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %x)
-  ret <8 x i8> %cnt
-}
-
-declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
-
-; CHECK-LABEL: ctpopv4i16
-; CHECK: cnt.8b
-define <4 x i16> @ctpopv4i16(<4 x i16> %x) nounwind readnone {
-  %cnt = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %x)
-  ret <4 x i16> %cnt
-}
-
-declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
-
-; CHECK-LABEL: ctpopv2i32
-; CHECK: cnt.8b
-define <2 x i32> @ctpopv2i32(<2 x i32> %x) nounwind readnone {
-  %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x)
-  ret <2 x i32> %cnt
-}
-
-declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
-
-
-; CHECK-LABEL: ctpopv16i8
-; CHECK: cnt.16b
-define <16 x i8> @ctpopv16i8(<16 x i8> %x) nounwind readnone {
-  %cnt = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %x)
-  ret <16 x i8> %cnt
-}
-
-declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
-
-; CHECK-LABEL: ctpopv8i16
-; CHECK: cnt.8b
-define <8 x i16> @ctpopv8i16(<8 x i16> %x) nounwind readnone {
-  %cnt = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %x)
-  ret <8 x i16> %cnt
-}
-
-declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
-
-; CHECK-LABEL: ctpopv4i32
-; CHECK: cnt.8b
-define <4 x i32> @ctpopv4i32(<4 x i32> %x) nounwind readnone {
-  %cnt = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x)
-  ret <4 x i32> %cnt
-}
-
-declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
-
-; CHECK-LABEL: ctpopv2i64
-; CHECK: cnt.8b
-define <2 x i64> @ctpopv2i64(<2 x i64> %x) nounwind readnone {
-  %cnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)
-  ret <2 x i64> %cnt
-}
-
-declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vqadd.ll b/test/CodeGen/ARM64/vqadd.ll
deleted file mode 100644
index 0b7f7e53105..00000000000
--- a/test/CodeGen/ARM64/vqadd.ll
+++ /dev/null
@@ -1,332 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @sqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sqadd8b:
-;CHECK: sqadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqadd4h:
-;CHECK: sqadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqadd2s:
-;CHECK: sqadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @uqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uqadd8b:
-;CHECK: uqadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uqadd4h:
-;CHECK: uqadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uqadd2s:
-;CHECK: uqadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sqadd16b:
-;CHECK: sqadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqadd8h:
-;CHECK: sqadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqadd4s:
-;CHECK: sqadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sqadd2d:
-;CHECK: sqadd.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @uqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uqadd16b:
-;CHECK: uqadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uqadd8h:
-;CHECK: uqadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uqadd4s:
-;CHECK: uqadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uqadd2d:
-;CHECK: uqadd.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @usqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: usqadd8b:
-;CHECK: usqadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @usqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: usqadd4h:
-;CHECK: usqadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @usqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: usqadd2s:
-;CHECK: usqadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @usqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: usqadd16b:
-;CHECK: usqadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @usqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: usqadd8h:
-;CHECK: usqadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @usqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: usqadd4s:
-;CHECK: usqadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @usqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: usqadd2d:
-;CHECK: usqadd.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define i64 @usqadd_d(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: usqadd_d:
-; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call i64 @llvm.arm64.neon.usqadd.i64(i64 %l, i64 %r)
-  ret i64 %sum
-}
-
-define i32 @usqadd_s(i32 %l, i32 %r) nounwind {
-; CHECK-LABEL: usqadd_s:
-; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
-  %sum = call i32 @llvm.arm64.neon.usqadd.i32(i32 %l, i32 %r)
-  ret i32 %sum
-}
-
-declare <8 x i8>  @llvm.arm64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.usqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.usqadd.i32(i32, i32) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @suqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: suqadd8b:
-;CHECK: suqadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @suqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: suqadd4h:
-;CHECK: suqadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @suqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: suqadd2s:
-;CHECK: suqadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @suqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: suqadd16b:
-;CHECK: suqadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @suqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: suqadd8h:
-;CHECK: suqadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @suqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: suqadd4s:
-;CHECK: suqadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @suqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: suqadd2d:
-;CHECK: suqadd.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i64> @suqadd_1d(<1 x i64> %l, <1 x i64> %r) nounwind {
-; CHECK-LABEL: suqadd_1d:
-; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
-  ret <1 x i64> %sum
-}
-
-define i64 @suqadd_d(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: suqadd_d:
-; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call i64 @llvm.arm64.neon.suqadd.i64(i64 %l, i64 %r)
-  ret i64 %sum
-}
-
-define i32 @suqadd_s(i32 %l, i32 %r) nounwind {
-; CHECK-LABEL: suqadd_s:
-; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
-  %sum = call i32 @llvm.arm64.neon.suqadd.i32(i32 %l, i32 %r)
-  ret i32 %sum
-}
-
-declare <8 x i8>  @llvm.arm64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.suqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.suqadd.i32(i32, i32) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vqsub.ll b/test/CodeGen/ARM64/vqsub.ll
deleted file mode 100644
index 0afeb68348b..00000000000
--- a/test/CodeGen/ARM64/vqsub.ll
+++ /dev/null
@@ -1,147 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sqsub8b:
-;CHECK: sqsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqsub4h:
-;CHECK: sqsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqsub2s:
-;CHECK: sqsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uqsub8b:
-;CHECK: uqsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uqsub4h:
-;CHECK: uqsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uqsub2s:
-;CHECK: uqsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sqsub16b:
-;CHECK: sqsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqsub8h:
-;CHECK: sqsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqsub4s:
-;CHECK: sqsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sqsub2d:
-;CHECK: sqsub.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uqsub16b:
-;CHECK: uqsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uqsub8h:
-;CHECK: uqsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uqsub4s:
-;CHECK: uqsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uqsub2d:
-;CHECK: uqsub.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vselect.ll b/test/CodeGen/ARM64/vselect.ll
deleted file mode 100644
index aa8e81eb709..00000000000
--- a/test/CodeGen/ARM64/vselect.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-;CHECK: @func63
-;CHECK: cmeq.4h v0, v0, v1
-
-;FIXME: currently, it will generate 3 instructions:
-; ushll.4s	v0, v0, #0
-; shl.4s	v0, v0, #31
-; sshr.4s	v0, v0, #31
-;But these instrucitons can be optimized into 1 instruction:
-; sshll.4s  v0, v0, #0
-
-;CHECK: bsl.16b v0, v2, v3
-;CHECK: str  q0, [x0]
-;CHECK: ret
-
-%T0_63 = type <4 x i16>
-%T1_63 = type <4 x i32>
-%T2_63 = type <4 x i1>
-define void @func63(%T1_63* %out, %T0_63 %v0, %T0_63 %v1, %T1_63 %v2, %T1_63 %v3) {
-  %cond = icmp eq %T0_63 %v0, %v1
-  %r = select %T2_63 %cond, %T1_63 %v2, %T1_63 %v3
-  store %T1_63 %r, %T1_63* %out
-  ret void
-}
diff --git a/test/CodeGen/ARM64/vsetcc_fp.ll b/test/CodeGen/ARM64/vsetcc_fp.ll
deleted file mode 100644
index c93aad5c4ee..00000000000
--- a/test/CodeGen/ARM64/vsetcc_fp.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
-define <2 x i32> @fcmp_one(<2 x float> %x, <2 x float> %y) nounwind optsize readnone {
-; CHECK-LABEL: fcmp_one:
-; CHECK-NEXT: fcmgt.2s [[REG:v[0-9]+]], v0, v1
-; CHECK-NEXT: fcmgt.2s [[REG2:v[0-9]+]], v1, v0
-; CHECK-NEXT: orr.8b v0, [[REG2]], [[REG]]
-; CHECK-NEXT: ret
-  %tmp = fcmp one <2 x float> %x, %y
-  %or = sext <2 x i1> %tmp to <2 x i32>
-  ret <2 x i32> %or
-}
diff --git a/test/CodeGen/ARM64/vshift.ll b/test/CodeGen/ARM64/vshift.ll
deleted file mode 100644
index 486c6cc390b..00000000000
--- a/test/CodeGen/ARM64/vshift.ll
+++ /dev/null
@@ -1,1917 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -enable-misched=false | FileCheck %s
-
-define <8 x i8> @sqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sqshl8b:
-;CHECK: sqshl.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqshl4h:
-;CHECK: sqshl.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqshl2s:
-;CHECK: sqshl.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @uqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uqshl8b:
-;CHECK: uqshl.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uqshl4h:
-;CHECK: uqshl.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uqshl2s:
-;CHECK: uqshl.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sqshl16b:
-;CHECK: sqshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqshl8h:
-;CHECK: sqshl.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqshl4s:
-;CHECK: sqshl.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sqshl2d:
-;CHECK: sqshl.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @uqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uqshl16b:
-;CHECK: uqshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uqshl8h:
-;CHECK: uqshl.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uqshl4s:
-;CHECK: uqshl.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uqshl2d:
-;CHECK: uqshl.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @srshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: srshl8b:
-;CHECK: srshl.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @srshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: srshl4h:
-;CHECK: srshl.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @srshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: srshl2s:
-;CHECK: srshl.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @urshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: urshl8b:
-;CHECK: urshl.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @urshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: urshl4h:
-;CHECK: urshl.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @urshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: urshl2s:
-;CHECK: urshl.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @srshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: srshl16b:
-;CHECK: srshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @srshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: srshl8h:
-;CHECK: srshl.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @srshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: srshl4s:
-;CHECK: srshl.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @srshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: srshl2d:
-;CHECK: srshl.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @urshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: urshl16b:
-;CHECK: urshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @urshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: urshl8h:
-;CHECK: urshl.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @urshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: urshl4s:
-;CHECK: urshl.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @urshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: urshl2d:
-;CHECK: urshl.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @sqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sqrshl8b:
-;CHECK: sqrshl.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqrshl4h:
-;CHECK: sqrshl.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqrshl2s:
-;CHECK: sqrshl.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @uqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uqrshl8b:
-;CHECK: uqrshl.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uqrshl4h:
-;CHECK: uqrshl.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uqrshl2s:
-;CHECK: uqrshl.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sqrshl16b:
-;CHECK: sqrshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqrshl8h:
-;CHECK: sqrshl.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqrshl4s:
-;CHECK: sqrshl.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sqrshl2d:
-;CHECK: sqrshl.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @uqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uqrshl16b:
-;CHECK: uqrshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uqrshl8h:
-;CHECK: uqrshl.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uqrshl4s:
-;CHECK: uqrshl.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uqrshl2d:
-;CHECK: uqrshl.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @urshr8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: urshr8b:
-;CHECK: urshr.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @urshr4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: urshr4h:
-;CHECK: urshr.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @urshr2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: urshr2s:
-;CHECK: urshr.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @urshr16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: urshr16b:
-;CHECK: urshr.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @urshr8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: urshr8h:
-;CHECK: urshr.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @urshr4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: urshr4s:
-;CHECK: urshr.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @urshr2d(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: urshr2d:
-;CHECK: urshr.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @srshr8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: srshr8b:
-;CHECK: srshr.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @srshr4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: srshr4h:
-;CHECK: srshr.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @srshr2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: srshr2s:
-;CHECK: srshr.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @srshr16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: srshr16b:
-;CHECK: srshr.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @srshr8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: srshr8h:
-;CHECK: srshr.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @srshr4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: srshr4s:
-;CHECK: srshr.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @srshr2d(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: srshr2d:
-;CHECK: srshr.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @sqshlu8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: sqshlu8b:
-;CHECK: sqshlu.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqshlu4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshlu4h:
-;CHECK: sqshlu.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqshlu2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshlu2s:
-;CHECK: sqshlu.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqshlu16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: sqshlu16b:
-;CHECK: sqshlu.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqshlu8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshlu8h:
-;CHECK: sqshlu.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqshlu4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshlu4s:
-;CHECK: sqshlu.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqshlu2d(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqshlu2d:
-;CHECK: sqshlu.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
-        ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @rshrn8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: rshrn8b:
-;CHECK: rshrn.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @rshrn4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: rshrn4h:
-;CHECK: rshrn.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @rshrn2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: rshrn2s:
-;CHECK: rshrn.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @rshrn16b(<8 x i8> *%ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: rshrn16b:
-;CHECK: rshrn2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @rshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: rshrn8h:
-;CHECK: rshrn2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @rshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: rshrn4s:
-;CHECK: rshrn2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare <8 x i8>  @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
-
-define <8 x i8> @shrn8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: shrn8b:
-;CHECK: shrn.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @shrn4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: shrn4h:
-;CHECK: shrn.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
-        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @shrn2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: shrn2s:
-;CHECK: shrn.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
-        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @shrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: shrn16b:
-;CHECK: shrn2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @shrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: shrn8h:
-;CHECK: shrn2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
-        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @shrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: shrn4s:
-;CHECK: shrn2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
-        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare <8 x i8>  @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
-
-define i32 @sqshrn1s(i64 %A) nounwind {
-; CHECK-LABEL: sqshrn1s:
-; CHECK: sqshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqshrn.i32(i64 %A, i32 1)
-  ret i32 %tmp
-}
-
-define <8 x i8> @sqshrn8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshrn8b:
-;CHECK: sqshrn.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqshrn4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshrn4h:
-;CHECK: sqshrn.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqshrn2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqshrn2s:
-;CHECK: sqshrn.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-
-define <16 x i8> @sqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshrn16b:
-;CHECK: sqshrn2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @sqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshrn8h:
-;CHECK: sqshrn2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @sqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqshrn4s:
-;CHECK: sqshrn2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare i32  @llvm.arm64.neon.sqshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
-
-define i32 @sqshrun1s(i64 %A) nounwind {
-; CHECK-LABEL: sqshrun1s:
-; CHECK: sqshrun {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqshrun.i32(i64 %A, i32 1)
-  ret i32 %tmp
-}
-
-define <8 x i8> @sqshrun8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshrun8b:
-;CHECK: sqshrun.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqshrun4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshrun4h:
-;CHECK: sqshrun.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqshrun2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqshrun2s:
-;CHECK: sqshrun.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshrun16b:
-;CHECK: sqshrun2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @sqshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshrun8h:
-;CHECK: sqshrun2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @sqshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqshrun4s:
-;CHECK: sqshrun2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare i32  @llvm.arm64.neon.sqshrun.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
-
-define i32 @sqrshrn1s(i64 %A) nounwind {
-; CHECK-LABEL: sqrshrn1s:
-; CHECK: sqrshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqrshrn.i32(i64 %A, i32 1)
-  ret i32 %tmp
-}
-
-define <8 x i8> @sqrshrn8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqrshrn8b:
-;CHECK: sqrshrn.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqrshrn4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqrshrn4h:
-;CHECK: sqrshrn.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqrshrn2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqrshrn2s:
-;CHECK: sqrshrn.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqrshrn16b:
-;CHECK: sqrshrn2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @sqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqrshrn8h:
-;CHECK: sqrshrn2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @sqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqrshrn4s:
-;CHECK: sqrshrn2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare i32  @llvm.arm64.neon.sqrshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
-
-define i32 @sqrshrun1s(i64 %A) nounwind {
-; CHECK-LABEL: sqrshrun1s:
-; CHECK: sqrshrun {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqrshrun.i32(i64 %A, i32 1)
-  ret i32 %tmp
-}
-
-define <8 x i8> @sqrshrun8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqrshrun8b:
-;CHECK: sqrshrun.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqrshrun4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqrshrun4h:
-;CHECK: sqrshrun.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqrshrun2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqrshrun2s:
-;CHECK: sqrshrun.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqrshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqrshrun16b:
-;CHECK: sqrshrun2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @sqrshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqrshrun8h:
-;CHECK: sqrshrun2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @sqrshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqrshrun4s:
-;CHECK: sqrshrun2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare i32  @llvm.arm64.neon.sqrshrun.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
-
-define i32 @uqrshrn1s(i64 %A) nounwind {
-; CHECK-LABEL: uqrshrn1s:
-; CHECK: uqrshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.uqrshrn.i32(i64 %A, i32 1)
-  ret i32 %tmp
-}
-
-define <8 x i8> @uqrshrn8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: uqrshrn8b:
-;CHECK: uqrshrn.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqrshrn4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: uqrshrn4h:
-;CHECK: uqrshrn.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqrshrn2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: uqrshrn2s:
-;CHECK: uqrshrn.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @uqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: uqrshrn16b:
-;CHECK: uqrshrn2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @uqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: uqrshrn8h:
-;CHECK: uqrshrn2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @uqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: uqrshrn4s:
-;CHECK: uqrshrn2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare i32  @llvm.arm64.neon.uqrshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
-
-define i32 @uqshrn1s(i64 %A) nounwind {
-; CHECK-LABEL: uqshrn1s:
-; CHECK: uqshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.uqshrn.i32(i64 %A, i32 1)
-  ret i32 %tmp
-}
-
-define <8 x i8> @uqshrn8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: uqshrn8b:
-;CHECK: uqshrn.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqshrn4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: uqshrn4h:
-;CHECK: uqshrn.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqshrn2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: uqshrn2s:
-;CHECK: uqshrn.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @uqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: uqshrn16b:
-;CHECK: uqshrn2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @uqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: uqshrn8h:
-;CHECK: uqshrn2.8h v0, {{v[0-9]+}}, #1
-  %out = load <4 x i16>* %ret
-  %tmp1 = load <4 x i32>* %A
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
-  %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @uqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: uqshrn4s:
-;CHECK: uqshrn2.4s v0, {{v[0-9]+}}, #1
-  %out = load <2 x i32>* %ret
-  %tmp1 = load <2 x i64>* %A
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
-  %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %tmp4
-}
-
-declare i32  @llvm.arm64.neon.uqshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
-
-define <8 x i16> @ushll8h(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: ushll8h:
-;CHECK: ushll.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
-        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @ushll4s(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: ushll4s:
-;CHECK: ushll.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
-        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @ushll2d(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: ushll2d:
-;CHECK: ushll.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
-        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: ushll2_8h:
-;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1
-        %load1 = load <16 x i8>* %A
-        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
-        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: ushll2_4s:
-;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1
-        %load1 = load <8 x i16>* %A
-        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
-        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: ushll2_2d:
-;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1
-        %load1 = load <4 x i32>* %A
-        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
-        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i16> @sshll8h(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: sshll8h:
-;CHECK: sshll.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
-        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sshll4s(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: sshll4s:
-;CHECK: sshll.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
-        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: sshll2d:
-;CHECK: sshll.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
-        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: sshll2_8h:
-;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1
-        %load1 = load <16 x i8>* %A
-        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
-        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sshll2_4s:
-;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1
-        %load1 = load <8 x i16>* %A
-        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
-        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sshll2_2d:
-;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1
-        %load1 = load <4 x i32>* %A
-        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
-        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @sqshli8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: sqshli8b:
-;CHECK: sqshl.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqshli4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshli4h:
-;CHECK: sqshl.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqshli2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshli2s:
-;CHECK: sqshl.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqshli16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: sqshli16b:
-;CHECK: sqshl.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqshli8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshli8h:
-;CHECK: sqshl.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqshli4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshli4s:
-;CHECK: sqshl.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqshli2d(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqshli2d:
-;CHECK: sqshl.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: uqshli8b:
-;CHECK: uqshl.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqshli4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: uqshli4h:
-;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqshli2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: uqshli2s:
-;CHECK: uqshl.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @uqshli16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: uqshli16b:
-;CHECK: uqshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqshli8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: uqshli8h:
-;CHECK: uqshl.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqshli4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: uqshli4s:
-;CHECK: uqshl.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqshli2d(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: uqshli2d:
-;CHECK: uqshl.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @ursra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: ursra8b:
-;CHECK: ursra.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        %tmp4 = load <8 x i8>* %B
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @ursra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: ursra4h:
-;CHECK: ursra.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-        %tmp4 = load <4 x i16>* %B
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @ursra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: ursra2s:
-;CHECK: ursra.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
-        %tmp4 = load <2 x i32>* %B
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <16 x i8> @ursra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: ursra16b:
-;CHECK: ursra.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        %tmp4 = load <16 x i8>* %B
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-         ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @ursra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: ursra8h:
-;CHECK: ursra.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-        %tmp4 = load <8 x i16>* %B
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-         ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @ursra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: ursra4s:
-;CHECK: ursra.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-        %tmp4 = load <4 x i32>* %B
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-         ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @ursra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: ursra2d:
-;CHECK: ursra.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
-        %tmp4 = load <2 x i64>* %B
-        %tmp5 = add <2 x i64> %tmp3, %tmp4
-         ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @srsra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: srsra8b:
-;CHECK: srsra.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        %tmp4 = load <8 x i8>* %B
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @srsra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: srsra4h:
-;CHECK: srsra.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-        %tmp4 = load <4 x i16>* %B
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @srsra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: srsra2s:
-;CHECK: srsra.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
-        %tmp4 = load <2 x i32>* %B
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <16 x i8> @srsra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: srsra16b:
-;CHECK: srsra.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        %tmp4 = load <16 x i8>* %B
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-         ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @srsra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: srsra8h:
-;CHECK: srsra.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-        %tmp4 = load <8 x i16>* %B
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-         ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @srsra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: srsra4s:
-;CHECK: srsra.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-        %tmp4 = load <4 x i32>* %B
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-         ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @srsra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: srsra2d:
-;CHECK: srsra.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
-        %tmp4 = load <2 x i64>* %B
-        %tmp5 = add <2 x i64> %tmp3, %tmp4
-         ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @usra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: usra8b:
-;CHECK: usra.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp4 = load <8 x i8>* %B
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @usra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: usra4h:
-;CHECK: usra.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
-        %tmp4 = load <4 x i16>* %B
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @usra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: usra2s:
-;CHECK: usra.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
-        %tmp4 = load <2 x i32>* %B
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <16 x i8> @usra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: usra16b:
-;CHECK: usra.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp4 = load <16 x i8>* %B
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-         ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @usra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: usra8h:
-;CHECK: usra.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        %tmp4 = load <8 x i16>* %B
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-         ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @usra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: usra4s:
-;CHECK: usra.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
-        %tmp4 = load <4 x i32>* %B
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-         ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @usra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: usra2d:
-;CHECK: usra.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
-        %tmp4 = load <2 x i64>* %B
-        %tmp5 = add <2 x i64> %tmp3, %tmp4
-         ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @ssra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: ssra8b:
-;CHECK: ssra.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = ashr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp4 = load <8 x i8>* %B
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @ssra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: ssra4h:
-;CHECK: ssra.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = ashr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
-        %tmp4 = load <4 x i16>* %B
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @ssra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: ssra2s:
-;CHECK: ssra.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = ashr <2 x i32> %tmp1, <i32 1, i32 1>
-        %tmp4 = load <2 x i32>* %B
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <16 x i8> @ssra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: ssra16b:
-;CHECK: ssra.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = ashr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp4 = load <16 x i8>* %B
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-         ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @ssra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: ssra8h:
-;CHECK: ssra.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = ashr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        %tmp4 = load <8 x i16>* %B
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-         ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @ssra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: ssra4s:
-;CHECK: ssra.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = ashr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
-        %tmp4 = load <4 x i32>* %B
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-         ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @ssra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: ssra2d:
-;CHECK: ssra.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = ashr <2 x i64> %tmp1, <i64 1, i64 1>
-        %tmp4 = load <2 x i64>* %B
-        %tmp5 = add <2 x i64> %tmp3, %tmp4
-         ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @shr_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: shr_orr8b:
-;CHECK: shr.8b v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.8b
-;CHECK-NEXT: ret
-        %tmp1 = load <8 x i8>* %A
-        %tmp4 = load <8 x i8>* %B
-        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp5 = or <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @shr_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: shr_orr4h:
-;CHECK: shr.4h v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.8b
-;CHECK-NEXT: ret
-        %tmp1 = load <4 x i16>* %A
-        %tmp4 = load <4 x i16>* %B
-        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
-        %tmp5 = or <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @shr_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: shr_orr2s:
-;CHECK: shr.2s v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.8b
-;CHECK-NEXT: ret
-        %tmp1 = load <2 x i32>* %A
-        %tmp4 = load <2 x i32>* %B
-        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
-        %tmp5 = or <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <16 x i8> @shr_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: shr_orr16b:
-;CHECK: shr.16b v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <16 x i8>* %A
-        %tmp4 = load <16 x i8>* %B
-        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp5 = or <16 x i8> %tmp3, %tmp4
-         ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @shr_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: shr_orr8h:
-;CHECK: shr.8h v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp4 = load <8 x i16>* %B
-        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        %tmp5 = or <8 x i16> %tmp3, %tmp4
-         ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @shr_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: shr_orr4s:
-;CHECK: shr.4s v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp4 = load <4 x i32>* %B
-        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
-        %tmp5 = or <4 x i32> %tmp3, %tmp4
-         ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @shr_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: shr_orr2d:
-;CHECK: shr.2d v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp4 = load <2 x i64>* %B
-        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
-        %tmp5 = or <2 x i64> %tmp3, %tmp4
-         ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @shl_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: shl_orr8b:
-;CHECK: shl.8b v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.8b
-;CHECK-NEXT: ret
-        %tmp1 = load <8 x i8>* %A
-        %tmp4 = load <8 x i8>* %B
-        %tmp3 = shl <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp5 = or <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @shl_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: shl_orr4h:
-;CHECK: shl.4h v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.8b
-;CHECK-NEXT: ret
-        %tmp1 = load <4 x i16>* %A
-        %tmp4 = load <4 x i16>* %B
-        %tmp3 = shl <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
-        %tmp5 = or <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @shl_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: shl_orr2s:
-;CHECK: shl.2s v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.8b
-;CHECK-NEXT: ret
-        %tmp1 = load <2 x i32>* %A
-        %tmp4 = load <2 x i32>* %B
-        %tmp3 = shl <2 x i32> %tmp1, <i32 1, i32 1>
-        %tmp5 = or <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <16 x i8> @shl_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: shl_orr16b:
-;CHECK: shl.16b v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <16 x i8>* %A
-        %tmp4 = load <16 x i8>* %B
-        %tmp3 = shl <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp5 = or <16 x i8> %tmp3, %tmp4
-         ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @shl_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: shl_orr8h:
-;CHECK: shl.8h v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp4 = load <8 x i16>* %B
-        %tmp3 = shl <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        %tmp5 = or <8 x i16> %tmp3, %tmp4
-         ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @shl_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: shl_orr4s:
-;CHECK: shl.4s v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp4 = load <4 x i32>* %B
-        %tmp3 = shl <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
-        %tmp5 = or <4 x i32> %tmp3, %tmp4
-         ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @shl_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: shl_orr2d:
-;CHECK: shl.2d v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp4 = load <2 x i64>* %B
-        %tmp3 = shl <2 x i64> %tmp1, <i64 1, i64 1>
-        %tmp5 = or <2 x i64> %tmp3, %tmp4
-         ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @shll(<8 x i8> %in) {
-; CHECK-LABEL: shll:
-; CHECK: shll.8h v0, {{v[0-9]+}}, #8
-  %ext = zext <8 x i8> %in to <8 x i16>
-  %res = shl <8 x i16> %ext, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @shll_high(<8 x i16> %in) {
-; CHECK-LABEL: shll_high
-; CHECK: shll2.4s v0, {{v[0-9]+}}, #16
-  %extract = shufflevector <8 x i16> %in, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %ext = zext <4 x i16> %extract to <4 x i32>
-  %res = shl <4 x i32> %ext, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %res
-}
-
-define <8 x i8> @sli8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sli8b:
-;CHECK: sli.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sli4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sli4h:
-;CHECK: sli.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sli2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sli2s:
-;CHECK: sli.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <1 x i64> @sli1d(<1 x i64>* %A, <1 x i64>* %B) nounwind {
-;CHECK-LABEL: sli1d:
-;CHECK: sli d0, {{d[0-9]+}}, #1
-        %tmp1 = load <1 x i64>* %A
-        %tmp2 = load <1 x i64>* %B
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
-        ret <1 x i64> %tmp3
-}
-
-define <16 x i8> @sli16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sli16b:
-;CHECK: sli.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sli8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sli8h:
-;CHECK: sli.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sli4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sli4s:
-;CHECK: sli.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sli2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sli2d:
-;CHECK: sli.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
-        ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
-
-define <1 x i64> @ashr_v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: ashr_v1i64:
-; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: sshl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %c = ashr <1 x i64> %a, %b
-  ret <1 x i64> %c
-}
diff --git a/test/CodeGen/ARM64/vshr.ll b/test/CodeGen/ARM64/vshr.ll
deleted file mode 100644
index 1da8f60acb6..00000000000
--- a/test/CodeGen/ARM64/vshr.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
-
-define <8 x i16> @testShiftRightArith_v8i16(<8 x i16> %a, <8 x i16> %b) #0 {
-; CHECK-LABEL: testShiftRightArith_v8i16:
-; CHECK: neg.8h	[[REG1:v[0-9]+]], [[REG1]]
-; CHECK-NEXT: sshl.8h [[REG2:v[0-9]+]], [[REG2]], [[REG1]]
-
-entry:
-  %a.addr = alloca <8 x i16>, align 16
-  %b.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %a, <8 x i16>* %a.addr, align 16
-  store <8 x i16> %b, <8 x i16>* %b.addr, align 16
-  %0 = load <8 x i16>* %a.addr, align 16
-  %1 = load <8 x i16>* %b.addr, align 16
-  %shr = ashr <8 x i16> %0, %1
-  ret <8 x i16> %shr
-}
-
-define <4 x i32> @testShiftRightArith_v4i32(<4 x i32> %a, <4 x i32> %b) #0 {
-; CHECK-LABEL: testShiftRightArith_v4i32:
-; CHECK: neg.4s	[[REG3:v[0-9]+]], [[REG3]]
-; CHECK-NEXT: sshl.4s [[REG4:v[0-9]+]], [[REG4]], [[REG3]]
-entry:
-  %a.addr = alloca <4 x i32>, align 32
-  %b.addr = alloca <4 x i32>, align 32
-  store <4 x i32> %a, <4 x i32>* %a.addr, align 32
-  store <4 x i32> %b, <4 x i32>* %b.addr, align 32
-  %0 = load <4 x i32>* %a.addr, align 32
-  %1 = load <4 x i32>* %b.addr, align 32
-  %shr = ashr <4 x i32> %0, %1
-  ret <4 x i32> %shr
-}
-
-define <8 x i16> @testShiftRightLogical(<8 x i16> %a, <8 x i16> %b) #0 {
-; CHECK: testShiftRightLogical
-; CHECK: neg.8h	[[REG5:v[0-9]+]], [[REG5]]
-; CHECK-NEXT: ushl.8h [[REG6:v[0-9]+]], [[REG6]], [[REG5]]
-entry:
-  %a.addr = alloca <8 x i16>, align 16
-  %b.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %a, <8 x i16>* %a.addr, align 16
-  store <8 x i16> %b, <8 x i16>* %b.addr, align 16
-  %0 = load <8 x i16>* %a.addr, align 16
-  %1 = load <8 x i16>* %b.addr, align 16
-  %shr = lshr <8 x i16> %0, %1
-  ret <8 x i16> %shr
-}
-
-define <1 x i64> @sshr_v1i64(<1 x i64> %A) nounwind {
-; CHECK-LABEL: sshr_v1i64:
-; CHECK: sshr d0, d0, #63
-  %tmp3 = ashr <1 x i64> %A, < i64 63 >
-  ret <1 x i64> %tmp3
-}
-
-define <1 x i64> @ushr_v1i64(<1 x i64> %A) nounwind {
-; CHECK-LABEL: ushr_v1i64:
-; CHECK: ushr d0, d0, #63
-  %tmp3 = lshr <1 x i64> %A, < i64 63 >
-  ret <1 x i64> %tmp3
-}
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/ARM64/vshuffle.ll b/test/CodeGen/ARM64/vshuffle.ll
deleted file mode 100644
index 62fd96102d0..00000000000
--- a/test/CodeGen/ARM64/vshuffle.ll
+++ /dev/null
@@ -1,115 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s
-
-
-; The mask:
-; CHECK: lCPI0_0:
-; CHECK:  .byte   2                       ; 0x2
-; CHECK:  .byte   255                     ; 0xff
-; CHECK:  .byte   6                       ; 0x6
-; CHECK:  .byte   255                     ; 0xff
-; The second vector is legalized to undef and the elements of the first vector
-; are used instead.
-; CHECK:  .byte   2                       ; 0x2
-; CHECK:  .byte   4                       ; 0x4
-; CHECK:  .byte   6                       ; 0x6
-; CHECK:  .byte   0                       ; 0x0
-; CHECK: test1
-; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0
-; CHECK: movi.8h v[[REG1:[0-9]+]], #0x1, lsl #8
-; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
-define <8 x i1> @test1() {
-entry:
-  %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
-                                   i1 7>,
-                         <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
-                                   i1 7>,
-                         <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10,
-                                    i32 12, i32 14, i32 0>
-  ret <8 x i1> %Shuff
-}
-
-; CHECK: lCPI1_0:
-; CHECK:          .byte   2                       ; 0x2
-; CHECK:          .byte   255                     ; 0xff
-; CHECK:          .byte   6                       ; 0x6
-; CHECK:          .byte   255                     ; 0xff
-; CHECK:          .byte   10                      ; 0xa
-; CHECK:          .byte   12                      ; 0xc
-; CHECK:          .byte   14                      ; 0xe
-; CHECK:          .byte   0                       ; 0x0
-; CHECK: test2
-; CHECK: ldr     d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF]
-; CHECK: adrp    x[[REG2:[0-9]+]], lCPI1_1@PAGE
-; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG2]], lCPI1_1@PAGEOFF]
-; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
-define <8 x i1>@test2() {
-bb:
-  %Shuff = shufflevector <8 x i1> zeroinitializer,
-     <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
-     <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
-                i32 0>
-  ret <8 x i1> %Shuff
-}
-
-; CHECK: lCPI2_0:
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
-; CHECK: test3
-; CHECK: adrp    x[[REG3:[0-9]+]], lCPI2_0@PAGE
-; CHECK: ldr     q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF]
-; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG3]], lCPI2_1@PAGEOFF]
-; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
-define <16 x i1> @test3(i1* %ptr, i32 %v) {
-bb:
-  %Shuff = shufflevector <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i1> undef,
-     <16 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
-                 i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12,
-                 i32 14, i32 0>
-  ret <16 x i1> %Shuff
-}
-; CHECK: lCPI3_1:
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   1                       ; 0x1
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   18                      ; 0x12
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   31                      ; 0x1f
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   30                      ; 0x1e
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
-; CHECK: _test4:
-; CHECK:         ldr     q[[REG1:[0-9]+]]
-; CHECK:         movi.2d v[[REG0:[0-9]+]], #0000000000000000
-; CHECK:         adrp    x[[REG3:[0-9]+]], lCPI3_1@PAGE
-; CHECK:         ldr     q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF]
-; CHECK:         tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]]
-define <16 x i1> @test4(i1* %ptr, i32 %v) {
-bb:
-  %Shuff = shufflevector <16 x i1> zeroinitializer,
-     <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1,
-                i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
-     <16 x i32> <i32 2, i32 1, i32 6, i32 18, i32 10, i32 12, i32 14, i32 0,
-                 i32 2, i32 31, i32 6, i32 30, i32 10, i32 12, i32 14, i32 0>
-  ret <16 x i1> %Shuff
-}
diff --git a/test/CodeGen/ARM64/vsqrt.ll b/test/CodeGen/ARM64/vsqrt.ll
deleted file mode 100644
index 094d7042a4d..00000000000
--- a/test/CodeGen/ARM64/vsqrt.ll
+++ /dev/null
@@ -1,232 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @frecps_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: frecps_2s:
-;CHECK: frecps.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frecps_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: frecps_4s:
-;CHECK: frecps.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frecps_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: frecps_2d:
-;CHECK: frecps.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-
-define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: frsqrts_2s:
-;CHECK: frsqrts.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frsqrts_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: frsqrts_4s:
-;CHECK: frsqrts.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frsqrts_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: frsqrts_2d:
-;CHECK: frsqrts.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @frecpe_2s(<2 x float>* %A) nounwind {
-;CHECK-LABEL: frecpe_2s:
-;CHECK: frecpe.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float> %tmp1)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frecpe_4s(<4 x float>* %A) nounwind {
-;CHECK-LABEL: frecpe_4s:
-;CHECK: frecpe.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float> %tmp1)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frecpe_2d(<2 x double>* %A) nounwind {
-;CHECK-LABEL: frecpe_2d:
-;CHECK: frecpe.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double> %tmp1)
-	ret <2 x double> %tmp3
-}
-
-define float @frecpe_s(float* %A) nounwind {
-;CHECK-LABEL: frecpe_s:
-;CHECK: frecpe s0, {{s[0-9]+}}
-  %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frecpe.f32(float %tmp1)
-  ret float %tmp3
-}
-
-define double @frecpe_d(double* %A) nounwind {
-;CHECK-LABEL: frecpe_d:
-;CHECK: frecpe d0, {{d[0-9]+}}
-  %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frecpe.f64(double %tmp1)
-  ret double %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
-declare float @llvm.arm64.neon.frecpe.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frecpe.f64(double) nounwind readnone
-
-define float @frecpx_s(float* %A) nounwind {
-;CHECK-LABEL: frecpx_s:
-;CHECK: frecpx s0, {{s[0-9]+}}
-  %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frecpx.f32(float %tmp1)
-  ret float %tmp3
-}
-
-define double @frecpx_d(double* %A) nounwind {
-;CHECK-LABEL: frecpx_d:
-;CHECK: frecpx d0, {{d[0-9]+}}
-  %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frecpx.f64(double %tmp1)
-  ret double %tmp3
-}
-
-declare float @llvm.arm64.neon.frecpx.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frecpx.f64(double) nounwind readnone
-
-define <2 x float> @frsqrte_2s(<2 x float>* %A) nounwind {
-;CHECK-LABEL: frsqrte_2s:
-;CHECK: frsqrte.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float> %tmp1)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frsqrte_4s(<4 x float>* %A) nounwind {
-;CHECK-LABEL: frsqrte_4s:
-;CHECK: frsqrte.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float> %tmp1)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frsqrte_2d(<2 x double>* %A) nounwind {
-;CHECK-LABEL: frsqrte_2d:
-;CHECK: frsqrte.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double> %tmp1)
-	ret <2 x double> %tmp3
-}
-
-define float @frsqrte_s(float* %A) nounwind {
-;CHECK-LABEL: frsqrte_s:
-;CHECK: frsqrte s0, {{s[0-9]+}}
-  %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frsqrte.f32(float %tmp1)
-  ret float %tmp3
-}
-
-define double @frsqrte_d(double* %A) nounwind {
-;CHECK-LABEL: frsqrte_d:
-;CHECK: frsqrte d0, {{d[0-9]+}}
-  %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frsqrte.f64(double %tmp1)
-  ret double %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
-declare float @llvm.arm64.neon.frsqrte.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frsqrte.f64(double) nounwind readnone
-
-define <2 x i32> @urecpe_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: urecpe_2s:
-;CHECK: urecpe.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32> %tmp1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @urecpe_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: urecpe_4s:
-;CHECK: urecpe.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32> %tmp1)
-	ret <4 x i32> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
-
-define <2 x i32> @ursqrte_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: ursqrte_2s:
-;CHECK: ursqrte.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @ursqrte_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: ursqrte_4s:
-;CHECK: ursqrte.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
-	ret <4 x i32> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
-
-define float @f1(float %a, float %b) nounwind readnone optsize ssp {
-; CHECK-LABEL: f1:
-; CHECK: frsqrts s0, s0, s1
-; CHECK-NEXT: ret
-  %vrsqrtss.i = tail call float @llvm.arm64.neon.frsqrts.f32(float %a, float %b) nounwind
-  ret float %vrsqrtss.i
-}
-
-define double @f2(double %a, double %b) nounwind readnone optsize ssp {
-; CHECK-LABEL: f2:
-; CHECK: frsqrts d0, d0, d1
-; CHECK-NEXT: ret
-  %vrsqrtsd.i = tail call double @llvm.arm64.neon.frsqrts.f64(double %a, double %b) nounwind
-  ret double %vrsqrtsd.i
-}
-
-declare double @llvm.arm64.neon.frsqrts.f64(double, double) nounwind readnone
-declare float @llvm.arm64.neon.frsqrts.f32(float, float) nounwind readnone
diff --git a/test/CodeGen/ARM64/vsra.ll b/test/CodeGen/ARM64/vsra.ll
deleted file mode 100644
index 3611eb3cba6..00000000000
--- a/test/CodeGen/ARM64/vsra.ll
+++ /dev/null
@@ -1,150 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vsras8:
-;CHECK: ssra.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = ashr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
-        %tmp4 = add <8 x i8> %tmp1, %tmp3
-	ret <8 x i8> %tmp4
-}
-
-define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: vsras16:
-;CHECK: ssra.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = ashr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
-        %tmp4 = add <4 x i16> %tmp1, %tmp3
-	ret <4 x i16> %tmp4
-}
-
-define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: vsras32:
-;CHECK: ssra.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = ashr <2 x i32> %tmp2, < i32 31, i32 31 >
-        %tmp4 = add <2 x i32> %tmp1, %tmp3
-	ret <2 x i32> %tmp4
-}
-
-define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: vsraQs8:
-;CHECK: ssra.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = ashr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
-        %tmp4 = add <16 x i8> %tmp1, %tmp3
-	ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vsraQs16:
-;CHECK: ssra.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = ashr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
-        %tmp4 = add <8 x i16> %tmp1, %tmp3
-	ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vsraQs32:
-;CHECK: ssra.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = ashr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
-        %tmp4 = add <4 x i32> %tmp1, %tmp3
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: vsraQs64:
-;CHECK: ssra.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = ashr <2 x i64> %tmp2, < i64 63, i64 63 >
-        %tmp4 = add <2 x i64> %tmp1, %tmp3
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vsrau8:
-;CHECK: usra.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = lshr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
-        %tmp4 = add <8 x i8> %tmp1, %tmp3
-	ret <8 x i8> %tmp4
-}
-
-define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: vsrau16:
-;CHECK: usra.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = lshr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
-        %tmp4 = add <4 x i16> %tmp1, %tmp3
-	ret <4 x i16> %tmp4
-}
-
-define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: vsrau32:
-;CHECK: usra.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = lshr <2 x i32> %tmp2, < i32 31, i32 31 >
-        %tmp4 = add <2 x i32> %tmp1, %tmp3
-	ret <2 x i32> %tmp4
-}
-
-
-define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: vsraQu8:
-;CHECK: usra.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = lshr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
-        %tmp4 = add <16 x i8> %tmp1, %tmp3
-	ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vsraQu16:
-;CHECK: usra.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = lshr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
-        %tmp4 = add <8 x i16> %tmp1, %tmp3
-	ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vsraQu32:
-;CHECK: usra.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = lshr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
-        %tmp4 = add <4 x i32> %tmp1, %tmp3
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: vsraQu64:
-;CHECK: usra.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = lshr <2 x i64> %tmp2, < i64 63, i64 63 >
-        %tmp4 = add <2 x i64> %tmp1, %tmp3
-	ret <2 x i64> %tmp4
-}
-
-define <1 x i64> @vsra_v1i64(<1 x i64> %A, <1 x i64> %B) nounwind {
-; CHECK-LABEL: vsra_v1i64:
-; CHECK: ssra d0, d1, #63
-  %tmp3 = ashr <1 x i64> %B, < i64 63 >
-  %tmp4 = add <1 x i64> %A, %tmp3
-  ret <1 x i64> %tmp4
-}
diff --git a/test/CodeGen/ARM64/vsub.ll b/test/CodeGen/ARM64/vsub.ll
deleted file mode 100644
index 5c7e84f46ef..00000000000
--- a/test/CodeGen/ARM64/vsub.ll
+++ /dev/null
@@ -1,417 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: subhn8b:
-;CHECK: subhn.8b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: subhn4h:
-;CHECK: subhn.4h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: subhn2s:
-;CHECK: subhn.2s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
-;CHECK-LABEL: subhn2_16b:
-;CHECK: subhn.8b
-;CHECK-NEXT: subhn2.16b
-  %vsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %res
-}
-
-define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
-;CHECK-LABEL: subhn2_8h:
-;CHECK: subhn.4h
-;CHECK-NEXT: subhn2.8h
-  %vsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
-;CHECK-LABEL: subhn2_4s:
-;CHECK: subhn.2s
-;CHECK-NEXT: subhn2.4s
-  %vsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %res
-}
-
-declare <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: rsubhn8b:
-;CHECK: rsubhn.8b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: rsubhn4h:
-;CHECK: rsubhn.4h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: rsubhn2s:
-;CHECK: rsubhn.2s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
-;CHECK-LABEL: rsubhn2_16b:
-;CHECK: rsubhn.8b
-;CHECK-NEXT: rsubhn2.16b
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vrsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %res
-}
-
-define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
-;CHECK-LABEL: rsubhn2_8h:
-;CHECK: rsubhn.4h
-;CHECK-NEXT: rsubhn2.8h
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vrsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
-;CHECK-LABEL: rsubhn2_4s:
-;CHECK: rsubhn.2s
-;CHECK-NEXT: rsubhn2.4s
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vrsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %res
-}
-
-declare <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: ssubl8h:
-;CHECK: ssubl.8h
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
-  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
-  %tmp5 = sub <8 x i16> %tmp3, %tmp4
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @ssubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: ssubl4s:
-;CHECK: ssubl.4s
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
-  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
-  %tmp5 = sub <4 x i32> %tmp3, %tmp4
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: ssubl2d:
-;CHECK: ssubl.2d
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
-  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
-  %tmp5 = sub <2 x i64> %tmp3, %tmp4
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: ssubl2_8h:
-;CHECK: ssubl2.8h
-        %tmp1 = load <16 x i8>* %A
-        %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %ext1 = sext <8 x i8> %high1 to <8 x i16>
-
-        %tmp2 = load <16 x i8>* %B
-        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %ext2 = sext <8 x i8> %high2 to <8 x i16>
-
-        %res = sub <8 x i16> %ext1, %ext2
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: ssubl2_4s:
-;CHECK: ssubl2.4s
-        %tmp1 = load <8 x i16>* %A
-        %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %ext1 = sext <4 x i16> %high1 to <4 x i32>
-
-        %tmp2 = load <8 x i16>* %B
-        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %ext2 = sext <4 x i16> %high2 to <4 x i32>
-
-        %res = sub <4 x i32> %ext1, %ext2
-        ret <4 x i32> %res
-}
-
-define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: ssubl2_2d:
-;CHECK: ssubl2.2d
-        %tmp1 = load <4 x i32>* %A
-        %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %ext1 = sext <2 x i32> %high1 to <2 x i64>
-
-        %tmp2 = load <4 x i32>* %B
-        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %ext2 = sext <2 x i32> %high2 to <2 x i64>
-
-        %res = sub <2 x i64> %ext1, %ext2
-        ret <2 x i64> %res
-}
-
-define <8 x i16> @usubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: usubl8h:
-;CHECK: usubl.8h
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
-  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
-  %tmp5 = sub <8 x i16> %tmp3, %tmp4
-  ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @usubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: usubl4s:
-;CHECK: usubl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
-  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
-  %tmp5 = sub <4 x i32> %tmp3, %tmp4
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: usubl2d:
-;CHECK: usubl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
-  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
-  %tmp5 = sub <2 x i64> %tmp3, %tmp4
-  ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: usubl2_8h:
-;CHECK: usubl2.8h
-  %tmp1 = load <16 x i8>* %A
-  %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %ext1 = zext <8 x i8> %high1 to <8 x i16>
-
-  %tmp2 = load <16 x i8>* %B
-  %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %ext2 = zext <8 x i8> %high2 to <8 x i16>
-
-  %res = sub <8 x i16> %ext1, %ext2
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: usubl2_4s:
-;CHECK: usubl2.4s
-  %tmp1 = load <8 x i16>* %A
-  %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %ext1 = zext <4 x i16> %high1 to <4 x i32>
-
-  %tmp2 = load <8 x i16>* %B
-  %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %ext2 = zext <4 x i16> %high2 to <4 x i32>
-
-  %res = sub <4 x i32> %ext1, %ext2
-  ret <4 x i32> %res
-}
-
-define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: usubl2_2d:
-;CHECK: usubl2.2d
-  %tmp1 = load <4 x i32>* %A
-  %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %ext1 = zext <2 x i32> %high1 to <2 x i64>
-
-  %tmp2 = load <4 x i32>* %B
-  %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %ext2 = zext <2 x i32> %high2 to <2 x i64>
-
-  %res = sub <2 x i64> %ext1, %ext2
-  ret <2 x i64> %res
-}
-
-define <8 x i16> @ssubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: ssubw8h:
-;CHECK: ssubw.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i8>* %B
-  %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
-  %tmp4 = sub <8 x i16> %tmp1, %tmp3
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @ssubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: ssubw4s:
-;CHECK: ssubw.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i16>* %B
-  %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
-  %tmp4 = sub <4 x i32> %tmp1, %tmp3
-        ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: ssubw2d:
-;CHECK: ssubw.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i32>* %B
-  %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
-  %tmp4 = sub <2 x i64> %tmp1, %tmp3
-        ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: ssubw2_8h:
-;CHECK: ssubw2.8h
-        %tmp1 = load <8 x i16>* %A
-
-        %tmp2 = load <16 x i8>* %B
-        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %ext2 = sext <8 x i8> %high2 to <8 x i16>
-
-        %res = sub <8 x i16> %tmp1, %ext2
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: ssubw2_4s:
-;CHECK: ssubw2.4s
-        %tmp1 = load <4 x i32>* %A
-
-        %tmp2 = load <8 x i16>* %B
-        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %ext2 = sext <4 x i16> %high2 to <4 x i32>
-
-        %res = sub <4 x i32> %tmp1, %ext2
-        ret <4 x i32> %res
-}
-
-define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: ssubw2_2d:
-;CHECK: ssubw2.2d
-        %tmp1 = load <2 x i64>* %A
-
-        %tmp2 = load <4 x i32>* %B
-        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %ext2 = sext <2 x i32> %high2 to <2 x i64>
-
-        %res = sub <2 x i64> %tmp1, %ext2
-        ret <2 x i64> %res
-}
-
-define <8 x i16> @usubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: usubw8h:
-;CHECK: usubw.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i8>* %B
-  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
-  %tmp4 = sub <8 x i16> %tmp1, %tmp3
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @usubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: usubw4s:
-;CHECK: usubw.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i16>* %B
-  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
-  %tmp4 = sub <4 x i32> %tmp1, %tmp3
-        ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: usubw2d:
-;CHECK: usubw.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i32>* %B
-  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
-  %tmp4 = sub <2 x i64> %tmp1, %tmp3
-        ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: usubw2_8h:
-;CHECK: usubw2.8h
-        %tmp1 = load <8 x i16>* %A
-
-        %tmp2 = load <16 x i8>* %B
-        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %ext2 = zext <8 x i8> %high2 to <8 x i16>
-
-        %res = sub <8 x i16> %tmp1, %ext2
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: usubw2_4s:
-;CHECK: usubw2.4s
-        %tmp1 = load <4 x i32>* %A
-
-        %tmp2 = load <8 x i16>* %B
-        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %ext2 = zext <4 x i16> %high2 to <4 x i32>
-
-        %res = sub <4 x i32> %tmp1, %ext2
-        ret <4 x i32> %res
-}
-
-define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: usubw2_2d:
-;CHECK: usubw2.2d
-        %tmp1 = load <2 x i64>* %A
-
-        %tmp2 = load <4 x i32>* %B
-        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %ext2 = zext <2 x i32> %high2 to <2 x i64>
-
-        %res = sub <2 x i64> %tmp1, %ext2
-        ret <2 x i64> %res
-}
diff --git a/test/CodeGen/ARM64/weak-reference.ll b/test/CodeGen/ARM64/weak-reference.ll
deleted file mode 100644
index b2135e0960c..00000000000
--- a/test/CodeGen/ARM64/weak-reference.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
-
-@x = extern_weak global i32
-
-define i32 @fn() nounwind ssp {
-; CHECK-LABEL: fn:
-; CHECK: .weak_reference
-  %val = load i32* @x, align 4
-  ret i32 %val
-}
diff --git a/test/CodeGen/ARM64/xaluo.ll b/test/CodeGen/ARM64/xaluo.ll
deleted file mode 100644
index 6cffbdeef8a..00000000000
--- a/test/CodeGen/ARM64/xaluo.ll
+++ /dev/null
@@ -1,524 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-;
-; Get the actual value of the overflow bit.
-;
-define i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL:  saddo.i32
-; CHECK:        adds w8, w0, w1
-; CHECK-NEXT:   cset w0, vs
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL:  saddo.i64
-; CHECK:        adds x8, x0, x1
-; CHECK-NEXT:   cset w0, vs
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-define i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL:  uaddo.i32
-; CHECK:        adds w8, w0, w1
-; CHECK-NEXT:   cset w0, hs
-  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL:  uaddo.i64
-; CHECK:        adds x8, x0, x1
-; CHECK-NEXT:   cset w0, hs
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-define i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL:  ssubo.i32
-; CHECK:        subs w8, w0, w1
-; CHECK-NEXT:   cset w0, vs
-  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL:  ssubo.i64
-; CHECK:        subs x8, x0, x1
-; CHECK-NEXT:   cset w0, vs
-  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-define i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL:  usubo.i32
-; CHECK:        subs w8, w0, w1
-; CHECK-NEXT:   cset w0, lo
-  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL:  usubo.i64
-; CHECK:        subs x8, x0, x1
-; CHECK-NEXT:   cset w0, lo
-  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-define i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL:  smulo.i32
-; CHECK:        smull x8, w0, w1
-; CHECK-NEXT:   lsr x9, x8, #32
-; CHECK-NEXT:   cmp w9, w8, asr #31
-; CHECK-NEXT:   cset w0, ne
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL:  smulo.i64
-; CHECK:        mul x8, x0, x1
-; CHECK-NEXT:   smulh x9, x0, x1
-; CHECK-NEXT:   cmp x9, x8, asr #63
-; CHECK-NEXT:   cset w0, ne
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-define i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL:  umulo.i32
-; CHECK:        umull x8, w0, w1
-; CHECK-NEXT:   cmp xzr, x8, lsr #32
-; CHECK-NEXT:   cset w0, ne
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL:  umulo.i64
-; CHECK:        umulh x8, x0, x1
-; CHECK-NEXT:   cmp xzr, x8
-; CHECK-NEXT:   cset w8, ne
-; CHECK-NEXT:   mul x9, x0, x1
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-
-;
-; Check the use of the overflow bit in combination with a select instruction.
-;
-define i32 @saddo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  saddo.select.i32
-; CHECK:        cmn w0, w1
-; CHECK-NEXT:   csel w0, w0, w1, vs
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  saddo.select.i64
-; CHECK:        cmn x0, x1
-; CHECK-NEXT:   csel x0, x0, x1, vs
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  uaddo.select.i32
-; CHECK:        cmn w0, w1
-; CHECK-NEXT:   csel w0, w0, w1, hs
-  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  uaddo.select.i64
-; CHECK:        cmn x0, x1
-; CHECK-NEXT:   csel x0, x0, x1, hs
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @ssubo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  ssubo.select.i32
-; CHECK:        cmp w0, w1
-; CHECK-NEXT:   csel w0, w0, w1, vs
-  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  ssubo.select.i64
-; CHECK:        cmp x0, x1
-; CHECK-NEXT:   csel x0, x0, x1, vs
-  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  usubo.select.i32
-; CHECK:        cmp w0, w1
-; CHECK-NEXT:   csel w0, w0, w1, lo
-  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  usubo.select.i64
-; CHECK:        cmp x0, x1
-; CHECK-NEXT:   csel x0, x0, x1, lo
-  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  smulo.select.i32
-; CHECK:        smull    x8, w0, w1
-; CHECK-NEXT:   lsr     x9, x8, #32
-; CHECK-NEXT:   cmp     w9, w8, asr #31
-; CHECK-NEXT:   csel    w0, w0, w1, ne
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  smulo.select.i64
-; CHECK:        mul      x8, x0, x1
-; CHECK-NEXT:   smulh   x9, x0, x1
-; CHECK-NEXT:   cmp     x9, x8, asr #63
-; CHECK-NEXT:   csel    x0, x0, x1, ne
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  umulo.select.i32
-; CHECK:        umull    x8, w0, w1
-; CHECK-NEXT:   cmp     xzr, x8, lsr #32
-; CHECK-NEXT:   csel    w0, w0, w1, ne
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  umulo.select.i64
-; CHECK:        umulh   x8, x0, x1
-; CHECK-NEXT:   cmp     xzr, x8
-; CHECK-NEXT:   csel    x0, x0, x1, ne
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-
-;
-; Check the use of the overflow bit in combination with a branch instruction.
-;
-define i1 @saddo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  saddo.br.i32
-; CHECK:        cmn w0, w1
-; CHECK-NEXT:   b.vc
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @saddo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  saddo.br.i64
-; CHECK:        cmn x0, x1
-; CHECK-NEXT:   b.vc
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  uaddo.br.i32
-; CHECK:        cmn w0, w1
-; CHECK-NEXT:   b.lo
-  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  uaddo.br.i64
-; CHECK:        cmn x0, x1
-; CHECK-NEXT:   b.lo
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  ssubo.br.i32
-; CHECK:        cmp w0, w1
-; CHECK-NEXT:   b.vc
-  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  ssubo.br.i64
-; CHECK:        cmp x0, x1
-; CHECK-NEXT:   b.vc
-  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @usubo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  usubo.br.i32
-; CHECK:        cmp w0, w1
-; CHECK-NEXT:   b.hs
-  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @usubo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  usubo.br.i64
-; CHECK:        cmp x0, x1
-; CHECK-NEXT:   b.hs
-  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @smulo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  smulo.br.i32
-; CHECK:        smull    x8, w0, w1
-; CHECK-NEXT:   lsr     x9, x8, #32
-; CHECK-NEXT:   cmp     w9, w8, asr #31
-; CHECK-NEXT:   b.eq
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @smulo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  smulo.br.i64
-; CHECK:        mul      x8, x0, x1
-; CHECK-NEXT:   smulh   x9, x0, x1
-; CHECK-NEXT:   cmp     x9, x8, asr #63
-; CHECK-NEXT:   b.eq
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @umulo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  umulo.br.i32
-; CHECK:        umull    x8, w0, w1
-; CHECK-NEXT:   cmp     xzr, x8, lsr #32
-; CHECK-NEXT:   b.eq
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @umulo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  umulo.br.i64
-; CHECK:        umulh   x8, x0, x1
-; CHECK-NEXT:   cbz
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
-
diff --git a/test/CodeGen/ARM64/zero-cycle-regmov.ll b/test/CodeGen/ARM64/zero-cycle-regmov.ll
deleted file mode 100644
index c56d607aa81..00000000000
--- a/test/CodeGen/ARM64/zero-cycle-regmov.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
-; rdar://12254953
-
-define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp {
-entry:
-; CHECK-LABEL: t:
-; CHECK: mov x0, [[REG1:x[0-9]+]]
-; CHECK: mov x1, [[REG2:x[0-9]+]]
-; CHECK: bl _foo
-; CHECK: mov x0, [[REG1]]
-; CHECK: mov x1, [[REG2]]
-  %call = call i32 @foo(i32 %c, i32 %d) nounwind
-  %call1 = call i32 @foo(i32 %c, i32 %d) nounwind
-  unreachable
-}
-
-declare i32 @foo(i32, i32)
diff --git a/test/CodeGen/ARM64/zero-cycle-zeroing.ll b/test/CodeGen/ARM64/zero-cycle-zeroing.ll
deleted file mode 100644
index 349bb6fd78a..00000000000
--- a/test/CodeGen/ARM64/zero-cycle-zeroing.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
-; rdar://11481771
-; rdar://13713797
-
-define void @t1() nounwind ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK-NOT: fmov
-; CHECK: movi.2d v0, #0000000000000000
-; CHECK: movi.2d v1, #0000000000000000
-; CHECK: movi.2d v2, #0000000000000000
-; CHECK: movi.2d v3, #0000000000000000
-  tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind
-  ret void
-}
-
-define void @t2() nounwind ssp {
-entry:
-; CHECK-LABEL: t2:
-; CHECK-NOT: mov w0, wzr
-; CHECK: movz w0, #0
-; CHECK: movz w1, #0
-  tail call void @bari(i32 0, i32 0) nounwind
-  ret void
-}
-
-define void @t3() nounwind ssp {
-entry:
-; CHECK-LABEL: t3:
-; CHECK-NOT: mov x0, xzr
-; CHECK: movz x0, #0
-; CHECK: movz x1, #0
-  tail call void @barl(i64 0, i64 0) nounwind
-  ret void
-}
-
-define void @t4() nounwind ssp {
-; CHECK-LABEL: t4:
-; CHECK-NOT: fmov
-; CHECK: movi.2d v0, #0000000000000000
-; CHECK: movi.2d v1, #0000000000000000
-  tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
-  ret void
-}
-
-declare void @bar(double, double, double, double)
-declare void @bari(i32, i32)
-declare void @barl(i64, i64)
-declare void @barf(float, float)
diff --git a/test/CodeGen/ARM64/zext.ll b/test/CodeGen/ARM64/zext.ll
deleted file mode 100644
index 8d9e5ea040e..00000000000
--- a/test/CodeGen/ARM64/zext.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i64 @foo(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: foo:
-; CHECK: add w0, w1, w0
-; CHECK: ret
-  %add = add i32 %b, %a
-  %conv = zext i32 %add to i64
-  ret i64 %conv
-}
diff --git a/test/CodeGen/ARM64/zextload-unscaled.ll b/test/CodeGen/ARM64/zextload-unscaled.ll
deleted file mode 100644
index c475dbd21ee..00000000000
--- a/test/CodeGen/ARM64/zextload-unscaled.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc -march=arm64 < %s | FileCheck %s
-
-@var32 = global i32 0
-
-define void @test_zextloadi1_unscaled(i1* %base) {
-; CHECK-LABEL: test_zextloadi1_unscaled:
-; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
-
-  %addr = getelementptr i1* %base, i32 -7
-  %val = load i1* %addr, align 1
-
-  %extended = zext i1 %val to i32
-  store i32 %extended, i32* @var32, align 4
-  ret void
-}
-
-define void @test_zextloadi8_unscaled(i8* %base) {
-; CHECK-LABEL: test_zextloadi8_unscaled:
-; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
-
-  %addr = getelementptr i8* %base, i32 -7
-  %val = load i8* %addr, align 1
-
-  %extended = zext i8 %val to i32
-  store i32 %extended, i32* @var32, align 4
-  ret void
-}
-
-define void @test_zextloadi16_unscaled(i16* %base) {
-; CHECK-LABEL: test_zextloadi16_unscaled:
-; CHECK: ldurh {{w[0-9]+}}, [{{x[0-9]+}}, #-14]
-
-  %addr = getelementptr i16* %base, i32 -7
-  %val = load i16* %addr, align 2
-
-  %extended = zext i16 %val to i32
-  store i32 %extended, i32* @var32, align 4
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/zip.ll b/test/CodeGen/ARM64/zip.ll
deleted file mode 100644
index d06a9f899dd..00000000000
--- a/test/CodeGen/ARM64/zip.ll
+++ /dev/null
@@ -1,107 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vzipi8:
-;CHECK: zip1.8b
-;CHECK: zip2.8b
-;CHECK-NEXT: add.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-	ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: vzipi16:
-;CHECK: zip1.4h
-;CHECK: zip2.4h
-;CHECK-NEXT: add.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-	ret <4 x i16> %tmp5
-}
-
-define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: vzipQi8:
-;CHECK: zip1.16b
-;CHECK: zip2.16b
-;CHECK-NEXT: add.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-	ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vzipQi16:
-;CHECK: zip1.8h
-;CHECK: zip2.8h
-;CHECK-NEXT: add.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-	ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vzipQi32:
-;CHECK: zip1.4s
-;CHECK: zip2.4s
-;CHECK-NEXT: add.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-	ret <4 x i32> %tmp5
-}
-
-define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: vzipQf:
-;CHECK: zip1.4s
-;CHECK: zip2.4s
-;CHECK-NEXT: fadd.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-        %tmp5 = fadd <4 x float> %tmp3, %tmp4
-	ret <4 x float> %tmp5
-}
-
-; Undef shuffle indices should not prevent matching to VZIP:
-
-define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vzipi8_undef:
-;CHECK: zip1.8b
-;CHECK: zip2.8b
-;CHECK-NEXT: add.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 1, i32 9, i32 undef, i32 10, i32 3, i32 11>
-	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 undef, i32 undef, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-	ret <8 x i8> %tmp5
-}
-
-define <16 x i8> @vzipQi8_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: vzipQi8_undef:
-;CHECK: zip1.16b
-;CHECK: zip2.16b
-;CHECK-NEXT: add.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31>
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-	ret <16 x i8> %tmp5
-}
diff --git a/test/DebugInfo/AArch64/struct_by_value.ll b/test/DebugInfo/AArch64/struct_by_value.ll
new file mode 100644
index 00000000000..0023c3d6eab
--- /dev/null
+++ b/test/DebugInfo/AArch64/struct_by_value.ll
@@ -0,0 +1,68 @@
+; A by-value struct is a register-indirect value (breg).
+; RUN: llc %s -filetype=asm -o - | FileCheck %s
+
+; CHECK: DW_OP_breg0
+
+; rdar://problem/13658587
+;
+; Generated from
+;
+; struct five
+; {
+;   int a;
+;   int b;
+;   int c;
+;   int d;
+;   int e;
+; };
+;
+; int
+; return_five_int (struct five f)
+; {
+;   return f.a;
+; }
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+%struct.five = type { i32, i32, i32, i32, i32 }
+
+; Function Attrs: nounwind ssp
+define i32 @return_five_int(%struct.five* %f) #0 {
+entry:
+  call void @llvm.dbg.declare(metadata !{%struct.five* %f}, metadata !17), !dbg !18
+  %a = getelementptr inbounds %struct.five* %f, i32 0, i32 0, !dbg !19
+  %0 = load i32* %a, align 4, !dbg !19
+  ret i32 %0, !dbg !19
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { nounwind ssp }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!16, !20}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"LLVM version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [struct_by_value.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"struct_by_value.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"return_five_int", metadata !"return_five_int", metadata !"", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.five*)* @return_five_int, null, null, metadata !2, i32 14} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 14] [return_five_int]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [struct_by_value.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !9}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786451, metadata !1, null, metadata !"five", i32 1, i64 160, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [five] [line 1, size 160, align 32, offset 0] [def] [from ]
+!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !14, metadata !15}
+!11 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"a", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 3, size 32, align 32, offset 0] [from int]
+!12 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"b", i32 4, i64 32, i64 32, i64 32, i32 0, metadata !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 32] [from int]
+!13 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"c", i32 5, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ] [c] [line 5, size 32, align 32, offset 64] [from int]
+!14 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"d", i32 6, i64 32, i64 32, i64 96, i32 0, metadata !8} ; [ DW_TAG_member ] [d] [line 6, size 32, align 32, offset 96] [from int]
+!15 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"e", i32 7, i64 32, i64 32, i64 128, i32 0, metadata !8} ; [ DW_TAG_member ] [e] [line 7, size 32, align 32, offset 128] [from int]
+!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!17 = metadata !{i32 786689, metadata !4, metadata !"f", metadata !5, i32 16777229, metadata !9, i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [f] [line 13]
+!18 = metadata !{i32 13, i32 0, metadata !4, null}
+!19 = metadata !{i32 16, i32 0, metadata !4, null}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/ARM64/lit.local.cfg b/test/DebugInfo/ARM64/lit.local.cfg
deleted file mode 100644
index a75a42b6f74..00000000000
--- a/test/DebugInfo/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/DebugInfo/ARM64/struct_by_value.ll b/test/DebugInfo/ARM64/struct_by_value.ll
deleted file mode 100644
index 0023c3d6eab..00000000000
--- a/test/DebugInfo/ARM64/struct_by_value.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; A by-value struct is a register-indirect value (breg).
-; RUN: llc %s -filetype=asm -o - | FileCheck %s
-
-; CHECK: DW_OP_breg0
-
-; rdar://problem/13658587
-;
-; Generated from
-;
-; struct five
-; {
-;   int a;
-;   int b;
-;   int c;
-;   int d;
-;   int e;
-; };
-;
-; int
-; return_five_int (struct five f)
-; {
-;   return f.a;
-; }
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios3.0.0"
-
-%struct.five = type { i32, i32, i32, i32, i32 }
-
-; Function Attrs: nounwind ssp
-define i32 @return_five_int(%struct.five* %f) #0 {
-entry:
-  call void @llvm.dbg.declare(metadata !{%struct.five* %f}, metadata !17), !dbg !18
-  %a = getelementptr inbounds %struct.five* %f, i32 0, i32 0, !dbg !19
-  %0 = load i32* %a, align 4, !dbg !19
-  ret i32 %0, !dbg !19
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
-
-attributes #0 = { nounwind ssp }
-attributes #1 = { nounwind readnone }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!16, !20}
-
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"LLVM version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [struct_by_value.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"struct_by_value.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"return_five_int", metadata !"return_five_int", metadata !"", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.five*)* @return_five_int, null, null, metadata !2, i32 14} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 14] [return_five_int]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [struct_by_value.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !9}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786451, metadata !1, null, metadata !"five", i32 1, i64 160, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [five] [line 1, size 160, align 32, offset 0] [def] [from ]
-!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !14, metadata !15}
-!11 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"a", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 3, size 32, align 32, offset 0] [from int]
-!12 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"b", i32 4, i64 32, i64 32, i64 32, i32 0, metadata !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 32] [from int]
-!13 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"c", i32 5, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ] [c] [line 5, size 32, align 32, offset 64] [from int]
-!14 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"d", i32 6, i64 32, i64 32, i64 96, i32 0, metadata !8} ; [ DW_TAG_member ] [d] [line 6, size 32, align 32, offset 96] [from int]
-!15 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"e", i32 7, i64 32, i64 32, i64 128, i32 0, metadata !8} ; [ DW_TAG_member ] [e] [line 7, size 32, align 32, offset 128] [from int]
-!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!17 = metadata !{i32 786689, metadata !4, metadata !"f", metadata !5, i32 16777229, metadata !9, i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [f] [line 13]
-!18 = metadata !{i32 13, i32 0, metadata !4, null}
-!19 = metadata !{i32 16, i32 0, metadata !4, null}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/MC/AArch64/adrp-relocation.s b/test/MC/AArch64/adrp-relocation.s
index 03b930d5397..3bcef34e4f5 100644
--- a/test/MC/AArch64/adrp-relocation.s
+++ b/test/MC/AArch64/adrp-relocation.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-linux-gnu -filetype=obj -o - %s| llvm-readobj -r - | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-linux-gnu -filetype=obj -o - %s| llvm-readobj -r - | FileCheck %s
         .text
 // These should produce an ADRP/ADD pair to calculate the address of
 // testfn. The important point is that LLVM shouldn't think it can deal with the
diff --git a/test/MC/AArch64/arm64-adr.s b/test/MC/AArch64/arm64-adr.s
new file mode 100644
index 00000000000..131e545d3bb
--- /dev/null
+++ b/test/MC/AArch64/arm64-adr.s
@@ -0,0 +1,31 @@
+// RUN: not llvm-mc -triple arm64 -show-encoding < %s 2>%t | FileCheck %s
+// RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+adr x0, #0
+adr x0, #1
+adr x0, 1f
+adr x0, foo
+// CHECK: adr x0, #0          // encoding: [0x00,0x00,0x00,0x10]
+// CHECK: adr x0, #1          // encoding: [0x00,0x00,0x00,0x30]
+// CHECK: adr x0, .Ltmp0      // encoding: [A,A,A,0x10'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: .Ltmp0, kind: fixup_aarch64_pcrel_adr_imm21
+// CHECK: adr x0, foo         // encoding: [A,A,A,0x10'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: foo, kind: fixup_aarch64_pcrel_adr_imm21
+
+adrp x0, #0
+adrp x0, #4096
+adrp x0, 1f
+adrp x0, foo
+// CHECK: adrp    x0, #0      // encoding: [0x00,0x00,0x00,0x90]
+// CHECK: adrp    x0, #4096   // encoding: [0x00,0x00,0x00,0xb0]
+// CHECK: adrp    x0, .Ltmp0  // encoding: [A,A,A,0x90'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: .Ltmp0, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK: adrp    x0, foo     // encoding: [A,A,A,0x90'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: foo, kind: fixup_aarch64_pcrel_adrp_imm21
+
+adr x0, #0xffffffff
+adrp x0, #0xffffffff
+adrp x0, #1
+// CHECK-ERRORS: error: expected label or encodable integer pc offset
+// CHECK-ERRORS: error: expected label or encodable integer pc offset
+// CHECK-ERRORS: error: expected label or encodable integer pc offset
diff --git a/test/MC/AArch64/arm64-advsimd.s b/test/MC/AArch64/arm64-advsimd.s
new file mode 100644
index 00000000000..c627de708d3
--- /dev/null
+++ b/test/MC/AArch64/arm64-advsimd.s
@@ -0,0 +1,1997 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -output-asm-variant=1 -show-encoding < %s | FileCheck %s
+
+foo:
+
+  abs.8b  v0, v0
+  abs.16b v0, v0
+  abs.4h  v0, v0
+  abs.8h  v0, v0
+  abs.2s  v0, v0
+  abs.4s  v0, v0
+
+; CHECK: abs.8b  v0, v0              ; encoding: [0x00,0xb8,0x20,0x0e]
+; CHECK: abs.16b v0, v0              ; encoding: [0x00,0xb8,0x20,0x4e]
+; CHECK: abs.4h  v0, v0              ; encoding: [0x00,0xb8,0x60,0x0e]
+; CHECK: abs.8h  v0, v0              ; encoding: [0x00,0xb8,0x60,0x4e]
+; CHECK: abs.2s  v0, v0              ; encoding: [0x00,0xb8,0xa0,0x0e]
+; CHECK: abs.4s  v0, v0              ; encoding: [0x00,0xb8,0xa0,0x4e]
+
+  add.8b  v0, v0, v0
+  add.16b v0, v0, v0
+  add.4h  v0, v0, v0
+  add.8h  v0, v0, v0
+  add.2s  v0, v0, v0
+  add.4s  v0, v0, v0
+  add.2d  v0, v0, v0
+
+; CHECK: add.8b  v0, v0, v0          ; encoding: [0x00,0x84,0x20,0x0e]
+; CHECK: add.16b v0, v0, v0          ; encoding: [0x00,0x84,0x20,0x4e]
+; CHECK: add.4h  v0, v0, v0          ; encoding: [0x00,0x84,0x60,0x0e]
+; CHECK: add.8h  v0, v0, v0          ; encoding: [0x00,0x84,0x60,0x4e]
+; CHECK: add.2s  v0, v0, v0          ; encoding: [0x00,0x84,0xa0,0x0e]
+; CHECK: add.4s  v0, v0, v0          ; encoding: [0x00,0x84,0xa0,0x4e]
+; CHECK: add.2d  v0, v0, v0          ; encoding: [0x00,0x84,0xe0,0x4e]
+
+  add d1, d2, d3
+
+; CHECK: add d1, d2, d3              ; encoding: [0x41,0x84,0xe3,0x5e]
+
+  addhn.8b   v0, v0, v0
+  addhn2.16b v0, v0, v0
+  addhn.4h   v0, v0, v0
+  addhn2.8h  v0, v0, v0
+  addhn.2s   v0, v0, v0
+  addhn2.4s  v0, v0, v0
+
+; CHECK: addhn.8b   v0, v0, v0       ; encoding: [0x00,0x40,0x20,0x0e]
+; CHECK: addhn2.16b v0, v0, v0       ; encoding: [0x00,0x40,0x20,0x4e]
+; CHECK: addhn.4h   v0, v0, v0       ; encoding: [0x00,0x40,0x60,0x0e]
+; CHECK: addhn2.8h  v0, v0, v0       ; encoding: [0x00,0x40,0x60,0x4e]
+; CHECK: addhn.2s   v0, v0, v0       ; encoding: [0x00,0x40,0xa0,0x0e]
+; CHECK: addhn2.4s  v0, v0, v0       ; encoding: [0x00,0x40,0xa0,0x4e]
+
+  addp.8b  v0, v0, v0
+  addp.16b v0, v0, v0
+  addp.4h  v0, v0, v0
+  addp.8h  v0, v0, v0
+  addp.2s  v0, v0, v0
+  addp.4s  v0, v0, v0
+  addp.2d  v0, v0, v0
+
+; CHECK: addp.8b   v0, v0, v0        ; encoding: [0x00,0xbc,0x20,0x0e]
+; CHECK: addp.16b  v0, v0, v0        ; encoding: [0x00,0xbc,0x20,0x4e]
+; CHECK: addp.4h   v0, v0, v0        ; encoding: [0x00,0xbc,0x60,0x0e]
+; CHECK: addp.8h   v0, v0, v0        ; encoding: [0x00,0xbc,0x60,0x4e]
+; CHECK: addp.2s   v0, v0, v0        ; encoding: [0x00,0xbc,0xa0,0x0e]
+; CHECK: addp.4s   v0, v0, v0        ; encoding: [0x00,0xbc,0xa0,0x4e]
+; CHECK: addp.2d   v0, v0, v0        ; encoding: [0x00,0xbc,0xe0,0x4e]
+
+  addp.2d  d0, v0
+
+; CHECK: addp.2d d0, v0              ; encoding: [0x00,0xb8,0xf1,0x5e]
+
+  addv.8b  b0, v0
+  addv.16b b0, v0
+  addv.4h  h0, v0
+  addv.8h  h0, v0
+  addv.4s  s0, v0
+
+; CHECK: addv.8b  b0, v0             ; encoding: [0x00,0xb8,0x31,0x0e]
+; CHECK: addv.16b b0, v0             ; encoding: [0x00,0xb8,0x31,0x4e]
+; CHECK: addv.4h  h0, v0             ; encoding: [0x00,0xb8,0x71,0x0e]
+; CHECK: addv.8h  h0, v0             ; encoding: [0x00,0xb8,0x71,0x4e]
+; CHECK: addv.4s  s0, v0             ; encoding: [0x00,0xb8,0xb1,0x4e]
+
+
+; INS/DUP
+  dup.2d  v0, x3
+  dup.4s  v0, w3
+  dup.2s  v0, w3
+  dup.8h  v0, w3
+  dup.4h  v0, w3
+  dup.16b v0, w3
+  dup.8b  v0, w3
+
+  dup v1.2d, x3
+  dup v2.4s, w4
+  dup v3.2s, w5
+  dup v4.8h, w6
+  dup v5.4h, w7
+  dup v6.16b, w8
+  dup v7.8b, w9
+
+; CHECK: dup.2d  v0, x3              ; encoding: [0x60,0x0c,0x08,0x4e]
+; CHECK: dup.4s  v0, w3              ; encoding: [0x60,0x0c,0x04,0x4e]
+; CHECK: dup.2s  v0, w3              ; encoding: [0x60,0x0c,0x04,0x0e]
+; CHECK: dup.8h  v0, w3              ; encoding: [0x60,0x0c,0x02,0x4e]
+; CHECK: dup.4h  v0, w3              ; encoding: [0x60,0x0c,0x02,0x0e]
+; CHECK: dup.16b v0, w3              ; encoding: [0x60,0x0c,0x01,0x4e]
+; CHECK: dup.8b  v0, w3              ; encoding: [0x60,0x0c,0x01,0x0e]
+
+; CHECK: dup.2d	v1, x3               ; encoding: [0x61,0x0c,0x08,0x4e]
+; CHECK: dup.4s	v2, w4               ; encoding: [0x82,0x0c,0x04,0x4e]
+; CHECK: dup.2s	v3, w5               ; encoding: [0xa3,0x0c,0x04,0x0e]
+; CHECK: dup.8h	v4, w6               ; encoding: [0xc4,0x0c,0x02,0x4e]
+; CHECK: dup.4h	v5, w7               ; encoding: [0xe5,0x0c,0x02,0x0e]
+; CHECK: dup.16b v6, w8              ; encoding: [0x06,0x0d,0x01,0x4e]
+; CHECK: dup.8b	v7, w9               ; encoding: [0x27,0x0d,0x01,0x0e]
+
+  dup.2d  v0, v3[1]
+  dup.2s  v0, v3[1]
+  dup.4s  v0, v3[1]
+  dup.4h  v0, v3[1]
+  dup.8h  v0, v3[1]
+  dup.8b  v0, v3[1]
+  dup.16b v0, v3[1]
+
+  dup v7.2d, v9.d[1]
+  dup v6.2s, v8.s[1]
+  dup v5.4s, v7.s[2]
+  dup v4.4h, v6.h[3]
+  dup v3.8h, v5.h[4]
+  dup v2.8b, v4.b[5]
+  dup v1.16b, v3.b[6]
+
+; CHECK: dup.2d  v0, v3[1]           ; encoding: [0x60,0x04,0x18,0x4e]
+; CHECK: dup.2s  v0, v3[1]           ; encoding: [0x60,0x04,0x0c,0x0e]
+; CHECK: dup.4s  v0, v3[1]           ; encoding: [0x60,0x04,0x0c,0x4e]
+; CHECK: dup.4h  v0, v3[1]           ; encoding: [0x60,0x04,0x06,0x0e]
+; CHECK: dup.8h  v0, v3[1]           ; encoding: [0x60,0x04,0x06,0x4e]
+; CHECK: dup.8b  v0, v3[1]           ; encoding: [0x60,0x04,0x03,0x0e]
+; CHECK: dup.16b v0, v3[1]           ; encoding: [0x60,0x04,0x03,0x4e]
+
+; CHECK: dup.2d  v7, v9[1]            ; encoding: [0x27,0x05,0x18,0x4e]
+; CHECK: dup.2s  v6, v8[1]            ; encoding: [0x06,0x05,0x0c,0x0e]
+; CHECK: dup.4s  v5, v7[2]            ; encoding: [0xe5,0x04,0x14,0x4e]
+; CHECK: dup.4h  v4, v6[3]            ; encoding: [0xc4,0x04,0x0e,0x0e]
+; CHECK: dup.8h  v3, v5[4]            ; encoding: [0xa3,0x04,0x12,0x4e]
+; CHECK: dup.8b  v2, v4[5]            ; encoding: [0x82,0x04,0x0b,0x0e]
+; CHECK: dup.16b v1, v3[6]            ; encoding: [0x61,0x04,0x0d,0x4e]
+
+  dup b3, v4[1]
+  dup h3, v4[1]
+  dup s3, v4[1]
+  dup d3, v4[1]
+  dup b3, v4.b[1]
+  dup h3, v4.h[1]
+  dup s3, v4.s[1]
+  dup d3, v4.d[1]
+
+  mov b3, v4[1]
+  mov h3, v4[1]
+  mov s3, v4[1]
+  mov d3, v4[1]
+  mov b3, v4.b[1]
+  mov h3, v4.h[1]
+  mov s3, v4.s[1]
+  mov d3, v4.d[1]
+
+; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
+; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
+; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
+; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
+; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
+; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
+; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
+; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
+
+; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
+; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
+; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
+; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
+; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
+; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
+; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
+; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
+
+  smov.s x3, v2[2]
+  smov   x3, v2.s[2]
+  umov.s w3, v2[2]
+  umov   w3, v2.s[2]
+  umov.d x3, v2[1]
+  umov   x3, v2.d[1]
+
+; CHECK: smov.s  x3, v2[2]           ; encoding: [0x43,0x2c,0x14,0x4e]
+; CHECK: smov.s  x3, v2[2]           ; encoding: [0x43,0x2c,0x14,0x4e]
+; CHECK: mov.s  w3, v2[2]           ; encoding: [0x43,0x3c,0x14,0x0e]
+; CHECK: mov.s  w3, v2[2]           ; encoding: [0x43,0x3c,0x14,0x0e]
+; CHECK: mov.d  x3, v2[1]           ; encoding: [0x43,0x3c,0x18,0x4e]
+; CHECK: mov.d  x3, v2[1]           ; encoding: [0x43,0x3c,0x18,0x4e]
+
+  ; MOV aliases for UMOV instructions above
+
+  mov.s w2, v3[3]
+  mov   w5, v7.s[2]
+  mov.d x11, v13[1]
+  mov   x17, v19.d[0]
+
+; CHECK: mov.s  w2, v3[3]               ; encoding: [0x62,0x3c,0x1c,0x0e]
+; CHECK: mov.s  w5, v7[2]               ; encoding: [0xe5,0x3c,0x14,0x0e]
+; CHECK: mov.d  x11, v13[1]             ; encoding: [0xab,0x3d,0x18,0x4e]
+; CHECK: mov.d  x17, v19[0]             ; encoding: [0x71,0x3e,0x08,0x4e]
+
+  ins.d v2[1], x5
+  ins.s v2[1], w5
+  ins.h v2[1], w5
+  ins.b v2[1], w5
+
+  ins   v2.d[1], x5
+  ins   v2.s[1], w5
+  ins   v2.h[1], w5
+  ins   v2.b[1], w5
+
+; CHECK: ins.d v2[1], x5             ; encoding: [0xa2,0x1c,0x18,0x4e]
+; CHECK: ins.s v2[1], w5             ; encoding: [0xa2,0x1c,0x0c,0x4e]
+; CHECK: ins.h v2[1], w5             ; encoding: [0xa2,0x1c,0x06,0x4e]
+; CHECK: ins.b v2[1], w5             ; encoding: [0xa2,0x1c,0x03,0x4e]
+
+; CHECK: ins.d v2[1], x5             ; encoding: [0xa2,0x1c,0x18,0x4e]
+; CHECK: ins.s v2[1], w5             ; encoding: [0xa2,0x1c,0x0c,0x4e]
+; CHECK: ins.h v2[1], w5             ; encoding: [0xa2,0x1c,0x06,0x4e]
+; CHECK: ins.b v2[1], w5             ; encoding: [0xa2,0x1c,0x03,0x4e]
+
+  ins.d v2[1], v15[1]
+  ins.s v2[1], v15[1]
+  ins.h v2[1], v15[1]
+  ins.b v2[1], v15[1]
+
+  ins   v2.d[1], v15.d[0]
+  ins   v2.s[3], v15.s[2]
+  ins   v2.h[7], v15.h[3]
+  ins   v2.b[10], v15.b[5]
+
+; CHECK: ins.d v2[1], v15[1]         ; encoding: [0xe2,0x45,0x18,0x6e]
+; CHECK: ins.s v2[1], v15[1]         ; encoding: [0xe2,0x25,0x0c,0x6e]
+; CHECK: ins.h v2[1], v15[1]         ; encoding: [0xe2,0x15,0x06,0x6e]
+; CHECK: ins.b v2[1], v15[1]         ; encoding: [0xe2,0x0d,0x03,0x6e]
+
+; CHECK: ins.d v2[1], v15[0]         ; encoding: [0xe2,0x05,0x18,0x6e]
+; CHECK: ins.s v2[3], v15[2]         ; encoding: [0xe2,0x45,0x1c,0x6e]
+; CHECK: ins.h v2[7], v15[3]         ; encoding: [0xe2,0x35,0x1e,0x6e]
+; CHECK: ins.b v2[10], v15[5]        ; encoding: [0xe2,0x2d,0x15,0x6e]
+
+; MOV aliases for the above INS instructions.
+  mov.d v2[1], x5
+  mov.s v3[1], w6
+  mov.h v4[1], w7
+  mov.b v5[1], w8
+
+  mov   v9.d[1], x2
+  mov   v8.s[1], w3
+  mov   v7.h[1], w4
+  mov   v6.b[1], w5
+
+  mov.d v1[1], v10[1]
+  mov.s v2[1], v11[1]
+  mov.h v7[1], v12[1]
+  mov.b v8[1], v15[1]
+
+  mov   v2.d[1], v15.d[0]
+  mov   v7.s[3], v16.s[2]
+  mov   v8.h[7], v17.h[3]
+  mov   v9.b[10], v18.b[5]
+
+; CHECK: ins.d	v2[1], x5               ; encoding: [0xa2,0x1c,0x18,0x4e]
+; CHECK: ins.s	v3[1], w6               ; encoding: [0xc3,0x1c,0x0c,0x4e]
+; CHECK: ins.h	v4[1], w7               ; encoding: [0xe4,0x1c,0x06,0x4e]
+; CHECK: ins.b	v5[1], w8               ; encoding: [0x05,0x1d,0x03,0x4e]
+; CHECK: ins.d	v9[1], x2               ; encoding: [0x49,0x1c,0x18,0x4e]
+; CHECK: ins.s	v8[1], w3               ; encoding: [0x68,0x1c,0x0c,0x4e]
+; CHECK: ins.h	v7[1], w4               ; encoding: [0x87,0x1c,0x06,0x4e]
+; CHECK: ins.b	v6[1], w5               ; encoding: [0xa6,0x1c,0x03,0x4e]
+; CHECK: ins.d	v1[1], v10[1]           ; encoding: [0x41,0x45,0x18,0x6e]
+; CHECK: ins.s	v2[1], v11[1]           ; encoding: [0x62,0x25,0x0c,0x6e]
+; CHECK: ins.h	v7[1], v12[1]           ; encoding: [0x87,0x15,0x06,0x6e]
+; CHECK: ins.b	v8[1], v15[1]           ; encoding: [0xe8,0x0d,0x03,0x6e]
+; CHECK: ins.d	v2[1], v15[0]           ; encoding: [0xe2,0x05,0x18,0x6e]
+; CHECK: ins.s	v7[3], v16[2]           ; encoding: [0x07,0x46,0x1c,0x6e]
+; CHECK: ins.h	v8[7], v17[3]           ; encoding: [0x28,0x36,0x1e,0x6e]
+; CHECK: ins.b	v9[10], v18[5]          ; encoding: [0x49,0x2e,0x15,0x6e]
+
+
+  and.8b  v0, v0, v0
+  and.16b v0, v0, v0
+
+; CHECK: and.8b  v0, v0, v0          ; encoding: [0x00,0x1c,0x20,0x0e]
+; CHECK: and.16b v0, v0, v0          ; encoding: [0x00,0x1c,0x20,0x4e]
+
+  bic.8b  v0, v0, v0
+
+; CHECK: bic.8b  v0, v0, v0          ; encoding: [0x00,0x1c,0x60,0x0e]
+
+  cmeq.8b v0, v0, v0
+  cmge.8b v0, v0, v0
+  cmgt.8b v0, v0, v0
+  cmhi.8b v0, v0, v0
+  cmhs.8b v0, v0, v0
+  cmtst.8b v0, v0, v0
+  fabd.2s v0, v0, v0
+  facge.2s  v0, v0, v0
+  facgt.2s  v0, v0, v0
+  faddp.2s v0, v0, v0
+  fadd.2s v0, v0, v0
+  fcmeq.2s  v0, v0, v0
+  fcmge.2s  v0, v0, v0
+  fcmgt.2s  v0, v0, v0
+  fdiv.2s v0, v0, v0
+  fmaxnmp.2s v0, v0, v0
+  fmaxnm.2s v0, v0, v0
+  fmaxp.2s v0, v0, v0
+  fmax.2s v0, v0, v0
+  fminnmp.2s v0, v0, v0
+  fminnm.2s v0, v0, v0
+  fminp.2s v0, v0, v0
+  fmin.2s v0, v0, v0
+  fmla.2s v0, v0, v0
+  fmls.2s v0, v0, v0
+  fmulx.2s v0, v0, v0
+  fmul.2s v0, v0, v0
+  fmulx	d2, d3, d1
+  fmulx	s2, s3, s1
+  frecps.2s v0, v0, v0
+  frsqrts.2s v0, v0, v0
+  fsub.2s v0, v0, v0
+  mla.8b v0, v0, v0
+  mls.8b v0, v0, v0
+  mul.8b v0, v0, v0
+  pmul.8b v0, v0, v0
+  saba.8b v0, v0, v0
+  sabd.8b v0, v0, v0
+  shadd.8b v0, v0, v0
+  shsub.8b v0, v0, v0
+  smaxp.8b v0, v0, v0
+  smax.8b v0, v0, v0
+  sminp.8b v0, v0, v0
+  smin.8b v0, v0, v0
+  sqadd.8b v0, v0, v0
+  sqdmulh.4h v0, v0, v0
+  sqrdmulh.4h v0, v0, v0
+  sqrshl.8b v0, v0, v0
+  sqshl.8b v0, v0, v0
+  sqsub.8b v0, v0, v0
+  srhadd.8b v0, v0, v0
+  srshl.8b v0, v0, v0
+  sshl.8b v0, v0, v0
+  sub.8b v0, v0, v0
+  uaba.8b v0, v0, v0
+  uabd.8b v0, v0, v0
+  uhadd.8b v0, v0, v0
+  uhsub.8b v0, v0, v0
+  umaxp.8b v0, v0, v0
+  umax.8b v0, v0, v0
+  uminp.8b v0, v0, v0
+  umin.8b v0, v0, v0
+  uqadd.8b v0, v0, v0
+  uqrshl.8b v0, v0, v0
+  uqshl.8b v0, v0, v0
+  uqsub.8b v0, v0, v0
+  urhadd.8b v0, v0, v0
+  urshl.8b v0, v0, v0
+  ushl.8b v0, v0, v0
+
+; CHECK: cmeq.8b	v0, v0, v0              ; encoding: [0x00,0x8c,0x20,0x2e]
+; CHECK: cmge.8b	v0, v0, v0              ; encoding: [0x00,0x3c,0x20,0x0e]
+; CHECK: cmgt.8b	v0, v0, v0              ; encoding: [0x00,0x34,0x20,0x0e]
+; CHECK: cmhi.8b	v0, v0, v0              ; encoding: [0x00,0x34,0x20,0x2e]
+; CHECK: cmhs.8b	v0, v0, v0              ; encoding: [0x00,0x3c,0x20,0x2e]
+; CHECK: cmtst.8b	v0, v0, v0      ; encoding: [0x00,0x8c,0x20,0x0e]
+; CHECK: fabd.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0xa0,0x2e]
+; CHECK: facge.2s	v0, v0, v0      ; encoding: [0x00,0xec,0x20,0x2e]
+; CHECK: facgt.2s	v0, v0, v0      ; encoding: [0x00,0xec,0xa0,0x2e]
+; CHECK: faddp.2s	v0, v0, v0      ; encoding: [0x00,0xd4,0x20,0x2e]
+; CHECK: fadd.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0x20,0x0e]
+; CHECK: fcmeq.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0x20,0x0e]
+; CHECK: fcmge.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0x20,0x2e]
+; CHECK: fcmgt.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0xa0,0x2e]
+; CHECK: fdiv.2s	v0, v0, v0              ; encoding: [0x00,0xfc,0x20,0x2e]
+; CHECK: fmaxnmp.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0x20,0x2e]
+; CHECK: fmaxnm.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0x20,0x0e]
+; CHECK: fmaxp.2s	v0, v0, v0      ; encoding: [0x00,0xf4,0x20,0x2e]
+; CHECK: fmax.2s	v0, v0, v0              ; encoding: [0x00,0xf4,0x20,0x0e]
+; CHECK: fminnmp.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0xa0,0x2e]
+; CHECK: fminnm.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0xa0,0x0e]
+; CHECK: fminp.2s	v0, v0, v0      ; encoding: [0x00,0xf4,0xa0,0x2e]
+; CHECK: fmin.2s	v0, v0, v0              ; encoding: [0x00,0xf4,0xa0,0x0e]
+; CHECK: fmla.2s	v0, v0, v0              ; encoding: [0x00,0xcc,0x20,0x0e]
+; CHECK: fmls.2s	v0, v0, v0              ; encoding: [0x00,0xcc,0xa0,0x0e]
+; CHECK: fmulx.2s	v0, v0, v0      ; encoding: [0x00,0xdc,0x20,0x0e]
+
+; CHECK: fmul.2s	v0, v0, v0              ; encoding: [0x00,0xdc,0x20,0x2e]
+; CHECK: fmulx	d2, d3, d1              ; encoding: [0x62,0xdc,0x61,0x5e]
+; CHECK: fmulx	s2, s3, s1              ; encoding: [0x62,0xdc,0x21,0x5e]
+; CHECK: frecps.2s	v0, v0, v0      ; encoding: [0x00,0xfc,0x20,0x0e]
+; CHECK: frsqrts.2s	v0, v0, v0      ; encoding: [0x00,0xfc,0xa0,0x0e]
+; CHECK: fsub.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0xa0,0x0e]
+; CHECK: mla.8b	v0, v0, v0              ; encoding: [0x00,0x94,0x20,0x0e]
+; CHECK: mls.8b	v0, v0, v0              ; encoding: [0x00,0x94,0x20,0x2e]
+; CHECK: mul.8b	v0, v0, v0              ; encoding: [0x00,0x9c,0x20,0x0e]
+; CHECK: pmul.8b	v0, v0, v0              ; encoding: [0x00,0x9c,0x20,0x2e]
+; CHECK: saba.8b	v0, v0, v0              ; encoding: [0x00,0x7c,0x20,0x0e]
+; CHECK: sabd.8b	v0, v0, v0              ; encoding: [0x00,0x74,0x20,0x0e]
+; CHECK: shadd.8b	v0, v0, v0      ; encoding: [0x00,0x04,0x20,0x0e]
+; CHECK: shsub.8b	v0, v0, v0      ; encoding: [0x00,0x24,0x20,0x0e]
+; CHECK: smaxp.8b	v0, v0, v0      ; encoding: [0x00,0xa4,0x20,0x0e]
+; CHECK: smax.8b	v0, v0, v0              ; encoding: [0x00,0x64,0x20,0x0e]
+; CHECK: sminp.8b	v0, v0, v0      ; encoding: [0x00,0xac,0x20,0x0e]
+; CHECK: smin.8b	v0, v0, v0              ; encoding: [0x00,0x6c,0x20,0x0e]
+; CHECK: sqadd.8b	v0, v0, v0      ; encoding: [0x00,0x0c,0x20,0x0e]
+; CHECK: sqdmulh.4h v0, v0, v0 ; encoding: [0x00,0xb4,0x60,0x0e]
+; CHECK: sqrdmulh.4h v0, v0, v0 ; encoding: [0x00,0xb4,0x60,0x2e]
+; CHECK: sqrshl.8b	v0, v0, v0      ; encoding: [0x00,0x5c,0x20,0x0e]
+; CHECK: sqshl.8b	v0, v0, v0      ; encoding: [0x00,0x4c,0x20,0x0e]
+; CHECK: sqsub.8b	v0, v0, v0      ; encoding: [0x00,0x2c,0x20,0x0e]
+; CHECK: srhadd.8b	v0, v0, v0      ; encoding: [0x00,0x14,0x20,0x0e]
+; CHECK: srshl.8b	v0, v0, v0      ; encoding: [0x00,0x54,0x20,0x0e]
+; CHECK: sshl.8b	v0, v0, v0              ; encoding: [0x00,0x44,0x20,0x0e]
+; CHECK: sub.8b	v0, v0, v0              ; encoding: [0x00,0x84,0x20,0x2e]
+; CHECK: uaba.8b	v0, v0, v0              ; encoding: [0x00,0x7c,0x20,0x2e]
+; CHECK: uabd.8b	v0, v0, v0              ; encoding: [0x00,0x74,0x20,0x2e]
+; CHECK: uhadd.8b	v0, v0, v0      ; encoding: [0x00,0x04,0x20,0x2e]
+; CHECK: uhsub.8b	v0, v0, v0      ; encoding: [0x00,0x24,0x20,0x2e]
+; CHECK: umaxp.8b	v0, v0, v0      ; encoding: [0x00,0xa4,0x20,0x2e]
+; CHECK: umax.8b	v0, v0, v0              ; encoding: [0x00,0x64,0x20,0x2e]
+; CHECK: uminp.8b	v0, v0, v0      ; encoding: [0x00,0xac,0x20,0x2e]
+; CHECK: umin.8b	v0, v0, v0              ; encoding: [0x00,0x6c,0x20,0x2e]
+; CHECK: uqadd.8b	v0, v0, v0      ; encoding: [0x00,0x0c,0x20,0x2e]
+; CHECK: uqrshl.8b	v0, v0, v0      ; encoding: [0x00,0x5c,0x20,0x2e]
+; CHECK: uqshl.8b	v0, v0, v0      ; encoding: [0x00,0x4c,0x20,0x2e]
+; CHECK: uqsub.8b	v0, v0, v0      ; encoding: [0x00,0x2c,0x20,0x2e]
+; CHECK: urhadd.8b	v0, v0, v0      ; encoding: [0x00,0x14,0x20,0x2e]
+; CHECK: urshl.8b	v0, v0, v0      ; encoding: [0x00,0x54,0x20,0x2e]
+; CHECK: ushl.8b	v0, v0, v0              ; encoding: [0x00,0x44,0x20,0x2e]
+
+  bif.8b v0, v0, v0
+  bit.8b v0, v0, v0
+  bsl.8b v0, v0, v0
+  eor.8b v0, v0, v0
+  orn.8b v0, v0, v0
+  orr.8b v0, v0, v1
+
+; CHECK: bif.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xe0,0x2e]
+; CHECK: bit.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xa0,0x2e]
+; CHECK: bsl.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0x60,0x2e]
+; CHECK: eor.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0x20,0x2e]
+; CHECK: orn.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xe0,0x0e]
+; CHECK: orr.8b v0, v0, v1              ; encoding: [0x00,0x1c,0xa1,0x0e]
+
+  sadalp.4h   v0, v0
+  sadalp.8h  v0, v0
+  sadalp.2s   v0, v0
+  sadalp.4s   v0, v0
+  sadalp.1d   v0, v0
+  sadalp.2d   v0, v0
+
+; CHECK: sadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x0e]
+; CHECK: sadalp.8h	v0, v0          ; encoding: [0x00,0x68,0x20,0x4e]
+; CHECK: sadalp.2s	v0, v0          ; encoding: [0x00,0x68,0x60,0x0e]
+; CHECK: sadalp.4s	v0, v0          ; encoding: [0x00,0x68,0x60,0x4e]
+; CHECK: sadalp.1d	v0, v0          ; encoding: [0x00,0x68,0xa0,0x0e]
+; CHECK: sadalp.2d	v0, v0          ; encoding: [0x00,0x68,0xa0,0x4e]
+
+  cls.8b      v0, v0
+  clz.8b      v0, v0
+  cnt.8b      v0, v0
+  fabs.2s     v0, v0
+  fneg.2s     v0, v0
+  frecpe.2s   v0, v0
+  frinta.2s   v0, v0
+  frintx.2s   v0, v0
+  frinti.2s   v0, v0
+  frintm.2s   v0, v0
+  frintn.2s   v0, v0
+  frintp.2s   v0, v0
+  frintz.2s   v0, v0
+  frsqrte.2s  v0, v0
+  fsqrt.2s    v0, v0
+  neg.8b      v0, v0
+  not.8b      v0, v0
+  rbit.8b     v0, v0
+  rev16.8b    v0, v0
+  rev32.8b    v0, v0
+  rev64.8b    v0, v0
+  sadalp.4h   v0, v0
+  saddlp.4h	  v0, v0
+  scvtf.2s    v0, v0
+  sqabs.8b    v0, v0
+  sqneg.8b    v0, v0
+  sqxtn.8b    v0, v0
+  sqxtun.8b   v0, v0
+  suqadd.8b   v0, v0
+  uadalp.4h   v0, v0
+  uaddlp.4h   v0, v0
+  ucvtf.2s    v0, v0
+  uqxtn.8b    v0, v0
+  urecpe.2s   v0, v0
+  ursqrte.2s  v0, v0
+  usqadd.8b   v0, v0
+  xtn.8b      v0, v0
+  shll.8h v1, v2, #8
+  shll.4s v3, v4, #16
+  shll.2d v5, v6, #32
+  shll2.8h v7, v8, #8
+  shll2.4s v9, v10, #16
+  shll2.2d v11, v12, #32
+  shll v1.8h, v2.8b, #8
+  shll v1.4s, v2.4h, #16
+  shll v1.2d, v2.2s, #32
+  shll2 v1.8h, v2.16b, #8
+  shll2 v1.4s, v2.8h, #16
+  shll2 v1.2d, v2.4s, #32
+
+; CHECK: cls.8b	v0, v0                  ; encoding: [0x00,0x48,0x20,0x0e]
+; CHECK: clz.8b	v0, v0                  ; encoding: [0x00,0x48,0x20,0x2e]
+; CHECK: cnt.8b	v0, v0                  ; encoding: [0x00,0x58,0x20,0x0e]
+; CHECK: fabs.2s	v0, v0                  ; encoding: [0x00,0xf8,0xa0,0x0e]
+; CHECK: fneg.2s	v0, v0                  ; encoding: [0x00,0xf8,0xa0,0x2e]
+; CHECK: frecpe.2s	v0, v0          ; encoding: [0x00,0xd8,0xa1,0x0e]
+; CHECK: frinta.2s	v0, v0          ; encoding: [0x00,0x88,0x21,0x2e]
+; CHECK: frintx.2s	v0, v0          ; encoding: [0x00,0x98,0x21,0x2e]
+; CHECK: frinti.2s	v0, v0          ; encoding: [0x00,0x98,0xa1,0x2e]
+; CHECK: frintm.2s	v0, v0          ; encoding: [0x00,0x98,0x21,0x0e]
+; CHECK: frintn.2s	v0, v0          ; encoding: [0x00,0x88,0x21,0x0e]
+; CHECK: frintp.2s	v0, v0          ; encoding: [0x00,0x88,0xa1,0x0e]
+; CHECK: frintz.2s	v0, v0          ; encoding: [0x00,0x98,0xa1,0x0e]
+; CHECK: frsqrte.2s	v0, v0          ; encoding: [0x00,0xd8,0xa1,0x2e]
+; CHECK: fsqrt.2s	v0, v0          ; encoding: [0x00,0xf8,0xa1,0x2e]
+; CHECK: neg.8b	v0, v0                  ; encoding: [0x00,0xb8,0x20,0x2e]
+; CHECK: mvn.8b	v0, v0                  ; encoding: [0x00,0x58,0x20,0x2e]
+; CHECK: rbit.8b	v0, v0                  ; encoding: [0x00,0x58,0x60,0x2e]
+; CHECK: rev16.8b	v0, v0          ; encoding: [0x00,0x18,0x20,0x0e]
+; CHECK: rev32.8b	v0, v0          ; encoding: [0x00,0x08,0x20,0x2e]
+; CHECK: rev64.8b	v0, v0          ; encoding: [0x00,0x08,0x20,0x0e]
+; CHECK: sadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x0e]
+; CHECK: saddlp.4h	v0, v0          ; encoding: [0x00,0x28,0x20,0x0e]
+; CHECK: scvtf.2s	v0, v0          ; encoding: [0x00,0xd8,0x21,0x0e]
+; CHECK: sqabs.8b	v0, v0          ; encoding: [0x00,0x78,0x20,0x0e]
+; CHECK: sqneg.8b	v0, v0          ; encoding: [0x00,0x78,0x20,0x2e]
+; CHECK: sqxtn.8b	v0, v0          ; encoding: [0x00,0x48,0x21,0x0e]
+; CHECK: sqxtun.8b	v0, v0          ; encoding: [0x00,0x28,0x21,0x2e]
+; CHECK: suqadd.8b	v0, v0          ; encoding: [0x00,0x38,0x20,0x0e]
+; CHECK: uadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x2e]
+; CHECK: uaddlp.4h	v0, v0          ; encoding: [0x00,0x28,0x20,0x2e]
+; CHECK: ucvtf.2s	v0, v0          ; encoding: [0x00,0xd8,0x21,0x2e]
+; CHECK: uqxtn.8b	v0, v0          ; encoding: [0x00,0x48,0x21,0x2e]
+; CHECK: urecpe.2s	v0, v0          ; encoding: [0x00,0xc8,0xa1,0x0e]
+; CHECK: ursqrte.2s	v0, v0          ; encoding: [0x00,0xc8,0xa1,0x2e]
+; CHECK: usqadd.8b	v0, v0          ; encoding: [0x00,0x38,0x20,0x2e]
+; CHECK: xtn.8b	v0, v0                  ; encoding: [0x00,0x28,0x21,0x0e]
+; CHECK: shll.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x2e]
+; CHECK: shll.4s	v3, v4, #16     ; encoding: [0x83,0x38,0x61,0x2e]
+; CHECK: shll.2d	v5, v6, #32     ; encoding: [0xc5,0x38,0xa1,0x2e]
+; CHECK: shll2.8h	v7, v8, #8      ; encoding: [0x07,0x39,0x21,0x6e]
+; CHECK: shll2.4s	v9, v10, #16    ; encoding: [0x49,0x39,0x61,0x6e]
+; CHECK: shll2.2d	v11, v12, #32   ; encoding: [0x8b,0x39,0xa1,0x6e]
+; CHECK: shll.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x2e]
+; CHECK: shll.4s	v1, v2, #16     ; encoding: [0x41,0x38,0x61,0x2e]
+; CHECK: shll.2d	v1, v2, #32     ; encoding: [0x41,0x38,0xa1,0x2e]
+; CHECK: shll2.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x6e]
+; CHECK: shll2.4s	v1, v2, #16     ; encoding: [0x41,0x38,0x61,0x6e]
+; CHECK: shll2.2d	v1, v2, #32     ; encoding: [0x41,0x38,0xa1,0x6e]
+
+
+  cmeq.8b   v0, v0, #0
+  cmeq.16b  v0, v0, #0
+  cmeq.4h   v0, v0, #0
+  cmeq.8h   v0, v0, #0
+  cmeq.2s   v0, v0, #0
+  cmeq.4s   v0, v0, #0
+  cmeq.2d   v0, v0, #0
+
+; CHECK: cmeq.8b	v0, v0, #0              ; encoding: [0x00,0x98,0x20,0x0e]
+; CHECK: cmeq.16b	v0, v0, #0      ; encoding: [0x00,0x98,0x20,0x4e]
+; CHECK: cmeq.4h	v0, v0, #0              ; encoding: [0x00,0x98,0x60,0x0e]
+; CHECK: cmeq.8h	v0, v0, #0              ; encoding: [0x00,0x98,0x60,0x4e]
+; CHECK: cmeq.2s	v0, v0, #0              ; encoding: [0x00,0x98,0xa0,0x0e]
+; CHECK: cmeq.4s	v0, v0, #0              ; encoding: [0x00,0x98,0xa0,0x4e]
+; CHECK: cmeq.2d	v0, v0, #0              ; encoding: [0x00,0x98,0xe0,0x4e]
+
+  cmge.8b   v0, v0, #0
+  cmgt.8b   v0, v0, #0
+  cmle.8b   v0, v0, #0
+  cmlt.8b   v0, v0, #0
+  fcmeq.2s  v0, v0, #0
+  fcmge.2s  v0, v0, #0
+  fcmgt.2s  v0, v0, #0
+  fcmle.2s  v0, v0, #0
+  fcmlt.2s  v0, v0, #0
+
+; ARM verbose mode aliases
+  cmlt v8.8b, v14.8b, #0
+  cmlt v8.16b, v14.16b, #0
+  cmlt v8.4h, v14.4h, #0
+  cmlt v8.8h, v14.8h, #0
+  cmlt v8.2s, v14.2s, #0
+  cmlt v8.4s, v14.4s, #0
+  cmlt v8.2d, v14.2d, #0
+
+; CHECK: cmge.8b	v0, v0, #0              ; encoding: [0x00,0x88,0x20,0x2e]
+; CHECK: cmgt.8b	v0, v0, #0              ; encoding: [0x00,0x88,0x20,0x0e]
+; CHECK: cmle.8b	v0, v0, #0              ; encoding: [0x00,0x98,0x20,0x2e]
+; CHECK: cmlt.8b	v0, v0, #0              ; encoding: [0x00,0xa8,0x20,0x0e]
+; CHECK: fcmeq.2s	v0, v0, #0.0      ; encoding: [0x00,0xd8,0xa0,0x0e]
+; CHECK: fcmge.2s	v0, v0, #0.0      ; encoding: [0x00,0xc8,0xa0,0x2e]
+; CHECK: fcmgt.2s	v0, v0, #0.0      ; encoding: [0x00,0xc8,0xa0,0x0e]
+; CHECK: fcmle.2s	v0, v0, #0.0      ; encoding: [0x00,0xd8,0xa0,0x2e]
+; CHECK: fcmlt.2s	v0, v0, #0.0      ; encoding: [0x00,0xe8,0xa0,0x0e]
+; CHECK: cmlt.8b	v8, v14, #0             ; encoding: [0xc8,0xa9,0x20,0x0e]
+; CHECK: cmlt.16b	v8, v14, #0     ; encoding: [0xc8,0xa9,0x20,0x4e]
+; CHECK: cmlt.4h	v8, v14, #0             ; encoding: [0xc8,0xa9,0x60,0x0e]
+; CHECK: cmlt.8h	v8, v14, #0             ; encoding: [0xc8,0xa9,0x60,0x4e]
+; CHECK: cmlt.2s	v8, v14, #0             ; encoding: [0xc8,0xa9,0xa0,0x0e]
+; CHECK: cmlt.4s	v8, v14, #0             ; encoding: [0xc8,0xa9,0xa0,0x4e]
+; CHECK: cmlt.2d	v8, v14, #0             ; encoding: [0xc8,0xa9,0xe0,0x4e]
+
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD Floating-point <-> Integer Conversions
+;===-------------------------------------------------------------------------===
+
+  fcvtas.2s   v0, v0
+  fcvtas.4s   v0, v0
+  fcvtas.2d   v0, v0
+  fcvtas      s0, s0
+  fcvtas      d0, d0
+
+; CHECK: fcvtas.2s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x0e]
+; CHECK: fcvtas.4s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x4e]
+; CHECK: fcvtas.2d  v0, v0           ; encoding: [0x00,0xc8,0x61,0x4e]
+; CHECK: fcvtas     s0, s0           ; encoding: [0x00,0xc8,0x21,0x5e]
+; CHECK: fcvtas     d0, d0           ; encoding: [0x00,0xc8,0x61,0x5e]
+
+  fcvtau.2s   v0, v0
+  fcvtau.4s   v0, v0
+  fcvtau.2d   v0, v0
+  fcvtau      s0, s0
+  fcvtau      d0, d0
+
+; CHECK: fcvtau.2s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x2e]
+; CHECK: fcvtau.4s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x6e]
+; CHECK: fcvtau.2d  v0, v0           ; encoding: [0x00,0xc8,0x61,0x6e]
+; CHECK: fcvtau     s0, s0           ; encoding: [0x00,0xc8,0x21,0x7e]
+; CHECK: fcvtau     d0, d0           ; encoding: [0x00,0xc8,0x61,0x7e]
+
+  fcvtl   v1.4s, v5.4h
+  fcvtl   v2.2d, v6.2s
+  fcvtl2  v3.4s, v7.8h
+  fcvtl2  v4.2d, v8.4s
+
+; CHECK: fcvtl	v1.4s, v5.4h            ; encoding: [0xa1,0x78,0x21,0x0e]
+; CHECK: fcvtl	v2.2d, v6.2s            ; encoding: [0xc2,0x78,0x61,0x0e]
+; CHECK: fcvtl2	v3.4s, v7.8h            ; encoding: [0xe3,0x78,0x21,0x4e]
+; CHECK: fcvtl2	v4.2d, v8.4s            ; encoding: [0x04,0x79,0x61,0x4e]
+
+  fcvtms.2s  v0, v0
+  fcvtms.4s  v0, v0
+  fcvtms.2d  v0, v0
+  fcvtms     s0, s0
+  fcvtms     d0, d0
+
+; CHECK: fcvtms.2s v0, v0            ; encoding: [0x00,0xb8,0x21,0x0e]
+; CHECK: fcvtms.4s v0, v0            ; encoding: [0x00,0xb8,0x21,0x4e]
+; CHECK: fcvtms.2d v0, v0            ; encoding: [0x00,0xb8,0x61,0x4e]
+; CHECK: fcvtms    s0, s0            ; encoding: [0x00,0xb8,0x21,0x5e]
+; CHECK: fcvtms    d0, d0            ; encoding: [0x00,0xb8,0x61,0x5e]
+
+  fcvtmu.2s   v0, v0
+  fcvtmu.4s   v0, v0
+  fcvtmu.2d   v0, v0
+  fcvtmu      s0, s0
+  fcvtmu      d0, d0
+
+; CHECK: fcvtmu.2s v0, v0            ; encoding: [0x00,0xb8,0x21,0x2e]
+; CHECK: fcvtmu.4s v0, v0            ; encoding: [0x00,0xb8,0x21,0x6e]
+; CHECK: fcvtmu.2d v0, v0            ; encoding: [0x00,0xb8,0x61,0x6e]
+; CHECK: fcvtmu    s0, s0            ; encoding: [0x00,0xb8,0x21,0x7e]
+; CHECK: fcvtmu    d0, d0            ; encoding: [0x00,0xb8,0x61,0x7e]
+
+  fcvtns.2s   v0, v0
+  fcvtns.4s   v0, v0
+  fcvtns.2d   v0, v0
+  fcvtns      s0, s0
+  fcvtns      d0, d0
+
+; CHECK: fcvtns.2s v0, v0            ; encoding: [0x00,0xa8,0x21,0x0e]
+; CHECK: fcvtns.4s v0, v0            ; encoding: [0x00,0xa8,0x21,0x4e]
+; CHECK: fcvtns.2d v0, v0            ; encoding: [0x00,0xa8,0x61,0x4e]
+; CHECK: fcvtns    s0, s0            ; encoding: [0x00,0xa8,0x21,0x5e]
+; CHECK: fcvtns    d0, d0            ; encoding: [0x00,0xa8,0x61,0x5e]
+
+  fcvtnu.2s   v0, v0
+  fcvtnu.4s   v0, v0
+  fcvtnu.2d   v0, v0
+  fcvtnu      s0, s0
+  fcvtnu      d0, d0
+
+; CHECK: fcvtnu.2s v0, v0            ; encoding: [0x00,0xa8,0x21,0x2e]
+; CHECK: fcvtnu.4s v0, v0            ; encoding: [0x00,0xa8,0x21,0x6e]
+; CHECK: fcvtnu.2d v0, v0            ; encoding: [0x00,0xa8,0x61,0x6e]
+; CHECK: fcvtnu    s0, s0            ; encoding: [0x00,0xa8,0x21,0x7e]
+; CHECK: fcvtnu    d0, d0            ; encoding: [0x00,0xa8,0x61,0x7e]
+
+  fcvtn   v2.4h, v4.4s
+  fcvtn   v3.2s, v5.2d
+  fcvtn2  v4.8h, v6.4s
+  fcvtn2  v5.4s, v7.2d
+  fcvtxn  v6.2s, v9.2d
+  fcvtxn2 v7.4s, v8.2d
+
+; CHECK: fcvtn	v2.4h, v4.4s            ; encoding: [0x82,0x68,0x21,0x0e]
+; CHECK: fcvtn	v3.2s, v5.2d            ; encoding: [0xa3,0x68,0x61,0x0e]
+; CHECK: fcvtn2	v4.8h, v6.4s            ; encoding: [0xc4,0x68,0x21,0x4e]
+; CHECK: fcvtn2	v5.4s, v7.2d            ; encoding: [0xe5,0x68,0x61,0x4e]
+; CHECK: fcvtxn	v6.2s, v9.2d            ; encoding: [0x26,0x69,0x61,0x2e]
+; CHECK: fcvtxn2 v7.4s, v8.2d           ; encoding: [0x07,0x69,0x61,0x6e]
+
+  fcvtps.2s  v0, v0
+  fcvtps.4s  v0, v0
+  fcvtps.2d  v0, v0
+  fcvtps     s0, s0
+  fcvtps     d0, d0
+
+; CHECK: fcvtps.2s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x0e]
+; CHECK: fcvtps.4s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x4e]
+; CHECK: fcvtps.2d v0, v0            ; encoding: [0x00,0xa8,0xe1,0x4e]
+; CHECK: fcvtps    s0, s0            ; encoding: [0x00,0xa8,0xa1,0x5e]
+; CHECK: fcvtps    d0, d0            ; encoding: [0x00,0xa8,0xe1,0x5e]
+
+  fcvtpu.2s  v0, v0
+  fcvtpu.4s  v0, v0
+  fcvtpu.2d  v0, v0
+  fcvtpu     s0, s0
+  fcvtpu     d0, d0
+
+; CHECK: fcvtpu.2s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x2e]
+; CHECK: fcvtpu.4s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x6e]
+; CHECK: fcvtpu.2d v0, v0            ; encoding: [0x00,0xa8,0xe1,0x6e]
+; CHECK: fcvtpu    s0, s0            ; encoding: [0x00,0xa8,0xa1,0x7e]
+; CHECK: fcvtpu    d0, d0            ; encoding: [0x00,0xa8,0xe1,0x7e]
+
+  fcvtzs.2s  v0, v0
+  fcvtzs.4s  v0, v0
+  fcvtzs.2d  v0, v0
+  fcvtzs     s0, s0
+  fcvtzs     d0, d0
+
+; CHECK: fcvtzs.2s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x0e]
+; CHECK: fcvtzs.4s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x4e]
+; CHECK: fcvtzs.2d v0, v0            ; encoding: [0x00,0xb8,0xe1,0x4e]
+; CHECK: fcvtzs    s0, s0            ; encoding: [0x00,0xb8,0xa1,0x5e]
+; CHECK: fcvtzs    d0, d0            ; encoding: [0x00,0xb8,0xe1,0x5e]
+
+  fcvtzu.2s  v0, v0
+  fcvtzu.4s  v0, v0
+  fcvtzu.2d  v0, v0
+  fcvtzu     s0, s0
+  fcvtzu     d0, d0
+
+; CHECK: fcvtzu.2s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x2e]
+; CHECK: fcvtzu.4s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x6e]
+; CHECK: fcvtzu.2d v0, v0            ; encoding: [0x00,0xb8,0xe1,0x6e]
+; CHECK: fcvtzu    s0, s0            ; encoding: [0x00,0xb8,0xa1,0x7e]
+; CHECK: fcvtzu    d0, d0            ; encoding: [0x00,0xb8,0xe1,0x7e]
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD modified immediate instructions
+;===-------------------------------------------------------------------------===
+
+  bic.2s  v0, #1
+  bic.2s  v0, #1, lsl #0
+  bic.2s  v0, #1, lsl #8
+  bic.2s  v0, #1, lsl #16
+  bic.2s  v0, #1, lsl #24
+
+; CHECK: bic.2s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x2f]
+; CHECK: bic.2s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x2f]
+; CHECK: bic.2s v0, #0x1, lsl #8       ; encoding: [0x20,0x34,0x00,0x2f]
+; CHECK: bic.2s v0, #0x1, lsl #16      ; encoding: [0x20,0x54,0x00,0x2f]
+; CHECK: bic.2s v0, #0x1, lsl #24      ; encoding: [0x20,0x74,0x00,0x2f]
+
+  bic.4h  v0, #1
+  bic.4h  v0, #1, lsl #0
+  bic.4h  v0, #1, lsl #8
+
+; CHECK: bic.4h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x2f]
+; CHECK: bic.4h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x2f]
+; CHECK: bic.4h v0, #0x1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x2f]
+
+  bic.4s  v0, #1
+  bic.4s  v0, #1, lsl #0
+  bic.4s  v0, #1, lsl #8
+  bic.4s  v0, #1, lsl #16
+  bic.4s  v0, #1, lsl #24
+
+; CHECK: bic.4s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x6f]
+; CHECK: bic.4s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x6f]
+; CHECK: bic.4s v0, #0x1, lsl #8       ; encoding: [0x20,0x34,0x00,0x6f]
+; CHECK: bic.4s v0, #0x1, lsl #16      ; encoding: [0x20,0x54,0x00,0x6f]
+; CHECK: bic.4s v0, #0x1, lsl #24      ; encoding: [0x20,0x74,0x00,0x6f]
+
+  bic.8h  v0, #1
+  bic.8h  v0, #1, lsl #0
+  bic.8h  v0, #1, lsl #8
+
+; CHECK: bic.8h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x6f]
+; CHECK: bic.8h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x6f]
+; CHECK: bic.8h v0, #0x1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x6f]
+
+  fmov.2d v0, #1.250000e-01
+
+; CHECK: fmov.2d v0, #0.12500000             ; encoding: [0x00,0xf4,0x02,0x6f]
+
+  fmov.2s v0, #1.250000e-01
+  fmov.4s v0, #1.250000e-01
+
+; CHECK: fmov.2s v0, #0.12500000             ; encoding: [0x00,0xf4,0x02,0x0f]
+; CHECK: fmov.4s v0, #0.12500000             ; encoding: [0x00,0xf4,0x02,0x4f]
+
+  orr.2s  v0, #1
+  orr.2s  v0, #1, lsl #0
+  orr.2s  v0, #1, lsl #8
+  orr.2s  v0, #1, lsl #16
+  orr.2s  v0, #1, lsl #24
+
+; CHECK: orr.2s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x0f]
+; CHECK: orr.2s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x0f]
+; CHECK: orr.2s v0, #0x1, lsl #8       ; encoding: [0x20,0x34,0x00,0x0f]
+; CHECK: orr.2s v0, #0x1, lsl #16      ; encoding: [0x20,0x54,0x00,0x0f]
+; CHECK: orr.2s v0, #0x1, lsl #24      ; encoding: [0x20,0x74,0x00,0x0f]
+
+  orr.4h  v0, #1
+  orr.4h  v0, #1, lsl #0
+  orr.4h  v0, #1, lsl #8
+
+; CHECK: orr.4h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x0f]
+; CHECK: orr.4h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x0f]
+; CHECK: orr.4h v0, #0x1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x0f]
+
+  orr.4s  v0, #1
+  orr.4s  v0, #1, lsl #0
+  orr.4s  v0, #1, lsl #8
+  orr.4s  v0, #1, lsl #16
+  orr.4s  v0, #1, lsl #24
+
+; CHECK: orr.4s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x4f]
+; CHECK: orr.4s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x4f]
+; CHECK: orr.4s v0, #0x1, lsl #8       ; encoding: [0x20,0x34,0x00,0x4f]
+; CHECK: orr.4s v0, #0x1, lsl #16      ; encoding: [0x20,0x54,0x00,0x4f]
+; CHECK: orr.4s v0, #0x1, lsl #24      ; encoding: [0x20,0x74,0x00,0x4f]
+
+  orr.8h  v0, #1
+  orr.8h  v0, #1, lsl #0
+  orr.8h  v0, #1, lsl #8
+
+; CHECK: orr.8h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x4f]
+; CHECK: orr.8h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x4f]
+; CHECK: orr.8h v0, #0x1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x4f]
+
+  movi     d0, #0x000000000000ff
+  movi.2d  v0, #0x000000000000ff
+
+; CHECK: movi     d0, #0x000000000000ff ; encoding: [0x20,0xe4,0x00,0x2f]
+; CHECK: movi.2d  v0, #0x000000000000ff ; encoding: [0x20,0xe4,0x00,0x6f]
+
+  movi.2s v0, #1
+  movi.2s v0, #1, lsl #0
+  movi.2s v0, #1, lsl #8
+  movi.2s v0, #1, lsl #16
+  movi.2s v0, #1, lsl #24
+
+; CHECK: movi.2s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x0f]
+; CHECK: movi.2s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x0f]
+; CHECK: movi.2s v0, #0x1, lsl #8      ; encoding: [0x20,0x24,0x00,0x0f]
+; CHECK: movi.2s v0, #0x1, lsl #16     ; encoding: [0x20,0x44,0x00,0x0f]
+; CHECK: movi.2s v0, #0x1, lsl #24     ; encoding: [0x20,0x64,0x00,0x0f]
+
+  movi.4s v0, #1
+  movi.4s v0, #1, lsl #0
+  movi.4s v0, #1, lsl #8
+  movi.4s v0, #1, lsl #16
+  movi.4s v0, #1, lsl #24
+
+; CHECK: movi.4s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x4f]
+; CHECK: movi.4s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x4f]
+; CHECK: movi.4s v0, #0x1, lsl #8      ; encoding: [0x20,0x24,0x00,0x4f]
+; CHECK: movi.4s v0, #0x1, lsl #16     ; encoding: [0x20,0x44,0x00,0x4f]
+; CHECK: movi.4s v0, #0x1, lsl #24     ; encoding: [0x20,0x64,0x00,0x4f]
+
+  movi.4h v0, #1
+  movi.4h v0, #1, lsl #0
+  movi.4h v0, #1, lsl #8
+
+; CHECK: movi.4h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x0f]
+; CHECK: movi.4h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x0f]
+; CHECK: movi.4h v0, #0x1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x0f]
+
+  movi.8h v0, #1
+  movi.8h v0, #1, lsl #0
+  movi.8h v0, #1, lsl #8
+
+; CHECK: movi.8h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x4f]
+; CHECK: movi.8h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x4f]
+; CHECK: movi.8h v0, #0x1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x4f]
+
+  movi.2s v0, #1, msl #8
+  movi.2s v0, #1, msl #16
+  movi.4s v0, #1, msl #8
+  movi.4s v0, #1, msl #16
+
+; CHECK: movi.2s v0, #0x1, msl #8      ; encoding: [0x20,0xc4,0x00,0x0f]
+; CHECK: movi.2s v0, #0x1, msl #16     ; encoding: [0x20,0xd4,0x00,0x0f]
+; CHECK: movi.4s v0, #0x1, msl #8      ; encoding: [0x20,0xc4,0x00,0x4f]
+; CHECK: movi.4s v0, #0x1, msl #16     ; encoding: [0x20,0xd4,0x00,0x4f]
+
+  movi.8b  v0, #1
+  movi.16b v0, #1
+
+; CHECK: movi.8b  v0, #0x1             ; encoding: [0x20,0xe4,0x00,0x0f]
+; CHECK: movi.16b v0, #0x1             ; encoding: [0x20,0xe4,0x00,0x4f]
+
+  mvni.2s v0, #1
+  mvni.2s v0, #1, lsl #0
+  mvni.2s v0, #1, lsl #8
+  mvni.2s v0, #1, lsl #16
+  mvni.2s v0, #1, lsl #24
+
+; CHECK: mvni.2s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x2f]
+; CHECK: mvni.2s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x2f]
+; CHECK: mvni.2s v0, #0x1, lsl #8      ; encoding: [0x20,0x24,0x00,0x2f]
+; CHECK: mvni.2s v0, #0x1, lsl #16     ; encoding: [0x20,0x44,0x00,0x2f]
+; CHECK: mvni.2s v0, #0x1, lsl #24     ; encoding: [0x20,0x64,0x00,0x2f]
+
+  mvni.4s v0, #1
+  mvni.4s v0, #1, lsl #0
+  mvni.4s v0, #1, lsl #8
+  mvni.4s v0, #1, lsl #16
+  mvni.4s v0, #1, lsl #24
+
+; CHECK: mvni.4s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x6f]
+; CHECK: mvni.4s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x6f]
+; CHECK: mvni.4s v0, #0x1, lsl #8      ; encoding: [0x20,0x24,0x00,0x6f]
+; CHECK: mvni.4s v0, #0x1, lsl #16     ; encoding: [0x20,0x44,0x00,0x6f]
+; CHECK: mvni.4s v0, #0x1, lsl #24     ; encoding: [0x20,0x64,0x00,0x6f]
+
+  mvni.4h v0, #1
+  mvni.4h v0, #1, lsl #0
+  mvni.4h v0, #1, lsl #8
+
+; CHECK: mvni.4h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x2f]
+; CHECK: mvni.4h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x2f]
+; CHECK: mvni.4h v0, #0x1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x2f]
+
+  mvni.8h v0, #1
+  mvni.8h v0, #1, lsl #0
+  mvni.8h v0, #1, lsl #8
+
+; CHECK: mvni.8h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x6f]
+; CHECK: mvni.8h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x6f]
+; CHECK: mvni.8h v0, #0x1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x6f]
+
+  mvni.2s v0, #1, msl #8
+  mvni.2s v0, #1, msl #16
+  mvni.4s v0, #1, msl #8
+  mvni.4s v0, #1, msl #16
+
+; CHECK: mvni.2s v0, #0x1, msl #8      ; encoding: [0x20,0xc4,0x00,0x2f]
+; CHECK: mvni.2s v0, #0x1, msl #16     ; encoding: [0x20,0xd4,0x00,0x2f]
+; CHECK: mvni.4s v0, #0x1, msl #8      ; encoding: [0x20,0xc4,0x00,0x6f]
+; CHECK: mvni.4s v0, #0x1, msl #16     ; encoding: [0x20,0xd4,0x00,0x6f]
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD scalar x index
+;===-------------------------------------------------------------------------===
+
+  fmla.s  s0, s0, v0[3]
+  fmla.d  d0, d0, v0[1]
+  fmls.s  s0, s0, v0[3]
+  fmls.d  d0, d0, v0[1]
+  fmulx.s s0, s0, v0[3]
+  fmulx.d d0, d0, v0[1]
+  fmul.s  s0, s0, v0[3]
+  fmul.d  d0, d0, v0[1]
+  sqdmlal.h s0, h0, v0[7]
+  sqdmlal.s d0, s0, v0[3]
+  sqdmlsl.h s0, h0, v0[7]
+  sqdmulh.h h0, h0, v0[7]
+  sqdmulh.s s0, s0, v0[3]
+  sqdmull.h s0, h0, v0[7]
+  sqdmull.s d0, s0, v0[3]
+  sqrdmulh.h  h0, h0, v0[7]
+  sqrdmulh.s  s0, s0, v0[3]
+
+; CHECK: fmla.s	s0, s0, v0[3]           ; encoding: [0x00,0x18,0xa0,0x5f]
+; CHECK: fmla.d	d0, d0, v0[1]           ; encoding: [0x00,0x18,0xc0,0x5f]
+; CHECK: fmls.s	s0, s0, v0[3]           ; encoding: [0x00,0x58,0xa0,0x5f]
+; CHECK: fmls.d	d0, d0, v0[1]           ; encoding: [0x00,0x58,0xc0,0x5f]
+; CHECK: fmulx.s	s0, s0, v0[3]           ; encoding: [0x00,0x98,0xa0,0x7f]
+; CHECK: fmulx.d	d0, d0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x7f]
+; CHECK: fmul.s	s0, s0, v0[3]           ; encoding: [0x00,0x98,0xa0,0x5f]
+; CHECK: fmul.d	d0, d0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x5f]
+; CHECK: sqdmlal.h	s0, h0, v0[7]   ; encoding: [0x00,0x38,0x70,0x5f]
+; CHECK: sqdmlal.s	d0, s0, v0[3]   ; encoding: [0x00,0x38,0xa0,0x5f]
+; CHECK: sqdmlsl.h	s0, h0, v0[7]   ; encoding: [0x00,0x78,0x70,0x5f]
+; CHECK: sqdmulh.h	h0, h0, v0[7]   ; encoding: [0x00,0xc8,0x70,0x5f]
+; CHECK: sqdmulh.s	s0, s0, v0[3]   ; encoding: [0x00,0xc8,0xa0,0x5f]
+; CHECK: sqdmull.h	s0, h0, v0[7]   ; encoding: [0x00,0xb8,0x70,0x5f]
+; CHECK: sqdmull.s	d0, s0, v0[3]   ; encoding: [0x00,0xb8,0xa0,0x5f]
+; CHECK: sqrdmulh.h	h0, h0, v0[7]   ; encoding: [0x00,0xd8,0x70,0x5f]
+; CHECK: sqrdmulh.s	s0, s0, v0[3]   ; encoding: [0x00,0xd8,0xa0,0x5f]
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD SMLAL
+;===-------------------------------------------------------------------------===
+        smlal.8h v1, v2, v3
+        smlal.4s v1, v2, v3
+        smlal.2d v1, v2, v3
+        smlal2.8h v1, v2, v3
+        smlal2.4s v1, v2, v3
+        smlal2.2d v1, v2, v3
+
+        smlal v13.8h, v8.8b, v0.8b
+        smlal v13.4s, v8.4h, v0.4h
+        smlal v13.2d, v8.2s, v0.2s
+        smlal2 v13.8h, v8.16b, v0.16b
+        smlal2 v13.4s, v8.8h, v0.8h
+        smlal2 v13.2d, v8.4s, v0.4s
+
+; CHECK: smlal.8h	v1, v2, v3      ; encoding: [0x41,0x80,0x23,0x0e]
+; CHECK: smlal.4s	v1, v2, v3      ; encoding: [0x41,0x80,0x63,0x0e]
+; CHECK: smlal.2d	v1, v2, v3      ; encoding: [0x41,0x80,0xa3,0x0e]
+; CHECK: smlal2.8h	v1, v2, v3      ; encoding: [0x41,0x80,0x23,0x4e]
+; CHECK: smlal2.4s	v1, v2, v3      ; encoding: [0x41,0x80,0x63,0x4e]
+; CHECK: smlal2.2d	v1, v2, v3      ; encoding: [0x41,0x80,0xa3,0x4e]
+; CHECK: smlal.8h	v13, v8, v0     ; encoding: [0x0d,0x81,0x20,0x0e]
+; CHECK: smlal.4s	v13, v8, v0     ; encoding: [0x0d,0x81,0x60,0x0e]
+; CHECK: smlal.2d	v13, v8, v0     ; encoding: [0x0d,0x81,0xa0,0x0e]
+; CHECK: smlal2.8h	v13, v8, v0     ; encoding: [0x0d,0x81,0x20,0x4e]
+; CHECK: smlal2.4s	v13, v8, v0     ; encoding: [0x0d,0x81,0x60,0x4e]
+; CHECK: smlal2.2d	v13, v8, v0     ; encoding: [0x0d,0x81,0xa0,0x4e]
+
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD scalar x index
+;===-------------------------------------------------------------------------===
+
+  fmla.2s v0, v0, v0[0]
+  fmla.4s v0, v0, v0[1]
+  fmla.2d v0, v0, v0[1]
+  fmls.2s v0, v0, v0[0]
+  fmls.4s v0, v0, v0[1]
+  fmls.2d v0, v0, v0[1]
+  fmulx.2s  v0, v0, v0[0]
+  fmulx.4s  v0, v0, v0[1]
+  fmulx.2d  v0, v0, v0[1]
+  fmul.2s v0, v0, v0[0]
+  fmul.4s v0, v0, v0[1]
+  fmul.2d v0, v0, v0[1]
+  mla.4h  v0, v0, v0[0]
+  mla.8h  v0, v0, v0[1]
+  mla.2s  v0, v0, v0[2]
+  mla.4s  v0, v0, v0[3]
+  mls.4h  v0, v0, v0[0]
+  mls.8h  v0, v0, v0[1]
+  mls.2s  v0, v0, v0[2]
+  mls.4s  v0, v0, v0[3]
+  mul.4h  v0, v0, v0[0]
+  mul.8h  v0, v0, v0[1]
+  mul.2s  v0, v0, v0[2]
+  mul.4s  v0, v0, v0[3]
+  smlal.4s  v0, v0, v0[0]
+  smlal2.4s v0, v0, v0[1]
+  smlal.2d  v0, v0, v0[2]
+  smlal2.2d v0, v0, v0[3]
+  smlsl.4s  v0, v0, v0[0]
+  smlsl2.4s v0, v0, v0[1]
+  smlsl.2d  v0, v0, v0[2]
+  smlsl2.2d v0, v0, v0[3]
+  smull.4s  v0, v0, v0[0]
+  smull2.4s v0, v0, v0[1]
+  smull.2d  v0, v0, v0[2]
+  smull2.2d v0, v0, v0[3]
+  sqdmlal.4s  v0, v0, v0[0]
+  sqdmlal2.4s v0, v0, v0[1]
+  sqdmlal.2d  v0, v0, v0[2]
+  sqdmlal2.2d v0, v0, v0[3]
+  sqdmlsl.4s  v0, v0, v0[0]
+  sqdmlsl2.4s v0, v0, v0[1]
+  sqdmlsl.2d  v0, v0, v0[2]
+  sqdmlsl2.2d v0, v0, v0[3]
+  sqdmulh.4h  v0, v0, v0[0]
+  sqdmulh.8h  v0, v0, v0[1]
+  sqdmulh.2s  v0, v0, v0[2]
+  sqdmulh.4s  v0, v0, v0[3]
+  sqdmull.4s  v0, v0, v0[0]
+  sqdmull2.4s v0, v0, v0[1]
+  sqdmull.2d  v0, v0, v0[2]
+  sqdmull2.2d v0, v0, v0[3]
+  sqrdmulh.4h v0, v0, v0[0]
+  sqrdmulh.8h v0, v0, v0[1]
+  sqrdmulh.2s v0, v0, v0[2]
+  sqrdmulh.4s v0, v0, v0[3]
+  umlal.4s  v0, v0, v0[0]
+  umlal2.4s v0, v0, v0[1]
+  umlal.2d  v0, v0, v0[2]
+  umlal2.2d v0, v0, v0[3]
+  umlsl.4s  v0, v0, v0[0]
+  umlsl2.4s v0, v0, v0[1]
+  umlsl.2d  v0, v0, v0[2]
+  umlsl2.2d v0, v0, v0[3]
+  umull.4s  v0, v0, v0[0]
+  umull2.4s v0, v0, v0[1]
+  umull.2d  v0, v0, v0[2]
+  umull2.2d v0, v0, v0[3]
+
+; CHECK: fmla.2s	v0, v0, v0[0]           ; encoding: [0x00,0x10,0x80,0x0f]
+; CHECK: fmla.4s	v0, v0, v0[1]           ; encoding: [0x00,0x10,0xa0,0x4f]
+; CHECK: fmla.2d	v0, v0, v0[1]           ; encoding: [0x00,0x18,0xc0,0x4f]
+; CHECK: fmls.2s	v0, v0, v0[0]           ; encoding: [0x00,0x50,0x80,0x0f]
+; CHECK: fmls.4s	v0, v0, v0[1]           ; encoding: [0x00,0x50,0xa0,0x4f]
+; CHECK: fmls.2d	v0, v0, v0[1]           ; encoding: [0x00,0x58,0xc0,0x4f]
+; CHECK: fmulx.2s	v0, v0, v0[0]   ; encoding: [0x00,0x90,0x80,0x2f]
+; CHECK: fmulx.4s	v0, v0, v0[1]   ; encoding: [0x00,0x90,0xa0,0x6f]
+; CHECK: fmulx.2d	v0, v0, v0[1]   ; encoding: [0x00,0x98,0xc0,0x6f]
+; CHECK: fmul.2s	v0, v0, v0[0]           ; encoding: [0x00,0x90,0x80,0x0f]
+; CHECK: fmul.4s	v0, v0, v0[1]           ; encoding: [0x00,0x90,0xa0,0x4f]
+; CHECK: fmul.2d	v0, v0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x4f]
+; CHECK: mla.4h	v0, v0, v0[0]           ; encoding: [0x00,0x00,0x40,0x2f]
+; CHECK: mla.8h	v0, v0, v0[1]           ; encoding: [0x00,0x00,0x50,0x6f]
+; CHECK: mla.2s	v0, v0, v0[2]           ; encoding: [0x00,0x08,0x80,0x2f]
+; CHECK: mla.4s	v0, v0, v0[3]           ; encoding: [0x00,0x08,0xa0,0x6f]
+; CHECK: mls.4h	v0, v0, v0[0]           ; encoding: [0x00,0x40,0x40,0x2f]
+; CHECK: mls.8h	v0, v0, v0[1]           ; encoding: [0x00,0x40,0x50,0x6f]
+; CHECK: mls.2s	v0, v0, v0[2]           ; encoding: [0x00,0x48,0x80,0x2f]
+; CHECK: mls.4s	v0, v0, v0[3]           ; encoding: [0x00,0x48,0xa0,0x6f]
+; CHECK: mul.4h	v0, v0, v0[0]           ; encoding: [0x00,0x80,0x40,0x0f]
+; CHECK: mul.8h	v0, v0, v0[1]           ; encoding: [0x00,0x80,0x50,0x4f]
+; CHECK: mul.2s	v0, v0, v0[2]           ; encoding: [0x00,0x88,0x80,0x0f]
+; CHECK: mul.4s	v0, v0, v0[3]           ; encoding: [0x00,0x88,0xa0,0x4f]
+; CHECK: smlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x20,0x40,0x0f]
+; CHECK: smlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x20,0x50,0x4f]
+; CHECK: smlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x28,0x80,0x0f]
+; CHECK: smlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x28,0xa0,0x4f]
+; CHECK: smlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x60,0x40,0x0f]
+; CHECK: smlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x60,0x50,0x4f]
+; CHECK: smlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x68,0x80,0x0f]
+; CHECK: smlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x68,0xa0,0x4f]
+; CHECK: smull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xa0,0x40,0x0f]
+; CHECK: smull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xa0,0x50,0x4f]
+; CHECK: smull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xa8,0x80,0x0f]
+; CHECK: smull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xa8,0xa0,0x4f]
+; CHECK: sqdmlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x30,0x40,0x0f]
+; CHECK: sqdmlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x30,0x50,0x4f]
+; CHECK: sqdmlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x38,0x80,0x0f]
+; CHECK: sqdmlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x38,0xa0,0x4f]
+; CHECK: sqdmlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x70,0x40,0x0f]
+; CHECK: sqdmlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x70,0x50,0x4f]
+; CHECK: sqdmlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x78,0x80,0x0f]
+; CHECK: sqdmlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x78,0xa0,0x4f]
+; CHECK: sqdmulh.4h	v0, v0, v0[0]   ; encoding: [0x00,0xc0,0x40,0x0f]
+; CHECK: sqdmulh.8h	v0, v0, v0[1]   ; encoding: [0x00,0xc0,0x50,0x4f]
+; CHECK: sqdmulh.2s	v0, v0, v0[2]   ; encoding: [0x00,0xc8,0x80,0x0f]
+; CHECK: sqdmulh.4s	v0, v0, v0[3]   ; encoding: [0x00,0xc8,0xa0,0x4f]
+; CHECK: sqdmull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xb0,0x40,0x0f]
+; CHECK: sqdmull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xb0,0x50,0x4f]
+; CHECK: sqdmull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xb8,0x80,0x0f]
+; CHECK: sqdmull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xb8,0xa0,0x4f]
+; CHECK: sqrdmulh.4h	v0, v0, v0[0]   ; encoding: [0x00,0xd0,0x40,0x0f]
+; CHECK: sqrdmulh.8h	v0, v0, v0[1]   ; encoding: [0x00,0xd0,0x50,0x4f]
+; CHECK: sqrdmulh.2s	v0, v0, v0[2]   ; encoding: [0x00,0xd8,0x80,0x0f]
+; CHECK: sqrdmulh.4s	v0, v0, v0[3]   ; encoding: [0x00,0xd8,0xa0,0x4f]
+; CHECK: umlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x20,0x40,0x2f]
+; CHECK: umlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x20,0x50,0x6f]
+; CHECK: umlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x28,0x80,0x2f]
+; CHECK: umlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x28,0xa0,0x6f]
+; CHECK: umlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x60,0x40,0x2f]
+; CHECK: umlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x60,0x50,0x6f]
+; CHECK: umlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x68,0x80,0x2f]
+; CHECK: umlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x68,0xa0,0x6f]
+; CHECK: umull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xa0,0x40,0x2f]
+; CHECK: umull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xa0,0x50,0x6f]
+; CHECK: umull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xa8,0x80,0x2f]
+; CHECK: umull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xa8,0xa0,0x6f]
+
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD scalar with shift
+;===-------------------------------------------------------------------------===
+
+  fcvtzs s0, s0, #1
+  fcvtzs d0, d0, #2
+  fcvtzu s0, s0, #1
+  fcvtzu d0, d0, #2
+  shl    d0, d0, #1
+  sli    d0, d0, #1
+  sqrshrn b0, h0, #1
+  sqrshrn h0, s0, #2
+  sqrshrn s0, d0, #3
+  sqrshrun b0, h0, #1
+  sqrshrun h0, s0, #2
+  sqrshrun s0, d0, #3
+  sqshlu  b0, b0, #1
+  sqshlu  h0, h0, #2
+  sqshlu  s0, s0, #3
+  sqshlu  d0, d0, #4
+  sqshl   b0, b0, #1
+  sqshl   h0, h0, #2
+  sqshl   s0, s0, #3
+  sqshl   d0, d0, #4
+  sqshrn  b0, h0, #1
+  sqshrn  h0, s0, #2
+  sqshrn  s0, d0, #3
+  sqshrun b0, h0, #1
+  sqshrun h0, s0, #2
+  sqshrun s0, d0, #3
+  sri     d0, d0, #1
+  srshr   d0, d0, #1
+  srsra   d0, d0, #1
+  sshr    d0, d0, #1
+  ucvtf   s0, s0, #1
+  ucvtf   d0, d0, #2
+  scvtf   s0, s0, #1
+  scvtf   d0, d0, #2
+  uqrshrn b0, h0, #1
+  uqrshrn h0, s0, #2
+  uqrshrn s0, d0, #3
+  uqshl   b0, b0, #1
+  uqshl   h0, h0, #2
+  uqshl   s0, s0, #3
+  uqshl   d0, d0, #4
+  uqshrn  b0, h0, #1
+  uqshrn  h0, s0, #2
+  uqshrn  s0, d0, #3
+  urshr   d0, d0, #1
+  ursra   d0, d0, #1
+  ushr    d0, d0, #1
+  usra    d0, d0, #1
+
+; CHECK: fcvtzs	s0, s0, #1              ; encoding: [0x00,0xfc,0x3f,0x5f]
+; CHECK: fcvtzs	d0, d0, #2              ; encoding: [0x00,0xfc,0x7e,0x5f]
+; CHECK: fcvtzu	s0, s0, #1              ; encoding: [0x00,0xfc,0x3f,0x7f]
+; CHECK: fcvtzu	d0, d0, #2              ; encoding: [0x00,0xfc,0x7e,0x7f]
+; CHECK: shl	d0, d0, #1              ; encoding: [0x00,0x54,0x41,0x5f]
+; CHECK: sli	d0, d0, #1              ; encoding: [0x00,0x54,0x41,0x7f]
+; CHECK: sqrshrn	b0, h0, #1              ; encoding: [0x00,0x9c,0x0f,0x5f]
+; CHECK: sqrshrn	h0, s0, #2              ; encoding: [0x00,0x9c,0x1e,0x5f]
+; CHECK: sqrshrn	s0, d0, #3              ; encoding: [0x00,0x9c,0x3d,0x5f]
+; CHECK: sqrshrun	b0, h0, #1      ; encoding: [0x00,0x8c,0x0f,0x7f]
+; CHECK: sqrshrun	h0, s0, #2      ; encoding: [0x00,0x8c,0x1e,0x7f]
+; CHECK: sqrshrun	s0, d0, #3      ; encoding: [0x00,0x8c,0x3d,0x7f]
+; CHECK: sqshlu	b0, b0, #1              ; encoding: [0x00,0x64,0x09,0x7f]
+; CHECK: sqshlu	h0, h0, #2              ; encoding: [0x00,0x64,0x12,0x7f]
+; CHECK: sqshlu	s0, s0, #3              ; encoding: [0x00,0x64,0x23,0x7f]
+; CHECK: sqshlu	d0, d0, #4              ; encoding: [0x00,0x64,0x44,0x7f]
+; CHECK: sqshl	b0, b0, #1              ; encoding: [0x00,0x74,0x09,0x5f]
+; CHECK: sqshl	h0, h0, #2              ; encoding: [0x00,0x74,0x12,0x5f]
+; CHECK: sqshl	s0, s0, #3              ; encoding: [0x00,0x74,0x23,0x5f]
+; CHECK: sqshl	d0, d0, #4              ; encoding: [0x00,0x74,0x44,0x5f]
+; CHECK: sqshrn	b0, h0, #1              ; encoding: [0x00,0x94,0x0f,0x5f]
+; CHECK: sqshrn	h0, s0, #2              ; encoding: [0x00,0x94,0x1e,0x5f]
+; CHECK: sqshrn	s0, d0, #3              ; encoding: [0x00,0x94,0x3d,0x5f]
+; CHECK: sqshrun	b0, h0, #1              ; encoding: [0x00,0x84,0x0f,0x7f]
+; CHECK: sqshrun	h0, s0, #2              ; encoding: [0x00,0x84,0x1e,0x7f]
+; CHECK: sqshrun	s0, d0, #3              ; encoding: [0x00,0x84,0x3d,0x7f]
+; CHECK: sri	d0, d0, #1              ; encoding: [0x00,0x44,0x7f,0x7f]
+; CHECK: srshr	d0, d0, #1              ; encoding: [0x00,0x24,0x7f,0x5f]
+; CHECK: srsra	d0, d0, #1              ; encoding: [0x00,0x34,0x7f,0x5f]
+; CHECK: sshr	d0, d0, #1              ; encoding: [0x00,0x04,0x7f,0x5f]
+; CHECK: ucvtf	s0, s0, #1              ; encoding: [0x00,0xe4,0x3f,0x7f]
+; CHECK: ucvtf	d0, d0, #2              ; encoding: [0x00,0xe4,0x7e,0x7f]
+; check: scvtf  s0, s0, #1              ; encoding: [0x00,0xe4,0x3f,0x5f]
+; check: scvtf  d0, d0, #2              ; encoding: [0x00,0xe4,0x7e,0x5f]
+; CHECK: uqrshrn	b0, h0, #1              ; encoding: [0x00,0x9c,0x0f,0x7f]
+; CHECK: uqrshrn	h0, s0, #2              ; encoding: [0x00,0x9c,0x1e,0x7f]
+; CHECK: uqrshrn	s0, d0, #3              ; encoding: [0x00,0x9c,0x3d,0x7f]
+; CHECK: uqshl	b0, b0, #1              ; encoding: [0x00,0x74,0x09,0x7f]
+; CHECK: uqshl	h0, h0, #2              ; encoding: [0x00,0x74,0x12,0x7f]
+; CHECK: uqshl	s0, s0, #3              ; encoding: [0x00,0x74,0x23,0x7f]
+; CHECK: uqshl	d0, d0, #4              ; encoding: [0x00,0x74,0x44,0x7f]
+; CHECK: uqshrn	b0, h0, #1              ; encoding: [0x00,0x94,0x0f,0x7f]
+; CHECK: uqshrn	h0, s0, #2              ; encoding: [0x00,0x94,0x1e,0x7f]
+; CHECK: uqshrn	s0, d0, #3              ; encoding: [0x00,0x94,0x3d,0x7f]
+; CHECK: urshr	d0, d0, #1              ; encoding: [0x00,0x24,0x7f,0x7f]
+; CHECK: ursra	d0, d0, #1              ; encoding: [0x00,0x34,0x7f,0x7f]
+; CHECK: ushr	d0, d0, #1              ; encoding: [0x00,0x04,0x7f,0x7f]
+; CHECK: usra	d0, d0, #1              ; encoding: [0x00,0x14,0x7f,0x7f]
+
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD vector with shift
+;===-------------------------------------------------------------------------===
+
+   fcvtzs.2s v0, v0, #1
+   fcvtzs.4s v0, v0, #2
+   fcvtzs.2d v0, v0, #3
+   fcvtzu.2s v0, v0, #1
+   fcvtzu.4s v0, v0, #2
+   fcvtzu.2d v0, v0, #3
+   rshrn.8b v0, v0, #1
+   rshrn2.16b v0, v0, #2
+   rshrn.4h v0, v0, #3
+   rshrn2.8h v0, v0, #4
+   rshrn.2s v0, v0, #5
+   rshrn2.4s v0, v0, #6
+   scvtf.2s v0, v0, #1
+   scvtf.4s v0, v0, #2
+   scvtf.2d v0, v0, #3
+   shl.8b v0, v0, #1
+   shl.16b v0, v0, #2
+   shl.4h v0, v0, #3
+   shl.8h v0, v0, #4
+   shl.2s v0, v0, #5
+   shl.4s v0, v0, #6
+   shl.2d v0, v0, #7
+   shrn.8b v0, v0, #1
+   shrn2.16b v0, v0, #2
+   shrn.4h v0, v0, #3
+   shrn2.8h v0, v0, #4
+   shrn.2s v0, v0, #5
+   shrn2.4s v0, v0, #6
+   sli.8b v0, v0, #1
+   sli.16b v0, v0, #2
+   sli.4h v0, v0, #3
+   sli.8h v0, v0, #4
+   sli.2s v0, v0, #5
+   sli.4s v0, v0, #6
+   sli.2d v0, v0, #7
+   sqrshrn.8b v0, v0, #1
+   sqrshrn2.16b v0, v0, #2
+   sqrshrn.4h v0, v0, #3
+   sqrshrn2.8h v0, v0, #4
+   sqrshrn.2s v0, v0, #5
+   sqrshrn2.4s v0, v0, #6
+   sqrshrun.8b v0, v0, #1
+   sqrshrun2.16b v0, v0, #2
+   sqrshrun.4h v0, v0, #3
+   sqrshrun2.8h v0, v0, #4
+   sqrshrun.2s v0, v0, #5
+   sqrshrun2.4s v0, v0, #6
+   sqshlu.8b v0, v0, #1
+   sqshlu.16b v0, v0, #2
+   sqshlu.4h v0, v0, #3
+   sqshlu.8h v0, v0, #4
+   sqshlu.2s v0, v0, #5
+   sqshlu.4s v0, v0, #6
+   sqshlu.2d v0, v0, #7
+   sqshl.8b v0, v0, #1
+   sqshl.16b v0, v0, #2
+   sqshl.4h v0, v0, #3
+   sqshl.8h v0, v0, #4
+   sqshl.2s v0, v0, #5
+   sqshl.4s v0, v0, #6
+   sqshl.2d v0, v0, #7
+   sqshrn.8b v0, v0, #1
+   sqshrn2.16b v0, v0, #2
+   sqshrn.4h v0, v0, #3
+   sqshrn2.8h v0, v0, #4
+   sqshrn.2s v0, v0, #5
+   sqshrn2.4s v0, v0, #6
+   sqshrun.8b v0, v0, #1
+   sqshrun2.16b v0, v0, #2
+   sqshrun.4h v0, v0, #3
+   sqshrun2.8h v0, v0, #4
+   sqshrun.2s v0, v0, #5
+   sqshrun2.4s v0, v0, #6
+   sri.8b v0, v0, #1
+   sri.16b v0, v0, #2
+   sri.4h v0, v0, #3
+   sri.8h v0, v0, #4
+   sri.2s v0, v0, #5
+   sri.4s v0, v0, #6
+   sri.2d v0, v0, #7
+   srshr.8b v0, v0, #1
+   srshr.16b v0, v0, #2
+   srshr.4h v0, v0, #3
+   srshr.8h v0, v0, #4
+   srshr.2s v0, v0, #5
+   srshr.4s v0, v0, #6
+   srshr.2d v0, v0, #7
+   srsra.8b v0, v0, #1
+   srsra.16b v0, v0, #2
+   srsra.4h v0, v0, #3
+   srsra.8h v0, v0, #4
+   srsra.2s v0, v0, #5
+   srsra.4s v0, v0, #6
+   srsra.2d v0, v0, #7
+   sshll.8h v0, v0, #1
+   sshll2.8h v0, v0, #2
+   sshll.4s v0, v0, #3
+   sshll2.4s v0, v0, #4
+   sshll.2d v0, v0, #5
+   sshll2.2d v0, v0, #6
+   sshr.8b v0, v0, #1
+   sshr.16b v0, v0, #2
+   sshr.4h v0, v0, #3
+   sshr.8h v0, v0, #4
+   sshr.2s v0, v0, #5
+   sshr.4s v0, v0, #6
+   sshr.2d v0, v0, #7
+   sshr.8b v0, v0, #1
+   ssra.16b v0, v0, #2
+   ssra.4h v0, v0, #3
+   ssra.8h v0, v0, #4
+   ssra.2s v0, v0, #5
+   ssra.4s v0, v0, #6
+   ssra.2d v0, v0, #7
+   ssra d0, d0, #64
+   ucvtf.2s v0, v0, #1
+   ucvtf.4s v0, v0, #2
+   ucvtf.2d v0, v0, #3
+   uqrshrn.8b v0, v0, #1
+   uqrshrn2.16b v0, v0, #2
+   uqrshrn.4h v0, v0, #3
+   uqrshrn2.8h v0, v0, #4
+   uqrshrn.2s v0, v0, #5
+   uqrshrn2.4s v0, v0, #6
+   uqshl.8b v0, v0, #1
+   uqshl.16b v0, v0, #2
+   uqshl.4h v0, v0, #3
+   uqshl.8h v0, v0, #4
+   uqshl.2s v0, v0, #5
+   uqshl.4s v0, v0, #6
+   uqshl.2d v0, v0, #7
+   uqshrn.8b v0, v0, #1
+   uqshrn2.16b v0, v0, #2
+   uqshrn.4h v0, v0, #3
+   uqshrn2.8h v0, v0, #4
+   uqshrn.2s v0, v0, #5
+   uqshrn2.4s v0, v0, #6
+   urshr.8b v0, v0, #1
+   urshr.16b v0, v0, #2
+   urshr.4h v0, v0, #3
+   urshr.8h v0, v0, #4
+   urshr.2s v0, v0, #5
+   urshr.4s v0, v0, #6
+   urshr.2d v0, v0, #7
+   ursra.8b v0, v0, #1
+   ursra.16b v0, v0, #2
+   ursra.4h v0, v0, #3
+   ursra.8h v0, v0, #4
+   ursra.2s v0, v0, #5
+   ursra.4s v0, v0, #6
+   ursra.2d v0, v0, #7
+   ushll.8h v0, v0, #1
+   ushll2.8h v0, v0, #2
+   ushll.4s v0, v0, #3
+   ushll2.4s v0, v0, #4
+   ushll.2d v0, v0, #5
+   ushll2.2d v0, v0, #6
+   ushr.8b v0, v0, #1
+   ushr.16b v0, v0, #2
+   ushr.4h v0, v0, #3
+   ushr.8h v0, v0, #4
+   ushr.2s v0, v0, #5
+   ushr.4s v0, v0, #6
+   ushr.2d v0, v0, #7
+   usra.8b v0, v0, #1
+   usra.16b v0, v0, #2
+   usra.4h v0, v0, #3
+   usra.8h v0, v0, #4
+   usra.2s v0, v0, #5
+   usra.4s v0, v0, #6
+   usra.2d v0, v0, #7
+
+; CHECK: fcvtzs.2s	v0, v0, #1      ; encoding: [0x00,0xfc,0x3f,0x0f]
+; CHECK: fcvtzs.4s	v0, v0, #2      ; encoding: [0x00,0xfc,0x3e,0x4f]
+; CHECK: fcvtzs.2d	v0, v0, #3      ; encoding: [0x00,0xfc,0x7d,0x4f]
+; CHECK: fcvtzu.2s	v0, v0, #1      ; encoding: [0x00,0xfc,0x3f,0x2f]
+; CHECK: fcvtzu.4s	v0, v0, #2      ; encoding: [0x00,0xfc,0x3e,0x6f]
+; CHECK: fcvtzu.2d	v0, v0, #3      ; encoding: [0x00,0xfc,0x7d,0x6f]
+; CHECK: rshrn.8b	v0, v0, #1      ; encoding: [0x00,0x8c,0x0f,0x0f]
+; CHECK: rshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x8c,0x0e,0x4f]
+; CHECK: rshrn.4h	v0, v0, #3      ; encoding: [0x00,0x8c,0x1d,0x0f]
+; CHECK: rshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x8c,0x1c,0x4f]
+; CHECK: rshrn.2s	v0, v0, #5      ; encoding: [0x00,0x8c,0x3b,0x0f]
+; CHECK: rshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x8c,0x3a,0x4f]
+; CHECK: scvtf.2s	v0, v0, #1      ; encoding: [0x00,0xe4,0x3f,0x0f]
+; CHECK: scvtf.4s	v0, v0, #2      ; encoding: [0x00,0xe4,0x3e,0x4f]
+; CHECK: scvtf.2d	v0, v0, #3      ; encoding: [0x00,0xe4,0x7d,0x4f]
+; CHECK: shl.8b	v0, v0, #1              ; encoding: [0x00,0x54,0x09,0x0f]
+; CHECK: shl.16b	v0, v0, #2              ; encoding: [0x00,0x54,0x0a,0x4f]
+; CHECK: shl.4h	v0, v0, #3              ; encoding: [0x00,0x54,0x13,0x0f]
+; CHECK: shl.8h	v0, v0, #4              ; encoding: [0x00,0x54,0x14,0x4f]
+; CHECK: shl.2s	v0, v0, #5              ; encoding: [0x00,0x54,0x25,0x0f]
+; CHECK: shl.4s	v0, v0, #6              ; encoding: [0x00,0x54,0x26,0x4f]
+; CHECK: shl.2d	v0, v0, #7              ; encoding: [0x00,0x54,0x47,0x4f]
+; CHECK: shrn.8b	v0, v0, #1              ; encoding: [0x00,0x84,0x0f,0x0f]
+; CHECK: shrn2.16b	v0, v0, #2      ; encoding: [0x00,0x84,0x0e,0x4f]
+; CHECK: shrn.4h	v0, v0, #3              ; encoding: [0x00,0x84,0x1d,0x0f]
+; CHECK: shrn2.8h	v0, v0, #4      ; encoding: [0x00,0x84,0x1c,0x4f]
+; CHECK: shrn.2s	v0, v0, #5              ; encoding: [0x00,0x84,0x3b,0x0f]
+; CHECK: shrn2.4s	v0, v0, #6      ; encoding: [0x00,0x84,0x3a,0x4f]
+; CHECK: sli.8b	v0, v0, #1              ; encoding: [0x00,0x54,0x09,0x2f]
+; CHECK: sli.16b	v0, v0, #2              ; encoding: [0x00,0x54,0x0a,0x6f]
+; CHECK: sli.4h	v0, v0, #3              ; encoding: [0x00,0x54,0x13,0x2f]
+; CHECK: sli.8h	v0, v0, #4              ; encoding: [0x00,0x54,0x14,0x6f]
+; CHECK: sli.2s	v0, v0, #5              ; encoding: [0x00,0x54,0x25,0x2f]
+; CHECK: sli.4s	v0, v0, #6              ; encoding: [0x00,0x54,0x26,0x6f]
+; CHECK: sli.2d	v0, v0, #7              ; encoding: [0x00,0x54,0x47,0x6f]
+; CHECK: sqrshrn.8b	v0, v0, #1      ; encoding: [0x00,0x9c,0x0f,0x0f]
+; CHECK: sqrshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x9c,0x0e,0x4f]
+; CHECK: sqrshrn.4h	v0, v0, #3      ; encoding: [0x00,0x9c,0x1d,0x0f]
+; CHECK: sqrshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x9c,0x1c,0x4f]
+; CHECK: sqrshrn.2s	v0, v0, #5      ; encoding: [0x00,0x9c,0x3b,0x0f]
+; CHECK: sqrshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x9c,0x3a,0x4f]
+; CHECK: sqrshrun.8b	v0, v0, #1      ; encoding: [0x00,0x8c,0x0f,0x2f]
+; CHECK: sqrshrun2.16b	v0, v0, #2      ; encoding: [0x00,0x8c,0x0e,0x6f]
+; CHECK: sqrshrun.4h	v0, v0, #3      ; encoding: [0x00,0x8c,0x1d,0x2f]
+; CHECK: sqrshrun2.8h	v0, v0, #4      ; encoding: [0x00,0x8c,0x1c,0x6f]
+; CHECK: sqrshrun.2s	v0, v0, #5      ; encoding: [0x00,0x8c,0x3b,0x2f]
+; CHECK: sqrshrun2.4s	v0, v0, #6      ; encoding: [0x00,0x8c,0x3a,0x6f]
+; CHECK: sqshlu.8b	v0, v0, #1      ; encoding: [0x00,0x64,0x09,0x2f]
+; CHECK: sqshlu.16b	v0, v0, #2      ; encoding: [0x00,0x64,0x0a,0x6f]
+; CHECK: sqshlu.4h	v0, v0, #3      ; encoding: [0x00,0x64,0x13,0x2f]
+; CHECK: sqshlu.8h	v0, v0, #4      ; encoding: [0x00,0x64,0x14,0x6f]
+; CHECK: sqshlu.2s	v0, v0, #5      ; encoding: [0x00,0x64,0x25,0x2f]
+; CHECK: sqshlu.4s	v0, v0, #6      ; encoding: [0x00,0x64,0x26,0x6f]
+; CHECK: sqshlu.2d	v0, v0, #7      ; encoding: [0x00,0x64,0x47,0x6f]
+; CHECK: sqshl.8b	v0, v0, #1      ; encoding: [0x00,0x74,0x09,0x0f]
+; CHECK: sqshl.16b	v0, v0, #2      ; encoding: [0x00,0x74,0x0a,0x4f]
+; CHECK: sqshl.4h	v0, v0, #3      ; encoding: [0x00,0x74,0x13,0x0f]
+; CHECK: sqshl.8h	v0, v0, #4      ; encoding: [0x00,0x74,0x14,0x4f]
+; CHECK: sqshl.2s	v0, v0, #5      ; encoding: [0x00,0x74,0x25,0x0f]
+; CHECK: sqshl.4s	v0, v0, #6      ; encoding: [0x00,0x74,0x26,0x4f]
+; CHECK: sqshl.2d	v0, v0, #7      ; encoding: [0x00,0x74,0x47,0x4f]
+; CHECK: sqshrn.8b	v0, v0, #1      ; encoding: [0x00,0x94,0x0f,0x0f]
+; CHECK: sqshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x94,0x0e,0x4f]
+; CHECK: sqshrn.4h	v0, v0, #3      ; encoding: [0x00,0x94,0x1d,0x0f]
+; CHECK: sqshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x94,0x1c,0x4f]
+; CHECK: sqshrn.2s	v0, v0, #5      ; encoding: [0x00,0x94,0x3b,0x0f]
+; CHECK: sqshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x94,0x3a,0x4f]
+; CHECK: sqshrun.8b	v0, v0, #1      ; encoding: [0x00,0x84,0x0f,0x2f]
+; CHECK: sqshrun2.16b	v0, v0, #2      ; encoding: [0x00,0x84,0x0e,0x6f]
+; CHECK: sqshrun.4h	v0, v0, #3      ; encoding: [0x00,0x84,0x1d,0x2f]
+; CHECK: sqshrun2.8h	v0, v0, #4      ; encoding: [0x00,0x84,0x1c,0x6f]
+; CHECK: sqshrun.2s	v0, v0, #5      ; encoding: [0x00,0x84,0x3b,0x2f]
+; CHECK: sqshrun2.4s	v0, v0, #6      ; encoding: [0x00,0x84,0x3a,0x6f]
+; CHECK: sri.8b	v0, v0, #1              ; encoding: [0x00,0x44,0x0f,0x2f]
+; CHECK: sri.16b	v0, v0, #2              ; encoding: [0x00,0x44,0x0e,0x6f]
+; CHECK: sri.4h	v0, v0, #3              ; encoding: [0x00,0x44,0x1d,0x2f]
+; CHECK: sri.8h	v0, v0, #4              ; encoding: [0x00,0x44,0x1c,0x6f]
+; CHECK: sri.2s	v0, v0, #5              ; encoding: [0x00,0x44,0x3b,0x2f]
+; CHECK: sri.4s	v0, v0, #6              ; encoding: [0x00,0x44,0x3a,0x6f]
+; CHECK: sri.2d	v0, v0, #7              ; encoding: [0x00,0x44,0x79,0x6f]
+; CHECK: srshr.8b	v0, v0, #1      ; encoding: [0x00,0x24,0x0f,0x0f]
+; CHECK: srshr.16b	v0, v0, #2      ; encoding: [0x00,0x24,0x0e,0x4f]
+; CHECK: srshr.4h	v0, v0, #3      ; encoding: [0x00,0x24,0x1d,0x0f]
+; CHECK: srshr.8h	v0, v0, #4      ; encoding: [0x00,0x24,0x1c,0x4f]
+; CHECK: srshr.2s	v0, v0, #5      ; encoding: [0x00,0x24,0x3b,0x0f]
+; CHECK: srshr.4s	v0, v0, #6      ; encoding: [0x00,0x24,0x3a,0x4f]
+; CHECK: srshr.2d	v0, v0, #7      ; encoding: [0x00,0x24,0x79,0x4f]
+; CHECK: srsra.8b	v0, v0, #1      ; encoding: [0x00,0x34,0x0f,0x0f]
+; CHECK: srsra.16b	v0, v0, #2      ; encoding: [0x00,0x34,0x0e,0x4f]
+; CHECK: srsra.4h	v0, v0, #3      ; encoding: [0x00,0x34,0x1d,0x0f]
+; CHECK: srsra.8h	v0, v0, #4      ; encoding: [0x00,0x34,0x1c,0x4f]
+; CHECK: srsra.2s	v0, v0, #5      ; encoding: [0x00,0x34,0x3b,0x0f]
+; CHECK: srsra.4s	v0, v0, #6      ; encoding: [0x00,0x34,0x3a,0x4f]
+; CHECK: srsra.2d	v0, v0, #7      ; encoding: [0x00,0x34,0x79,0x4f]
+; CHECK: sshll.8h	v0, v0, #1      ; encoding: [0x00,0xa4,0x09,0x0f]
+; CHECK: sshll2.8h	v0, v0, #2      ; encoding: [0x00,0xa4,0x0a,0x4f]
+; CHECK: sshll.4s	v0, v0, #3      ; encoding: [0x00,0xa4,0x13,0x0f]
+; CHECK: sshll2.4s	v0, v0, #4      ; encoding: [0x00,0xa4,0x14,0x4f]
+; CHECK: sshll.2d	v0, v0, #5      ; encoding: [0x00,0xa4,0x25,0x0f]
+; CHECK: sshll2.2d	v0, v0, #6      ; encoding: [0x00,0xa4,0x26,0x4f]
+; CHECK: sshr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x0f]
+; CHECK: sshr.16b	v0, v0, #2      ; encoding: [0x00,0x04,0x0e,0x4f]
+; CHECK: sshr.4h	v0, v0, #3              ; encoding: [0x00,0x04,0x1d,0x0f]
+; CHECK: sshr.8h	v0, v0, #4              ; encoding: [0x00,0x04,0x1c,0x4f]
+; CHECK: sshr.2s	v0, v0, #5              ; encoding: [0x00,0x04,0x3b,0x0f]
+; CHECK: sshr.4s	v0, v0, #6              ; encoding: [0x00,0x04,0x3a,0x4f]
+; CHECK: sshr.2d	v0, v0, #7              ; encoding: [0x00,0x04,0x79,0x4f]
+; CHECK: sshr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x0f]
+; CHECK: ssra.16b	v0, v0, #2      ; encoding: [0x00,0x14,0x0e,0x4f]
+; CHECK: ssra.4h	v0, v0, #3              ; encoding: [0x00,0x14,0x1d,0x0f]
+; CHECK: ssra.8h	v0, v0, #4              ; encoding: [0x00,0x14,0x1c,0x4f]
+; CHECK: ssra.2s	v0, v0, #5              ; encoding: [0x00,0x14,0x3b,0x0f]
+; CHECK: ssra.4s	v0, v0, #6              ; encoding: [0x00,0x14,0x3a,0x4f]
+; CHECK: ssra.2d	v0, v0, #7              ; encoding: [0x00,0x14,0x79,0x4f]
+; CHECK: ssra		d0, d0, #64             ; encoding: [0x00,0x14,0x40,0x5f]
+; CHECK: ucvtf.2s	v0, v0, #1      ; encoding: [0x00,0xe4,0x3f,0x2f]
+; CHECK: ucvtf.4s	v0, v0, #2      ; encoding: [0x00,0xe4,0x3e,0x6f]
+; CHECK: ucvtf.2d	v0, v0, #3      ; encoding: [0x00,0xe4,0x7d,0x6f]
+; CHECK: uqrshrn.8b	v0, v0, #1      ; encoding: [0x00,0x9c,0x0f,0x2f]
+; CHECK: uqrshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x9c,0x0e,0x6f]
+; CHECK: uqrshrn.4h	v0, v0, #3      ; encoding: [0x00,0x9c,0x1d,0x2f]
+; CHECK: uqrshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x9c,0x1c,0x6f]
+; CHECK: uqrshrn.2s	v0, v0, #5      ; encoding: [0x00,0x9c,0x3b,0x2f]
+; CHECK: uqrshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x9c,0x3a,0x6f]
+; CHECK: uqshl.8b	v0, v0, #1      ; encoding: [0x00,0x74,0x09,0x2f]
+; CHECK: uqshl.16b	v0, v0, #2      ; encoding: [0x00,0x74,0x0a,0x6f]
+; CHECK: uqshl.4h	v0, v0, #3      ; encoding: [0x00,0x74,0x13,0x2f]
+; CHECK: uqshl.8h	v0, v0, #4      ; encoding: [0x00,0x74,0x14,0x6f]
+; CHECK: uqshl.2s	v0, v0, #5      ; encoding: [0x00,0x74,0x25,0x2f]
+; CHECK: uqshl.4s	v0, v0, #6      ; encoding: [0x00,0x74,0x26,0x6f]
+; CHECK: uqshl.2d	v0, v0, #7      ; encoding: [0x00,0x74,0x47,0x6f]
+; CHECK: uqshrn.8b	v0, v0, #1      ; encoding: [0x00,0x94,0x0f,0x2f]
+; CHECK: uqshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x94,0x0e,0x6f]
+; CHECK: uqshrn.4h	v0, v0, #3      ; encoding: [0x00,0x94,0x1d,0x2f]
+; CHECK: uqshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x94,0x1c,0x6f]
+; CHECK: uqshrn.2s	v0, v0, #5      ; encoding: [0x00,0x94,0x3b,0x2f]
+; CHECK: uqshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x94,0x3a,0x6f]
+; CHECK: urshr.8b	v0, v0, #1      ; encoding: [0x00,0x24,0x0f,0x2f]
+; CHECK: urshr.16b	v0, v0, #2      ; encoding: [0x00,0x24,0x0e,0x6f]
+; CHECK: urshr.4h	v0, v0, #3      ; encoding: [0x00,0x24,0x1d,0x2f]
+; CHECK: urshr.8h	v0, v0, #4      ; encoding: [0x00,0x24,0x1c,0x6f]
+; CHECK: urshr.2s	v0, v0, #5      ; encoding: [0x00,0x24,0x3b,0x2f]
+; CHECK: urshr.4s	v0, v0, #6      ; encoding: [0x00,0x24,0x3a,0x6f]
+; CHECK: urshr.2d	v0, v0, #7      ; encoding: [0x00,0x24,0x79,0x6f]
+; CHECK: ursra.8b	v0, v0, #1      ; encoding: [0x00,0x34,0x0f,0x2f]
+; CHECK: ursra.16b	v0, v0, #2      ; encoding: [0x00,0x34,0x0e,0x6f]
+; CHECK: ursra.4h	v0, v0, #3      ; encoding: [0x00,0x34,0x1d,0x2f]
+; CHECK: ursra.8h	v0, v0, #4      ; encoding: [0x00,0x34,0x1c,0x6f]
+; CHECK: ursra.2s	v0, v0, #5      ; encoding: [0x00,0x34,0x3b,0x2f]
+; CHECK: ursra.4s	v0, v0, #6      ; encoding: [0x00,0x34,0x3a,0x6f]
+; CHECK: ursra.2d	v0, v0, #7      ; encoding: [0x00,0x34,0x79,0x6f]
+; CHECK: ushll.8h	v0, v0, #1      ; encoding: [0x00,0xa4,0x09,0x2f]
+; CHECK: ushll2.8h	v0, v0, #2      ; encoding: [0x00,0xa4,0x0a,0x6f]
+; CHECK: ushll.4s	v0, v0, #3      ; encoding: [0x00,0xa4,0x13,0x2f]
+; CHECK: ushll2.4s	v0, v0, #4      ; encoding: [0x00,0xa4,0x14,0x6f]
+; CHECK: ushll.2d	v0, v0, #5      ; encoding: [0x00,0xa4,0x25,0x2f]
+; CHECK: ushll2.2d	v0, v0, #6      ; encoding: [0x00,0xa4,0x26,0x6f]
+; CHECK: ushr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x2f]
+; CHECK: ushr.16b	v0, v0, #2      ; encoding: [0x00,0x04,0x0e,0x6f]
+; CHECK: ushr.4h	v0, v0, #3              ; encoding: [0x00,0x04,0x1d,0x2f]
+; CHECK: ushr.8h	v0, v0, #4              ; encoding: [0x00,0x04,0x1c,0x6f]
+; CHECK: ushr.2s	v0, v0, #5              ; encoding: [0x00,0x04,0x3b,0x2f]
+; CHECK: ushr.4s	v0, v0, #6              ; encoding: [0x00,0x04,0x3a,0x6f]
+; CHECK: ushr.2d	v0, v0, #7              ; encoding: [0x00,0x04,0x79,0x6f]
+; CHECK: usra.8b	v0, v0, #1              ; encoding: [0x00,0x14,0x0f,0x2f]
+; CHECK: usra.16b	v0, v0, #2      ; encoding: [0x00,0x14,0x0e,0x6f]
+; CHECK: usra.4h	v0, v0, #3              ; encoding: [0x00,0x14,0x1d,0x2f]
+; CHECK: usra.8h	v0, v0, #4              ; encoding: [0x00,0x14,0x1c,0x6f]
+; CHECK: usra.2s	v0, v0, #5              ; encoding: [0x00,0x14,0x3b,0x2f]
+; CHECK: usra.4s	v0, v0, #6              ; encoding: [0x00,0x14,0x3a,0x6f]
+; CHECK: usra.2d	v0, v0, #7              ; encoding: [0x00,0x14,0x79,0x6f]
+
+
+; ARM Verbose syntax variants.
+
+   rshrn v9.8b, v11.8h, #1
+   rshrn2 v8.16b, v9.8h, #2
+   rshrn v7.4h, v8.4s, #3
+   rshrn2 v6.8h, v7.4s, #4
+   rshrn v5.2s, v6.2d, #5
+   rshrn2 v4.4s, v5.2d, #6
+
+   shrn v9.8b, v11.8h, #1
+   shrn2 v8.16b, v9.8h, #2
+   shrn v7.4h, v8.4s, #3
+   shrn2 v6.8h, v7.4s, #4
+   shrn v5.2s, v6.2d, #5
+   shrn2 v4.4s, v5.2d, #6
+
+   sqrshrn v9.8b, v11.8h, #1
+   sqrshrn2 v8.16b, v9.8h, #2
+   sqrshrn v7.4h, v8.4s, #3
+   sqrshrn2 v6.8h, v7.4s, #4
+   sqrshrn v5.2s, v6.2d, #5
+   sqrshrn2 v4.4s, v5.2d, #6
+
+   sqshrn v9.8b, v11.8h, #1
+   sqshrn2 v8.16b, v9.8h, #2
+   sqshrn v7.4h, v8.4s, #3
+   sqshrn2 v6.8h, v7.4s, #4
+   sqshrn v5.2s, v6.2d, #5
+   sqshrn2 v4.4s, v5.2d, #6
+
+   sqrshrun v9.8b, v11.8h, #1
+   sqrshrun2 v8.16b, v9.8h, #2
+   sqrshrun v7.4h, v8.4s, #3
+   sqrshrun2 v6.8h, v7.4s, #4
+   sqrshrun v5.2s, v6.2d, #5
+   sqrshrun2 v4.4s, v5.2d, #6
+
+   sqshrun v9.8b, v11.8h, #1
+   sqshrun2 v8.16b, v9.8h, #2
+   sqshrun v7.4h, v8.4s, #3
+   sqshrun2 v6.8h, v7.4s, #4
+   sqshrun v5.2s, v6.2d, #5
+   sqshrun2 v4.4s, v5.2d, #6
+
+   uqrshrn v9.8b, v11.8h, #1
+   uqrshrn2 v8.16b, v9.8h, #2
+   uqrshrn v7.4h, v8.4s, #3
+   uqrshrn2 v6.8h, v7.4s, #4
+   uqrshrn v5.2s, v6.2d, #5
+   uqrshrn2 v4.4s, v5.2d, #6
+
+   uqshrn v9.8b, v11.8h, #1
+   uqshrn2 v8.16b, v9.8h, #2
+   uqshrn v7.4h, v8.4s, #3
+   uqshrn2 v6.8h, v7.4s, #4
+   uqshrn v5.2s, v6.2d, #5
+   uqshrn2 v4.4s, v5.2d, #6
+
+   sshll2 v10.8h, v3.16b, #6
+   sshll2 v11.4s, v4.8h, #5
+   sshll2 v12.2d, v5.4s, #4
+   sshll v13.8h, v6.8b, #3
+   sshll v14.4s, v7.4h, #2
+   sshll v15.2d, v8.2s, #7
+
+   ushll2 v10.8h, v3.16b, #6
+   ushll2 v11.4s, v4.8h, #5
+   ushll2 v12.2d, v5.4s, #4
+   ushll v13.8h, v6.8b, #3
+   ushll v14.4s, v7.4h, #2
+   ushll v15.2d, v8.2s, #7
+
+
+; CHECK: rshrn.8b	v9, v11, #1     ; encoding: [0x69,0x8d,0x0f,0x0f]
+; CHECK: rshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x8d,0x0e,0x4f]
+; CHECK: rshrn.4h	v7, v8, #3      ; encoding: [0x07,0x8d,0x1d,0x0f]
+; CHECK: rshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x8c,0x1c,0x4f]
+; CHECK: rshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x8c,0x3b,0x0f]
+; CHECK: rshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x8c,0x3a,0x4f]
+; CHECK: shrn.8b	v9, v11, #1             ; encoding: [0x69,0x85,0x0f,0x0f]
+; CHECK: shrn2.16b	v8, v9, #2      ; encoding: [0x28,0x85,0x0e,0x4f]
+; CHECK: shrn.4h	v7, v8, #3              ; encoding: [0x07,0x85,0x1d,0x0f]
+; CHECK: shrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x84,0x1c,0x4f]
+; CHECK: shrn.2s	v5, v6, #5              ; encoding: [0xc5,0x84,0x3b,0x0f]
+; CHECK: shrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x84,0x3a,0x4f]
+; CHECK: sqrshrn.8b	v9, v11, #1     ; encoding: [0x69,0x9d,0x0f,0x0f]
+; CHECK: sqrshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x9d,0x0e,0x4f]
+; CHECK: sqrshrn.4h	v7, v8, #3      ; encoding: [0x07,0x9d,0x1d,0x0f]
+; CHECK: sqrshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x9c,0x1c,0x4f]
+; CHECK: sqrshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x9c,0x3b,0x0f]
+; CHECK: sqrshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x9c,0x3a,0x4f]
+; CHECK: sqshrn.8b	v9, v11, #1     ; encoding: [0x69,0x95,0x0f,0x0f]
+; CHECK: sqshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x95,0x0e,0x4f]
+; CHECK: sqshrn.4h	v7, v8, #3      ; encoding: [0x07,0x95,0x1d,0x0f]
+; CHECK: sqshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x94,0x1c,0x4f]
+; CHECK: sqshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x94,0x3b,0x0f]
+; CHECK: sqshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x94,0x3a,0x4f]
+; CHECK: sqrshrun.8b	v9, v11, #1     ; encoding: [0x69,0x8d,0x0f,0x2f]
+; CHECK: sqrshrun2.16b	v8, v9, #2      ; encoding: [0x28,0x8d,0x0e,0x6f]
+; CHECK: sqrshrun.4h	v7, v8, #3      ; encoding: [0x07,0x8d,0x1d,0x2f]
+; CHECK: sqrshrun2.8h	v6, v7, #4      ; encoding: [0xe6,0x8c,0x1c,0x6f]
+; CHECK: sqrshrun.2s	v5, v6, #5      ; encoding: [0xc5,0x8c,0x3b,0x2f]
+; CHECK: sqrshrun2.4s	v4, v5, #6      ; encoding: [0xa4,0x8c,0x3a,0x6f]
+; CHECK: sqshrun.8b	v9, v11, #1     ; encoding: [0x69,0x85,0x0f,0x2f]
+; CHECK: sqshrun2.16b	v8, v9, #2      ; encoding: [0x28,0x85,0x0e,0x6f]
+; CHECK: sqshrun.4h	v7, v8, #3      ; encoding: [0x07,0x85,0x1d,0x2f]
+; CHECK: sqshrun2.8h	v6, v7, #4      ; encoding: [0xe6,0x84,0x1c,0x6f]
+; CHECK: sqshrun.2s	v5, v6, #5      ; encoding: [0xc5,0x84,0x3b,0x2f]
+; CHECK: sqshrun2.4s	v4, v5, #6      ; encoding: [0xa4,0x84,0x3a,0x6f]
+; CHECK: uqrshrn.8b	v9, v11, #1     ; encoding: [0x69,0x9d,0x0f,0x2f]
+; CHECK: uqrshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x9d,0x0e,0x6f]
+; CHECK: uqrshrn.4h	v7, v8, #3      ; encoding: [0x07,0x9d,0x1d,0x2f]
+; CHECK: uqrshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x9c,0x1c,0x6f]
+; CHECK: uqrshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x9c,0x3b,0x2f]
+; CHECK: uqrshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x9c,0x3a,0x6f]
+; CHECK: uqshrn.8b	v9, v11, #1     ; encoding: [0x69,0x95,0x0f,0x2f]
+; CHECK: uqshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x95,0x0e,0x6f]
+; CHECK: uqshrn.4h	v7, v8, #3      ; encoding: [0x07,0x95,0x1d,0x2f]
+; CHECK: uqshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x94,0x1c,0x6f]
+; CHECK: uqshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x94,0x3b,0x2f]
+; CHECK: uqshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x94,0x3a,0x6f]
+; CHECK: sshll2.8h	v10, v3, #6     ; encoding: [0x6a,0xa4,0x0e,0x4f]
+; CHECK: sshll2.4s	v11, v4, #5     ; encoding: [0x8b,0xa4,0x15,0x4f]
+; CHECK: sshll2.2d	v12, v5, #4     ; encoding: [0xac,0xa4,0x24,0x4f]
+; CHECK: sshll.8h	v13, v6, #3     ; encoding: [0xcd,0xa4,0x0b,0x0f]
+; CHECK: sshll.4s	v14, v7, #2     ; encoding: [0xee,0xa4,0x12,0x0f]
+; CHECK: sshll.2d	v15, v8, #7     ; encoding: [0x0f,0xa5,0x27,0x0f]
+; CHECK: ushll2.8h	v10, v3, #6     ; encoding: [0x6a,0xa4,0x0e,0x6f]
+; CHECK: ushll2.4s	v11, v4, #5     ; encoding: [0x8b,0xa4,0x15,0x6f]
+; CHECK: ushll2.2d	v12, v5, #4     ; encoding: [0xac,0xa4,0x24,0x6f]
+; CHECK: ushll.8h	v13, v6, #3     ; encoding: [0xcd,0xa4,0x0b,0x2f]
+; CHECK: ushll.4s	v14, v7, #2     ; encoding: [0xee,0xa4,0x12,0x2f]
+; CHECK: ushll.2d	v15, v8, #7     ; encoding: [0x0f,0xa5,0x27,0x2f]
+
+
+  pmull.8h v0, v0, v0
+  pmull2.8h v0, v0, v0
+  pmull.1q v2, v3, v4
+  pmull2.1q v2, v3, v4
+  pmull v2.1q, v3.1d, v4.1d
+  pmull2 v2.1q, v3.2d, v4.2d
+
+; CHECK: pmull.8h	v0, v0, v0      ; encoding: [0x00,0xe0,0x20,0x0e]
+; CHECK: pmull2.8h	v0, v0, v0      ; encoding: [0x00,0xe0,0x20,0x4e]
+; CHECK: pmull.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x0e]
+; CHECK: pmull2.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x4e]
+; CHECK: pmull.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x0e]
+; CHECK: pmull2.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x4e]
+
+
+  faddp.2d d1, v2
+  faddp.2s s3, v4
+; CHECK: faddp.2d	d1, v2          ; encoding: [0x41,0xd8,0x70,0x7e]
+; CHECK: faddp.2s	s3, v4          ; encoding: [0x83,0xd8,0x30,0x7e]
+
+  tbl.16b v2, {v4,v5,v6,v7}, v1
+  tbl.8b v0, {v4,v5,v6,v7}, v1
+  tbl.16b v2, {v5}, v1
+  tbl.8b v0, {v5}, v1
+  tbl.16b v2, {v5,v6,v7}, v1
+  tbl.8b v0, {v5,v6,v7}, v1
+  tbl.16b v2, {v6,v7}, v1
+  tbl.8b v0, {v6,v7}, v1
+; CHECK: tbl.16b	v2, { v4, v5, v6, v7 }, v1 ; encoding: [0x82,0x60,0x01,0x4e]
+; CHECK: tbl.8b	v0, { v4, v5, v6, v7 }, v1 ; encoding: [0x80,0x60,0x01,0x0e]
+; CHECK: tbl.16b	v2, { v5 }, v1          ; encoding: [0xa2,0x00,0x01,0x4e]
+; CHECK: tbl.8b	v0, { v5 }, v1          ; encoding: [0xa0,0x00,0x01,0x0e]
+; CHECK: tbl.16b	v2, { v5, v6, v7 }, v1  ; encoding: [0xa2,0x40,0x01,0x4e]
+; CHECK: tbl.8b	v0, { v5, v6, v7 }, v1  ; encoding: [0xa0,0x40,0x01,0x0e]
+; CHECK: tbl.16b	v2, { v6, v7 }, v1      ; encoding: [0xc2,0x20,0x01,0x4e]
+; CHECK: tbl.8b	v0, { v6, v7 }, v1      ; encoding: [0xc0,0x20,0x01,0x0e]
+
+  tbl v2.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v1.16b
+  tbl v0.8b, {v4.16b,v5.16b,v6.16b,v7.16b}, v1.8b
+  tbl v2.16b, {v5.16b}, v1.16b
+  tbl v0.8b, {v5.16b}, v1.8b
+  tbl v2.16b, {v5.16b,v6.16b,v7.16b}, v1.16b
+  tbl v0.8b, {v5.16b,v6.16b,v7.16b}, v1.8b
+  tbl v2.16b, {v6.16b,v7.16b}, v1.16b
+  tbl v0.8b, {v6.16b,v7.16b}, v1.8b
+; CHECK: tbl.16b v2, { v4, v5, v6, v7 }, v1 ; encoding: [0x82,0x60,0x01,0x4e]
+; CHECK: tbl.8b v0, { v4, v5, v6, v7 }, v1 ; encoding: [0x80,0x60,0x01,0x0e]
+; CHECK: tbl.16b v2, { v5 }, v1          ; encoding: [0xa2,0x00,0x01,0x4e]
+; CHECK: tbl.8b v0, { v5 }, v1          ; encoding: [0xa0,0x00,0x01,0x0e]
+; CHECK: tbl.16b v2, { v5, v6, v7 }, v1  ; encoding: [0xa2,0x40,0x01,0x4e]
+; CHECK: tbl.8b v0, { v5, v6, v7 }, v1  ; encoding: [0xa0,0x40,0x01,0x0e]
+; CHECK: tbl.16b v2, { v6, v7 }, v1      ; encoding: [0xc2,0x20,0x01,0x4e]
+; CHECK: tbl.8b v0, { v6, v7 }, v1      ; encoding: [0xc0,0x20,0x01,0x0e]
+
+  sqdmull	s0, h0, h0
+  sqdmull	d0, s0, s0
+; CHECK: sqdmull	s0, h0, h0              ; encoding: [0x00,0xd0,0x60,0x5e]
+; CHECK: sqdmull	d0, s0, s0              ; encoding: [0x00,0xd0,0xa0,0x5e]
+
+  frsqrte s0, s0
+  frsqrte d0, d0
+; CHECK: frsqrte s0, s0                  ; encoding: [0x00,0xd8,0xa1,0x7e]
+; CHECK: frsqrte d0, d0                  ; encoding: [0x00,0xd8,0xe1,0x7e]
+
+  mov.16b v0, v0
+  mov.2s v0, v0
+; CHECK: mov.16b v0, v0              ; encoding: [0x00,0x1c,0xa0,0x4e]
+; CHECK: mov.8b v0, v0              ; encoding: [0x00,0x1c,0xa0,0x0e]
+
+
+; uadalp/sadalp verbose mode aliases.
+  uadalp v14.4h, v25.8b
+  uadalp v15.8h, v24.16b
+  uadalp v16.2s, v23.4h
+  uadalp v17.4s, v22.8h
+  uadalp v18.1d, v21.2s
+  uadalp v19.2d, v20.4s
+
+  sadalp v1.4h, v11.8b
+  sadalp v2.8h, v12.16b
+  sadalp v3.2s, v13.4h
+  sadalp v4.4s, v14.8h
+  sadalp v5.1d, v15.2s
+  sadalp v6.2d, v16.4s
+
+; CHECK: uadalp.4h	v14, v25        ; encoding: [0x2e,0x6b,0x20,0x2e]
+; CHECK: uadalp.8h	v15, v24        ; encoding: [0x0f,0x6b,0x20,0x6e]
+; CHECK: uadalp.2s	v16, v23        ; encoding: [0xf0,0x6a,0x60,0x2e]
+; CHECK: uadalp.4s	v17, v22        ; encoding: [0xd1,0x6a,0x60,0x6e]
+; CHECK: uadalp.1d	v18, v21        ; encoding: [0xb2,0x6a,0xa0,0x2e]
+; CHECK: uadalp.2d	v19, v20        ; encoding: [0x93,0x6a,0xa0,0x6e]
+; CHECK: sadalp.4h	v1, v11         ; encoding: [0x61,0x69,0x20,0x0e]
+; CHECK: sadalp.8h	v2, v12         ; encoding: [0x82,0x69,0x20,0x4e]
+; CHECK: sadalp.2s	v3, v13         ; encoding: [0xa3,0x69,0x60,0x0e]
+; CHECK: sadalp.4s	v4, v14         ; encoding: [0xc4,0x69,0x60,0x4e]
+; CHECK: sadalp.1d	v5, v15         ; encoding: [0xe5,0x69,0xa0,0x0e]
+; CHECK: sadalp.2d	v6, v16         ; encoding: [0x06,0x6a,0xa0,0x4e]
+
+; MVN is an alias for 'not'.
+  mvn v1.8b, v4.8b
+  mvn v19.16b, v17.16b
+  mvn.8b v10, v6
+  mvn.16b v11, v7
+
+; CHECK: mvn.8b	v1, v4                  ; encoding: [0x81,0x58,0x20,0x2e]
+; CHECK: mvn.16b	v19, v17                ; encoding: [0x33,0x5a,0x20,0x6e]
+; CHECK: mvn.8b	v10, v6                 ; encoding: [0xca,0x58,0x20,0x2e]
+; CHECK: mvn.16b	v11, v7                 ; encoding: [0xeb,0x58,0x20,0x6e]
+
+; sqdmull verbose mode aliases
+ sqdmull v10.4s, v12.4h, v12.4h
+ sqdmull2 v10.4s, v13.8h, v13.8h
+ sqdmull v10.2d, v13.2s, v13.2s
+ sqdmull2 v10.2d, v13.4s, v13.4s
+; CHECK: sqdmull.4s	v10, v12, v12   ; encoding: [0x8a,0xd1,0x6c,0x0e]
+; CHECK: sqdmull2.4s	v10, v13, v13   ; encoding: [0xaa,0xd1,0x6d,0x4e]
+; CHECK: sqdmull.2d	v10, v13, v13   ; encoding: [0xaa,0xd1,0xad,0x0e]
+; CHECK: sqdmull2.2d	v10, v13, v13   ; encoding: [0xaa,0xd1,0xad,0x4e]
+
+; xtn verbose mode aliases
+ xtn v14.8b, v14.8h
+ xtn2 v14.16b, v14.8h
+ xtn v14.4h, v14.4s
+ xtn2 v14.8h, v14.4s
+ xtn v14.2s, v14.2d
+ xtn2 v14.4s, v14.2d
+; CHECK: xtn.8b v14, v14                ; encoding: [0xce,0x29,0x21,0x0e]
+; CHECK: xtn2.16b v14, v14              ; encoding: [0xce,0x29,0x21,0x4e]
+; CHECK: xtn.4h v14, v14                ; encoding: [0xce,0x29,0x61,0x0e]
+; CHECK: xtn2.8h v14, v14               ; encoding: [0xce,0x29,0x61,0x4e]
+; CHECK: xtn.2s v14, v14                ; encoding: [0xce,0x29,0xa1,0x0e]
+; CHECK: xtn2.4s v14, v14               ; encoding: [0xce,0x29,0xa1,0x4e]
+
+; uaddl verbose mode aliases
+ uaddl v9.8h, v13.8b, v14.8b
+ uaddl2 v9.8h, v13.16b, v14.16b
+ uaddl v9.4s, v13.4h, v14.4h
+ uaddl2 v9.4s, v13.8h, v14.8h
+ uaddl v9.2d, v13.2s, v14.2s
+ uaddl2 v9.2d, v13.4s, v14.4s
+; CHECK: uaddl.8h	v9, v13, v14    ; encoding: [0xa9,0x01,0x2e,0x2e]
+; CHECK: uaddl2.8h	v9, v13, v14    ; encoding: [0xa9,0x01,0x2e,0x6e]
+; CHECK: uaddl.4s	v9, v13, v14    ; encoding: [0xa9,0x01,0x6e,0x2e]
+; CHECK: uaddl2.4s	v9, v13, v14    ; encoding: [0xa9,0x01,0x6e,0x6e]
+; CHECK: uaddl.2d	v9, v13, v14    ; encoding: [0xa9,0x01,0xae,0x2e]
+; CHECK: uaddl2.2d	v9, v13, v14    ; encoding: [0xa9,0x01,0xae,0x6e]
+
+; bit verbose mode aliases
+ bit v9.16b, v10.16b, v10.16b
+ bit v9.8b, v10.8b, v10.8b
+; CHECK: bit.16b v9, v10, v10           ; encoding: [0x49,0x1d,0xaa,0x6e]
+; CHECK: bit.8b v9, v10, v10            ; encoding: [0x49,0x1d,0xaa,0x2e]
+
+; pmull verbose mode aliases
+ pmull v8.8h, v8.8b, v8.8b
+ pmull2 v8.8h, v8.16b, v8.16b
+ pmull v8.1q, v8.1d, v8.1d
+ pmull2 v8.1q, v8.2d, v8.2d
+; CHECK: pmull.8h	v8, v8, v8      ; encoding: [0x08,0xe1,0x28,0x0e]
+; CHECK: pmull2.8h	v8, v8, v8      ; encoding: [0x08,0xe1,0x28,0x4e]
+; CHECK: pmull.1q	v8, v8, v8      ; encoding: [0x08,0xe1,0xe8,0x0e]
+; CHECK: pmull2.1q	v8, v8, v8      ; encoding: [0x08,0xe1,0xe8,0x4e]
+
+; usubl verbose mode aliases
+ usubl v9.8h, v13.8b, v14.8b
+ usubl2 v9.8h, v13.16b, v14.16b
+ usubl v9.4s, v13.4h, v14.4h
+ usubl2 v9.4s, v13.8h, v14.8h
+ usubl v9.2d, v13.2s, v14.2s
+ usubl2 v9.2d, v13.4s, v14.4s
+; CHECK: usubl.8h	v9, v13, v14    ; encoding: [0xa9,0x21,0x2e,0x2e]
+; CHECK: usubl2.8h	v9, v13, v14    ; encoding: [0xa9,0x21,0x2e,0x6e]
+; CHECK: usubl.4s	v9, v13, v14    ; encoding: [0xa9,0x21,0x6e,0x2e]
+; CHECK: usubl2.4s	v9, v13, v14    ; encoding: [0xa9,0x21,0x6e,0x6e]
+; CHECK: usubl.2d	v9, v13, v14    ; encoding: [0xa9,0x21,0xae,0x2e]
+; CHECK: usubl2.2d	v9, v13, v14    ; encoding: [0xa9,0x21,0xae,0x6e]
+
+; uabdl verbose mode aliases
+ uabdl v9.8h, v13.8b, v14.8b
+ uabdl2 v9.8h, v13.16b, v14.16b
+ uabdl v9.4s, v13.4h, v14.4h
+ uabdl2 v9.4s, v13.8h, v14.8h
+ uabdl v9.2d, v13.2s, v14.2s
+ uabdl2 v9.2d, v13.4s, v14.4s
+; CHECK: uabdl.8h	v9, v13, v14    ; encoding: [0xa9,0x71,0x2e,0x2e]
+; CHECK: uabdl2.8h	v9, v13, v14    ; encoding: [0xa9,0x71,0x2e,0x6e]
+; CHECK: uabdl.4s	v9, v13, v14    ; encoding: [0xa9,0x71,0x6e,0x2e]
+; CHECK: uabdl2.4s	v9, v13, v14    ; encoding: [0xa9,0x71,0x6e,0x6e]
+; CHECK: uabdl.2d	v9, v13, v14    ; encoding: [0xa9,0x71,0xae,0x2e]
+; CHECK: uabdl2.2d	v9, v13, v14    ; encoding: [0xa9,0x71,0xae,0x6e]
+
+; umull verbose mode aliases
+ umull v9.8h, v13.8b, v14.8b
+ umull2 v9.8h, v13.16b, v14.16b
+ umull v9.4s, v13.4h, v14.4h
+ umull2 v9.4s, v13.8h, v14.8h
+ umull v9.2d, v13.2s, v14.2s
+ umull2 v9.2d, v13.4s, v14.4s
+; CHECK: umull.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x2e]
+; CHECK: umull2.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x6e]
+; CHECK: umull.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x2e]
+; CHECK: umull2.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x6e]
+; CHECK: umull.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x2e]
+; CHECK: umull2.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x6e]
+
+; smull verbose mode aliases
+ smull v9.8h, v13.8b, v14.8b
+ smull2 v9.8h, v13.16b, v14.16b
+ smull v9.4s, v13.4h, v14.4h
+ smull2 v9.4s, v13.8h, v14.8h
+ smull v9.2d, v13.2s, v14.2s
+ smull2 v9.2d, v13.4s, v14.4s
+; CHECK: smull.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x0e]
+; CHECK: smull2.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x4e]
+; CHECK: smull.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x0e]
+; CHECK: smull2.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x4e]
+; CHECK: smull.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x0e]
+; CHECK: smull2.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x4e]
diff --git a/test/MC/AArch64/arm64-aliases.s b/test/MC/AArch64/arm64-aliases.s
new file mode 100644
index 00000000000..c3affe37aa9
--- /dev/null
+++ b/test/MC/AArch64/arm64-aliases.s
@@ -0,0 +1,753 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon -output-asm-variant=1 -show-encoding < %s | FileCheck %s
+
+foo:
+;-----------------------------------------------------------------------------
+; ADD #0 to/from SP/WSP is a MOV
+;-----------------------------------------------------------------------------
+  add x1, sp, #0
+; CHECK: mov x1, sp
+  add sp, x2, #0
+; CHECK: mov sp, x2
+  add w3, wsp, #0
+; CHECK: mov w3, wsp
+  add wsp, w4, #0
+; CHECK: mov wsp, w4
+  mov x5, sp
+; CHECK: mov x5, sp
+  mov sp, x6
+; CHECK: mov sp, x6
+  mov w7, wsp
+; CHECK: mov w7, wsp
+  mov wsp, w8
+; CHECK: mov wsp, w8
+
+;-----------------------------------------------------------------------------
+; ORR Rd, Rn, Rn is a MOV
+;-----------------------------------------------------------------------------
+  orr x2, xzr, x9
+; CHECK: mov x2, x9
+  orr w2, wzr, w9
+; CHECK: mov w2, w9
+  mov x3, x4
+; CHECK: mov x3, x4
+  mov w5, w6
+; CHECK: mov w5, w6
+
+;-----------------------------------------------------------------------------
+; TST Xn, #<imm>
+;-----------------------------------------------------------------------------
+        tst w1, #3
+        tst x1, #3
+        tst w1, w2
+        tst x1, x2
+        ands wzr, w1, w2, lsl #2
+        ands xzr, x1, x2, lsl #3
+        tst w3, w7, lsl #31
+        tst x2, x20, asr #0
+
+; CHECK: tst	w1, #0x3                ; encoding: [0x3f,0x04,0x00,0x72]
+; CHECK: tst	x1, #0x3                ; encoding: [0x3f,0x04,0x40,0xf2]
+; CHECK: tst	w1, w2                  ; encoding: [0x3f,0x00,0x02,0x6a]
+; CHECK: tst	x1, x2                  ; encoding: [0x3f,0x00,0x02,0xea]
+; CHECK: tst	w1, w2, lsl #2          ; encoding: [0x3f,0x08,0x02,0x6a]
+; CHECK: tst	x1, x2, lsl #3          ; encoding: [0x3f,0x0c,0x02,0xea]
+; CHECK: tst	w3, w7, lsl #31         ; encoding: [0x7f,0x7c,0x07,0x6a]
+; CHECK: tst	x2, x20, asr #0         ; encoding: [0x5f,0x00,0x94,0xea]
+
+;-----------------------------------------------------------------------------
+; ADDS to WZR/XZR is a CMN
+;-----------------------------------------------------------------------------
+  cmn w1, #3, lsl #0
+  cmn x2, #4194304
+  cmn w4, w5
+  cmn x6, x7
+  cmn w8, w9, asr #3
+  cmn x2, x3, lsr #4
+  cmn x2, w3, uxtb #1
+  cmn x4, x5, uxtx #1
+
+; CHECK: cmn	w1, #3                  ; encoding: [0x3f,0x0c,0x00,0x31]
+; CHECK: cmn	x2, #1024, lsl #12      ; encoding: [0x5f,0x00,0x50,0xb1]
+; CHECK: cmn	w4, w5                  ; encoding: [0x9f,0x00,0x05,0x2b]
+; CHECK: cmn	x6, x7                  ; encoding: [0xdf,0x00,0x07,0xab]
+; CHECK: cmn	w8, w9, asr #3          ; encoding: [0x1f,0x0d,0x89,0x2b]
+; CHECK: cmn	x2, x3, lsr #4          ; encoding: [0x5f,0x10,0x43,0xab]
+; CHECK: cmn	x2, w3, uxtb #1         ; encoding: [0x5f,0x04,0x23,0xab]
+; CHECK: cmn	x4, x5, uxtx #1         ; encoding: [0x9f,0x64,0x25,0xab]
+
+
+;-----------------------------------------------------------------------------
+; SUBS to WZR/XZR is a CMP
+;-----------------------------------------------------------------------------
+  cmp w1, #1024, lsl #12
+  cmp x2, #1024
+  cmp w4, w5
+  cmp x6, x7
+  cmp w8, w9, asr #3
+  cmp x2, x3, lsr #4
+  cmp x2, w3, uxth #2
+  cmp x4, x5, uxtx
+  cmp wzr, w1
+  cmp x8, w8, uxtw
+  cmp w9, w8, uxtw
+  cmp wsp, w9, lsl #0
+
+; CHECK: cmp	w1, #1024, lsl #12      ; encoding: [0x3f,0x00,0x50,0x71]
+; CHECK: cmp	x2, #1024               ; encoding: [0x5f,0x00,0x10,0xf1]
+; CHECK: cmp	w4, w5                  ; encoding: [0x9f,0x00,0x05,0x6b]
+; CHECK: cmp	x6, x7                  ; encoding: [0xdf,0x00,0x07,0xeb]
+; CHECK: cmp	w8, w9, asr #3          ; encoding: [0x1f,0x0d,0x89,0x6b]
+; CHECK: cmp	x2, x3, lsr #4          ; encoding: [0x5f,0x10,0x43,0xeb]
+; CHECK: cmp	x2, w3, uxth #2         ; encoding: [0x5f,0x28,0x23,0xeb]
+; CHECK: cmp	x4, x5, uxtx            ; encoding: [0x9f,0x60,0x25,0xeb]
+; CHECK: cmp	wzr, w1                 ; encoding: [0xff,0x03,0x01,0x6b]
+; CHECK: cmp	x8, w8, uxtw            ; encoding: [0x1f,0x41,0x28,0xeb]
+; CHECK: cmp	w9, w8, uxtw            ; encoding: [0x3f,0x41,0x28,0x6b]
+; CHECK: cmp	wsp, w9                 ; encoding: [0xff,0x43,0x29,0x6b]
+
+
+;-----------------------------------------------------------------------------
+; SUB/SUBS from WZR/XZR is a NEG
+;-----------------------------------------------------------------------------
+
+  neg w0, w1
+; CHECK: neg w0, w1
+  neg w0, w1, lsl #1
+; CHECK: neg w0, w1, lsl #1
+  neg x0, x1
+; CHECK: neg x0, x1
+  neg x0, x1, asr #1
+; CHECK: neg x0, x1, asr #1
+  negs w0, w1
+; CHECK: negs w0, w1
+  negs w0, w1, lsl #1
+; CHECK: negs w0, w1, lsl #1
+  negs x0, x1
+; CHECK: negs x0, x1
+  negs x0, x1, asr #1
+; CHECK: negs x0, x1, asr #1
+
+;-----------------------------------------------------------------------------
+; MOV aliases
+;-----------------------------------------------------------------------------
+
+  mov x0, #281470681743360
+  mov x0, #18446744073709486080
+
+; CHECK: movz	x0, #0xffff, lsl #32
+; CHECK: movn	x0, #0xffff
+
+  mov w0, #0xffffffff
+  mov w0, #0xffffff00
+  mov wzr, #0xffffffff
+  mov wzr, #0xffffff00
+
+; CHECK: movn   w0, #0
+; CHECK: movn   w0, #0xff
+; CHECK: movn   wzr, #0
+; CHECK: movn   wzr, #0xff
+
+;-----------------------------------------------------------------------------
+; MVN aliases
+;-----------------------------------------------------------------------------
+
+        mvn w4, w9
+        mvn x2, x3
+        orn w4, wzr, w9
+
+; CHECK: mvn	w4, w9             ; encoding: [0xe4,0x03,0x29,0x2a]
+; CHECK: mvn	x2, x3             ; encoding: [0xe2,0x03,0x23,0xaa]
+; CHECK: mvn	w4, w9             ; encoding: [0xe4,0x03,0x29,0x2a]
+
+        mvn w4, w9, lsl #1
+        mvn x2, x3, lsl #1
+        orn w4, wzr, w9, lsl #1
+
+; CHECK: mvn	w4, w9, lsl #1     ; encoding: [0xe4,0x07,0x29,0x2a]
+; CHECK: mvn	x2, x3, lsl #1     ; encoding: [0xe2,0x07,0x23,0xaa]
+; CHECK: mvn	w4, w9, lsl #1     ; encoding: [0xe4,0x07,0x29,0x2a]
+
+;-----------------------------------------------------------------------------
+; Bitfield aliases
+;-----------------------------------------------------------------------------
+
+  bfi   w0, w0, #1, #4
+  bfi   x0, x0, #1, #4
+  bfi   w0, w0, #0, #2
+  bfi   x0, x0, #0, #2
+  bfxil w0, w0, #2, #3
+  bfxil x0, x0, #2, #3
+  sbfiz w0, w0, #1, #4
+  sbfiz x0, x0, #1, #4
+  sbfx  w0, w0, #2, #3
+  sbfx  x0, x0, #2, #3
+  ubfiz w0, w0, #1, #4
+  ubfiz x0, x0, #1, #4
+  ubfx  w0, w0, #2, #3
+  ubfx  x0, x0, #2, #3
+
+; CHECK: bfi   w0, w0, #1, #4
+; CHECK: bfi   x0, x0, #1, #4
+; CHECK: bfxil w0, w0, #0, #2
+; CHECK: bfxil x0, x0, #0, #2
+; CHECK: bfxil w0, w0, #2, #3
+; CHECK: bfxil x0, x0, #2, #3
+; CHECK: sbfiz w0, w0, #1, #4
+; CHECK: sbfiz x0, x0, #1, #4
+; CHECK: sbfx  w0, w0, #2, #3
+; CHECK: sbfx  x0, x0, #2, #3
+; CHECK: ubfiz w0, w0, #1, #4
+; CHECK: ubfiz x0, x0, #1, #4
+; CHECK: ubfx  w0, w0, #2, #3
+; CHECK: ubfx  x0, x0, #2, #3
+
+;-----------------------------------------------------------------------------
+; Shift (immediate) aliases
+;-----------------------------------------------------------------------------
+
+; CHECK: asr w1, w3, #13
+; CHECK: asr x1, x3, #13
+; CHECK: lsl w0, w0, #1
+; CHECK: lsl x0, x0, #1
+; CHECK: lsr w0, w0, #4
+; CHECK: lsr x0, x0, #4
+
+   sbfm w1, w3, #13, #31
+   sbfm x1, x3, #13, #63
+   ubfm w0, w0, #31, #30
+   ubfm x0, x0, #63, #62
+   ubfm w0, w0, #4, #31
+   ubfm x0, x0, #4, #63
+; CHECK: ror w1, w3, #5
+; CHECK: ror x1, x3, #5
+   ror w1, w3, #5
+   ror x1, x3, #5
+; CHECK: lsl w1, wzr, #3
+   lsl w1, wzr, #3
+
+;-----------------------------------------------------------------------------
+; Sign/Zero extend aliases
+;-----------------------------------------------------------------------------
+
+  sxtb  w1, w2
+  sxth  w1, w2
+  uxtb  w1, w2
+  uxth  w1, w2
+
+; CHECK: sxtb w1, w2
+; CHECK: sxth w1, w2
+; CHECK: uxtb w1, w2
+; CHECK: uxth w1, w2
+
+  sxtb  x1, w2
+  sxth  x1, w2
+  sxtw  x1, w2
+  uxtb  x1, w2
+  uxth  x1, w2
+  uxtw  x1, w2
+
+; CHECK: sxtb x1, w2
+; CHECK: sxth x1, w2
+; CHECK: sxtw x1, w2
+; CHECK: uxtb w1, w2
+; CHECK: uxth w1, w2
+; CHECK: ubfx x1, x2, #0, #32
+
+;-----------------------------------------------------------------------------
+; Negate with carry
+;-----------------------------------------------------------------------------
+
+  ngc   w1, w2
+  ngc   x1, x2
+  ngcs  w1, w2
+  ngcs  x1, x2
+
+; CHECK: ngc  w1, w2
+; CHECK: ngc  x1, x2
+; CHECK: ngcs w1, w2
+; CHECK: ngcs x1, x2
+
+;-----------------------------------------------------------------------------
+; 6.6.1 Multiply aliases
+;-----------------------------------------------------------------------------
+
+  mneg   w1, w2, w3
+  mneg   x1, x2, x3
+  mul    w1, w2, w3
+  mul    x1, x2, x3
+  smnegl x1, w2, w3
+  umnegl x1, w2, w3
+  smull   x1, w2, w3
+  umull   x1, w2, w3
+
+; CHECK: mneg w1, w2, w3
+; CHECK: mneg x1, x2, x3
+; CHECK: mul w1, w2, w3
+; CHECK: mul x1, x2, x3
+; CHECK: smnegl x1, w2, w3
+; CHECK: umnegl x1, w2, w3
+; CHECK: smull x1, w2, w3
+; CHECK: umull x1, w2, w3
+
+;-----------------------------------------------------------------------------
+; Conditional select aliases
+;-----------------------------------------------------------------------------
+
+  cset   w1, eq
+  cset   x1, eq
+  csetm  w1, ne
+  csetm  x1, ne
+  cinc   w1, w2, lt
+  cinc   x1, x2, lt
+  cinv   w1, w2, mi
+  cinv   x1, x2, mi
+
+; CHECK: cset  w1, eq
+; CHECK: cset  x1, eq
+; CHECK: csetm  w1, ne
+; CHECK: csetm  x1, ne
+; CHECK: cinc  w1, w2, lt
+; CHECK: cinc  x1, x2, lt
+; CHECK: cinv  w1, w2, mi
+; CHECK: cinv  x1, x2, mi
+
+;-----------------------------------------------------------------------------
+; SYS aliases
+;-----------------------------------------------------------------------------
+
+  sys #0, c7, c1, #0
+; CHECK: ic ialluis
+  sys #0, c7, c5, #0
+; CHECK: ic iallu
+  sys #3, c7, c5, #1
+; CHECK: ic ivau
+
+  sys #3, c7, c4, #1
+; CHECK: dc zva
+  sys #0, c7, c6, #1
+; CHECK: dc ivac
+  sys #0, c7, c6, #2
+; CHECK: dc isw
+  sys #3, c7, c10, #1
+; CHECK: dc cvac
+  sys #0, c7, c10, #2
+; CHECK: dc csw
+  sys #3, c7, c11, #1
+; CHECK: dc cvau
+  sys #3, c7, c14, #1
+; CHECK: dc civac
+  sys #0, c7, c14, #2
+; CHECK: dc cisw
+
+  sys #0, c7, c8, #0
+; CHECK: at s1e1r
+  sys #4, c7, c8, #0
+; CHECK: at s1e2r
+  sys #6, c7, c8, #0
+; CHECK: at s1e3r
+  sys #0, c7, c8, #1
+; CHECK: at s1e1w
+  sys #4, c7, c8, #1
+; CHECK: at s1e2w
+  sys #6, c7, c8, #1
+; CHECK: at s1e3w
+  sys #0, c7, c8, #2
+; CHECK: at s1e0r
+  sys #0, c7, c8, #3
+; CHECK: at s1e0w
+  sys #4, c7, c8, #4
+; CHECK: at s12e1r
+  sys #4, c7, c8, #5
+; CHECK: at s12e1w
+  sys #4, c7, c8, #6
+; CHECK: at s12e0r
+  sys #4, c7, c8, #7
+; CHECK: at s12e0w
+
+  sys #0, c8, c3, #0
+; CHECK: tlbi vmalle1is
+  sys #4, c8, c3, #0
+; CHECK: tlbi alle2is
+  sys #6, c8, c3, #0
+; CHECK: tlbi alle3is
+  sys #0, c8, c3, #1
+; CHECK: tlbi vae1is
+  sys #4, c8, c3, #1
+; CHECK: tlbi vae2is
+  sys #6, c8, c3, #1
+; CHECK: tlbi vae3is
+  sys #0, c8, c3, #2
+; CHECK: tlbi aside1is
+  sys #0, c8, c3, #3
+; CHECK: tlbi vaae1is
+  sys #4, c8, c3, #4
+; CHECK: tlbi alle1is
+  sys #0, c8, c3, #5
+; CHECK: tlbi vale1is
+  sys #0, c8, c3, #7
+; CHECK: tlbi vaale1is
+  sys #0, c8, c7, #0
+; CHECK: tlbi vmalle1
+  sys #4, c8, c7, #0
+; CHECK: tlbi alle2
+  sys #4, c8, c3, #5
+; CHECK: tlbi vale2is
+  sys #6, c8, c3, #5
+; CHECK: tlbi vale3is
+  sys #6, c8, c7, #0
+; CHECK: tlbi alle3
+  sys #0, c8, c7, #1
+; CHECK: tlbi vae1
+  sys #4, c8, c7, #1
+; CHECK: tlbi vae2
+  sys #6, c8, c7, #1
+; CHECK: tlbi vae3
+  sys #0, c8, c7, #2
+; CHECK: tlbi aside1
+  sys #0, c8, c7, #3
+; CHECK: tlbi vaae1
+  sys #4, c8, c7, #4
+; CHECK: tlbi alle1
+  sys #0, c8, c7, #5
+; CHECK: tlbi vale1
+  sys #4, c8, c7, #5
+; CHECK: tlbi vale2
+  sys #6, c8, c7, #5
+; CHECK: tlbi vale3
+  sys #0, c8, c7, #7
+; CHECK: tlbi vaale1
+  sys #4, c8, c4, #1
+; CHECK: tlbi ipas2e1
+  sys #4, c8, c4, #5
+; CHECK: tlbi ipas2le1
+  sys #4, c8, c0, #1
+; CHECK: tlbi ipas2e1is
+  sys #4, c8, c0, #5
+; CHECK: tlbi ipas2le1is
+  sys #4, c8, c7, #6
+; CHECK: tlbi vmalls12e1
+  sys #4, c8, c3, #6
+; CHECK: tlbi vmalls12e1is
+
+  ic ialluis
+; CHECK: ic ialluis                 ; encoding: [0x1f,0x71,0x08,0xd5]
+  ic iallu
+; CHECK: ic iallu                   ; encoding: [0x1f,0x75,0x08,0xd5]
+  ic ivau, x0
+; CHECK: ic ivau, x0                ; encoding: [0x20,0x75,0x0b,0xd5]
+
+  dc zva, x0
+; CHECK: dc zva, x0                 ; encoding: [0x20,0x74,0x0b,0xd5]
+  dc ivac, x0
+; CHECK: dc ivac, x0                ; encoding: [0x20,0x76,0x08,0xd5]
+  dc isw, x0
+; CHECK: dc isw, x0                 ; encoding: [0x40,0x76,0x08,0xd5]
+  dc cvac, x0
+; CHECK: dc cvac, x0                ; encoding: [0x20,0x7a,0x0b,0xd5]
+  dc csw, x0
+; CHECK: dc csw, x0                 ; encoding: [0x40,0x7a,0x08,0xd5]
+  dc cvau, x0
+; CHECK: dc cvau, x0                ; encoding: [0x20,0x7b,0x0b,0xd5]
+  dc civac, x0
+; CHECK: dc civac, x0               ; encoding: [0x20,0x7e,0x0b,0xd5]
+  dc cisw, x0
+; CHECK: dc cisw, x0                ; encoding: [0x40,0x7e,0x08,0xd5]
+
+  at s1e1r, x0
+; CHECK: at s1e1r, x0               ; encoding: [0x00,0x78,0x08,0xd5]
+  at s1e2r, x0
+; CHECK: at s1e2r, x0               ; encoding: [0x00,0x78,0x0c,0xd5]
+  at s1e3r, x0
+; CHECK: at s1e3r, x0               ; encoding: [0x00,0x78,0x0e,0xd5]
+  at s1e1w, x0
+; CHECK: at s1e1w, x0               ; encoding: [0x20,0x78,0x08,0xd5]
+  at s1e2w, x0
+; CHECK: at s1e2w, x0               ; encoding: [0x20,0x78,0x0c,0xd5]
+  at s1e3w, x0
+; CHECK: at s1e3w, x0               ; encoding: [0x20,0x78,0x0e,0xd5]
+  at s1e0r, x0
+; CHECK: at s1e0r, x0               ; encoding: [0x40,0x78,0x08,0xd5]
+  at s1e0w, x0
+; CHECK: at s1e0w, x0               ; encoding: [0x60,0x78,0x08,0xd5]
+  at s12e1r, x0
+; CHECK: at s12e1r, x0              ; encoding: [0x80,0x78,0x0c,0xd5]
+  at s12e1w, x0
+; CHECK: at s12e1w, x0              ; encoding: [0xa0,0x78,0x0c,0xd5]
+  at s12e0r, x0
+; CHECK: at s12e0r, x0              ; encoding: [0xc0,0x78,0x0c,0xd5]
+  at s12e0w, x0
+; CHECK: at s12e0w, x0              ; encoding: [0xe0,0x78,0x0c,0xd5]
+
+  tlbi vmalle1is
+; CHECK: tlbi vmalle1is             ; encoding: [0x1f,0x83,0x08,0xd5]
+  tlbi alle2is
+; CHECK: tlbi alle2is               ; encoding: [0x1f,0x83,0x0c,0xd5]
+  tlbi alle3is
+; CHECK: tlbi alle3is               ; encoding: [0x1f,0x83,0x0e,0xd5]
+  tlbi vae1is, x0
+; CHECK: tlbi vae1is, x0            ; encoding: [0x20,0x83,0x08,0xd5]
+  tlbi vae2is, x0
+; CHECK: tlbi vae2is, x0            ; encoding: [0x20,0x83,0x0c,0xd5]
+  tlbi vae3is, x0
+; CHECK: tlbi vae3is, x0            ; encoding: [0x20,0x83,0x0e,0xd5]
+  tlbi aside1is, x0
+; CHECK: tlbi aside1is, x0          ; encoding: [0x40,0x83,0x08,0xd5]
+  tlbi vaae1is, x0
+; CHECK: tlbi vaae1is, x0           ; encoding: [0x60,0x83,0x08,0xd5]
+  tlbi alle1is
+; CHECK: tlbi alle1is               ; encoding: [0x9f,0x83,0x0c,0xd5]
+  tlbi vale1is, x0
+; CHECK: tlbi vale1is, x0           ; encoding: [0xa0,0x83,0x08,0xd5]
+  tlbi vaale1is, x0
+; CHECK: tlbi vaale1is, x0          ; encoding: [0xe0,0x83,0x08,0xd5]
+  tlbi vmalle1
+; CHECK: tlbi vmalle1               ; encoding: [0x1f,0x87,0x08,0xd5]
+  tlbi alle2
+; CHECK: tlbi alle2                 ; encoding: [0x1f,0x87,0x0c,0xd5]
+  tlbi vale2is, x0
+; CHECK: tlbi vale2is, x0           ; encoding: [0xa0,0x83,0x0c,0xd5]
+  tlbi vale3is, x0
+; CHECK: tlbi vale3is, x0           ; encoding: [0xa0,0x83,0x0e,0xd5]
+  tlbi alle3
+; CHECK: tlbi alle3                 ; encoding: [0x1f,0x87,0x0e,0xd5]
+  tlbi vae1, x0
+; CHECK: tlbi vae1, x0              ; encoding: [0x20,0x87,0x08,0xd5]
+  tlbi vae2, x0
+; CHECK: tlbi vae2, x0              ; encoding: [0x20,0x87,0x0c,0xd5]
+  tlbi vae3, x0
+; CHECK: tlbi vae3, x0              ; encoding: [0x20,0x87,0x0e,0xd5]
+  tlbi aside1, x0
+; CHECK: tlbi aside1, x0            ; encoding: [0x40,0x87,0x08,0xd5]
+  tlbi vaae1, x0
+; CHECK: tlbi vaae1, x0             ; encoding: [0x60,0x87,0x08,0xd5]
+  tlbi alle1
+; CHECK: tlbi alle1                 ; encoding: [0x9f,0x87,0x0c,0xd5
+  tlbi vale1, x0
+; CHECK: tlbi vale1, x0             ; encoding: [0xa0,0x87,0x08,0xd5]
+  tlbi vale2, x0
+; CHECK: tlbi vale2, x0             ; encoding: [0xa0,0x87,0x0c,0xd5]
+  tlbi vale3, x0
+; CHECK: tlbi vale3, x0             ; encoding: [0xa0,0x87,0x0e,0xd5]
+  tlbi vaale1, x0
+; CHECK: tlbi vaale1, x0            ; encoding: [0xe0,0x87,0x08,0xd5]
+  tlbi ipas2e1, x0
+; CHECK: tlbi ipas2e1, x0           ; encoding: [0x20,0x84,0x0c,0xd5]
+  tlbi ipas2le1, x0
+; CHECK: tlbi ipas2le1, x0          ; encoding: [0xa0,0x84,0x0c,0xd5]
+  tlbi ipas2e1is, x0
+; CHECK: tlbi ipas2e1is, x0         ; encoding: [0x20,0x80,0x0c,0xd5]
+  tlbi ipas2le1is, x0
+; CHECK: tlbi ipas2le1is, x0        ; encoding: [0xa0,0x80,0x0c,0xd5]
+  tlbi vmalls12e1
+; CHECK: tlbi vmalls12e1            ; encoding: [0xdf,0x87,0x0c,0xd5]
+  tlbi vmalls12e1is
+; CHECK: tlbi vmalls12e1is          ; encoding: [0xdf,0x83,0x0c,0xd5]
+
+;-----------------------------------------------------------------------------
+; 5.8.5 Vector Arithmetic aliases
+;-----------------------------------------------------------------------------
+
+  cmls.8b v0, v2, v1
+  cmls.16b v0, v2, v1
+  cmls.4h v0, v2, v1
+  cmls.8h v0, v2, v1
+  cmls.2s v0, v2, v1
+  cmls.4s v0, v2, v1
+  cmls.2d v0, v2, v1
+; CHECK: cmhs.8b v0, v1, v2
+; CHECK: cmhs.16b v0, v1, v2
+; CHECK: cmhs.4h v0, v1, v2
+; CHECK: cmhs.8h v0, v1, v2
+; CHECK: cmhs.2s v0, v1, v2
+; CHECK: cmhs.4s v0, v1, v2
+; CHECK: cmhs.2d v0, v1, v2
+
+  cmlo.8b v0, v2, v1
+  cmlo.16b v0, v2, v1
+  cmlo.4h v0, v2, v1
+  cmlo.8h v0, v2, v1
+  cmlo.2s v0, v2, v1
+  cmlo.4s v0, v2, v1
+  cmlo.2d v0, v2, v1
+; CHECK: cmhi.8b v0, v1, v2
+; CHECK: cmhi.16b v0, v1, v2
+; CHECK: cmhi.4h v0, v1, v2
+; CHECK: cmhi.8h v0, v1, v2
+; CHECK: cmhi.2s v0, v1, v2
+; CHECK: cmhi.4s v0, v1, v2
+; CHECK: cmhi.2d v0, v1, v2
+
+  cmle.8b v0, v2, v1
+  cmle.16b v0, v2, v1
+  cmle.4h v0, v2, v1
+  cmle.8h  v0, v2, v1
+  cmle.2s v0, v2, v1
+  cmle.4s v0, v2, v1
+  cmle.2d v0, v2, v1
+; CHECK: cmge.8b v0, v1, v2
+; CHECK: cmge.16b v0, v1, v2
+; CHECK: cmge.4h v0, v1, v2
+; CHECK: cmge.8h v0, v1, v2
+; CHECK: cmge.2s v0, v1, v2
+; CHECK: cmge.4s v0, v1, v2
+; CHECK: cmge.2d v0, v1, v2
+
+  cmlt.8b v0, v2, v1
+  cmlt.16b v0, v2, v1
+  cmlt.4h v0, v2, v1
+  cmlt.8h  v0, v2, v1
+  cmlt.2s v0, v2, v1
+  cmlt.4s v0, v2, v1
+  cmlt.2d v0, v2, v1
+; CHECK: cmgt.8b v0, v1, v2
+; CHECK: cmgt.16b v0, v1, v2
+; CHECK: cmgt.4h v0, v1, v2
+; CHECK: cmgt.8h v0, v1, v2
+; CHECK: cmgt.2s v0, v1, v2
+; CHECK: cmgt.4s v0, v1, v2
+; CHECK: cmgt.2d v0, v1, v2
+
+  fcmle.2s v0, v2, v1
+  fcmle.4s v0, v2, v1
+  fcmle.2d v0, v2, v1
+; CHECK: fcmge.2s v0, v1, v2
+; CHECK: fcmge.4s v0, v1, v2
+; CHECK: fcmge.2d v0, v1, v2
+
+  fcmlt.2s v0, v2, v1
+  fcmlt.4s v0, v2, v1
+  fcmlt.2d v0, v2, v1
+; CHECK: fcmgt.2s v0, v1, v2
+; CHECK: fcmgt.4s v0, v1, v2
+; CHECK: fcmgt.2d v0, v1, v2
+
+  facle.2s v0, v2, v1
+  facle.4s v0, v2, v1
+  facle.2d v0, v2, v1
+; CHECK: facge.2s v0, v1, v2
+; CHECK: facge.4s v0, v1, v2
+; CHECK: facge.2d v0, v1, v2
+
+  faclt.2s v0, v2, v1
+  faclt.4s v0, v2, v1
+  faclt.2d v0, v2, v1
+; CHECK: facgt.2s v0, v1, v2
+; CHECK: facgt.4s v0, v1, v2
+; CHECK: facgt.2d v0, v1, v2
+
+;-----------------------------------------------------------------------------
+; 5.8.6 Scalar Arithmetic aliases
+;-----------------------------------------------------------------------------
+
+  cmls d0, d2, d1
+; CHECK: cmhs d0, d1, d2
+
+  cmle d0, d2, d1
+; CHECK: cmge d0, d1, d2
+
+  cmlo d0, d2, d1
+; CHECK: cmhi d0, d1, d2
+
+  cmlt d0, d2, d1
+; CHECK: cmgt d0, d1, d2
+
+  fcmle s0, s2, s1
+  fcmle d0, d2, d1
+; CHECK: fcmge s0, s1, s2
+; CHECK: fcmge d0, d1, d2
+
+  fcmlt s0, s2, s1
+  fcmlt d0, d2, d1
+; CHECK: fcmgt s0, s1, s2
+; CHECK: fcmgt d0, d1, d2
+
+  facle s0, s2, s1
+  facle d0, d2, d1
+; CHECK: facge s0, s1, s2
+; CHECK: facge d0, d1, d2
+
+  faclt s0, s2, s1
+  faclt d0, d2, d1
+; CHECK: facgt s0, s1, s2
+; CHECK: facgt d0, d1, d2
+
+;-----------------------------------------------------------------------------
+; 5.8.14 Vector Shift (immediate)
+;-----------------------------------------------------------------------------
+  sxtl v1.8h, v2.8b
+; CHECK: sshll.8h v1, v2, #0
+  sxtl.8h v1, v2
+; CHECK: sshll.8h v1, v2, #0
+
+  sxtl v1.4s, v2.4h
+; CHECK: sshll.4s v1, v2, #0
+  sxtl.4s v1, v2
+; CHECK: sshll.4s v1, v2, #0
+
+  sxtl v1.2d, v2.2s
+; CHECK: sshll.2d v1, v2, #0
+  sxtl.2d v1, v2
+; CHECK: sshll.2d v1, v2, #0
+
+  sxtl2 v1.8h, v2.16b
+; CHECK: sshll2.8h v1, v2, #0
+  sxtl2.8h v1, v2
+; CHECK: sshll2.8h v1, v2, #0
+
+  sxtl2 v1.4s, v2.8h
+; CHECK: sshll2.4s v1, v2, #0
+  sxtl2.4s v1, v2
+; CHECK: sshll2.4s v1, v2, #0
+
+  sxtl2 v1.2d, v2.4s
+; CHECK: sshll2.2d v1, v2, #0
+  sxtl2.2d v1, v2
+; CHECK: sshll2.2d v1, v2, #0
+
+  uxtl v1.8h, v2.8b
+; CHECK: ushll.8h v1, v2, #0
+  uxtl.8h v1, v2
+; CHECK: ushll.8h v1, v2, #0
+
+  uxtl v1.4s, v2.4h
+; CHECK: ushll.4s v1, v2, #0
+  uxtl.4s v1, v2
+; CHECK: ushll.4s v1, v2, #0
+
+  uxtl v1.2d, v2.2s
+; CHECK: ushll.2d v1, v2, #0
+  uxtl.2d v1, v2
+; CHECK: ushll.2d v1, v2, #0
+
+  uxtl2 v1.8h, v2.16b
+; CHECK: ushll2.8h v1, v2, #0
+  uxtl2.8h v1, v2
+; CHECK: ushll2.8h v1, v2, #0
+
+  uxtl2 v1.4s, v2.8h
+; CHECK: ushll2.4s v1, v2, #0
+  uxtl2.4s v1, v2
+; CHECK: ushll2.4s v1, v2, #0
+
+  uxtl2 v1.2d, v2.4s
+; CHECK: ushll2.2d v1, v2, #0
+  uxtl2.2d v1, v2
+; CHECK: ushll2.2d v1, v2, #0
+
+
+;-----------------------------------------------------------------------------
+; MOVI verbose syntax with shift operand omitted.
+;-----------------------------------------------------------------------------
+  movi v4.16b, #0x00
+  movi v4.16B, #0x01
+  movi v4.8b, #0x02
+  movi v4.8B, #0x03
+  movi v1.2d, #0x000000000000ff
+  movi v2.2D, #0x000000000000ff
+
+; CHECK: movi.16b	v4, #0              ; encoding: [0x04,0xe4,0x00,0x4f]
+; CHECK: movi.16b	v4, #0x1              ; encoding: [0x24,0xe4,0x00,0x4f]
+; CHECK: movi.8b	v4, #0x2               ; encoding: [0x44,0xe4,0x00,0x0f]
+; CHECK: movi.8b	v4, #0x3               ; encoding: [0x64,0xe4,0x00,0x0f]
+; CHECK: movi.2d	v1, #0x000000000000ff ; encoding: [0x21,0xe4,0x00,0x6f]
+; CHECK: movi.2d	v2, #0x000000000000ff ; encoding: [0x22,0xe4,0x00,0x6f]
diff --git a/test/MC/AArch64/arm64-arithmetic-encoding.s b/test/MC/AArch64/arm64-arithmetic-encoding.s
new file mode 100644
index 00000000000..5fd591240e2
--- /dev/null
+++ b/test/MC/AArch64/arm64-arithmetic-encoding.s
@@ -0,0 +1,615 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon -show-encoding < %s | FileCheck %s
+
+foo:
+;==---------------------------------------------------------------------------==
+; Add/Subtract with carry/borrow
+;==---------------------------------------------------------------------------==
+
+  adc   w1, w2, w3
+  adc   x1, x2, x3
+  adcs  w5, w4, w3
+  adcs  x5, x4, x3
+
+; CHECK: adc  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x1a]
+; CHECK: adc  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0x9a]
+; CHECK: adcs w5, w4, w3             ; encoding: [0x85,0x00,0x03,0x3a]
+; CHECK: adcs x5, x4, x3             ; encoding: [0x85,0x00,0x03,0xba]
+
+  sbc   w1, w2, w3
+  sbc   x1, x2, x3
+  sbcs  w1, w2, w3
+  sbcs  x1, x2, x3
+
+; CHECK: sbc  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x5a]
+; CHECK: sbc  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xda]
+; CHECK: sbcs w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x7a]
+; CHECK: sbcs x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xfa]
+
+;==---------------------------------------------------------------------------==
+; Add/Subtract with (optionally shifted) immediate
+;==---------------------------------------------------------------------------==
+
+  add w3, w4, #1024
+  add w3, w4, #1024, lsl #0
+  add x3, x4, #1024
+  add x3, x4, #1024, lsl #0
+
+; CHECK: add w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x11]
+; CHECK: add w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x11]
+; CHECK: add x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0x91]
+; CHECK: add x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0x91]
+
+  add w3, w4, #1024, lsl #12
+  add w3, w4, #4194304
+  add w3, w4, #0, lsl #12
+  add x3, x4, #1024, lsl #12
+  add x3, x4, #4194304
+  add x3, x4, #0, lsl #12
+  add sp, sp, #32
+
+; CHECK: add w3, w4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x11]
+; CHECK: add w3, w4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x11]
+; CHECK: add w3, w4, #0, lsl #12     ; encoding: [0x83,0x00,0x40,0x11]
+; CHECK: add x3, x4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x91]
+; CHECK: add x3, x4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x91]
+; CHECK: add x3, x4, #0, lsl #12     ; encoding: [0x83,0x00,0x40,0x91]
+; CHECK: add sp, sp, #32             ; encoding: [0xff,0x83,0x00,0x91]
+
+  adds w3, w4, #1024
+  adds w3, w4, #1024, lsl #0
+  adds w3, w4, #1024, lsl #12
+  adds x3, x4, #1024
+  adds x3, x4, #1024, lsl #0
+  adds x3, x4, #1024, lsl #12
+
+; CHECK: adds w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x31]
+; CHECK: adds w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x31]
+; CHECK: adds w3, w4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0x31]
+; CHECK: adds x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xb1]
+; CHECK: adds x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xb1]
+; CHECK: adds x3, x4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0xb1]
+
+  sub w3, w4, #1024
+  sub w3, w4, #1024, lsl #0
+  sub w3, w4, #1024, lsl #12
+  sub x3, x4, #1024
+  sub x3, x4, #1024, lsl #0
+  sub x3, x4, #1024, lsl #12
+  sub sp, sp, #32
+
+; CHECK: sub w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x51]
+; CHECK: sub w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x51]
+; CHECK: sub w3, w4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x51]
+; CHECK: sub x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0xd1]
+; CHECK: sub x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0xd1]
+; CHECK: sub x3, x4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0xd1]
+; CHECK: sub sp, sp, #32             ; encoding: [0xff,0x83,0x00,0xd1]
+
+  subs w3, w4, #1024
+  subs w3, w4, #1024, lsl #0
+  subs w3, w4, #1024, lsl #12
+  subs x3, x4, #1024
+  subs x3, x4, #1024, lsl #0
+  subs x3, x4, #1024, lsl #12
+
+; CHECK: subs w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x71]
+; CHECK: subs w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x71]
+; CHECK: subs w3, w4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0x71]
+; CHECK: subs x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xf1]
+; CHECK: subs x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xf1]
+; CHECK: subs x3, x4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0xf1]
+
+;==---------------------------------------------------------------------------==
+; Add/Subtract register with (optional) shift
+;==---------------------------------------------------------------------------==
+
+  add w12, w13, w14
+  add x12, x13, x14
+  add w12, w13, w14, lsl #12
+  add x12, x13, x14, lsl #12
+  add x12, x13, x14, lsr #42
+  add x12, x13, x14, asr #39
+
+; CHECK: add w12, w13, w14           ; encoding: [0xac,0x01,0x0e,0x0b]
+; CHECK: add x12, x13, x14           ; encoding: [0xac,0x01,0x0e,0x8b]
+; CHECK: add w12, w13, w14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x0b]
+; CHECK: add x12, x13, x14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x8b]
+; CHECK: add x12, x13, x14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0x8b]
+; CHECK: add x12, x13, x14, asr #39  ; encoding: [0xac,0x9d,0x8e,0x8b]
+
+  sub w12, w13, w14
+  sub x12, x13, x14
+  sub w12, w13, w14, lsl #12
+  sub x12, x13, x14, lsl #12
+  sub x12, x13, x14, lsr #42
+  sub x12, x13, x14, asr #39
+
+; CHECK: sub w12, w13, w14           ; encoding: [0xac,0x01,0x0e,0x4b]
+; CHECK: sub x12, x13, x14           ; encoding: [0xac,0x01,0x0e,0xcb]
+; CHECK: sub w12, w13, w14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x4b]
+; CHECK: sub x12, x13, x14, lsl #12  ; encoding: [0xac,0x31,0x0e,0xcb]
+; CHECK: sub x12, x13, x14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0xcb]
+; CHECK: sub x12, x13, x14, asr #39  ; encoding: [0xac,0x9d,0x8e,0xcb]
+
+  adds w12, w13, w14
+  adds x12, x13, x14
+  adds w12, w13, w14, lsl #12
+  adds x12, x13, x14, lsl #12
+  adds x12, x13, x14, lsr #42
+  adds x12, x13, x14, asr #39
+
+; CHECK: adds w12, w13, w14          ; encoding: [0xac,0x01,0x0e,0x2b]
+; CHECK: adds x12, x13, x14          ; encoding: [0xac,0x01,0x0e,0xab]
+; CHECK: adds w12, w13, w14, lsl #12 ; encoding: [0xac,0x31,0x0e,0x2b]
+; CHECK: adds x12, x13, x14, lsl #12 ; encoding: [0xac,0x31,0x0e,0xab]
+; CHECK: adds x12, x13, x14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0xab]
+; CHECK: adds x12, x13, x14, asr #39 ; encoding: [0xac,0x9d,0x8e,0xab]
+
+  subs w12, w13, w14
+  subs x12, x13, x14
+  subs w12, w13, w14, lsl #12
+  subs x12, x13, x14, lsl #12
+  subs x12, x13, x14, lsr #42
+  subs x12, x13, x14, asr #39
+
+; CHECK: subs w12, w13, w14          ; encoding: [0xac,0x01,0x0e,0x6b]
+; CHECK: subs x12, x13, x14          ; encoding: [0xac,0x01,0x0e,0xeb]
+; CHECK: subs w12, w13, w14, lsl #12 ; encoding: [0xac,0x31,0x0e,0x6b]
+; CHECK: subs x12, x13, x14, lsl #12 ; encoding: [0xac,0x31,0x0e,0xeb]
+; CHECK: subs x12, x13, x14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0xeb]
+; CHECK: subs x12, x13, x14, asr #39 ; encoding: [0xac,0x9d,0x8e,0xeb]
+
+; Check use of upper case register names rdar://14354073
+  add X2, X2, X2
+; CHECK: add x2, x2, x2              ; encoding: [0x42,0x00,0x02,0x8b]
+
+;==---------------------------------------------------------------------------==
+; Add/Subtract with (optional) extend
+;==---------------------------------------------------------------------------==
+
+  add w1, w2, w3, uxtb
+  add w1, w2, w3, uxth
+  add w1, w2, w3, uxtw
+  add w1, w2, w3, uxtx
+  add w1, w2, w3, sxtb
+  add w1, w2, w3, sxth
+  add w1, w2, w3, sxtw
+  add w1, w2, w3, sxtx
+
+; CHECK: add w1, w2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x0b]
+; CHECK: add w1, w2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x0b]
+; CHECK: add w1, w2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x0b]
+; CHECK: add w1, w2, w3, uxtx        ; encoding: [0x41,0x60,0x23,0x0b]
+; CHECK: add w1, w2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x0b]
+; CHECK: add w1, w2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x0b]
+; CHECK: add w1, w2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x0b]
+; CHECK: add w1, w2, w3, sxtx        ; encoding: [0x41,0xe0,0x23,0x0b]
+
+  add x1, x2, w3, uxtb
+  add x1, x2, w3, uxth
+  add x1, x2, w3, uxtw
+  add x1, x2, w3, sxtb
+  add x1, x2, w3, sxth
+  add x1, x2, w3, sxtw
+
+; CHECK: add x1, x2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x8b]
+; CHECK: add x1, x2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x8b]
+; CHECK: add x1, x2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x8b]
+; CHECK: add x1, x2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x8b]
+; CHECK: add x1, x2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x8b]
+; CHECK: add x1, x2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x8b]
+
+  add w1, wsp, w3
+  add w1, wsp, w3, uxtw #0
+  add w2, wsp, w3, lsl #1
+  add sp, x2, x3
+  add sp, x2, x3, uxtx #0
+
+; CHECK: add w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x0b]
+; CHECK: add w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x0b]
+; CHECK: add w2, wsp, w3, lsl #1     ; encoding: [0xe2,0x47,0x23,0x0b]
+; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
+; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
+
+  sub w1, w2, w3, uxtb
+  sub w1, w2, w3, uxth
+  sub w1, w2, w3, uxtw
+  sub w1, w2, w3, uxtx
+  sub w1, w2, w3, sxtb
+  sub w1, w2, w3, sxth
+  sub w1, w2, w3, sxtw
+  sub w1, w2, w3, sxtx
+
+; CHECK: sub w1, w2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x4b]
+; CHECK: sub w1, w2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x4b]
+; CHECK: sub w1, w2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x4b]
+; CHECK: sub w1, w2, w3, uxtx        ; encoding: [0x41,0x60,0x23,0x4b]
+; CHECK: sub w1, w2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x4b]
+; CHECK: sub w1, w2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x4b]
+; CHECK: sub w1, w2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x4b]
+; CHECK: sub w1, w2, w3, sxtx        ; encoding: [0x41,0xe0,0x23,0x4b]
+
+  sub x1, x2, w3, uxtb
+  sub x1, x2, w3, uxth
+  sub x1, x2, w3, uxtw
+  sub x1, x2, w3, sxtb
+  sub x1, x2, w3, sxth
+  sub x1, x2, w3, sxtw
+
+; CHECK: sub x1, x2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0xcb]
+; CHECK: sub x1, x2, w3, uxth        ; encoding: [0x41,0x20,0x23,0xcb]
+; CHECK: sub x1, x2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0xcb]
+; CHECK: sub x1, x2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0xcb]
+; CHECK: sub x1, x2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0xcb]
+; CHECK: sub x1, x2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0xcb]
+
+  sub w1, wsp, w3
+  sub w1, wsp, w3, uxtw #0
+  sub sp, x2, x3
+  sub sp, x2, x3, uxtx #0
+  sub sp, x3, x7, lsl #4
+
+; CHECK: sub w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x4b]
+; CHECK: sub w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x4b]
+; CHECK: sub sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0xcb]
+; CHECK: sub sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0xcb]
+; CHECK: sp, x3, x7, lsl #4          ; encoding: [0x7f,0x70,0x27,0xcb]
+
+  adds w1, w2, w3, uxtb
+  adds w1, w2, w3, uxth
+  adds w1, w2, w3, uxtw
+  adds w1, w2, w3, uxtx
+  adds w1, w2, w3, sxtb
+  adds w1, w2, w3, sxth
+  adds w1, w2, w3, sxtw
+  adds w1, w2, w3, sxtx
+
+; CHECK: adds w1, w2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0x2b]
+; CHECK: adds w1, w2, w3, uxth       ; encoding: [0x41,0x20,0x23,0x2b]
+; CHECK: adds w1, w2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0x2b]
+; CHECK: adds w1, w2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0x2b]
+; CHECK: adds w1, w2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0x2b]
+; CHECK: adds w1, w2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0x2b]
+; CHECK: adds w1, w2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0x2b]
+; CHECK: adds w1, w2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0x2b]
+
+  adds x1, x2, w3, uxtb
+  adds x1, x2, w3, uxth
+  adds x1, x2, w3, uxtw
+  adds x1, x2, w3, uxtx
+  adds x1, x2, w3, sxtb
+  adds x1, x2, w3, sxth
+  adds x1, x2, w3, sxtw
+  adds x1, x2, w3, sxtx
+
+; CHECK: adds x1, x2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0xab]
+; CHECK: adds x1, x2, w3, uxth       ; encoding: [0x41,0x20,0x23,0xab]
+; CHECK: adds x1, x2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0xab]
+; CHECK: adds x1, x2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0xab]
+; CHECK: adds x1, x2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0xab]
+; CHECK: adds x1, x2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0xab]
+; CHECK: adds x1, x2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0xab]
+; CHECK: adds x1, x2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0xab]
+
+  adds w1, wsp, w3
+  adds w1, wsp, w3, uxtw #0
+  adds wzr, wsp, w3, lsl #4
+
+; CHECK: adds w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x2b]
+; CHECK: adds w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x2b]
+; CHECK: cmn wsp, w3, lsl #4         ; encoding: [0xff,0x53,0x23,0x2b]
+
+  subs w1, w2, w3, uxtb
+  subs w1, w2, w3, uxth
+  subs w1, w2, w3, uxtw
+  subs w1, w2, w3, uxtx
+  subs w1, w2, w3, sxtb
+  subs w1, w2, w3, sxth
+  subs w1, w2, w3, sxtw
+  subs w1, w2, w3, sxtx
+
+; CHECK: subs w1, w2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0x6b]
+; CHECK: subs w1, w2, w3, uxth       ; encoding: [0x41,0x20,0x23,0x6b]
+; CHECK: subs w1, w2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0x6b]
+; CHECK: subs w1, w2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0x6b]
+; CHECK: subs w1, w2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0x6b]
+; CHECK: subs w1, w2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0x6b]
+; CHECK: subs w1, w2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0x6b]
+; CHECK: subs w1, w2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0x6b]
+
+  subs x1, x2, w3, uxtb
+  subs x1, x2, w3, uxth
+  subs x1, x2, w3, uxtw
+  subs x1, x2, w3, uxtx
+  subs x1, x2, w3, sxtb
+  subs x1, x2, w3, sxth
+  subs x1, x2, w3, sxtw
+  subs x1, x2, w3, sxtx
+
+; CHECK: subs x1, x2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0xeb]
+; CHECK: subs x1, x2, w3, uxth       ; encoding: [0x41,0x20,0x23,0xeb]
+; CHECK: subs x1, x2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0xeb]
+; CHECK: subs x1, x2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0xeb]
+; CHECK: subs x1, x2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0xeb]
+; CHECK: subs x1, x2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0xeb]
+; CHECK: subs x1, x2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0xeb]
+; CHECK: subs x1, x2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0xeb]
+
+  subs w1, wsp, w3
+  subs w1, wsp, w3, uxtw #0
+
+; CHECK: subs w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x6b]
+; CHECK: subs w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x6b]
+
+  cmp wsp, w9, lsl #0
+  subs x3, sp, x9, lsl #2
+  cmp wsp, w8, uxtw
+  subs wzr, wsp, w8, uxtw
+  cmp sp, w8, uxtw
+  subs xzr, sp, w8, uxtw
+
+; CHECK: cmp wsp, w9                 ; encoding: [0xff,0x43,0x29,0x6b]
+; CHECK: subs x3, sp, x9, lsl #2     ; encoding: [0xe3,0x6b,0x29,0xeb]
+; CHECK: cmp wsp, w8                 ; encoding: [0xff,0x43,0x28,0x6b]
+; CHECK: cmp wsp, w8                 ; encoding: [0xff,0x43,0x28,0x6b]
+; CHECK: cmp sp, w8, uxtw            ; encoding: [0xff,0x43,0x28,0xeb]
+; CHECK: cmp sp, w8, uxtw            ; encoding: [0xff,0x43,0x28,0xeb]
+
+  sub wsp, w9, w8, uxtw
+  sub w1, wsp, w8, uxtw
+  sub wsp, wsp, w8, uxtw
+  sub sp, x9, w8, uxtw
+  sub x1, sp, w8, uxtw
+  sub sp, sp, w8, uxtw
+  subs w1, wsp, w8, uxtw
+  subs x1, sp, w8, uxtw
+
+; CHECK: sub wsp, w9, w8             ; encoding: [0x3f,0x41,0x28,0x4b]
+; CHECK: sub w1, wsp, w8             ; encoding: [0xe1,0x43,0x28,0x4b]
+; CHECK: sub wsp, wsp, w8            ; encoding: [0xff,0x43,0x28,0x4b]
+; CHECK: sub sp, x9, w8, uxtw        ; encoding: [0x3f,0x41,0x28,0xcb]
+; CHECK: sub x1, sp, w8, uxtw        ; encoding: [0xe1,0x43,0x28,0xcb]
+; CHECK: sub sp, sp, w8, uxtw        ; encoding: [0xff,0x43,0x28,0xcb]
+; CHECK: subs w1, wsp, w8            ; encoding: [0xe1,0x43,0x28,0x6b]
+; CHECK: subs x1, sp, w8, uxtw       ; encoding: [0xe1,0x43,0x28,0xeb]
+
+;==---------------------------------------------------------------------------==
+; Signed/Unsigned divide
+;==---------------------------------------------------------------------------==
+
+  sdiv w1, w2, w3
+  sdiv x1, x2, x3
+  udiv w1, w2, w3
+  udiv x1, x2, x3
+
+; CHECK: sdiv w1, w2, w3             ; encoding: [0x41,0x0c,0xc3,0x1a]
+; CHECK: sdiv x1, x2, x3             ; encoding: [0x41,0x0c,0xc3,0x9a]
+; CHECK: udiv w1, w2, w3             ; encoding: [0x41,0x08,0xc3,0x1a]
+; CHECK: udiv x1, x2, x3             ; encoding: [0x41,0x08,0xc3,0x9a]
+
+;==---------------------------------------------------------------------------==
+; Variable shifts
+;==---------------------------------------------------------------------------==
+
+  asrv w1, w2, w3
+  asrv x1, x2, x3
+  asr w1, w2, w3
+  asr x1, x2, x3
+  lslv w1, w2, w3
+  lslv x1, x2, x3
+  lsl w1, w2, w3
+  lsl x1, x2, x3
+  lsrv w1, w2, w3
+  lsrv x1, x2, x3
+  lsr w1, w2, w3
+  lsr x1, x2, x3
+  rorv w1, w2, w3
+  rorv x1, x2, x3
+  ror w1, w2, w3
+  ror x1, x2, x3
+
+; CHECK: encoding: [0x41,0x28,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x28,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x28,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x28,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x20,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x20,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x20,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x20,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x24,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x24,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x24,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x24,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x2c,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x2c,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x2c,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x2c,0xc3,0x9a]
+
+;==---------------------------------------------------------------------------==
+; One operand instructions
+;==---------------------------------------------------------------------------==
+
+  cls w1, w2
+  cls x1, x2
+  clz w1, w2
+  clz x1, x2
+  rbit w1, w2
+  rbit x1, x2
+  rev w1, w2
+  rev x1, x2
+  rev16 w1, w2
+  rev16 x1, x2
+  rev32 x1, x2
+
+; CHECK: encoding: [0x41,0x14,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x14,0xc0,0xda]
+; CHECK: encoding: [0x41,0x10,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x10,0xc0,0xda]
+; CHECK: encoding: [0x41,0x00,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x00,0xc0,0xda]
+; CHECK: encoding: [0x41,0x08,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x0c,0xc0,0xda]
+; CHECK: encoding: [0x41,0x04,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x04,0xc0,0xda]
+; CHECK: encoding: [0x41,0x08,0xc0,0xda]
+
+;==---------------------------------------------------------------------------==
+; 6.6.1 Multiply-add instructions
+;==---------------------------------------------------------------------------==
+
+  madd   w1, w2, w3, w4
+  madd   x1, x2, x3, x4
+  msub   w1, w2, w3, w4
+  msub   x1, x2, x3, x4
+  smaddl x1, w2, w3, x4
+  smsubl x1, w2, w3, x4
+  umaddl x1, w2, w3, x4
+  umsubl x1, w2, w3, x4
+
+; CHECK: madd   w1, w2, w3, w4       ; encoding: [0x41,0x10,0x03,0x1b]
+; CHECK: madd   x1, x2, x3, x4       ; encoding: [0x41,0x10,0x03,0x9b]
+; CHECK: msub   w1, w2, w3, w4       ; encoding: [0x41,0x90,0x03,0x1b]
+; CHECK: msub   x1, x2, x3, x4       ; encoding: [0x41,0x90,0x03,0x9b]
+; CHECK: smaddl x1, w2, w3, x4       ; encoding: [0x41,0x10,0x23,0x9b]
+; CHECK: smsubl x1, w2, w3, x4       ; encoding: [0x41,0x90,0x23,0x9b]
+; CHECK: umaddl x1, w2, w3, x4       ; encoding: [0x41,0x10,0xa3,0x9b]
+; CHECK: umsubl x1, w2, w3, x4       ; encoding: [0x41,0x90,0xa3,0x9b]
+
+;==---------------------------------------------------------------------------==
+; Multiply-high instructions
+;==---------------------------------------------------------------------------==
+
+  smulh x1, x2, x3
+  umulh x1, x2, x3
+
+; CHECK: smulh x1, x2, x3            ; encoding: [0x41,0x7c,0x43,0x9b]
+; CHECK: umulh x1, x2, x3            ; encoding: [0x41,0x7c,0xc3,0x9b]
+
+;==---------------------------------------------------------------------------==
+; Move immediate instructions
+;==---------------------------------------------------------------------------==
+
+  movz w0, #1
+  movz x0, #1
+  movz w0, #1, lsl #16
+  movz x0, #1, lsl #16
+
+; CHECK: movz w0, #0x1                 ; encoding: [0x20,0x00,0x80,0x52]
+; CHECK: movz x0, #0x1                 ; encoding: [0x20,0x00,0x80,0xd2]
+; CHECK: movz w0, #0x1, lsl #16        ; encoding: [0x20,0x00,0xa0,0x52]
+; CHECK: movz x0, #0x1, lsl #16        ; encoding: [0x20,0x00,0xa0,0xd2]
+
+  movn w0, #2
+  movn x0, #2
+  movn w0, #2, lsl #16
+  movn x0, #2, lsl #16
+
+; CHECK: movn w0, #0x2                 ; encoding: [0x40,0x00,0x80,0x12]
+; CHECK: movn x0, #0x2                 ; encoding: [0x40,0x00,0x80,0x92]
+; CHECK: movn w0, #0x2, lsl #16        ; encoding: [0x40,0x00,0xa0,0x12]
+; CHECK: movn x0, #0x2, lsl #16        ; encoding: [0x40,0x00,0xa0,0x92]
+
+  movk w0, #1
+  movk x0, #1
+  movk w0, #1, lsl #16
+  movk x0, #1, lsl #16
+
+; CHECK: movk w0, #0x1                 ; encoding: [0x20,0x00,0x80,0x72]
+; CHECK: movk x0, #0x1                 ; encoding: [0x20,0x00,0x80,0xf2]
+; CHECK: movk w0, #0x1, lsl #16        ; encoding: [0x20,0x00,0xa0,0x72]
+; CHECK: movk x0, #0x1, lsl #16        ; encoding: [0x20,0x00,0xa0,0xf2]
+
+;==---------------------------------------------------------------------------==
+; Conditionally set flags instructions
+;==---------------------------------------------------------------------------==
+
+  ccmn w1, #2, #3, eq
+  ccmn x1, #2, #3, eq
+  ccmp w1, #2, #3, eq
+  ccmp x1, #2, #3, eq
+
+; CHECK: encoding: [0x23,0x08,0x42,0x3a]
+; CHECK: encoding: [0x23,0x08,0x42,0xba]
+; CHECK: encoding: [0x23,0x08,0x42,0x7a]
+; CHECK: encoding: [0x23,0x08,0x42,0xfa]
+
+  ccmn w1, w2, #3, eq
+  ccmn x1, x2, #3, eq
+  ccmp w1, w2, #3, eq
+  ccmp x1, x2, #3, eq
+
+; CHECK: encoding: [0x23,0x00,0x42,0x3a]
+; CHECK: encoding: [0x23,0x00,0x42,0xba]
+; CHECK: encoding: [0x23,0x00,0x42,0x7a]
+; CHECK: encoding: [0x23,0x00,0x42,0xfa]
+
+;==---------------------------------------------------------------------------==
+; Conditional select instructions
+;==---------------------------------------------------------------------------==
+
+  csel w1, w2, w3, eq
+  csel x1, x2, x3, eq
+  csinc w1, w2, w3, eq
+  csinc x1, x2, x3, eq
+  csinv w1, w2, w3, eq
+  csinv x1, x2, x3, eq
+  csneg w1, w2, w3, eq
+  csneg x1, x2, x3, eq
+
+; CHECK: encoding: [0x41,0x00,0x83,0x1a]
+; CHECK: encoding: [0x41,0x00,0x83,0x9a]
+; CHECK: encoding: [0x41,0x04,0x83,0x1a]
+; CHECK: encoding: [0x41,0x04,0x83,0x9a]
+; CHECK: encoding: [0x41,0x00,0x83,0x5a]
+; CHECK: encoding: [0x41,0x00,0x83,0xda]
+; CHECK: encoding: [0x41,0x04,0x83,0x5a]
+; CHECK: encoding: [0x41,0x04,0x83,0xda]
+
+; Make sure we handle upper case, too. In particular, condition codes.
+  CSEL W16, W7, W27, EQ
+  CSEL W15, W6, W26, NE
+  CSEL W14, W5, W25, CS
+  CSEL W13, W4, W24, HS
+  csel w12, w3, w23, CC
+  csel w11, w2, w22, LO
+  csel w10, w1, w21, MI
+  csel x9, x9, x1, PL
+  csel x8, x8, x2, VS
+  CSEL X7, X7, X3, VC
+  CSEL X6, X7, X4, HI
+  CSEL X5, X6, X5, LS
+  CSEL X4, X5, X6, GE
+  csel x3, x4, x7, LT
+  csel x2, x3, x8, GT
+  csel x1, x2, x9, LE
+  csel x10, x1, x20, AL
+
+; CHECK: csel	w16, w7, w27, eq        ; encoding: [0xf0,0x00,0x9b,0x1a]
+; CHECK: csel	w15, w6, w26, ne        ; encoding: [0xcf,0x10,0x9a,0x1a]
+; CHECK: csel	w14, w5, w25, hs        ; encoding: [0xae,0x20,0x99,0x1a]
+; CHECK: csel	w13, w4, w24, hs        ; encoding: [0x8d,0x20,0x98,0x1a]
+; CHECK: csel	w12, w3, w23, lo        ; encoding: [0x6c,0x30,0x97,0x1a]
+; CHECK: csel	w11, w2, w22, lo        ; encoding: [0x4b,0x30,0x96,0x1a]
+; CHECK: csel	w10, w1, w21, mi        ; encoding: [0x2a,0x40,0x95,0x1a]
+; CHECK: csel	x9, x9, x1, pl          ; encoding: [0x29,0x51,0x81,0x9a]
+; CHECK: csel	x8, x8, x2, vs          ; encoding: [0x08,0x61,0x82,0x9a]
+; CHECK: csel	x7, x7, x3, vc          ; encoding: [0xe7,0x70,0x83,0x9a]
+; CHECK: csel	x6, x7, x4, hi          ; encoding: [0xe6,0x80,0x84,0x9a]
+; CHECK: csel	x5, x6, x5, ls          ; encoding: [0xc5,0x90,0x85,0x9a]
+; CHECK: csel	x4, x5, x6, ge          ; encoding: [0xa4,0xa0,0x86,0x9a]
+; CHECK: csel	x3, x4, x7, lt          ; encoding: [0x83,0xb0,0x87,0x9a]
+; CHECK: csel	x2, x3, x8, gt          ; encoding: [0x62,0xc0,0x88,0x9a]
+; CHECK: csel	x1, x2, x9, le          ; encoding: [0x41,0xd0,0x89,0x9a]
+; CHECK: csel	x10, x1, x20, al        ; encoding: [0x2a,0xe0,0x94,0x9a]
+
+
+;==---------------------------------------------------------------------------==
+; Scalar saturating arithmetic
+;==---------------------------------------------------------------------------==
+  uqxtn b4, h2
+  uqxtn h2, s3
+  uqxtn s9, d2
+
+; CHECK: uqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x7e]
+; CHECK: uqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x7e]
+; CHECK: uqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x7e]
diff --git a/test/MC/AArch64/arm64-arm64-fixup.s b/test/MC/AArch64/arm64-arm64-fixup.s
new file mode 100644
index 00000000000..81306fb5ac0
--- /dev/null
+++ b/test/MC/AArch64/arm64-arm64-fixup.s
@@ -0,0 +1,10 @@
+; RUN: llvm-mc < %s -triple arm64-apple-darwin --show-encoding | FileCheck %s
+
+foo:
+  adr x3, Lbar
+; CHECK: adr x3, Lbar            ; encoding: [0x03'A',A,A,0x10'A']
+; CHECK: fixup A - offset: 0, value: Lbar, kind: fixup_aarch64_pcrel_adr_imm21
+Lbar:
+  adrp x3, _printf@page
+; CHECK: adrp x3, _printf@PAGE      ; encoding: [0x03'A',A,A,0x90'A']
+; CHECK: fixup A - offset: 0, value: _printf@PAGE, kind: fixup_aarch64_pcrel_adrp_imm21
diff --git a/test/MC/AArch64/arm64-basic-a64-instructions.s b/test/MC/AArch64/arm64-basic-a64-instructions.s
new file mode 100644
index 00000000000..2f58eadfc84
--- /dev/null
+++ b/test/MC/AArch64/arm64-basic-a64-instructions.s
@@ -0,0 +1,18 @@
+// RUN: llvm-mc -triple arm64 -mattr=+crc -show-encoding < %s | FileCheck %s
+
+        crc32b  w5, w7, w20
+        crc32h  w28, wzr, w30
+        crc32w  w0, w1, w2
+        crc32x  w7, w9, x20
+        crc32cb w9, w5, w4
+        crc32ch w13, w17, w25
+        crc32cw wzr, w3, w5
+        crc32cx w18, w16, xzr
+// CHECK: crc32b   w5, w7, w20             // encoding: [0xe5,0x40,0xd4,0x1a]
+// CHECK: crc32h   w28, wzr, w30           // encoding: [0xfc,0x47,0xde,0x1a]
+// CHECK: crc32w   w0, w1, w2              // encoding: [0x20,0x48,0xc2,0x1a]
+// CHECK: crc32x   w7, w9, x20             // encoding: [0x27,0x4d,0xd4,0x9a]
+// CHECK: crc32cb  w9, w5, w4              // encoding: [0xa9,0x50,0xc4,0x1a]
+// CHECK: crc32ch  w13, w17, w25           // encoding: [0x2d,0x56,0xd9,0x1a]
+// CHECK: crc32cw  wzr, w3, w5             // encoding: [0x7f,0x58,0xc5,0x1a]
+// CHECK: crc32cx  w18, w16, xzr           // encoding: [0x12,0x5e,0xdf,0x9a]
diff --git a/test/MC/AArch64/arm64-be-datalayout.s b/test/MC/AArch64/arm64-be-datalayout.s
new file mode 100644
index 00000000000..f448a4b86e1
--- /dev/null
+++ b/test/MC/AArch64/arm64-be-datalayout.s
@@ -0,0 +1,4 @@
+// RUN: llvm-mc -filetype=obj -triple arm64_be %s | llvm-readobj -section-data -sections | FileCheck %s
+
+// CHECK: 0000: 00123456 789ABCDE
+foo:    .xword 0x123456789abcde
diff --git a/test/MC/AArch64/arm64-bitfield-encoding.s b/test/MC/AArch64/arm64-bitfield-encoding.s
new file mode 100644
index 00000000000..1589aa7139f
--- /dev/null
+++ b/test/MC/AArch64/arm64-bitfield-encoding.s
@@ -0,0 +1,38 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+;==---------------------------------------------------------------------------==
+; 5.4.4 Bitfield Operations
+;==---------------------------------------------------------------------------==
+
+  bfm  w1, w2, #1, #15
+  bfm  x1, x2, #1, #15
+  sbfm w1, w2, #1, #15
+  sbfm x1, x2, #1, #15
+  ubfm w1, w2, #1, #15
+  ubfm x1, x2, #1, #15
+  sbfiz wzr, w0, #31, #1
+  sbfiz xzr, x0, #31, #1
+  ubfiz wzr, w0, #31, #1
+  ubfiz xzr, x0, #31, #1
+
+; CHECK: bfxil w1, w2, #1, #15       ; encoding: [0x41,0x3c,0x01,0x33]
+; CHECK: bfxil x1, x2, #1, #15       ; encoding: [0x41,0x3c,0x41,0xb3]
+; CHECK: sbfx w1, w2, #1, #15        ; encoding: [0x41,0x3c,0x01,0x13]
+; CHECK: sbfx x1, x2, #1, #15        ; encoding: [0x41,0x3c,0x41,0x93]
+; CHECK: ubfx w1, w2, #1, #15        ; encoding: [0x41,0x3c,0x01,0x53]
+; CHECK: ubfx x1, x2, #1, #15        ; encoding: [0x41,0x3c,0x41,0xd3]
+; CHECK: sbfiz wzr, w0, #31, #1      ; encoding: [0x1f,0x00,0x01,0x13]
+; CHECK: sbfiz xzr, x0, #31, #1      ; encoding: [0x1f,0x00,0x61,0x93]
+; CHECK: lsl  wzr, w0, #31           ; encoding: [0x1f,0x00,0x01,0x53]
+; CHECK: ubfiz xzr, x0, #31, #1      ; encoding: [0x1f,0x00,0x61,0xd3]
+
+;==---------------------------------------------------------------------------==
+; 5.4.5 Extract (immediate)
+;==---------------------------------------------------------------------------==
+
+  extr w1, w2, w3, #15
+  extr x2, x3, x4, #1
+
+; CHECK: extr w1, w2, w3, #15        ; encoding: [0x41,0x3c,0x83,0x13]
+; CHECK: extr x2, x3, x4, #1         ; encoding: [0x62,0x04,0xc4,0x93]
diff --git a/test/MC/AArch64/arm64-branch-encoding.s b/test/MC/AArch64/arm64-branch-encoding.s
new file mode 100644
index 00000000000..48c2099012f
--- /dev/null
+++ b/test/MC/AArch64/arm64-branch-encoding.s
@@ -0,0 +1,159 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+
+;-----------------------------------------------------------------------------
+; Unconditional branch (register) instructions.
+;-----------------------------------------------------------------------------
+
+  ret
+; CHECK: encoding: [0xc0,0x03,0x5f,0xd6]
+  ret x1
+; CHECK: encoding: [0x20,0x00,0x5f,0xd6]
+  drps
+; CHECK: encoding: [0xe0,0x03,0xbf,0xd6]
+  eret
+; CHECK: encoding: [0xe0,0x03,0x9f,0xd6]
+  br  x5
+; CHECK: encoding: [0xa0,0x00,0x1f,0xd6]
+  blr x9
+; CHECK: encoding: [0x20,0x01,0x3f,0xd6]
+  bl  L1
+; CHECK: bl L1   ; encoding: [A,A,A,0b100101AA]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_call26
+
+;-----------------------------------------------------------------------------
+; Contitional branch instructions.
+;-----------------------------------------------------------------------------
+
+  b     L1
+; CHECK: b L1      ; encoding: [A,A,A,0b000101AA]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch26
+  b.eq  L1
+; CHECK: b.eq L1   ; encoding: [0bAAA00000,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.ne  L1
+; CHECK: b.ne L1   ; encoding: [0bAAA00001,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.cs  L1
+; CHECK: b.hs L1   ; encoding: [0bAAA00010,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.cc  L1
+; CHECK: b.lo L1   ; encoding: [0bAAA00011,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.mi  L1
+; CHECK: b.mi L1   ; encoding: [0bAAA00100,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.pl  L1
+; CHECK: b.pl L1   ; encoding: [0bAAA00101,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.vs  L1
+; CHECK: b.vs L1   ; encoding: [0bAAA00110,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.vc  L1
+; CHECK: b.vc L1   ; encoding: [0bAAA00111,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.hi  L1
+; CHECK: b.hi L1   ; encoding: [0bAAA01000,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.ls  L1
+; CHECK: b.ls L1   ; encoding: [0bAAA01001,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.ge  L1
+; CHECK: b.ge L1   ; encoding: [0bAAA01010,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.lt  L1
+; CHECK: b.lt L1   ; encoding: [0bAAA01011,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.gt  L1
+; CHECK: b.gt L1   ; encoding: [0bAAA01100,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.le  L1
+; CHECK: b.le L1   ; encoding: [0bAAA01101,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.al  L1
+; CHECK: b.al L1      ; encoding: [0bAAA01110,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+L1:
+  b #28
+; CHECK: b #28
+  b.lt #28
+; CHECK: b.lt #28
+  b.cc #1048572
+; CHECK: b.lo	#1048572                ; encoding: [0xe3,0xff,0x7f,0x54]
+  b #134217724
+; CHECK: b	#134217724              ; encoding: [0xff,0xff,0xff,0x15]
+  b #-134217728
+; CHECK: b	#-134217728             ; encoding: [0x00,0x00,0x00,0x16]
+
+;-----------------------------------------------------------------------------
+; Compare-and-branch instructions.
+;-----------------------------------------------------------------------------
+
+  cbz w1, foo
+; CHECK: encoding: [0bAAA00001,A,A,0x34]
+  cbz x1, foo
+; CHECK: encoding: [0bAAA00001,A,A,0xb4]
+  cbnz w2, foo
+; CHECK: encoding: [0bAAA00010,A,A,0x35]
+  cbnz x2, foo
+; CHECK: encoding: [0bAAA00010,A,A,0xb5]
+  cbz w1, #28
+; CHECK: cbz w1, #28
+  cbz     w20, #1048572
+; CHECK: cbz	w20, #1048572           ; encoding: [0xf4,0xff,0x7f,0x34]
+  cbnz x2, #-1048576
+; CHECK: cbnz	x2, #-1048576           ; encoding: [0x02,0x00,0x80,0xb5]
+
+
+;-----------------------------------------------------------------------------
+; Bit-test-and-branch instructions.
+;-----------------------------------------------------------------------------
+
+  tbz x1, #3, foo
+; CHECK: encoding: [0bAAA00001,A,0b00011AAA,0x36]
+  tbnz x1, #63, foo
+; CHECK: encoding: [0bAAA00001,A,0b11111AAA,0xb7]
+
+  tbz w1, #3, foo
+; CHECK: encoding: [0bAAA00001,A,0b00011AAA,0x36]
+  tbnz w1, #31, foo
+; CHECK: encoding: [0bAAA00001,A,0b11111AAA,0x37]
+
+  tbz w1, #3, #28
+; CHECK: tbz w1, #3, #28
+  tbz w3, #5, #32764
+; CHECK: tbz	w3, #5, #32764          ; encoding: [0xe3,0xff,0x2b,0x36]
+  tbnz x3, #8, #-32768
+; CHECK: tbnz	w3, #8, #-32768         ; encoding: [0x03,0x00,0x44,0x37]
+
+;-----------------------------------------------------------------------------
+; Exception generation instructions.
+;-----------------------------------------------------------------------------
+
+  brk   #1
+; CHECK: encoding: [0x20,0x00,0x20,0xd4]
+  dcps1 #2
+; CHECK: encoding: [0x41,0x00,0xa0,0xd4]
+  dcps2 #3
+; CHECK: encoding: [0x62,0x00,0xa0,0xd4]
+  dcps3 #4
+; CHECK: encoding: [0x83,0x00,0xa0,0xd4]
+  hlt   #5
+; CHECK: encoding: [0xa0,0x00,0x40,0xd4]
+  hvc   #6
+; CHECK: encoding: [0xc2,0x00,0x00,0xd4]
+  smc   #7
+; CHECK: encoding: [0xe3,0x00,0x00,0xd4]
+  svc   #8
+; CHECK: encoding: [0x01,0x01,0x00,0xd4]
+
+; The immediate defaults to zero for DCPSn
+  dcps1
+  dcps2
+  dcps3
+
+; CHECK: dcps1                     ; encoding: [0x01,0x00,0xa0,0xd4]
+; CHECK: dcps2                     ; encoding: [0x02,0x00,0xa0,0xd4]
+; CHECK: dcps3                     ; encoding: [0x03,0x00,0xa0,0xd4]
+
diff --git a/test/MC/AArch64/arm64-condbr-without-dots.s b/test/MC/AArch64/arm64-condbr-without-dots.s
new file mode 100644
index 00000000000..2a9f7a7cf74
--- /dev/null
+++ b/test/MC/AArch64/arm64-condbr-without-dots.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple arm64-apple-ios -o - %s | FileCheck %s
+        
+        beq lbl
+        bne lbl
+        bcs lbl
+        bhs lbl
+        blo lbl
+        bcc lbl
+        bmi lbl
+        bpl lbl
+        bvs lbl
+        bvc lbl
+        bhi lbl
+        bls lbl
+        bge lbl
+        blt lbl
+        bgt lbl
+        ble lbl
+        bal lbl
+
+// CHECK: b.eq lbl
+// CHECK: b.ne lbl
+// CHECK: b.hs lbl
+// CHECK: b.hs lbl
+// CHECK: b.lo lbl
+// CHECK: b.lo lbl
+// CHECK: b.mi lbl
+// CHECK: b.pl lbl
+// CHECK: b.vs lbl
+// CHECK: b.vc lbl
+// CHECK: b.hi lbl
+// CHECK: b.ls lbl
+// CHECK: b.ge lbl
+// CHECK: b.lt lbl
+// CHECK: b.gt lbl
+// CHECK: b.le lbl
+// CHECK: b.al lbl
diff --git a/test/MC/AArch64/arm64-crypto.s b/test/MC/AArch64/arm64-crypto.s
new file mode 100644
index 00000000000..51efd2132a7
--- /dev/null
+++ b/test/MC/AArch64/arm64-crypto.s
@@ -0,0 +1,66 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -show-encoding -output-asm-variant=1 < %s | FileCheck %s
+
+foo:
+  aese.16b v0, v1
+  aesd.16b v0, v1
+  aesmc.16b v0, v1
+  aesimc.16b v0, v1
+
+  sha1c.4s q0, s1, v2
+  sha1p.4s q0, s1, v2
+  sha1m.4s q0, s1, v2
+  sha1su0.4s v0, v1, v2
+  sha256h.4s q0, q1, v2
+  sha256h2.4s q0, q1, v2
+  sha256su1.4s v0, v1, v2
+  sha1h s0, s1
+  sha1su1.4s v0, v1
+  sha256su0.4s v0, v1
+
+; CHECK: aese.16b v0, v1               ; encoding: [0x20,0x48,0x28,0x4e]
+; CHECK: aesd.16b v0, v1               ; encoding: [0x20,0x58,0x28,0x4e]
+; CHECK: aesmc.16b v0, v1              ; encoding: [0x20,0x68,0x28,0x4e]
+; CHECK: aesimc.16b v0, v1             ; encoding: [0x20,0x78,0x28,0x4e]
+
+; CHECK: sha1c.4s q0, s1, v2           ; encoding: [0x20,0x00,0x02,0x5e]
+; CHECK: sha1p.4s q0, s1, v2           ; encoding: [0x20,0x10,0x02,0x5e]
+; CHECK: sha1m.4s q0, s1, v2           ; encoding: [0x20,0x20,0x02,0x5e]
+; CHECK: sha1su0.4s v0, v1, v2         ; encoding: [0x20,0x30,0x02,0x5e]
+; CHECK: sha256h.4s q0, q1, v2         ; encoding: [0x20,0x40,0x02,0x5e]
+; CHECK: sha256h2.4s q0, q1, v2        ; encoding: [0x20,0x50,0x02,0x5e]
+; CHECK: sha256su1.4s v0, v1, v2       ; encoding: [0x20,0x60,0x02,0x5e]
+; CHECK: sha1h s0, s1                  ; encoding: [0x20,0x08,0x28,0x5e]
+; CHECK: sha1su1.4s v0, v1             ; encoding: [0x20,0x18,0x28,0x5e]
+; CHECK: sha256su0.4s v0, v1           ; encoding: [0x20,0x28,0x28,0x5e]
+
+  aese v2.16b, v3.16b
+  aesd v5.16b, v7.16b
+  aesmc v11.16b, v13.16b
+  aesimc v17.16b, v19.16b
+
+; CHECK: aese.16b v2, v3            ; encoding: [0x62,0x48,0x28,0x4e]
+; CHECK: aesd.16b v5, v7            ; encoding: [0xe5,0x58,0x28,0x4e]
+; CHECK: aesmc.16b v11, v13         ; encoding: [0xab,0x69,0x28,0x4e]
+; CHECK: aesimc.16b v17, v19        ; encoding: [0x71,0x7a,0x28,0x4e]
+
+  sha1c q23, s29, v3.4s
+  sha1p q14, s15, v9.4s
+  sha1m q2, s6, v5.4s
+  sha1su0 v3.4s, v5.4s, v9.4s
+  sha256h q2, q7, v18.4s
+  sha256h2 q28, q18, v28.4s
+  sha256su1 v4.4s, v5.4s, v9.4s
+  sha1h s30, s0
+  sha1su1 v10.4s, v21.4s
+  sha256su0 v2.4s, v31.4s
+
+; CHECK: sha1c.4s q23, s29, v3       ; encoding: [0xb7,0x03,0x03,0x5e]
+; CHECK: sha1p.4s q14, s15, v9       ; encoding: [0xee,0x11,0x09,0x5e]
+; CHECK: sha1m.4s q2, s6, v5         ; encoding: [0xc2,0x20,0x05,0x5e]
+; CHECK: sha1su0.4s v3, v5, v9       ; encoding: [0xa3,0x30,0x09,0x5e]
+; CHECK: sha256h.4s q2, q7, v18      ; encoding: [0xe2,0x40,0x12,0x5e]
+; CHECK: sha256h2.4s q28, q18, v28   ; encoding: [0x5c,0x52,0x1c,0x5e]
+; CHECK: sha256su1.4s v4, v5, v9     ; encoding: [0xa4,0x60,0x09,0x5e]
+; CHECK: sha1h s30, s0               ; encoding: [0x1e,0x08,0x28,0x5e]
+; CHECK: sha1su1.4s v10, v21         ; encoding: [0xaa,0x1a,0x28,0x5e]
+; CHECK: sha256su0.4s v2, v31        ; encoding: [0xe2,0x2b,0x28,0x5e]
diff --git a/test/MC/AArch64/arm64-diagno-predicate.s b/test/MC/AArch64/arm64-diagno-predicate.s
new file mode 100644
index 00000000000..3b757e836d3
--- /dev/null
+++ b/test/MC/AArch64/arm64-diagno-predicate.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc  -triple arm64-linux-gnu -mattr=-fp-armv8,-crc < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
+
+
+        fcvt d0, s0
+// CHECK-ERROR: error: instruction requires: fp-armv8
+// CHECK-ERROR-NEXT:        fcvt d0, s0
+// CHECK-ERROR-NEXT:        ^
+
+        fmla v9.2s, v9.2s, v0.2s
+// CHECK-ERROR: error: instruction requires: neon
+// CHECK-ERROR-NEXT:        fmla v9.2s, v9.2s, v0.2s
+// CHECK-ERROR-NEXT:        ^
+
+        pmull v0.1q, v1.1d, v2.1d
+// CHECK-ERROR: error: instruction requires: crypto
+// CHECK-ERROR-NEXT:        pmull v0.1q, v1.1d, v2.1d
+// CHECK-ERROR-NEXT:        ^
+
+        crc32b  w5, w7, w20
+// CHECK-ERROR: error: instruction requires: crc
+// CHECK-ERROR-NEXT:        crc32b  w5, w7, w20
+// CHECK-ERROR-NEXT:        ^
+
diff --git a/test/MC/AArch64/arm64-diags.s b/test/MC/AArch64/arm64-diags.s
new file mode 100644
index 00000000000..cf00e9826e1
--- /dev/null
+++ b/test/MC/AArch64/arm64-diags.s
@@ -0,0 +1,392 @@
+; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+foo:
+
+; The first should encode as an expression. The second should error expecting
+; a register.
+  ldr x3, (foo + 4)
+  ldr x3, [foo + 4]
+; CHECK:  ldr x3, foo+4               ; encoding: [0bAAA00011,A,A,0x58]
+; CHECK:                              ;   fixup A - offset: 0, value: foo+4, kind: fixup_aarch64_ldr_pcrel_imm19
+; CHECK-ERRORS: error: invalid operand for instruction
+
+; The last argument should be flagged as an error.  rdar://9576009
+  ld4.8b	{v0, v1, v2, v3}, [x0], #33
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: ld4.8b	{v0, v1, v2, v3}, [x0], #33
+
+
+        ldr x0, [x0, #804]
+        ldr w0, [x0, #802]
+        ldr x0, [x0, #804]!
+        ldr w0, [w0, #301]!
+        ldr x0, [x0], #804
+        ldr w0, [w0], #301
+
+        ldp w3, w4, [x5, #11]!
+        ldp x3, x4, [x5, #12]!
+        ldp q3, q4, [x5, #12]!
+        ldp w3, w4, [x5], #11
+        ldp x3, x4, [x5], #12
+        ldp q3, q4, [x5], #12
+
+        ldur x0, [x1, #-257]
+
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
+; CHECK-ERRORS:         ldr x0, [x0, #804]
+; CHECK-ERRORS:                 ^
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
+; CHECK-ERRORS:         ldr w0, [x0, #802]
+; CHECK-ERRORS:                 ^
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
+; CHECK-ERRORS:         ldr x0, [x0, #804]!
+; CHECK-ERRORS:                 ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS:         ldr w0, [w0, #301]!
+; CHECK-ERRORS:                  ^
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
+; CHECK-ERRORS:         ldr x0, [x0], #804
+; CHECK-ERRORS:                       ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS:         ldr w0, [w0], #301
+; CHECK-ERRORS:                  ^
+; CHECK-ERRORS: error: index must be a multiple of 4 in range [-256, 252].
+; CHECK-ERRORS:         ldp w3, w4, [x5, #11]!
+; CHECK-ERRORS:                     ^
+; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512, 504].
+; CHECK-ERRORS:         ldp x3, x4, [x5, #12]!
+; CHECK-ERRORS:                     ^
+; CHECK-ERRORS: error: index must be a multiple of 16 in range [-1024, 1008].
+; CHECK-ERRORS:         ldp q3, q4, [x5, #12]!
+; CHECK-ERRORS:                     ^
+; CHECK-ERRORS: error: index must be a multiple of 4 in range [-256, 252].
+; CHECK-ERRORS:         ldp w3, w4, [x5], #11
+; CHECK-ERRORS:                           ^
+; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512, 504].
+; CHECK-ERRORS:         ldp x3, x4, [x5], #12
+; CHECK-ERRORS:                           ^
+; CHECK-ERRORS: error: index must be a multiple of 16 in range [-1024, 1008].
+; CHECK-ERRORS:         ldp q3, q4, [x5], #12
+; CHECK-ERRORS:                           ^
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
+; CHECK-ERRORS:         ldur x0, [x1, #-257]
+; CHECK-ERRORS:                   ^
+
+
+ldrb   w1, [x3, w3, sxtw #4]
+ldrh   w1, [x3, w3, sxtw #4]
+ldr    w1, [x3, w3, sxtw #4]
+ldr    x1, [x3, w3, sxtw #4]
+ldr    b1, [x3, w3, sxtw #4]
+ldr    h1, [x3, w3, sxtw #4]
+ldr    s1, [x3, w3, sxtw #4]
+ldr    d1, [x3, w3, sxtw #4]
+ldr    q1, [x3, w3, sxtw #1]
+
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0
+; CHECK-ERRORS:ldrb   w1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #1
+; CHECK-ERRORS:ldrh   w1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
+; CHECK-ERRORS:ldr    w1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
+; CHECK-ERRORS:ldr    x1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0
+; CHECK-ERRORS:ldr    b1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #1
+; CHECK-ERRORS:ldr    h1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
+; CHECK-ERRORS:ldr    s1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
+; CHECK-ERRORS:ldr    d1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #4
+; CHECK-ERRORS:ldr    q1, [x3, w3, sxtw #1]
+; CHECK-ERRORS:           ^
+
+; Check that register offset addressing modes only accept 32-bit offset
+; registers when using uxtw/sxtw extends. Everything else requires a 64-bit
+; register.
+  str    d1, [x3, w3, sxtx #3]
+  ldr    s1, [x3, d3, sxtx #2]
+
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
+; CHECK-ERRORS:   str    d1, [x3, w3, sxtx #3]
+; CHECK-ERRORS:                       ^
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
+; CHECK-ERRORS:   ldr    s1, [x3, d3, sxtx #2]
+; CHECK-ERRORS:                   ^
+
+; Shift immediates range checking.
+  sqrshrn b4, h9, #10
+  rshrn v9.8b, v11.8h, #17
+  sqrshrn v7.4h, v8.4s, #39
+  uqshrn2 v4.4s, v5.2d, #67
+
+; CHECK-ERRORS: error: immediate must be an integer in range [1, 8].
+; CHECK-ERRORS:   sqrshrn b4, h9, #10
+; CHECK-ERRORS:                   ^
+; CHECK-ERRORS: error: immediate must be an integer in range [1, 8].
+; CHECK-ERRORS:   rshrn v9.8b, v11.8h, #17
+; CHECK-ERRORS:                        ^
+; CHECK-ERRORS: error: immediate must be an integer in range [1, 16].
+; CHECK-ERRORS:   sqrshrn v7.4h, v8.4s, #39
+; CHECK-ERRORS:                         ^
+; CHECK-ERRORS: error: immediate must be an integer in range [1, 32].
+; CHECK-ERRORS:   uqshrn2 v4.4s, v5.2d, #67
+; CHECK-ERRORS:                         ^
+
+
+  st1.s4 {v14, v15}, [x2], #32
+; CHECK-ERRORS: error: invalid type suffix for instruction
+; CHECK-ERRORS: st1.s4 {v14, v15}, [x2], #32
+; CHECK-ERRORS:     ^
+
+
+
+; Load pair instructions where Rt==Rt2 and writeback load/store instructions
+; where Rt==Rn or Rt2==Rn are unpredicatable.
+  ldp x1, x2, [x2], #16
+  ldp x2, x2, [x2], #16
+  ldp w1, w2, [x2], #16
+  ldp w2, w2, [x2], #16
+  ldp x1, x1, [x2]
+
+  ldr x2, [x2], #8
+  ldr x2, [x2, #8]!
+  ldr w2, [x2], #8
+  ldr w2, [x2, #8]!
+
+  str x2, [x2], #8
+  str x2, [x2, #8]!
+  str w2, [x2], #8
+  str w2, [x2, #8]!
+
+; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
+; CHECK-ERRORS:   ldp x1, x2, [x2], #16
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
+; CHECK-ERRORS:   ldp x2, x2, [x2], #16
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
+; CHECK-ERRORS:   ldp w1, w2, [x2], #16
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
+; CHECK-ERRORS:   ldp w2, w2, [x2], #16
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
+; CHECK-ERRORS:   ldp x1, x1, [x2]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
+; CHECK-ERRORS:   ldr x2, [x2], #8
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
+; CHECK-ERRORS:   ldr x2, [x2, #8]!
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
+; CHECK-ERRORS:   ldr w2, [x2], #8
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
+; CHECK-ERRORS:   ldr w2, [x2, #8]!
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
+; CHECK-ERRORS:   str x2, [x2], #8
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
+; CHECK-ERRORS:   str x2, [x2, #8]!
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
+; CHECK-ERRORS:   str w2, [x2], #8
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
+; CHECK-ERRORS:   str w2, [x2, #8]!
+; CHECK-ERRORS:       ^
+
+; The validity checking for shifted-immediate operands.  rdar://13174476
+; Where the immediate is out of range.
+  add w1, w2, w3, lsr #75
+
+; CHECK-ERRORS: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+; CHECK-ERRORS: add w1, w2, w3, lsr #75
+; CHECK-ERRORS:                      ^
+
+; logical instructions on 32-bit regs with shift > 31 is not legal
+orr w0, w0, w0, lsl #32
+; CHECK-ERRORS: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+; CHECK-ERRORS:        orr w0, w0, w0, lsl #32
+; CHECK-ERRORS:                        ^
+eor w0, w0, w0, lsl #32
+; CHECK-ERRORS: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+; CHECK-ERRORS:        eor w0, w0, w0, lsl #32
+; CHECK-ERRORS:                        ^
+and w0, w0, w0, lsl #32
+; CHECK-ERRORS: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+; CHECK-ERRORS:        and w0, w0, w0, lsl #32
+; CHECK-ERRORS:                        ^
+ands w0, w0, w0, lsl #32
+; CHECK-ERRORS: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+; CHECK-ERRORS:        ands w0, w0, w0, lsl #32
+; CHECK-ERRORS:                        ^
+
+; Relocated expressions should not be accepted for 32-bit adds or sub (imm)
+add w3, w5, sym@PAGEOFF
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: add w3, w5, sym@PAGEOFF
+; CHECK-ERRORS:             ^
+
+adds w3, w5, sym@PAGEOFF
+adds x9, x12, sym@PAGEOFF
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: adds w3, w5, sym@PAGEOFF
+; CHECK-ERRORS:              ^
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: adds x9, x12, sym@PAGEOFF
+; CHECK-ERRORS:               ^
+
+sub x3, x5, sym@PAGEOFF
+sub w20, w30, sym@PAGEOFF
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: sub x3, x5, sym@PAGEOFF
+; CHECK-ERRORS:             ^
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: sub w20, w30, sym@PAGEOFF
+; CHECK-ERRORS:               ^
+
+subs w9, w10, sym@PAGEOFF
+subs x20, x30, sym@PAGEOFF
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: subs w9, w10, sym@PAGEOFF
+; CHECK-ERRORS:               ^
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: subs x20, x30, sym@PAGEOFF
+; CHECK-ERRORS:                ^
+
+tbl v0.8b, { v1 }, v0.8b
+tbl v0.16b, { v1.8b, v2.8b, v3.8b }, v0.16b
+tbx v3.16b, { v12.8b, v13.8b, v14.8b }, v6.8b
+tbx v2.8b, { v0 }, v6.8b
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: tbl v0.8b, { v1 }, v0.8b
+; CHECK-ERRORS:            ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: tbl v0.16b, { v1.8b, v2.8b, v3.8b }, v0.16b
+; CHECK-ERRORS:             ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: tbx v3.16b, { v12.8b, v13.8b, v14.8b }, v6.8b
+; CHECK-ERRORS:             ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: tbx v2.8b, { v0 }, v6.8b
+; CHECK-ERRORS:            ^
+
+b.c #0x4
+; CHECK-ERRORS: error: invalid condition code
+; CHECK-ERRORS: b.c #0x4
+; CHECK-ERRORS:   ^
+
+ic ialluis, x0
+; CHECK-ERRORS: error: specified ic op does not use a register
+ic iallu, x0
+; CHECK-ERRORS: error: specified ic op does not use a register
+ic ivau
+; CHECK-ERRORS: error: specified ic op requires a register
+
+dc zva
+; CHECK-ERRORS: error: specified dc op requires a register
+dc ivac
+; CHECK-ERRORS: error: specified dc op requires a register
+dc isw
+; CHECK-ERRORS: error: specified dc op requires a register
+dc cvac
+; CHECK-ERRORS: error: specified dc op requires a register
+dc csw
+; CHECK-ERRORS: error: specified dc op requires a register
+dc cvau
+; CHECK-ERRORS: error: specified dc op requires a register
+dc civac
+; CHECK-ERRORS: error: specified dc op requires a register
+dc cisw
+; CHECK-ERRORS: error: specified dc op requires a register
+
+at s1e1r
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e2r
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e3r
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e1w
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e2w
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e3w
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e0r
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e0w
+; CHECK-ERRORS: error: specified at op requires a register
+at s12e1r
+; CHECK-ERRORS: error: specified at op requires a register
+at s12e1w
+; CHECK-ERRORS: error: specified at op requires a register
+at s12e0r
+; CHECK-ERRORS: error: specified at op requires a register
+at s12e0w
+; CHECK-ERRORS: error: specified at op requires a register
+
+tlbi vmalle1is, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi vmalle1, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi alle1is, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi alle2is, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi alle3is, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi alle1, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi alle2, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi alle3, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi vae1is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vae2is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vae3is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi aside1is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vaae1is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vale1is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vaale1is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vale2is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vale3is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vae1
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vae2
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vae3
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi aside1
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vaae1
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vale1
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vale2
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vale3
+; CHECK-ERRORS: error: specified tlbi op requires a register
diff --git a/test/MC/AArch64/arm64-directive_loh.s b/test/MC/AArch64/arm64-directive_loh.s
new file mode 100644
index 00000000000..76d2d7f2186
--- /dev/null
+++ b/test/MC/AArch64/arm64-directive_loh.s
@@ -0,0 +1,93 @@
+# RUN: not llvm-mc -triple arm64-apple-darwin < %s 2> %t | FileCheck %s
+# RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+.globl _fct1
+_fct1:
+  L1:
+  L2:
+  L3:
+  L4:
+  ret lr;
+
+# Known LOHs with:
+# - Regular syntax.
+# - Alternative syntax.
+
+# CHECK: .loh AdrpAdrp L1, L2
+# CHECK: .loh AdrpAdrp L1, L2
+.loh AdrpAdrp L1, L2
+.loh 1 L1, L2
+
+# CHECK: .loh AdrpLdr L1, L2
+# CHECK: .loh AdrpLdr L1, L2
+.loh AdrpLdr L1, L2
+.loh 2 L1, L2
+
+# CHECK: .loh AdrpAddLdr L1, L2, L3
+# CHECK: .loh AdrpAddLdr L1, L2, L3
+.loh AdrpAddLdr L1, L2, L3
+.loh 3 L1, L2, L3
+
+# CHECK: .loh AdrpLdrGotLdr L1, L2, L3
+# CHECK: .loh AdrpLdrGotLdr L1, L2, L3
+.loh AdrpLdrGotLdr L1, L2, L3
+.loh 4 L1, L2, L3
+
+# CHECK: .loh AdrpAddStr L1, L2, L3
+# CHECK: .loh AdrpAddStr L1, L2, L3
+.loh AdrpAddStr L1, L2, L3
+.loh 5 L1, L2, L3
+
+# CHECK: .loh AdrpLdrGotStr L1, L2, L3
+# CHECK: .loh AdrpLdrGotStr L1, L2, L3
+.loh AdrpLdrGotStr L1, L2, L3
+.loh 6 L1, L2, L3
+
+# CHECK: .loh AdrpAdd L1, L2
+# CHECK: .loh AdrpAdd L1, L2
+.loh AdrpAdd L1, L2
+.loh 7 L1, L2
+
+# CHECK: .loh AdrpLdrGot L1, L2
+# CHECK: .loh AdrpLdrGot L1, L2
+.loh AdrpLdrGot L1, L2
+.loh 8 L1, L2
+
+# End Known LOHs.
+
+### Errors Check ####
+
+# Unknown textual identifier.
+# CHECK-ERRORS: error: invalid identifier in directive
+# CHECK-ERRORS-NEXT: .loh Unknown
+# CHECK-ERRORS-NEXT:      ^
+.loh Unknown
+# Unknown numeric identifier.
+# CHECK-ERRORS: error: invalid numeric identifier in directive
+# CHECK-ERRORS-NEXT: .loh 153, L1
+# CHECK-ERRORS-NEXT:      ^
+.loh 153, L1
+
+# Too much arguments.
+# CHECK-ERRORS: error: unexpected token in '.loh' directive
+# CHECK-ERRORS-NEXT: .loh AdrpAdrp L1, L2, L3
+# CHECK-ERRORS-NEXT:                     ^
+.loh AdrpAdrp L1, L2, L3
+
+# Too much arguments with alternative syntax.
+# CHECK-ERRORS: error: unexpected token in '.loh' directive
+# CHECK-ERRORS-NEXT: .loh 1 L1, L2, L3
+# CHECK-ERRORS-NEXT:              ^
+.loh 1 L1, L2, L3
+
+# Too few argumets.
+# CHECK-ERRORS: error: unexpected token in '.loh' directive
+# CHECK-ERRORS-NEXT: .loh AdrpAdrp L1
+# CHECK-ERRORS-NEXT:                 ^
+.loh AdrpAdrp L1
+
+# Too few argumets with alternative syntax.
+# CHECK-ERRORS: error: unexpected token in '.loh' directive
+# CHECK-ERRORS-NEXT: .loh 1 L1
+# CHECK-ERRORS-NEXT:          ^
+.loh 1 L1
diff --git a/test/MC/AArch64/arm64-elf-reloc-condbr.s b/test/MC/AArch64/arm64-elf-reloc-condbr.s
new file mode 100644
index 00000000000..9b70a20e1bc
--- /dev/null
+++ b/test/MC/AArch64/arm64-elf-reloc-condbr.s
@@ -0,0 +1,10 @@
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
+// RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
+
+        b.eq somewhere
+
+// OBJ:      Relocations [
+// OBJ-NEXT:   Section (2) .rela.text {
+// OBJ-NEXT:     0x0 R_AARCH64_CONDBR19 somewhere 0x0
+// OBJ-NEXT:   }
+// OBJ-NEXT: ]
diff --git a/test/MC/AArch64/arm64-elf-relocs.s b/test/MC/AArch64/arm64-elf-relocs.s
new file mode 100644
index 00000000000..eb22cc2f236
--- /dev/null
+++ b/test/MC/AArch64/arm64-elf-relocs.s
@@ -0,0 +1,249 @@
+// RUN: llvm-mc -triple=arm64-linux-gnu -o - < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
+
+   add x0, x2, #:lo12:sym
+// CHECK: add x0, x2, :lo12:sym
+// CHECK-OBJ: 0 R_AARCH64_ADD_ABS_LO12_NC sym
+
+   add x5, x7, #:dtprel_lo12:sym
+// CHECK: add x5, x7, :dtprel_lo12:sym
+// CHECK-OBJ: 4 R_AARCH64_TLSLD_ADD_DTPREL_LO12 sym
+
+   add x9, x12, #:dtprel_lo12_nc:sym
+// CHECK: add x9, x12, :dtprel_lo12_nc:sym
+// CHECK-OBJ: 8 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC sym
+
+   add x20, x30, #:tprel_lo12:sym
+// CHECK: add x20, x30, :tprel_lo12:sym
+// CHECK-OBJ: c R_AARCH64_TLSLE_ADD_TPREL_LO12 sym
+
+   add x9, x12, #:tprel_lo12_nc:sym
+// CHECK: add x9, x12, :tprel_lo12_nc:sym
+// CHECK-OBJ: 10 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC sym
+
+   add x5, x0, #:tlsdesc_lo12:sym
+// CHECK: add x5, x0, :tlsdesc_lo12:sym
+// CHECK-OBJ: 14 R_AARCH64_TLSDESC_ADD_LO12_NC sym
+
+        add x0, x2, #:lo12:sym+8
+// CHECK: add x0, x2, :lo12:sym
+// CHECK-OBJ: 18 R_AARCH64_ADD_ABS_LO12_NC sym+8
+
+   add x5, x7, #:dtprel_lo12:sym+1
+// CHECK: add x5, x7, :dtprel_lo12:sym+1
+// CHECK-OBJ: 1c R_AARCH64_TLSLD_ADD_DTPREL_LO12 sym+1
+
+   add x9, x12, #:dtprel_lo12_nc:sym+2
+// CHECK: add x9, x12, :dtprel_lo12_nc:sym+2
+// CHECK-OBJ:20 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC sym+2
+
+   add x20, x30, #:tprel_lo12:sym+12
+// CHECK: add x20, x30, :tprel_lo12:sym+12
+// CHECK-OBJ: 24 R_AARCH64_TLSLE_ADD_TPREL_LO12 sym+12
+
+   add x9, x12, #:tprel_lo12_nc:sym+54
+// CHECK: add x9, x12, :tprel_lo12_nc:sym+54
+// CHECK-OBJ: 28 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC sym+54
+
+   add x5, x0, #:tlsdesc_lo12:sym+70
+// CHECK: add x5, x0, :tlsdesc_lo12:sym+70
+// CHECK-OBJ: 2c R_AARCH64_TLSDESC_ADD_LO12_NC sym+70
+
+        .hword sym + 4 - .
+// CHECK-OBJ: 30 R_AARCH64_PREL16 sym+4
+        .word sym - . + 8
+// CHECK-OBJ: 32 R_AARCH64_PREL32 sym+8
+        .xword sym-.
+// CHECK-OBJ: 36 R_AARCH64_PREL64 sym{{$}}
+
+        .hword sym
+// CHECK-OBJ: 3e R_AARCH64_ABS16 sym
+        .word sym+1
+// CHECK-OBJ: 40 R_AARCH64_ABS32 sym+1
+        .xword sym+16
+// CHECK-OBJ: 44 R_AARCH64_ABS64 sym+16
+
+   adrp x0, sym
+// CHECK: adrp x0, sym
+// CHECK-OBJ: 4c R_AARCH64_ADR_PREL_PG_HI21 sym
+
+   adrp x15, :got:sym
+// CHECK: adrp x15, :got:sym
+// CHECK-OBJ: 50 R_AARCH64_ADR_GOT_PAGE sym
+
+   adrp x29, :gottprel:sym
+// CHECK: adrp x29, :gottprel:sym
+// CHECK-OBJ: 54 R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 sym
+
+   adrp x2, :tlsdesc:sym
+// CHECK: adrp x2, :tlsdesc:sym
+// CHECK-OBJ: 58 R_AARCH64_TLSDESC_ADR_PAGE sym
+
+   // LLVM is not competent enough to do this relocation because the
+   // page boundary could occur anywhere after linking. A relocation
+   // is needed.
+   adrp x3, trickQuestion
+   .global trickQuestion
+trickQuestion:
+// CHECK: adrp x3, trickQuestion
+// CHECK-OBJ: 5c R_AARCH64_ADR_PREL_PG_HI21 trickQuestion
+
+   ldrb w2, [x3, :lo12:sym]
+   ldrsb w5, [x7, #:lo12:sym]
+   ldrsb x11, [x13, :lo12:sym]
+   ldr b17, [x19, #:lo12:sym]
+// CHECK: ldrb w2, [x3, :lo12:sym]
+// CHECK: ldrsb w5, [x7, :lo12:sym]
+// CHECK: ldrsb x11, [x13, :lo12:sym]
+// CHECK: ldr b17, [x19, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
+
+   ldrb w23, [x29, #:dtprel_lo12_nc:sym]
+   ldrsb w23, [x19, #:dtprel_lo12:sym]
+   ldrsb x17, [x13, :dtprel_lo12_nc:sym]
+   ldr b11, [x7, #:dtprel_lo12:sym]
+// CHECK: ldrb w23, [x29, :dtprel_lo12_nc:sym]
+// CHECK: ldrsb w23, [x19, :dtprel_lo12:sym]
+// CHECK: ldrsb x17, [x13, :dtprel_lo12_nc:sym]
+// CHECK: ldr b11, [x7, :dtprel_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12 sym
+
+   ldrb w1, [x2, :tprel_lo12:sym]
+   ldrsb w3, [x4, #:tprel_lo12_nc:sym]
+   ldrsb x5, [x6, :tprel_lo12:sym]
+   ldr b7, [x8, #:tprel_lo12_nc:sym]
+// CHECK: ldrb w1, [x2, :tprel_lo12:sym]
+// CHECK: ldrsb w3, [x4, :tprel_lo12_nc:sym]
+// CHECK: ldrsb x5, [x6, :tprel_lo12:sym]
+// CHECK: ldr b7, [x8, :tprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC sym
+
+   ldrh w2, [x3, #:lo12:sym]
+   ldrsh w5, [x7, :lo12:sym]
+   ldrsh x11, [x13, #:lo12:sym]
+   ldr h17, [x19, :lo12:sym]
+// CHECK: ldrh w2, [x3, :lo12:sym]
+// CHECK: ldrsh w5, [x7, :lo12:sym]
+// CHECK: ldrsh x11, [x13, :lo12:sym]
+// CHECK: ldr h17, [x19, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
+
+   ldrh w23, [x29, #:dtprel_lo12_nc:sym]
+   ldrsh w23, [x19, :dtprel_lo12:sym]
+   ldrsh x17, [x13, :dtprel_lo12_nc:sym]
+   ldr h11, [x7, #:dtprel_lo12:sym]
+// CHECK: ldrh w23, [x29, :dtprel_lo12_nc:sym]
+// CHECK: ldrsh w23, [x19, :dtprel_lo12:sym]
+// CHECK: ldrsh x17, [x13, :dtprel_lo12_nc:sym]
+// CHECK: ldr h11, [x7, :dtprel_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12 sym
+
+   ldrh w1, [x2, :tprel_lo12:sym]
+   ldrsh w3, [x4, #:tprel_lo12_nc:sym]
+   ldrsh x5, [x6, :tprel_lo12:sym]
+   ldr h7, [x8, #:tprel_lo12_nc:sym]
+// CHECK: ldrh w1, [x2, :tprel_lo12:sym]
+// CHECK: ldrsh w3, [x4, :tprel_lo12_nc:sym]
+// CHECK: ldrsh x5, [x6, :tprel_lo12:sym]
+// CHECK: ldr h7, [x8, :tprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC sym
+
+   ldr w1, [x2, #:lo12:sym]
+   ldrsw x3, [x4, #:lo12:sym]
+   ldr s4, [x5, :lo12:sym]
+// CHECK: ldr w1, [x2, :lo12:sym]
+// CHECK: ldrsw x3, [x4, :lo12:sym]
+// CHECK: ldr s4, [x5, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
+
+   ldr w1, [x2, :dtprel_lo12:sym]
+   ldrsw x3, [x4, #:dtprel_lo12_nc:sym]
+   ldr s4, [x5, #:dtprel_lo12_nc:sym]
+// CHECK: ldr w1, [x2, :dtprel_lo12:sym]
+// CHECK: ldrsw x3, [x4, :dtprel_lo12_nc:sym]
+// CHECK: ldr s4, [x5, :dtprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC sym
+
+
+   ldr w1, [x2, #:tprel_lo12:sym]
+   ldrsw x3, [x4, :tprel_lo12_nc:sym]
+   ldr s4, [x5, :tprel_lo12_nc:sym]
+// CHECK: ldr w1, [x2, :tprel_lo12:sym]
+// CHECK: ldrsw x3, [x4, :tprel_lo12_nc:sym]
+// CHECK: ldr s4, [x5, :tprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC sym
+
+   ldr x28, [x27, :lo12:sym]
+   ldr d26, [x25, #:lo12:sym]
+// CHECK: ldr x28, [x27, :lo12:sym]
+// CHECK: ldr d26, [x25, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST64_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST64_ABS_LO12_NC sym
+
+   ldr x24, [x23, #:got_lo12:sym]
+   ldr d22, [x21, :got_lo12:sym]
+// CHECK: ldr x24, [x23, :got_lo12:sym]
+// CHECK: ldr d22, [x21, :got_lo12:sym]
+// CHECK-OBJ: R_AARCH64_LD64_GOT_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LD64_GOT_LO12_NC sym
+
+   ldr x24, [x23, :dtprel_lo12_nc:sym]
+   ldr d22, [x21, #:dtprel_lo12:sym]
+// CHECK: ldr x24, [x23, :dtprel_lo12_nc:sym]
+// CHECK: ldr d22, [x21, :dtprel_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST64_DTPREL_LO12 sym
+
+   ldr x24, [x23, #:tprel_lo12:sym]
+   ldr d22, [x21, :tprel_lo12_nc:sym]
+// CHECK: ldr x24, [x23, :tprel_lo12:sym]
+// CHECK: ldr d22, [x21, :tprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST64_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC sym
+
+   ldr x24, [x23, :gottprel_lo12:sym]
+   ldr d22, [x21, #:gottprel_lo12:sym]
+// CHECK: ldr x24, [x23, :gottprel_lo12:sym]
+// CHECK: ldr d22, [x21, :gottprel_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC sym
+
+   ldr x24, [x23, #:tlsdesc_lo12:sym]
+   ldr d22, [x21, :tlsdesc_lo12:sym]
+// CHECK: ldr x24, [x23, :tlsdesc_lo12:sym]
+// CHECK: ldr d22, [x21, :tlsdesc_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSDESC_LD64_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSDESC_LD64_LO12_NC sym
+
+   ldr q20, [x19, #:lo12:sym]
+// CHECK: ldr q20, [x19, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST128_ABS_LO12_NC sym
+
+// Since relocated instructions print without a '#', that syntax should
+// certainly be accepted when assembling.
+   add x3, x5, :lo12:imm
+// CHECK: add x3, x5, :lo12:imm
diff --git a/test/MC/AArch64/arm64-fp-encoding.s b/test/MC/AArch64/arm64-fp-encoding.s
new file mode 100644
index 00000000000..684d9883e37
--- /dev/null
+++ b/test/MC/AArch64/arm64-fp-encoding.s
@@ -0,0 +1,443 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon -show-encoding -output-asm-variant=1 < %s | FileCheck %s
+
+foo:
+;-----------------------------------------------------------------------------
+; Floating-point arithmetic
+;-----------------------------------------------------------------------------
+
+  fabs s1, s2
+  fabs d1, d2
+
+; CHECK: fabs s1, s2                 ; encoding: [0x41,0xc0,0x20,0x1e]
+; CHECK: fabs d1, d2                 ; encoding: [0x41,0xc0,0x60,0x1e]
+
+  fadd s1, s2, s3
+  fadd d1, d2, d3
+
+; CHECK: fadd s1, s2, s3             ; encoding: [0x41,0x28,0x23,0x1e]
+; CHECK: fadd d1, d2, d3             ; encoding: [0x41,0x28,0x63,0x1e]
+
+  fdiv s1, s2, s3
+  fdiv d1, d2, d3
+
+; CHECK: fdiv s1, s2, s3             ; encoding: [0x41,0x18,0x23,0x1e]
+; CHECK: fdiv d1, d2, d3             ; encoding: [0x41,0x18,0x63,0x1e]
+
+  fmadd s1, s2, s3, s4
+  fmadd d1, d2, d3, d4
+
+; CHECK: fmadd s1, s2, s3, s4        ; encoding: [0x41,0x10,0x03,0x1f]
+; CHECK: fmadd d1, d2, d3, d4        ; encoding: [0x41,0x10,0x43,0x1f]
+
+  fmax   s1, s2, s3
+  fmax   d1, d2, d3
+  fmaxnm s1, s2, s3
+  fmaxnm d1, d2, d3
+
+; CHECK: fmax   s1, s2, s3           ; encoding: [0x41,0x48,0x23,0x1e]
+; CHECK: fmax   d1, d2, d3           ; encoding: [0x41,0x48,0x63,0x1e]
+; CHECK: fmaxnm s1, s2, s3           ; encoding: [0x41,0x68,0x23,0x1e]
+; CHECK: fmaxnm d1, d2, d3           ; encoding: [0x41,0x68,0x63,0x1e]
+
+  fmin   s1, s2, s3
+  fmin   d1, d2, d3
+  fminnm s1, s2, s3
+  fminnm d1, d2, d3
+
+; CHECK: fmin   s1, s2, s3           ; encoding: [0x41,0x58,0x23,0x1e]
+; CHECK: fmin   d1, d2, d3           ; encoding: [0x41,0x58,0x63,0x1e]
+; CHECK: fminnm s1, s2, s3           ; encoding: [0x41,0x78,0x23,0x1e]
+; CHECK: fminnm d1, d2, d3           ; encoding: [0x41,0x78,0x63,0x1e]
+
+  fmsub s1, s2, s3, s4
+  fmsub d1, d2, d3, d4
+
+; CHECK: fmsub s1, s2, s3, s4        ; encoding: [0x41,0x90,0x03,0x1f]
+; CHECK: fmsub d1, d2, d3, d4        ; encoding: [0x41,0x90,0x43,0x1f]
+
+  fmul s1, s2, s3
+  fmul d1, d2, d3
+
+; CHECK: fmul s1, s2, s3             ; encoding: [0x41,0x08,0x23,0x1e]
+; CHECK: fmul d1, d2, d3             ; encoding: [0x41,0x08,0x63,0x1e]
+
+  fneg s1, s2
+  fneg d1, d2
+
+; CHECK: fneg s1, s2                 ; encoding: [0x41,0x40,0x21,0x1e]
+; CHECK: fneg d1, d2                 ; encoding: [0x41,0x40,0x61,0x1e]
+
+  fnmadd s1, s2, s3, s4
+  fnmadd d1, d2, d3, d4
+
+; CHECK: fnmadd s1, s2, s3, s4       ; encoding: [0x41,0x10,0x23,0x1f]
+; CHECK: fnmadd d1, d2, d3, d4       ; encoding: [0x41,0x10,0x63,0x1f]
+
+  fnmsub s1, s2, s3, s4
+  fnmsub d1, d2, d3, d4
+
+; CHECK: fnmsub s1, s2, s3, s4       ; encoding: [0x41,0x90,0x23,0x1f]
+; CHECK: fnmsub d1, d2, d3, d4       ; encoding: [0x41,0x90,0x63,0x1f]
+
+  fnmul s1, s2, s3
+  fnmul d1, d2, d3
+
+; CHECK: fnmul s1, s2, s3            ; encoding: [0x41,0x88,0x23,0x1e]
+; CHECK: fnmul d1, d2, d3            ; encoding: [0x41,0x88,0x63,0x1e]
+
+  fsqrt s1, s2
+  fsqrt d1, d2
+
+; CHECK: fsqrt s1, s2                ; encoding: [0x41,0xc0,0x21,0x1e]
+; CHECK: fsqrt d1, d2                ; encoding: [0x41,0xc0,0x61,0x1e]
+
+  fsub s1, s2, s3
+  fsub d1, d2, d3
+
+; CHECK: fsub s1, s2, s3             ; encoding: [0x41,0x38,0x23,0x1e]
+; CHECK: fsub d1, d2, d3             ; encoding: [0x41,0x38,0x63,0x1e]
+
+;-----------------------------------------------------------------------------
+; Floating-point comparison
+;-----------------------------------------------------------------------------
+
+  fccmp  s1, s2, #0, eq
+  fccmp  d1, d2, #0, eq
+  fccmpe s1, s2, #0, eq
+  fccmpe d1, d2, #0, eq
+
+; CHECK: fccmp  s1, s2, #0, eq       ; encoding: [0x20,0x04,0x22,0x1e]
+; CHECK: fccmp  d1, d2, #0, eq       ; encoding: [0x20,0x04,0x62,0x1e]
+; CHECK: fccmpe s1, s2, #0, eq       ; encoding: [0x30,0x04,0x22,0x1e]
+; CHECK: fccmpe d1, d2, #0, eq       ; encoding: [0x30,0x04,0x62,0x1e]
+
+  fcmp  s1, s2
+  fcmp  d1, d2
+  fcmp  s1, #0.0
+  fcmp  d1, #0.0
+  fcmpe s1, s2
+  fcmpe d1, d2
+  fcmpe s1, #0.0
+  fcmpe d1, #0.0
+
+; CHECK: fcmp  s1, s2                ; encoding: [0x20,0x20,0x22,0x1e]
+; CHECK: fcmp  d1, d2                ; encoding: [0x20,0x20,0x62,0x1e]
+; CHECK: fcmp  s1, #0.0              ; encoding: [0x28,0x20,0x20,0x1e]
+; CHECK: fcmp  d1, #0.0              ; encoding: [0x28,0x20,0x60,0x1e]
+; CHECK: fcmpe s1, s2                ; encoding: [0x30,0x20,0x22,0x1e]
+; CHECK: fcmpe d1, d2                ; encoding: [0x30,0x20,0x62,0x1e]
+; CHECK: fcmpe s1, #0.0              ; encoding: [0x38,0x20,0x20,0x1e]
+; CHECK: fcmpe d1, #0.0              ; encoding: [0x38,0x20,0x60,0x1e]
+
+;-----------------------------------------------------------------------------
+; Floating-point conditional select
+;-----------------------------------------------------------------------------
+
+  fcsel s1, s2, s3, eq
+  fcsel d1, d2, d3, eq
+
+; CHECK: fcsel s1, s2, s3, eq        ; encoding: [0x41,0x0c,0x23,0x1e]
+; CHECK: fcsel d1, d2, d3, eq        ; encoding: [0x41,0x0c,0x63,0x1e]
+
+;-----------------------------------------------------------------------------
+; Floating-point convert
+;-----------------------------------------------------------------------------
+
+  fcvt h1, d2
+  fcvt s1, d2
+  fcvt d1, h2
+  fcvt s1, h2
+  fcvt d1, s2
+  fcvt h1, s2
+
+; CHECK: fcvt h1, d2                 ; encoding: [0x41,0xc0,0x63,0x1e]
+; CHECK: fcvt s1, d2                 ; encoding: [0x41,0x40,0x62,0x1e]
+; CHECK: fcvt d1, h2                 ; encoding: [0x41,0xc0,0xe2,0x1e]
+; CHECK: fcvt s1, h2                 ; encoding: [0x41,0x40,0xe2,0x1e]
+; CHECK: fcvt d1, s2                 ; encoding: [0x41,0xc0,0x22,0x1e]
+; CHECK: fcvt h1, s2                 ; encoding: [0x41,0xc0,0x23,0x1e]
+
+  fcvtas w1, d2
+  fcvtas x1, d2
+  fcvtas w1, s2
+  fcvtas x1, s2
+
+; CHECK: fcvtas	w1, d2                  ; encoding: [0x41,0x00,0x64,0x1e]
+; CHECK: fcvtas	x1, d2                  ; encoding: [0x41,0x00,0x64,0x9e]
+; CHECK: fcvtas	w1, s2                  ; encoding: [0x41,0x00,0x24,0x1e]
+; CHECK: fcvtas	x1, s2                  ; encoding: [0x41,0x00,0x24,0x9e]
+
+  fcvtau w1, s2
+  fcvtau w1, d2
+  fcvtau x1, s2
+  fcvtau x1, d2
+
+; CHECK: fcvtau	w1, s2                  ; encoding: [0x41,0x00,0x25,0x1e]
+; CHECK: fcvtau	w1, d2                  ; encoding: [0x41,0x00,0x65,0x1e]
+; CHECK: fcvtau	x1, s2                  ; encoding: [0x41,0x00,0x25,0x9e]
+; CHECK: fcvtau	x1, d2                  ; encoding: [0x41,0x00,0x65,0x9e]
+
+  fcvtms w1, s2
+  fcvtms w1, d2
+  fcvtms x1, s2
+  fcvtms x1, d2
+
+; CHECK: fcvtms	w1, s2                  ; encoding: [0x41,0x00,0x30,0x1e]
+; CHECK: fcvtms	w1, d2                  ; encoding: [0x41,0x00,0x70,0x1e]
+; CHECK: fcvtms	x1, s2                  ; encoding: [0x41,0x00,0x30,0x9e]
+; CHECK: fcvtms	x1, d2                  ; encoding: [0x41,0x00,0x70,0x9e]
+
+  fcvtmu w1, s2
+  fcvtmu w1, d2
+  fcvtmu x1, s2
+  fcvtmu x1, d2
+
+; CHECK: fcvtmu	w1, s2                  ; encoding: [0x41,0x00,0x31,0x1e]
+; CHECK: fcvtmu	w1, d2                  ; encoding: [0x41,0x00,0x71,0x1e]
+; CHECK: fcvtmu	x1, s2                  ; encoding: [0x41,0x00,0x31,0x9e]
+; CHECK: fcvtmu	x1, d2                  ; encoding: [0x41,0x00,0x71,0x9e]
+
+  fcvtns w1, s2
+  fcvtns w1, d2
+  fcvtns x1, s2
+  fcvtns x1, d2
+
+; CHECK: fcvtns	w1, s2                  ; encoding: [0x41,0x00,0x20,0x1e]
+; CHECK: fcvtns	w1, d2                  ; encoding: [0x41,0x00,0x60,0x1e]
+; CHECK: fcvtns	x1, s2                  ; encoding: [0x41,0x00,0x20,0x9e]
+; CHECK: fcvtns	x1, d2                  ; encoding: [0x41,0x00,0x60,0x9e]
+
+  fcvtnu w1, s2
+  fcvtnu w1, d2
+  fcvtnu x1, s2
+  fcvtnu x1, d2
+
+; CHECK: fcvtnu	w1, s2                  ; encoding: [0x41,0x00,0x21,0x1e]
+; CHECK: fcvtnu	w1, d2                  ; encoding: [0x41,0x00,0x61,0x1e]
+; CHECK: fcvtnu	x1, s2                  ; encoding: [0x41,0x00,0x21,0x9e]
+; CHECK: fcvtnu	x1, d2                  ; encoding: [0x41,0x00,0x61,0x9e]
+
+  fcvtps w1, s2
+  fcvtps w1, d2
+  fcvtps x1, s2
+  fcvtps x1, d2
+
+; CHECK: fcvtps	w1, s2                  ; encoding: [0x41,0x00,0x28,0x1e]
+; CHECK: fcvtps	w1, d2                  ; encoding: [0x41,0x00,0x68,0x1e]
+; CHECK: fcvtps	x1, s2                  ; encoding: [0x41,0x00,0x28,0x9e]
+; CHECK: fcvtps	x1, d2                  ; encoding: [0x41,0x00,0x68,0x9e]
+
+  fcvtpu w1, s2
+  fcvtpu w1, d2
+  fcvtpu x1, s2
+  fcvtpu x1, d2
+
+; CHECK: fcvtpu	w1, s2                  ; encoding: [0x41,0x00,0x29,0x1e]
+; CHECK: fcvtpu	w1, d2                  ; encoding: [0x41,0x00,0x69,0x1e]
+; CHECK: fcvtpu	x1, s2                  ; encoding: [0x41,0x00,0x29,0x9e]
+; CHECK: fcvtpu	x1, d2                  ; encoding: [0x41,0x00,0x69,0x9e]
+
+  fcvtzs w1, s2
+  fcvtzs w1, s2, #1
+  fcvtzs w1, d2
+  fcvtzs w1, d2, #1
+  fcvtzs x1, s2
+  fcvtzs x1, s2, #1
+  fcvtzs x1, d2
+  fcvtzs x1, d2, #1
+
+; CHECK: fcvtzs	w1, s2                  ; encoding: [0x41,0x00,0x38,0x1e]
+; CHECK: fcvtzs	w1, s2, #1              ; encoding: [0x41,0xfc,0x18,0x1e]
+; CHECK: fcvtzs	w1, d2                  ; encoding: [0x41,0x00,0x78,0x1e]
+; CHECK: fcvtzs	w1, d2, #1              ; encoding: [0x41,0xfc,0x58,0x1e]
+; CHECK: fcvtzs	x1, s2                  ; encoding: [0x41,0x00,0x38,0x9e]
+; CHECK: fcvtzs	x1, s2, #1              ; encoding: [0x41,0xfc,0x18,0x9e]
+; CHECK: fcvtzs	x1, d2                  ; encoding: [0x41,0x00,0x78,0x9e]
+; CHECK: fcvtzs	x1, d2, #1              ; encoding: [0x41,0xfc,0x58,0x9e]
+
+  fcvtzu w1, s2
+  fcvtzu w1, s2, #1
+  fcvtzu w1, d2
+  fcvtzu w1, d2, #1
+  fcvtzu x1, s2
+  fcvtzu x1, s2, #1
+  fcvtzu x1, d2
+  fcvtzu x1, d2, #1
+
+; CHECK: fcvtzu	w1, s2                  ; encoding: [0x41,0x00,0x39,0x1e]
+; CHECK: fcvtzu	w1, s2, #1              ; encoding: [0x41,0xfc,0x19,0x1e]
+; CHECK: fcvtzu	w1, d2                  ; encoding: [0x41,0x00,0x79,0x1e]
+; CHECK: fcvtzu	w1, d2, #1              ; encoding: [0x41,0xfc,0x59,0x1e]
+; CHECK: fcvtzu	x1, s2                  ; encoding: [0x41,0x00,0x39,0x9e]
+; CHECK: fcvtzu	x1, s2, #1              ; encoding: [0x41,0xfc,0x19,0x9e]
+; CHECK: fcvtzu	x1, d2                  ; encoding: [0x41,0x00,0x79,0x9e]
+; CHECK: fcvtzu	x1, d2, #1              ; encoding: [0x41,0xfc,0x59,0x9e]
+
+  scvtf s1, w2
+  scvtf s1, w2, #1
+  scvtf d1, w2
+  scvtf d1, w2, #1
+  scvtf s1, x2
+  scvtf s1, x2, #1
+  scvtf d1, x2
+  scvtf d1, x2, #1
+
+; CHECK: scvtf	s1, w2                  ; encoding: [0x41,0x00,0x22,0x1e]
+; CHECK: scvtf	s1, w2, #1              ; encoding: [0x41,0xfc,0x02,0x1e]
+; CHECK: scvtf	d1, w2                  ; encoding: [0x41,0x00,0x62,0x1e]
+; CHECK: scvtf	d1, w2, #1              ; encoding: [0x41,0xfc,0x42,0x1e]
+; CHECK: scvtf	s1, x2                  ; encoding: [0x41,0x00,0x22,0x9e]
+; CHECK: scvtf	s1, x2, #1              ; encoding: [0x41,0xfc,0x02,0x9e]
+; CHECK: scvtf	d1, x2                  ; encoding: [0x41,0x00,0x62,0x9e]
+; CHECK: scvtf	d1, x2, #1              ; encoding: [0x41,0xfc,0x42,0x9e]
+
+  ucvtf s1, w2
+  ucvtf s1, w2, #1
+  ucvtf d1, w2
+  ucvtf d1, w2, #1
+  ucvtf s1, x2
+  ucvtf s1, x2, #1
+  ucvtf d1, x2
+  ucvtf d1, x2, #1
+
+; CHECK: ucvtf	s1, w2                  ; encoding: [0x41,0x00,0x23,0x1e]
+; CHECK: ucvtf	s1, w2, #1              ; encoding: [0x41,0xfc,0x03,0x1e]
+; CHECK: ucvtf	d1, w2                  ; encoding: [0x41,0x00,0x63,0x1e]
+; CHECK: ucvtf	d1, w2, #1              ; encoding: [0x41,0xfc,0x43,0x1e]
+; CHECK: ucvtf	s1, x2                  ; encoding: [0x41,0x00,0x23,0x9e]
+; CHECK: ucvtf	s1, x2, #1              ; encoding: [0x41,0xfc,0x03,0x9e]
+; CHECK: ucvtf	d1, x2                  ; encoding: [0x41,0x00,0x63,0x9e]
+; CHECK: ucvtf	d1, x2, #1              ; encoding: [0x41,0xfc,0x43,0x9e]
+
+;-----------------------------------------------------------------------------
+; Floating-point move
+;-----------------------------------------------------------------------------
+
+  fmov s1, w2
+  fmov w1, s2
+  fmov d1, x2
+  fmov x1, d2
+
+; CHECK: fmov s1, w2                 ; encoding: [0x41,0x00,0x27,0x1e]
+; CHECK: fmov w1, s2                 ; encoding: [0x41,0x00,0x26,0x1e]
+; CHECK: fmov d1, x2                 ; encoding: [0x41,0x00,0x67,0x9e]
+; CHECK: fmov x1, d2                 ; encoding: [0x41,0x00,0x66,0x9e]
+
+  fmov s1, #0.125
+  fmov s1, #0x40
+  fmov d1, #0.125
+  fmov d1, #0x40
+  fmov d1, #-4.843750e-01
+  fmov d1, #4.843750e-01
+  fmov d3, #3
+  fmov s2, #0.0
+  fmov d2, #0.0
+
+; CHECK: fmov s1, #0.12500000      ; encoding: [0x01,0x10,0x28,0x1e]
+; CHECK: fmov s1, #0.12500000      ; encoding: [0x01,0x10,0x28,0x1e]
+; CHECK: fmov d1, #0.12500000      ; encoding: [0x01,0x10,0x68,0x1e]
+; CHECK: fmov d1, #0.12500000      ; encoding: [0x01,0x10,0x68,0x1e]
+; CHECK: fmov d1, #-0.48437500     ; encoding: [0x01,0xf0,0x7b,0x1e]
+; CHECK: fmov d1, #0.48437500      ; encoding: [0x01,0xf0,0x6b,0x1e]
+; CHECK: fmov d3, #3.00000000      ; encoding: [0x03,0x10,0x61,0x1e]
+; CHECK: fmov s2, wzr                ; encoding: [0xe2,0x03,0x27,0x1e]
+; CHECK: fmov d2, xzr                ; encoding: [0xe2,0x03,0x67,0x9e]
+
+  fmov s1, s2
+  fmov d1, d2
+
+; CHECK: fmov s1, s2                 ; encoding: [0x41,0x40,0x20,0x1e]
+; CHECK: fmov d1, d2                 ; encoding: [0x41,0x40,0x60,0x1e]
+
+
+  fmov x2, v5.d[1]
+  fmov.d x9, v7[1]
+  fmov v1.d[1], x1
+  fmov.d v8[1], x6
+
+; CHECK: fmov.d	x2, v5[1]               ; encoding: [0xa2,0x00,0xae,0x9e]
+; CHECK: fmov.d	x9, v7[1]               ; encoding: [0xe9,0x00,0xae,0x9e]
+; CHECK: fmov.d	v1[1], x1               ; encoding: [0x21,0x00,0xaf,0x9e]
+; CHECK: fmov.d	v8[1], x6               ; encoding: [0xc8,0x00,0xaf,0x9e]
+
+
+;-----------------------------------------------------------------------------
+; Floating-point round to integral
+;-----------------------------------------------------------------------------
+
+  frinta s1, s2
+  frinta d1, d2
+
+; CHECK: frinta s1, s2               ; encoding: [0x41,0x40,0x26,0x1e]
+; CHECK: frinta d1, d2               ; encoding: [0x41,0x40,0x66,0x1e]
+
+  frinti s1, s2
+  frinti d1, d2
+
+; CHECK: frinti s1, s2               ; encoding: [0x41,0xc0,0x27,0x1e]
+; CHECK: frinti d1, d2               ; encoding: [0x41,0xc0,0x67,0x1e]
+
+  frintm s1, s2
+  frintm d1, d2
+
+; CHECK: frintm s1, s2               ; encoding: [0x41,0x40,0x25,0x1e]
+; CHECK: frintm d1, d2               ; encoding: [0x41,0x40,0x65,0x1e]
+
+  frintn s1, s2
+  frintn d1, d2
+
+; CHECK: frintn s1, s2               ; encoding: [0x41,0x40,0x24,0x1e]
+; CHECK: frintn d1, d2               ; encoding: [0x41,0x40,0x64,0x1e]
+
+  frintp s1, s2
+  frintp d1, d2
+
+; CHECK: frintp s1, s2               ; encoding: [0x41,0xc0,0x24,0x1e]
+; CHECK: frintp d1, d2               ; encoding: [0x41,0xc0,0x64,0x1e]
+
+  frintx s1, s2
+  frintx d1, d2
+
+; CHECK: frintx s1, s2               ; encoding: [0x41,0x40,0x27,0x1e]
+; CHECK: frintx d1, d2               ; encoding: [0x41,0x40,0x67,0x1e]
+
+  frintz s1, s2
+  frintz d1, d2
+
+; CHECK: frintz s1, s2               ; encoding: [0x41,0xc0,0x25,0x1e]
+; CHECK: frintz d1, d2               ; encoding: [0x41,0xc0,0x65,0x1e]
+
+  cmhs d0, d0, d0
+  cmtst d0, d0, d0
+
+; CHECK: cmhs	d0, d0, d0              ; encoding: [0x00,0x3c,0xe0,0x7e]
+; CHECK: cmtst	d0, d0, d0              ; encoding: [0x00,0x8c,0xe0,0x5e]
+
+
+
+;-----------------------------------------------------------------------------
+; Floating-point extract and narrow
+;-----------------------------------------------------------------------------
+  sqxtn b4, h2
+  sqxtn h2, s3
+  sqxtn s9, d2
+
+; CHECK: sqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x5e]
+; CHECK: sqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x5e]
+; CHECK: sqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x5e]
+
+  sqxtun b4, h2
+  sqxtun h2, s3
+  sqxtun s9, d2
+
+; CHECK: sqxtun b4, h2                  ; encoding: [0x44,0x28,0x21,0x7e]
+; CHECK: sqxtun h2, s3                  ; encoding: [0x62,0x28,0x61,0x7e]
+; CHECK: sqxtun s9, d2                  ; encoding: [0x49,0x28,0xa1,0x7e]
+
+  uqxtn b4, h2
+  uqxtn h2, s3
+  uqxtn s9, d2
+
+; CHECK: uqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x7e]
+; CHECK: uqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x7e]
+; CHECK: uqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x7e]
diff --git a/test/MC/AArch64/arm64-large-relocs.s b/test/MC/AArch64/arm64-large-relocs.s
new file mode 100644
index 00000000000..2a0cfa22286
--- /dev/null
+++ b/test/MC/AArch64/arm64-large-relocs.s
@@ -0,0 +1,38 @@
+// RUN: llvm-mc -triple=arm64-linux-gnu -show-encoding -o - %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64-linux-gnu -show-encoding -filetype=obj -o - %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-OBJ %s
+
+        movz x2, #:abs_g0:sym
+        movk w3, #:abs_g0_nc:sym
+// CHECK: movz    x2, #:abs_g0:sym        // encoding: [0bAAA00010,A,0b100AAAAA,0xd2]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_aarch64_movw
+// CHECK: movk     w3, #:abs_g0_nc:sym    // encoding: [0bAAA00011,A,0b100AAAAA,0x72]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_aarch64_movw
+
+// CHECK-OBJ: 0 R_AARCH64_MOVW_UABS_G0 sym
+// CHECK-OBJ: 4 R_AARCH64_MOVW_UABS_G0_NC sym
+
+        movz x4, #:abs_g1:sym
+        movk w5, #:abs_g1_nc:sym
+// CHECK: movz     x4, #:abs_g1:sym       // encoding: [0bAAA00100,A,0b101AAAAA,0xd2]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_aarch64_movw
+// CHECK: movk     w5, #:abs_g1_nc:sym    // encoding: [0bAAA00101,A,0b101AAAAA,0x72]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_aarch64_movw
+
+// CHECK-OBJ: 8 R_AARCH64_MOVW_UABS_G1 sym
+// CHECK-OBJ: c R_AARCH64_MOVW_UABS_G1_NC sym
+
+        movz x6, #:abs_g2:sym
+        movk x7, #:abs_g2_nc:sym
+// CHECK: movz     x6, #:abs_g2:sym       // encoding: [0bAAA00110,A,0b110AAAAA,0xd2]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_aarch64_movw
+// CHECK: movk     x7, #:abs_g2_nc:sym    // encoding: [0bAAA00111,A,0b110AAAAA,0xf2]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_aarch64_movw
+
+// CHECK-OBJ: 10 R_AARCH64_MOVW_UABS_G2 sym
+// CHECK-OBJ: 14 R_AARCH64_MOVW_UABS_G2_NC sym
+
+        movz x8, #:abs_g3:sym
+// CHECK: movz     x8, #:abs_g3:sym       // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_aarch64_movw
+
+// CHECK-OBJ: 18 R_AARCH64_MOVW_UABS_G3 sym
diff --git a/test/MC/AArch64/arm64-leaf-compact-unwind.s b/test/MC/AArch64/arm64-leaf-compact-unwind.s
new file mode 100644
index 00000000000..d6998134d04
--- /dev/null
+++ b/test/MC/AArch64/arm64-leaf-compact-unwind.s
@@ -0,0 +1,208 @@
+// RUN: llvm-mc -triple=arm64-apple-ios -filetype=obj < %s | \
+// RUN: llvm-readobj -sections -section-relocations -section-data | \
+// RUN: FileCheck %s
+//
+// rdar://13070556
+
+// FIXME: we should add compact unwind support to llvm-objdump -unwind-info
+
+// CHECK:      Section {
+// CHECK:        Index: 1
+// CHECK-NEXT:   Name: __compact_unwind
+// CHECK-NEXT:   Segment: __LD
+// CHECK-NEXT:   Address:
+// CHECK-NEXT:   Size:
+// CHECK-NEXT:   Offset:
+// CHECK-NEXT:   Alignment:
+// CHECK-NEXT:   RelocationOffset:
+// CHECK-NEXT:   RelocationCount:
+// CHECK-NEXT:   Type:
+// CHECK-NEXT:   Attributes [
+// CHECK-NEXT:     Debug
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Reserved1:
+// CHECK-NEXT:   Reserved2:
+// CHECK-NEXT:   Relocations [
+// CHECK-NEXT:     0x60 0 3 0 ARM64_RELOC_UNSIGNED 0 -
+// CHECK-NEXT:     0x40 0 3 0 ARM64_RELOC_UNSIGNED 0 -
+// CHECK-NEXT:     0x20 0 3 0 ARM64_RELOC_UNSIGNED 0 -
+// CHECK-NEXT:     0x0 0 3 0 ARM64_RELOC_UNSIGNED 0 -
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   SectionData (
+// CHECK-NEXT:     0000: 00000000 00000000 08000000 00000002
+// CHECK-NEXT:     0010: 00000000 00000000 00000000 00000000
+// CHECK-NEXT:     0020: 08000000 00000000 40000000 00900002
+// CHECK-NEXT:     0030: 00000000 00000000 00000000 00000000
+// CHECK-NEXT:     0040: 48000000 00000000 D4000000 0F400002
+// CHECK-NEXT:     0050: 00000000 00000000 00000000 00000000
+// CHECK-NEXT:     0060: 1C010000 00000000 54000000 10100202
+// CHECK-NEXT:     0070: 00000000 00000000 00000000 00000000
+// CHECK-NEXT:   )
+// CHECK-NEXT: }
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_foo1
+	.align	2
+_foo1:                                  ; @foo1
+	.cfi_startproc
+; BB#0:                                 ; %entry
+	add	w0, w0, #42             ; =#42
+	ret
+	.cfi_endproc
+
+	.globl	_foo2
+	.align	2
+_foo2:                                  ; @foo2
+	.cfi_startproc
+; BB#0:                                 ; %entry
+	sub	sp, sp, #144            ; =#144
+Ltmp2:
+	.cfi_def_cfa_offset 144
+	mov	x9, xzr
+	mov	x8, sp
+LBB1_1:                                 ; %for.body
+                                        ; =>This Inner Loop Header: Depth=1
+	str	w9, [x8, x9, lsl #2]
+	add	x9, x9, #1              ; =#1
+	cmp	w9, #36                 ; =#36
+	b.ne	LBB1_1
+; BB#2:
+	mov	x9, xzr
+	mov	w0, wzr
+LBB1_3:                                 ; %for.body4
+                                        ; =>This Inner Loop Header: Depth=1
+	ldr	w10, [x8, x9]
+	add	x9, x9, #4              ; =#4
+	cmp	w9, #144                ; =#144
+	add	w0, w10, w0
+	b.ne	LBB1_3
+; BB#4:                                 ; %for.end9
+	add	sp, sp, #144            ; =#144
+	ret
+	.cfi_endproc
+
+	.globl	_foo3
+	.align	2
+_foo3:                                  ; @foo3
+	.cfi_startproc
+; BB#0:                                 ; %entry
+	stp	x26, x25, [sp, #-64]!
+	stp	x24, x23, [sp, #16]
+	stp	x22, x21, [sp, #32]
+	stp	x20, x19, [sp, #48]
+Ltmp3:
+	.cfi_def_cfa_offset 64
+Ltmp4:
+	.cfi_offset w19, -16
+Ltmp5:
+	.cfi_offset w20, -24
+Ltmp6:
+	.cfi_offset w21, -32
+Ltmp7:
+	.cfi_offset w22, -40
+Ltmp8:
+	.cfi_offset w23, -48
+Ltmp9:
+	.cfi_offset w24, -56
+Ltmp10:
+	.cfi_offset w25, -64
+Ltmp11:
+	.cfi_offset w26, -72
+Lloh0:
+	adrp	x8, _bar@GOTPAGE
+Lloh1:
+	ldr	x8, [x8, _bar@GOTPAGEOFF]
+	ldr	w9, [x8]
+	ldr	w10, [x8]
+	ldr	w11, [x8]
+	ldr	w12, [x8]
+	ldr	w13, [x8]
+	ldr	w14, [x8]
+	ldr	w15, [x8]
+	ldr	w16, [x8]
+	ldr	w17, [x8]
+	ldr	w0, [x8]
+	ldr	w19, [x8]
+	ldr	w20, [x8]
+	ldr	w21, [x8]
+	ldr	w22, [x8]
+	ldr	w23, [x8]
+	ldr	w24, [x8]
+	ldr	w25, [x8]
+	ldr	w8, [x8]
+	add	w9, w10, w9
+	add	w9, w9, w11
+	add	w9, w9, w12
+	add	w9, w9, w13
+	add	w9, w9, w14
+	add	w9, w9, w15
+	add	w9, w9, w16
+	add	w9, w9, w17
+	add	w9, w9, w0
+	add	w9, w9, w19
+	add	w9, w9, w20
+	add	w9, w9, w21
+	add	w9, w9, w22
+	add	w9, w9, w23
+	add	w9, w9, w24
+	add	w9, w9, w25
+	sub	w8, w8, w9
+	sub	w8, w8, w7, lsl #1
+	sub	w8, w8, w6, lsl #1
+	sub	w8, w8, w5, lsl #1
+	sub	w8, w8, w4, lsl #1
+	sub	w8, w8, w3, lsl #1
+	sub	w8, w8, w2, lsl #1
+	sub	w0, w8, w1, lsl #1
+	ldp	x20, x19, [sp, #48]
+	ldp	x22, x21, [sp, #32]
+	ldp	x24, x23, [sp, #16]
+	ldp	x26, x25, [sp], #64
+	ret
+	.loh AdrpLdrGot	Lloh0, Lloh1
+	.cfi_endproc
+
+	.globl	_foo4
+	.align	2
+_foo4:                                  ; @foo4
+	.cfi_startproc
+; BB#0:                                 ; %entry
+	stp	x28, x27, [sp, #-16]!
+	sub	sp, sp, #512            ; =#512
+Ltmp12:
+	.cfi_def_cfa_offset 528
+Ltmp13:
+	.cfi_offset w27, -16
+Ltmp14:
+	.cfi_offset w28, -24
+                                        ; kill: W0<def> W0<kill> X0<def>
+	mov	x9, xzr
+	ubfx	x10, x0, #0, #32
+	mov	x8, sp
+LBB3_1:                                 ; %for.body
+                                        ; =>This Inner Loop Header: Depth=1
+	add	w11, w10, w9
+	str	w11, [x8, x9, lsl #2]
+	add	x9, x9, #1              ; =#1
+	cmp	w9, #128                ; =#128
+	b.ne	LBB3_1
+; BB#2:                                 ; %for.cond2.preheader
+	mov	x9, xzr
+	mov	w0, wzr
+	add	x8, x8, w5, sxtw #2
+LBB3_3:                                 ; %for.body4
+                                        ; =>This Inner Loop Header: Depth=1
+	ldr	w10, [x8, x9]
+	add	x9, x9, #4              ; =#4
+	cmp	w9, #512                ; =#512
+	add	w0, w10, w0
+	b.ne	LBB3_3
+; BB#4:                                 ; %for.end11
+	add	sp, sp, #512            ; =#512
+	ldp	x28, x27, [sp], #16
+	ret
+	.cfi_endproc
+
+	.comm	_bar,4,2                ; @bar
+
+.subsections_via_symbols
diff --git a/test/MC/AArch64/arm64-logical-encoding.s b/test/MC/AArch64/arm64-logical-encoding.s
new file mode 100644
index 00000000000..e5f1436d1ab
--- /dev/null
+++ b/test/MC/AArch64/arm64-logical-encoding.s
@@ -0,0 +1,224 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+;==---------------------------------------------------------------------------==
+; 5.4.2 Logical (immediate)
+;==---------------------------------------------------------------------------==
+
+  and   w0, w0, #1
+  and   x0, x0, #1
+  and   w1, w2, #15
+  and   x1, x2, #15
+  and   sp, x5, #~15
+  ands  w0, w0, #1
+  ands  x0, x0, #1
+  ands  w1, w2, #15
+  ands  x1, x2, #15
+
+; CHECK: and  w0, w0, #0x1           ; encoding: [0x00,0x00,0x00,0x12]
+; CHECK: and  x0, x0, #0x1           ; encoding: [0x00,0x00,0x40,0x92]
+; CHECK: and  w1, w2, #0xf           ; encoding: [0x41,0x0c,0x00,0x12]
+; CHECK: and  x1, x2, #0xf           ; encoding: [0x41,0x0c,0x40,0x92]
+; CHECK: and  sp, x5, #0xfffffffffffffff0 ; encoding: [0xbf,0xec,0x7c,0x92]
+; CHECK: ands w0, w0, #0x1           ; encoding: [0x00,0x00,0x00,0x72]
+; CHECK: ands x0, x0, #0x1           ; encoding: [0x00,0x00,0x40,0xf2]
+; CHECK: ands w1, w2, #0xf           ; encoding: [0x41,0x0c,0x00,0x72]
+; CHECK: ands x1, x2, #0xf           ; encoding: [0x41,0x0c,0x40,0xf2]
+
+  eor w1, w2, #0x4000
+  eor x1, x2, #0x8000
+
+; CHECK: eor w1, w2, #0x4000         ; encoding: [0x41,0x00,0x12,0x52]
+; CHECK: eor x1, x2, #0x8000         ; encoding: [0x41,0x00,0x71,0xd2]
+
+  orr w1, w2, #0x4000
+  orr x1, x2, #0x8000
+
+; CHECK: orr w1, w2, #0x4000         ; encoding: [0x41,0x00,0x12,0x32]
+; CHECK: orr x1, x2, #0x8000         ; encoding: [0x41,0x00,0x71,0xb2]
+
+  orr w8, wzr, #0x1
+  orr x8, xzr, #0x1
+
+; CHECK: orr w8, wzr, #0x1           ; encoding: [0xe8,0x03,0x00,0x32]
+; CHECK: orr x8, xzr, #0x1           ; encoding: [0xe8,0x03,0x40,0xb2]
+
+;==---------------------------------------------------------------------------==
+; 5.5.3 Logical (shifted register)
+;==---------------------------------------------------------------------------==
+
+  and   w1, w2, w3
+  and   x1, x2, x3
+  and   w1, w2, w3, lsl #2
+  and   x1, x2, x3, lsl #2
+  and   w1, w2, w3, lsr #2
+  and   x1, x2, x3, lsr #2
+  and   w1, w2, w3, asr #2
+  and   x1, x2, x3, asr #2
+  and   w1, w2, w3, ror #2
+  and   x1, x2, x3, ror #2
+
+; CHECK: and  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x0a]
+; CHECK: and  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0x8a]
+; CHECK: and  w1, w2, w3, lsl #2     ; encoding: [0x41,0x08,0x03,0x0a]
+; CHECK: and  x1, x2, x3, lsl #2     ; encoding: [0x41,0x08,0x03,0x8a]
+; CHECK: and  w1, w2, w3, lsr #2     ; encoding: [0x41,0x08,0x43,0x0a]
+; CHECK: and  x1, x2, x3, lsr #2     ; encoding: [0x41,0x08,0x43,0x8a]
+; CHECK: and  w1, w2, w3, asr #2     ; encoding: [0x41,0x08,0x83,0x0a]
+; CHECK: and  x1, x2, x3, asr #2     ; encoding: [0x41,0x08,0x83,0x8a]
+; CHECK: and  w1, w2, w3, ror #2     ; encoding: [0x41,0x08,0xc3,0x0a]
+; CHECK: and  x1, x2, x3, ror #2     ; encoding: [0x41,0x08,0xc3,0x8a]
+
+  ands  w1, w2, w3
+  ands  x1, x2, x3
+  ands  w1, w2, w3, lsl #2
+  ands  x1, x2, x3, lsl #2
+  ands  w1, w2, w3, lsr #2
+  ands  x1, x2, x3, lsr #2
+  ands  w1, w2, w3, asr #2
+  ands  x1, x2, x3, asr #2
+  ands  w1, w2, w3, ror #2
+  ands  x1, x2, x3, ror #2
+
+; CHECK: ands w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x6a]
+; CHECK: ands x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xea]
+; CHECK: ands w1, w2, w3, lsl #2     ; encoding: [0x41,0x08,0x03,0x6a]
+; CHECK: ands x1, x2, x3, lsl #2     ; encoding: [0x41,0x08,0x03,0xea]
+; CHECK: ands w1, w2, w3, lsr #2     ; encoding: [0x41,0x08,0x43,0x6a]
+; CHECK: ands x1, x2, x3, lsr #2     ; encoding: [0x41,0x08,0x43,0xea]
+; CHECK: ands w1, w2, w3, asr #2     ; encoding: [0x41,0x08,0x83,0x6a]
+; CHECK: ands x1, x2, x3, asr #2     ; encoding: [0x41,0x08,0x83,0xea]
+; CHECK: ands w1, w2, w3, ror #2     ; encoding: [0x41,0x08,0xc3,0x6a]
+; CHECK: ands x1, x2, x3, ror #2     ; encoding: [0x41,0x08,0xc3,0xea]
+
+  bic w1, w2, w3
+  bic x1, x2, x3
+  bic w1, w2, w3, lsl #3
+  bic x1, x2, x3, lsl #3
+  bic w1, w2, w3, lsr #3
+  bic x1, x2, x3, lsr #3
+  bic w1, w2, w3, asr #3
+  bic x1, x2, x3, asr #3
+  bic w1, w2, w3, ror #3
+  bic x1, x2, x3, ror #3
+
+; CHECK: bic w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x0a]
+; CHECK: bic x1, x2, x3              ; encoding: [0x41,0x00,0x23,0x8a]
+; CHECK: bic w1, w2, w3, lsl #3      ; encoding: [0x41,0x0c,0x23,0x0a]
+; CHECK: bic x1, x2, x3, lsl #3      ; encoding: [0x41,0x0c,0x23,0x8a]
+; CHECK: bic w1, w2, w3, lsr #3      ; encoding: [0x41,0x0c,0x63,0x0a]
+; CHECK: bic x1, x2, x3, lsr #3      ; encoding: [0x41,0x0c,0x63,0x8a]
+; CHECK: bic w1, w2, w3, asr #3      ; encoding: [0x41,0x0c,0xa3,0x0a]
+; CHECK: bic x1, x2, x3, asr #3      ; encoding: [0x41,0x0c,0xa3,0x8a]
+; CHECK: bic w1, w2, w3, ror #3      ; encoding: [0x41,0x0c,0xe3,0x0a]
+; CHECK: bic x1, x2, x3, ror #3      ; encoding: [0x41,0x0c,0xe3,0x8a]
+
+  bics w1, w2, w3
+  bics x1, x2, x3
+  bics w1, w2, w3, lsl #3
+  bics x1, x2, x3, lsl #3
+  bics w1, w2, w3, lsr #3
+  bics x1, x2, x3, lsr #3
+  bics w1, w2, w3, asr #3
+  bics x1, x2, x3, asr #3
+  bics w1, w2, w3, ror #3
+  bics x1, x2, x3, ror #3
+
+; CHECK: bics w1, w2, w3             ; encoding: [0x41,0x00,0x23,0x6a]
+; CHECK: bics x1, x2, x3             ; encoding: [0x41,0x00,0x23,0xea]
+; CHECK: bics w1, w2, w3, lsl #3     ; encoding: [0x41,0x0c,0x23,0x6a]
+; CHECK: bics x1, x2, x3, lsl #3     ; encoding: [0x41,0x0c,0x23,0xea]
+; CHECK: bics w1, w2, w3, lsr #3     ; encoding: [0x41,0x0c,0x63,0x6a]
+; CHECK: bics x1, x2, x3, lsr #3     ; encoding: [0x41,0x0c,0x63,0xea]
+; CHECK: bics w1, w2, w3, asr #3     ; encoding: [0x41,0x0c,0xa3,0x6a]
+; CHECK: bics x1, x2, x3, asr #3     ; encoding: [0x41,0x0c,0xa3,0xea]
+; CHECK: bics w1, w2, w3, ror #3     ; encoding: [0x41,0x0c,0xe3,0x6a]
+; CHECK: bics x1, x2, x3, ror #3     ; encoding: [0x41,0x0c,0xe3,0xea]
+
+  eon w1, w2, w3
+  eon x1, x2, x3
+  eon w1, w2, w3, lsl #4
+  eon x1, x2, x3, lsl #4
+  eon w1, w2, w3, lsr #4
+  eon x1, x2, x3, lsr #4
+  eon w1, w2, w3, asr #4
+  eon x1, x2, x3, asr #4
+  eon w1, w2, w3, ror #4
+  eon x1, x2, x3, ror #4
+
+; CHECK: eon w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x4a]
+; CHECK: eon x1, x2, x3              ; encoding: [0x41,0x00,0x23,0xca]
+; CHECK: eon w1, w2, w3, lsl #4      ; encoding: [0x41,0x10,0x23,0x4a]
+; CHECK: eon x1, x2, x3, lsl #4      ; encoding: [0x41,0x10,0x23,0xca]
+; CHECK: eon w1, w2, w3, lsr #4      ; encoding: [0x41,0x10,0x63,0x4a]
+; CHECK: eon x1, x2, x3, lsr #4      ; encoding: [0x41,0x10,0x63,0xca]
+; CHECK: eon w1, w2, w3, asr #4      ; encoding: [0x41,0x10,0xa3,0x4a]
+; CHECK: eon x1, x2, x3, asr #4      ; encoding: [0x41,0x10,0xa3,0xca]
+; CHECK: eon w1, w2, w3, ror #4      ; encoding: [0x41,0x10,0xe3,0x4a]
+; CHECK: eon x1, x2, x3, ror #4      ; encoding: [0x41,0x10,0xe3,0xca]
+
+  eor w1, w2, w3
+  eor x1, x2, x3
+  eor w1, w2, w3, lsl #5
+  eor x1, x2, x3, lsl #5
+  eor w1, w2, w3, lsr #5
+  eor x1, x2, x3, lsr #5
+  eor w1, w2, w3, asr #5
+  eor x1, x2, x3, asr #5
+  eor w1, w2, w3, ror #5
+  eor x1, x2, x3, ror #5
+
+; CHECK: eor w1, w2, w3              ; encoding: [0x41,0x00,0x03,0x4a]
+; CHECK: eor x1, x2, x3              ; encoding: [0x41,0x00,0x03,0xca]
+; CHECK: eor w1, w2, w3, lsl #5      ; encoding: [0x41,0x14,0x03,0x4a]
+; CHECK: eor x1, x2, x3, lsl #5      ; encoding: [0x41,0x14,0x03,0xca]
+; CHECK: eor w1, w2, w3, lsr #5      ; encoding: [0x41,0x14,0x43,0x4a]
+; CHECK: eor x1, x2, x3, lsr #5      ; encoding: [0x41,0x14,0x43,0xca]
+; CHECK: eor w1, w2, w3, asr #5      ; encoding: [0x41,0x14,0x83,0x4a]
+; CHECK: eor x1, x2, x3, asr #5      ; encoding: [0x41,0x14,0x83,0xca]
+; CHECK: eor w1, w2, w3, ror #5      ; encoding: [0x41,0x14,0xc3,0x4a]
+; CHECK: eor x1, x2, x3, ror #5      ; encoding: [0x41,0x14,0xc3,0xca]
+
+  orr w1, w2, w3
+  orr x1, x2, x3
+  orr w1, w2, w3, lsl #6
+  orr x1, x2, x3, lsl #6
+  orr w1, w2, w3, lsr #6
+  orr x1, x2, x3, lsr #6
+  orr w1, w2, w3, asr #6
+  orr x1, x2, x3, asr #6
+  orr w1, w2, w3, ror #6
+  orr x1, x2, x3, ror #6
+
+; CHECK: orr w1, w2, w3              ; encoding: [0x41,0x00,0x03,0x2a]
+; CHECK: orr x1, x2, x3              ; encoding: [0x41,0x00,0x03,0xaa]
+; CHECK: orr w1, w2, w3, lsl #6      ; encoding: [0x41,0x18,0x03,0x2a]
+; CHECK: orr x1, x2, x3, lsl #6      ; encoding: [0x41,0x18,0x03,0xaa]
+; CHECK: orr w1, w2, w3, lsr #6      ; encoding: [0x41,0x18,0x43,0x2a]
+; CHECK: orr x1, x2, x3, lsr #6      ; encoding: [0x41,0x18,0x43,0xaa]
+; CHECK: orr w1, w2, w3, asr #6      ; encoding: [0x41,0x18,0x83,0x2a]
+; CHECK: orr x1, x2, x3, asr #6      ; encoding: [0x41,0x18,0x83,0xaa]
+; CHECK: orr w1, w2, w3, ror #6      ; encoding: [0x41,0x18,0xc3,0x2a]
+; CHECK: orr x1, x2, x3, ror #6      ; encoding: [0x41,0x18,0xc3,0xaa]
+
+  orn w1, w2, w3
+  orn x1, x2, x3
+  orn w1, w2, w3, lsl #7
+  orn x1, x2, x3, lsl #7
+  orn w1, w2, w3, lsr #7
+  orn x1, x2, x3, lsr #7
+  orn w1, w2, w3, asr #7
+  orn x1, x2, x3, asr #7
+  orn w1, w2, w3, ror #7
+  orn x1, x2, x3, ror #7
+
+; CHECK: orn w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x2a]
+; CHECK: orn x1, x2, x3              ; encoding: [0x41,0x00,0x23,0xaa]
+; CHECK: orn w1, w2, w3, lsl #7      ; encoding: [0x41,0x1c,0x23,0x2a]
+; CHECK: orn x1, x2, x3, lsl #7      ; encoding: [0x41,0x1c,0x23,0xaa]
+; CHECK: orn w1, w2, w3, lsr #7      ; encoding: [0x41,0x1c,0x63,0x2a]
+; CHECK: orn x1, x2, x3, lsr #7      ; encoding: [0x41,0x1c,0x63,0xaa]
+; CHECK: orn w1, w2, w3, asr #7      ; encoding: [0x41,0x1c,0xa3,0x2a]
+; CHECK: orn x1, x2, x3, asr #7      ; encoding: [0x41,0x1c,0xa3,0xaa]
+; CHECK: orn w1, w2, w3, ror #7      ; encoding: [0x41,0x1c,0xe3,0x2a]
+; CHECK: orn x1, x2, x3, ror #7      ; encoding: [0x41,0x1c,0xe3,0xaa]
diff --git a/test/MC/AArch64/arm64-mapping-across-sections.s b/test/MC/AArch64/arm64-mapping-across-sections.s
new file mode 100644
index 00000000000..00b324cb826
--- /dev/null
+++ b/test/MC/AArch64/arm64-mapping-across-sections.s
@@ -0,0 +1,28 @@
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+
+        .text
+        add w0, w0, w0
+
+// .wibble should *not* inherit .text's mapping symbol. It's a completely different section.
+        .section .wibble
+        add w0, w0, w0
+
+// A setion should be able to start with a $d
+        .section .starts_data
+        .word 42
+
+// Changing back to .text should not emit a redundant $x
+        .text
+        add w0, w0, w0
+
+// With all those constraints, we want:
+//   + .text to have $x at 0 and no others
+//   + .wibble to have $x at 0
+//   + .starts_data to have $d at 0
+
+
+// CHECK: 00000000 .starts_data 00000000 $d
+// CHECK-NEXT: 00000000 .text 00000000 $x
+// CHECK-NEXT: 00000000 .wibble 00000000 $x
+// CHECK-NOT: ${{[adtx]}}
+
diff --git a/test/MC/AArch64/arm64-mapping-within-section.s b/test/MC/AArch64/arm64-mapping-within-section.s
new file mode 100644
index 00000000000..f515cb9a5c0
--- /dev/null
+++ b/test/MC/AArch64/arm64-mapping-within-section.s
@@ -0,0 +1,23 @@
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+
+    .text
+// $x at 0x0000
+    add w0, w0, w0
+// $d at 0x0004
+    .ascii "012"
+    .byte 1
+    .hword 2
+    .word 4
+    .xword 8
+    .single 4.0
+    .double 8.0
+    .space 10
+    .zero 3
+    .fill 10, 2, 42
+    .org 100, 12
+// $x at 0x0018
+    add x0, x0, x0
+
+// CHECK: 00000004         .text  00000000 $d
+// CHECK-NEXT: 00000000         .text  00000000 $x
+// CHECK-NEXT: 00000064         .text  00000000 $x
diff --git a/test/MC/AArch64/arm64-memory.s b/test/MC/AArch64/arm64-memory.s
new file mode 100644
index 00000000000..579859660f9
--- /dev/null
+++ b/test/MC/AArch64/arm64-memory.s
@@ -0,0 +1,634 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+;-----------------------------------------------------------------------------
+; Indexed loads
+;-----------------------------------------------------------------------------
+
+  ldr    w5, [x4, #20]
+  ldr    x4, [x3]
+  ldr    x2, [sp, #32]
+  ldr    b5, [sp, #1]
+  ldr    h6, [sp, #2]
+  ldr    s7, [sp, #4]
+  ldr    d8, [sp, #8]
+  ldr    q9, [sp, #16]
+  ldrb   w4, [x3]
+  ldrb   w5, [x4, #20]
+  ldrb	 w2, [x3, _foo@pageoff]
+  ldrb   w3, [x2, "+[Test method].var"@PAGEOFF]
+  ldrsb  w9, [x3]
+  ldrsb  x2, [sp, #128]
+  ldrh   w2, [sp, #32]
+  ldrsh  w3, [sp, #32]
+  ldrsh  x5, [x9, #24]
+  ldrsw  x9, [sp, #512]
+
+  prfm   #5, [sp, #32]
+  prfm   #31, [sp, #32]
+  prfm   pldl1keep, [x2]
+  prfm   pldl1strm, [x2]
+  prfm   pldl2keep, [x2]
+  prfm   pldl2strm, [x2]
+  prfm   pldl3keep, [x2]
+  prfm   pldl3strm, [x2]
+  prfm   pstl1keep, [x2]
+  prfm   pstl1strm, [x2]
+  prfm   pstl2keep, [x2]
+  prfm   pstl2strm, [x2]
+  prfm   pstl3keep, [x2]
+  prfm   pstl3strm, [x2]
+  prfm  pstl3strm, [x4, x5, lsl #3]
+
+; CHECK: ldr    w5, [x4, #20]           ; encoding: [0x85,0x14,0x40,0xb9]
+; CHECK: ldr    x4, [x3]                ; encoding: [0x64,0x00,0x40,0xf9]
+; CHECK: ldr    x2, [sp, #32]           ; encoding: [0xe2,0x13,0x40,0xf9]
+; CHECK: ldr    b5, [sp, #1]            ; encoding: [0xe5,0x07,0x40,0x3d]
+; CHECK: ldr    h6, [sp, #2]            ; encoding: [0xe6,0x07,0x40,0x7d]
+; CHECK: ldr    s7, [sp, #4]            ; encoding: [0xe7,0x07,0x40,0xbd]
+; CHECK: ldr    d8, [sp, #8]            ; encoding: [0xe8,0x07,0x40,0xfd]
+; CHECK: ldr    q9, [sp, #16]           ; encoding: [0xe9,0x07,0xc0,0x3d]
+; CHECK: ldrb   w4, [x3]                ; encoding: [0x64,0x00,0x40,0x39]
+; CHECK: ldrb   w5, [x4, #20]           ; encoding: [0x85,0x50,0x40,0x39]
+; CHECK: ldrb	w2, [x3, _foo@PAGEOFF]  ; encoding: [0x62,0bAAAAAA00,0b01AAAAAA,0x39]
+; CHECK: ldrb	w3, [x2, "+[Test method].var"@PAGEOFF] ; encoding: [0x43,0bAAAAAA00,0b01AAAAAA,0x39]
+; CHECK: ldrsb  w9, [x3]                ; encoding: [0x69,0x00,0xc0,0x39]
+; CHECK: ldrsb  x2, [sp, #128]          ; encoding: [0xe2,0x03,0x82,0x39]
+; CHECK: ldrh   w2, [sp, #32]           ; encoding: [0xe2,0x43,0x40,0x79]
+; CHECK: ldrsh  w3, [sp, #32]           ; encoding: [0xe3,0x43,0xc0,0x79]
+; CHECK: ldrsh  x5, [x9, #24]           ; encoding: [0x25,0x31,0x80,0x79]
+; CHECK: ldrsw  x9, [sp, #512]          ; encoding: [0xe9,0x03,0x82,0xb9]
+; CHECK: prfm   pldl3strm, [sp, #32]    ; encoding: [0xe5,0x13,0x80,0xf9]
+; CHECK: prfm	#31, [sp, #32]          ; encoding: [0xff,0x13,0x80,0xf9]
+; CHECK: prfm   pldl1keep, [x2]         ; encoding: [0x40,0x00,0x80,0xf9]
+; CHECK: prfm   pldl1strm, [x2]         ; encoding: [0x41,0x00,0x80,0xf9]
+; CHECK: prfm   pldl2keep, [x2]         ; encoding: [0x42,0x00,0x80,0xf9]
+; CHECK: prfm   pldl2strm, [x2]         ; encoding: [0x43,0x00,0x80,0xf9]
+; CHECK: prfm   pldl3keep, [x2]         ; encoding: [0x44,0x00,0x80,0xf9]
+; CHECK: prfm   pldl3strm, [x2]         ; encoding: [0x45,0x00,0x80,0xf9]
+; CHECK: prfm   pstl1keep, [x2]         ; encoding: [0x50,0x00,0x80,0xf9]
+; CHECK: prfm   pstl1strm, [x2]         ; encoding: [0x51,0x00,0x80,0xf9]
+; CHECK: prfm   pstl2keep, [x2]         ; encoding: [0x52,0x00,0x80,0xf9]
+; CHECK: prfm   pstl2strm, [x2]         ; encoding: [0x53,0x00,0x80,0xf9]
+; CHECK: prfm   pstl3keep, [x2]         ; encoding: [0x54,0x00,0x80,0xf9]
+; CHECK: prfm   pstl3strm, [x2]         ; encoding: [0x55,0x00,0x80,0xf9]
+; CHECK: prfm	pstl3strm, [x4, x5, lsl #3] ; encoding: [0x95,0x78,0xa5,0xf8]
+
+;-----------------------------------------------------------------------------
+; Indexed stores
+;-----------------------------------------------------------------------------
+
+  str   x4, [x3]
+  str   x2, [sp, #32]
+  str   w5, [x4, #20]
+  str   b5, [sp, #1]
+  str   h6, [sp, #2]
+  str   s7, [sp, #4]
+  str   d8, [sp, #8]
+  str   q9, [sp, #16]
+  strb  w4, [x3]
+  strb  w5, [x4, #20]
+  strh  w2, [sp, #32]
+
+; CHECK: str   x4, [x3]                 ; encoding: [0x64,0x00,0x00,0xf9]
+; CHECK: str   x2, [sp, #32]            ; encoding: [0xe2,0x13,0x00,0xf9]
+; CHECK: str   w5, [x4, #20]            ; encoding: [0x85,0x14,0x00,0xb9]
+; CHECK: str   b5, [sp, #1]             ; encoding: [0xe5,0x07,0x00,0x3d]
+; CHECK: str   h6, [sp, #2]             ; encoding: [0xe6,0x07,0x00,0x7d]
+; CHECK: str   s7, [sp, #4]             ; encoding: [0xe7,0x07,0x00,0xbd]
+; CHECK: str   d8, [sp, #8]             ; encoding: [0xe8,0x07,0x00,0xfd]
+; CHECK: str   q9, [sp, #16]            ; encoding: [0xe9,0x07,0x80,0x3d]
+; CHECK: strb  w4, [x3]                 ; encoding: [0x64,0x00,0x00,0x39]
+; CHECK: strb  w5, [x4, #20]            ; encoding: [0x85,0x50,0x00,0x39]
+; CHECK: strh  w2, [sp, #32]            ; encoding: [0xe2,0x43,0x00,0x79]
+
+;-----------------------------------------------------------------------------
+; Unscaled immediate loads and stores
+;-----------------------------------------------------------------------------
+
+  ldur    w2, [x3]
+  ldur    w2, [sp, #24]
+  ldur    x2, [x3]
+  ldur    x2, [sp, #24]
+  ldur    b5, [sp, #1]
+  ldur    h6, [sp, #2]
+  ldur    s7, [sp, #4]
+  ldur    d8, [sp, #8]
+  ldur    q9, [sp, #16]
+  ldursb  w9, [x3]
+  ldursb  x2, [sp, #128]
+  ldursh  w3, [sp, #32]
+  ldursh  x5, [x9, #24]
+  ldursw  x9, [sp, #-128]
+
+; CHECK: ldur    w2, [x3]               ; encoding: [0x62,0x00,0x40,0xb8]
+; CHECK: ldur    w2, [sp, #24]          ; encoding: [0xe2,0x83,0x41,0xb8]
+; CHECK: ldur    x2, [x3]               ; encoding: [0x62,0x00,0x40,0xf8]
+; CHECK: ldur    x2, [sp, #24]          ; encoding: [0xe2,0x83,0x41,0xf8]
+; CHECK: ldur    b5, [sp, #1]           ; encoding: [0xe5,0x13,0x40,0x3c]
+; CHECK: ldur    h6, [sp, #2]           ; encoding: [0xe6,0x23,0x40,0x7c]
+; CHECK: ldur    s7, [sp, #4]           ; encoding: [0xe7,0x43,0x40,0xbc]
+; CHECK: ldur    d8, [sp, #8]           ; encoding: [0xe8,0x83,0x40,0xfc]
+; CHECK: ldur    q9, [sp, #16]          ; encoding: [0xe9,0x03,0xc1,0x3c]
+; CHECK: ldursb  w9, [x3]               ; encoding: [0x69,0x00,0xc0,0x38]
+; CHECK: ldursb  x2, [sp, #128]         ; encoding: [0xe2,0x03,0x88,0x38]
+; CHECK: ldursh  w3, [sp, #32]          ; encoding: [0xe3,0x03,0xc2,0x78]
+; CHECK: ldursh  x5, [x9, #24]          ; encoding: [0x25,0x81,0x81,0x78]
+; CHECK: ldursw  x9, [sp, #-128]        ; encoding: [0xe9,0x03,0x98,0xb8]
+
+  stur    w4, [x3]
+  stur    w2, [sp, #32]
+  stur    x4, [x3]
+  stur    x2, [sp, #32]
+  stur    w5, [x4, #20]
+  stur    b5, [sp, #1]
+  stur    h6, [sp, #2]
+  stur    s7, [sp, #4]
+  stur    d8, [sp, #8]
+  stur    q9, [sp, #16]
+  sturb   w4, [x3]
+  sturb   w5, [x4, #20]
+  sturh   w2, [sp, #32]
+  prfum   #5, [sp, #32]
+
+; CHECK: stur    w4, [x3]               ; encoding: [0x64,0x00,0x00,0xb8]
+; CHECK: stur    w2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0xb8]
+; CHECK: stur    x4, [x3]               ; encoding: [0x64,0x00,0x00,0xf8]
+; CHECK: stur    x2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0xf8]
+; CHECK: stur    w5, [x4, #20]          ; encoding: [0x85,0x40,0x01,0xb8]
+; CHECK: stur    b5, [sp, #1]           ; encoding: [0xe5,0x13,0x00,0x3c]
+; CHECK: stur    h6, [sp, #2]           ; encoding: [0xe6,0x23,0x00,0x7c]
+; CHECK: stur    s7, [sp, #4]           ; encoding: [0xe7,0x43,0x00,0xbc]
+; CHECK: stur    d8, [sp, #8]           ; encoding: [0xe8,0x83,0x00,0xfc]
+; CHECK: stur    q9, [sp, #16]          ; encoding: [0xe9,0x03,0x81,0x3c]
+; CHECK: sturb   w4, [x3]               ; encoding: [0x64,0x00,0x00,0x38]
+; CHECK: sturb   w5, [x4, #20]          ; encoding: [0x85,0x40,0x01,0x38]
+; CHECK: sturh   w2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0x78]
+; CHECK: prfum   pldl3strm, [sp, #32]   ; encoding: [0xe5,0x03,0x82,0xf8]
+
+;-----------------------------------------------------------------------------
+; Unprivileged loads and stores
+;-----------------------------------------------------------------------------
+
+  ldtr    w3, [x4, #16]
+  ldtr    x3, [x4, #16]
+  ldtrb   w3, [x4, #16]
+  ldtrsb  w9, [x3]
+  ldtrsb  x2, [sp, #128]
+  ldtrh   w3, [x4, #16]
+  ldtrsh  w3, [sp, #32]
+  ldtrsh  x5, [x9, #24]
+  ldtrsw  x9, [sp, #-128]
+
+; CHECK: ldtr   w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0xb8]
+; CHECK: ldtr   x3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0xf8]
+; CHECK: ldtrb  w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0x38]
+; CHECK: ldtrsb w9, [x3]                ; encoding: [0x69,0x08,0xc0,0x38]
+; CHECK: ldtrsb x2, [sp, #128]          ; encoding: [0xe2,0x0b,0x88,0x38]
+; CHECK: ldtrh  w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0x78]
+; CHECK: ldtrsh w3, [sp, #32]           ; encoding: [0xe3,0x0b,0xc2,0x78]
+; CHECK: ldtrsh x5, [x9, #24]           ; encoding: [0x25,0x89,0x81,0x78]
+; CHECK: ldtrsw x9, [sp, #-128]         ; encoding: [0xe9,0x0b,0x98,0xb8]
+
+  sttr    w5, [x4, #20]
+  sttr    x4, [x3]
+  sttr    x2, [sp, #32]
+  sttrb   w4, [x3]
+  sttrb   w5, [x4, #20]
+  sttrh   w2, [sp, #32]
+
+; CHECK: sttr   w5, [x4, #20]           ; encoding: [0x85,0x48,0x01,0xb8]
+; CHECK: sttr   x4, [x3]                ; encoding: [0x64,0x08,0x00,0xf8]
+; CHECK: sttr   x2, [sp, #32]           ; encoding: [0xe2,0x0b,0x02,0xf8]
+; CHECK: sttrb  w4, [x3]                ; encoding: [0x64,0x08,0x00,0x38]
+; CHECK: sttrb  w5, [x4, #20]           ; encoding: [0x85,0x48,0x01,0x38]
+; CHECK: sttrh  w2, [sp, #32]           ; encoding: [0xe2,0x0b,0x02,0x78]
+
+;-----------------------------------------------------------------------------
+; Pre-indexed loads and stores
+;-----------------------------------------------------------------------------
+
+  ldr   x29, [x7, #8]!
+  ldr   x30, [x7, #8]!
+  ldr   b5, [x0, #1]!
+  ldr   h6, [x0, #2]!
+  ldr   s7, [x0, #4]!
+  ldr   d8, [x0, #8]!
+  ldr   q9, [x0, #16]!
+
+  str   x30, [x7, #-8]!
+  str   x29, [x7, #-8]!
+  str   b5, [x0, #-1]!
+  str   h6, [x0, #-2]!
+  str   s7, [x0, #-4]!
+  str   d8, [x0, #-8]!
+  str   q9, [x0, #-16]!
+
+; CHECK: ldr  x29, [x7, #8]!             ; encoding: [0xfd,0x8c,0x40,0xf8]
+; CHECK: ldr  x30, [x7, #8]!             ; encoding: [0xfe,0x8c,0x40,0xf8]
+; CHECK: ldr  b5, [x0, #1]!             ; encoding: [0x05,0x1c,0x40,0x3c]
+; CHECK: ldr  h6, [x0, #2]!             ; encoding: [0x06,0x2c,0x40,0x7c]
+; CHECK: ldr  s7, [x0, #4]!             ; encoding: [0x07,0x4c,0x40,0xbc]
+; CHECK: ldr  d8, [x0, #8]!             ; encoding: [0x08,0x8c,0x40,0xfc]
+; CHECK: ldr  q9, [x0, #16]!            ; encoding: [0x09,0x0c,0xc1,0x3c]
+
+; CHECK: str  x30, [x7, #-8]!            ; encoding: [0xfe,0x8c,0x1f,0xf8]
+; CHECK: str  x29, [x7, #-8]!            ; encoding: [0xfd,0x8c,0x1f,0xf8]
+; CHECK: str  b5, [x0, #-1]!            ; encoding: [0x05,0xfc,0x1f,0x3c]
+; CHECK: str  h6, [x0, #-2]!            ; encoding: [0x06,0xec,0x1f,0x7c]
+; CHECK: str  s7, [x0, #-4]!            ; encoding: [0x07,0xcc,0x1f,0xbc]
+; CHECK: str  d8, [x0, #-8]!            ; encoding: [0x08,0x8c,0x1f,0xfc]
+; CHECK: str  q9, [x0, #-16]!           ; encoding: [0x09,0x0c,0x9f,0x3c]
+
+;-----------------------------------------------------------------------------
+; post-indexed loads and stores
+;-----------------------------------------------------------------------------
+  str x30, [x7], #-8
+  str x29, [x7], #-8
+  str b5, [x0], #-1
+  str h6, [x0], #-2
+  str s7, [x0], #-4
+  str d8, [x0], #-8
+  str q9, [x0], #-16
+
+  ldr x29, [x7], #8
+  ldr x30, [x7], #8
+  ldr b5, [x0], #1
+  ldr h6, [x0], #2
+  ldr s7, [x0], #4
+  ldr d8, [x0], #8
+  ldr q9, [x0], #16
+
+; CHECK: str x30, [x7], #-8             ; encoding: [0xfe,0x84,0x1f,0xf8]
+; CHECK: str x29, [x7], #-8             ; encoding: [0xfd,0x84,0x1f,0xf8]
+; CHECK: str b5, [x0], #-1             ; encoding: [0x05,0xf4,0x1f,0x3c]
+; CHECK: str h6, [x0], #-2             ; encoding: [0x06,0xe4,0x1f,0x7c]
+; CHECK: str s7, [x0], #-4             ; encoding: [0x07,0xc4,0x1f,0xbc]
+; CHECK: str d8, [x0], #-8             ; encoding: [0x08,0x84,0x1f,0xfc]
+; CHECK: str q9, [x0], #-16            ; encoding: [0x09,0x04,0x9f,0x3c]
+
+; CHECK: ldr x29, [x7], #8              ; encoding: [0xfd,0x84,0x40,0xf8]
+; CHECK: ldr x30, [x7], #8              ; encoding: [0xfe,0x84,0x40,0xf8]
+; CHECK: ldr b5, [x0], #1              ; encoding: [0x05,0x14,0x40,0x3c]
+; CHECK: ldr h6, [x0], #2              ; encoding: [0x06,0x24,0x40,0x7c]
+; CHECK: ldr s7, [x0], #4              ; encoding: [0x07,0x44,0x40,0xbc]
+; CHECK: ldr d8, [x0], #8              ; encoding: [0x08,0x84,0x40,0xfc]
+; CHECK: ldr q9, [x0], #16             ; encoding: [0x09,0x04,0xc1,0x3c]
+
+;-----------------------------------------------------------------------------
+; Load/Store pair (indexed, offset)
+;-----------------------------------------------------------------------------
+
+  ldp    w3, w2, [x15, #16]
+  ldp    x4, x9, [sp, #-16]
+  ldpsw  x2, x3, [x14, #16]
+  ldpsw  x2, x3, [sp, #-16]
+  ldp    s10, s1, [x2, #64]
+  ldp    d10, d1, [x2]
+  ldp    q2, q3, [x0, #32]
+
+; CHECK: ldp    w3, w2, [x15, #16]      ; encoding: [0xe3,0x09,0x42,0x29]
+; CHECK: ldp    x4, x9, [sp, #-16]      ; encoding: [0xe4,0x27,0x7f,0xa9]
+; CHECK: ldpsw  x2, x3, [x14, #16]      ; encoding: [0xc2,0x0d,0x42,0x69]
+; CHECK: ldpsw  x2, x3, [sp, #-16]      ; encoding: [0xe2,0x0f,0x7e,0x69]
+; CHECK: ldp    s10, s1, [x2, #64]      ; encoding: [0x4a,0x04,0x48,0x2d]
+; CHECK: ldp    d10, d1, [x2]           ; encoding: [0x4a,0x04,0x40,0x6d]
+; CHECK: ldp    q2, q3, [x0, #32]       ; encoding: [0x02,0x0c,0x41,0xad]
+
+  stp    w3, w2, [x15, #16]
+  stp    x4, x9, [sp, #-16]
+  stp    s10, s1, [x2, #64]
+  stp    d10, d1, [x2]
+  stp    q2, q3, [x0, #32]
+
+; CHECK: stp    w3, w2, [x15, #16]      ; encoding: [0xe3,0x09,0x02,0x29]
+; CHECK: stp    x4, x9, [sp, #-16]      ; encoding: [0xe4,0x27,0x3f,0xa9]
+; CHECK: stp    s10, s1, [x2, #64]      ; encoding: [0x4a,0x04,0x08,0x2d]
+; CHECK: stp    d10, d1, [x2]           ; encoding: [0x4a,0x04,0x00,0x6d]
+; CHECK: stp    q2, q3, [x0, #32]       ; encoding: [0x02,0x0c,0x01,0xad]
+
+;-----------------------------------------------------------------------------
+; Load/Store pair (pre-indexed)
+;-----------------------------------------------------------------------------
+
+  ldp    w3, w2, [x15, #16]!
+  ldp    x4, x9, [sp, #-16]!
+  ldpsw  x2, x3, [x14, #16]!
+  ldpsw  x2, x3, [sp, #-16]!
+  ldp    s10, s1, [x2, #64]!
+  ldp    d10, d1, [x2, #16]!
+
+; CHECK: ldp  w3, w2, [x15, #16]!       ; encoding: [0xe3,0x09,0xc2,0x29]
+; CHECK: ldp  x4, x9, [sp, #-16]!       ; encoding: [0xe4,0x27,0xff,0xa9]
+; CHECK: ldpsw	x2, x3, [x14, #16]!     ; encoding: [0xc2,0x0d,0xc2,0x69]
+; CHECK: ldpsw	x2, x3, [sp, #-16]!     ; encoding: [0xe2,0x0f,0xfe,0x69]
+; CHECK: ldp  s10, s1, [x2, #64]!       ; encoding: [0x4a,0x04,0xc8,0x2d]
+; CHECK: ldp  d10, d1, [x2, #16]!       ; encoding: [0x4a,0x04,0xc1,0x6d]
+
+  stp    w3, w2, [x15, #16]!
+  stp    x4, x9, [sp, #-16]!
+  stp    s10, s1, [x2, #64]!
+  stp    d10, d1, [x2, #16]!
+
+; CHECK: stp  w3, w2, [x15, #16]!       ; encoding: [0xe3,0x09,0x82,0x29]
+; CHECK: stp  x4, x9, [sp, #-16]!       ; encoding: [0xe4,0x27,0xbf,0xa9]
+; CHECK: stp  s10, s1, [x2, #64]!       ; encoding: [0x4a,0x04,0x88,0x2d]
+; CHECK: stp  d10, d1, [x2, #16]!       ; encoding: [0x4a,0x04,0x81,0x6d]
+
+;-----------------------------------------------------------------------------
+; Load/Store pair (post-indexed)
+;-----------------------------------------------------------------------------
+
+  ldp    w3, w2, [x15], #16
+  ldp    x4, x9, [sp], #-16
+  ldpsw  x2, x3, [x14], #16
+  ldpsw  x2, x3, [sp], #-16
+  ldp    s10, s1, [x2], #64
+  ldp    d10, d1, [x2], #16
+
+; CHECK: ldp  w3, w2, [x15], #16        ; encoding: [0xe3,0x09,0xc2,0x28]
+; CHECK: ldp  x4, x9, [sp], #-16        ; encoding: [0xe4,0x27,0xff,0xa8]
+; CHECK: ldpsw	x2, x3, [x14], #16      ; encoding: [0xc2,0x0d,0xc2,0x68]
+; CHECK: ldpsw	x2, x3, [sp], #-16      ; encoding: [0xe2,0x0f,0xfe,0x68]
+; CHECK: ldp  s10, s1, [x2], #64        ; encoding: [0x4a,0x04,0xc8,0x2c]
+; CHECK: ldp  d10, d1, [x2], #16        ; encoding: [0x4a,0x04,0xc1,0x6c]
+
+  stp    w3, w2, [x15], #16
+  stp    x4, x9, [sp], #-16
+  stp    s10, s1, [x2], #64
+  stp    d10, d1, [x2], #16
+
+; CHECK: stp  w3, w2, [x15], #16        ; encoding: [0xe3,0x09,0x82,0x28]
+; CHECK: stp  x4, x9, [sp], #-16        ; encoding: [0xe4,0x27,0xbf,0xa8]
+; CHECK: stp  s10, s1, [x2], #64        ; encoding: [0x4a,0x04,0x88,0x2c]
+; CHECK: stp  d10, d1, [x2], #16        ; encoding: [0x4a,0x04,0x81,0x6c]
+
+;-----------------------------------------------------------------------------
+; Load/Store pair (no-allocate)
+;-----------------------------------------------------------------------------
+
+  ldnp  w3, w2, [x15, #16]
+  ldnp  x4, x9, [sp, #-16]
+  ldnp  s10, s1, [x2, #64]
+  ldnp  d10, d1, [x2]
+
+; CHECK: ldnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x42,0x28]
+; CHECK: ldnp  x4, x9, [sp, #-16]       ; encoding: [0xe4,0x27,0x7f,0xa8]
+; CHECK: ldnp  s10, s1, [x2, #64]       ; encoding: [0x4a,0x04,0x48,0x2c]
+; CHECK: ldnp  d10, d1, [x2]            ; encoding: [0x4a,0x04,0x40,0x6c]
+
+  stnp  w3, w2, [x15, #16]
+  stnp  x4, x9, [sp, #-16]
+  stnp  s10, s1, [x2, #64]
+  stnp  d10, d1, [x2]
+
+; CHECK: stnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x02,0x28]
+; CHECK: stnp  x4, x9, [sp, #-16]       ; encoding: [0xe4,0x27,0x3f,0xa8]
+; CHECK: stnp  s10, s1, [x2, #64]       ; encoding: [0x4a,0x04,0x08,0x2c]
+; CHECK: stnp  d10, d1, [x2]            ; encoding: [0x4a,0x04,0x00,0x6c]
+
+;-----------------------------------------------------------------------------
+; Load/Store register offset
+;-----------------------------------------------------------------------------
+
+  ldr  w0, [x0, x0]
+  ldr  w0, [x0, x0, lsl #2]
+  ldr  x0, [x0, x0]
+  ldr  x0, [x0, x0, lsl #3]
+  ldr  x0, [x0, x0, sxtx]
+
+; CHECK: ldr  w0, [x0, x0]              ; encoding: [0x00,0x68,0x60,0xb8]
+; CHECK: ldr  w0, [x0, x0, lsl #2]      ; encoding: [0x00,0x78,0x60,0xb8]
+; CHECK: ldr  x0, [x0, x0]              ; encoding: [0x00,0x68,0x60,0xf8]
+; CHECK: ldr  x0, [x0, x0, lsl #3]      ; encoding: [0x00,0x78,0x60,0xf8]
+; CHECK: ldr  x0, [x0, x0, sxtx]        ; encoding: [0x00,0xe8,0x60,0xf8]
+
+  ldr  b1, [x1, x2]
+  ldr  b1, [x1, x2, lsl #0]
+  ldr  h1, [x1, x2]
+  ldr  h1, [x1, x2, lsl #1]
+  ldr  s1, [x1, x2]
+  ldr  s1, [x1, x2, lsl #2]
+  ldr  d1, [x1, x2]
+  ldr  d1, [x1, x2, lsl #3]
+  ldr  q1, [x1, x2]
+  ldr  q1, [x1, x2, lsl #4]
+
+; CHECK: ldr  b1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0x3c]
+; CHECK: ldr  b1, [x1, x2, lsl #0]      ; encoding: [0x21,0x78,0x62,0x3c]
+; CHECK: ldr  h1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0x7c]
+; CHECK: ldr  h1, [x1, x2, lsl #1]      ; encoding: [0x21,0x78,0x62,0x7c]
+; CHECK: ldr  s1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0xbc]
+; CHECK: ldr  s1, [x1, x2, lsl #2]      ; encoding: [0x21,0x78,0x62,0xbc]
+; CHECK: ldr  d1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0xfc]
+; CHECK: ldr  d1, [x1, x2, lsl #3]      ; encoding: [0x21,0x78,0x62,0xfc]
+; CHECK: ldr  q1, [x1, x2]              ; encoding: [0x21,0x68,0xe2,0x3c]
+; CHECK: ldr  q1, [x1, x2, lsl #4]      ; encoding: [0x21,0x78,0xe2,0x3c]
+
+  str  d1, [sp, x3]
+  str  d1, [sp, w3, uxtw #3]
+  str  q1, [sp, x3]
+  str  q1, [sp, w3, uxtw #4]
+
+; CHECK: str  d1, [sp, x3]              ; encoding: [0xe1,0x6b,0x23,0xfc]
+; CHECK: str  d1, [sp, w3, uxtw #3]     ; encoding: [0xe1,0x5b,0x23,0xfc]
+; CHECK: str  q1, [sp, x3]              ; encoding: [0xe1,0x6b,0xa3,0x3c]
+; CHECK: str  q1, [sp, w3, uxtw #4]     ; encoding: [0xe1,0x5b,0xa3,0x3c]
+
+;-----------------------------------------------------------------------------
+; Load literal
+;-----------------------------------------------------------------------------
+
+  ldr    w5, foo
+  ldr    x4, foo
+  ldrsw  x9, foo
+  prfm   #5, foo
+
+; CHECK: ldr    w5, foo                 ; encoding: [0bAAA00101,A,A,0x18]
+; CHECK: ldr    x4, foo                 ; encoding: [0bAAA00100,A,A,0x58]
+; CHECK: ldrsw  x9, foo                 ; encoding: [0bAAA01001,A,A,0x98]
+; CHECK: prfm   pldl3strm, foo          ; encoding: [0bAAA00101,A,A,0xd8]
+
+;-----------------------------------------------------------------------------
+; Load/Store exclusive
+;-----------------------------------------------------------------------------
+
+  ldxr   w6, [x1]
+  ldxr   x6, [x1]
+  ldxrb  w6, [x1]
+  ldxrh  w6, [x1]
+  ldxp   w7, w3, [x9]
+  ldxp   x7, x3, [x9]
+
+; CHECK: ldxrb  w6, [x1]                ; encoding: [0x26,0x7c,0x5f,0x08]
+; CHECK: ldxrh  w6, [x1]                ; encoding: [0x26,0x7c,0x5f,0x48]
+; CHECK: ldxp   w7, w3, [x9]            ; encoding: [0x27,0x0d,0x7f,0x88]
+; CHECK: ldxp   x7, x3, [x9]            ; encoding: [0x27,0x0d,0x7f,0xc8]
+
+  stxr   w1, x4, [x3]
+  stxr   w1, w4, [x3]
+  stxrb  w1, w4, [x3]
+  stxrh  w1, w4, [x3]
+  stxp   w1, x2, x6, [x1]
+  stxp   w1, w2, w6, [x1]
+
+; CHECK: stxr   w1, x4, [x3]            ; encoding: [0x64,0x7c,0x01,0xc8]
+; CHECK: stxr   w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x88]
+; CHECK: stxrb  w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x08]
+; CHECK: stxrh  w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x48]
+; CHECK: stxp   w1, x2, x6, [x1]        ; encoding: [0x22,0x18,0x21,0xc8]
+; CHECK: stxp   w1, w2, w6, [x1]        ; encoding: [0x22,0x18,0x21,0x88]
+
+;-----------------------------------------------------------------------------
+; Load-acquire/Store-release non-exclusive
+;-----------------------------------------------------------------------------
+
+  ldar   w4, [sp]
+  ldar   x4, [sp, #0]
+  ldarb  w4, [sp]
+  ldarh  w4, [sp]
+
+; CHECK: ldar   w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x88]
+; CHECK: ldar   x4, [sp]                ; encoding: [0xe4,0xff,0xdf,0xc8]
+; CHECK: ldarb  w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x08]
+; CHECK: ldarh  w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x48]
+
+  stlr   w3, [x6]
+  stlr   x3, [x6]
+  stlrb  w3, [x6]
+  stlrh  w3, [x6]
+
+; CHECK: stlr   w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x88]
+; CHECK: stlr   x3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0xc8]
+; CHECK: stlrb  w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x08]
+; CHECK: stlrh  w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x48]
+
+;-----------------------------------------------------------------------------
+; Load-acquire/Store-release exclusive
+;-----------------------------------------------------------------------------
+
+  ldaxr   w2, [x4]
+  ldaxr   x2, [x4]
+  ldaxrb  w2, [x4, #0]
+  ldaxrh  w2, [x4]
+  ldaxp   w2, w6, [x1]
+  ldaxp   x2, x6, [x1]
+
+; CHECK: ldaxr   w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x88]
+; CHECK: ldaxr   x2, [x4]               ; encoding: [0x82,0xfc,0x5f,0xc8]
+; CHECK: ldaxrb  w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x08]
+; CHECK: ldaxrh  w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x48]
+; CHECK: ldaxp   w2, w6, [x1]           ; encoding: [0x22,0x98,0x7f,0x88]
+; CHECK: ldaxp   x2, x6, [x1]           ; encoding: [0x22,0x98,0x7f,0xc8]
+
+  stlxr   w8, x7, [x1]
+  stlxr   w8, w7, [x1]
+  stlxrb  w8, w7, [x1]
+  stlxrh  w8, w7, [x1]
+  stlxp   w1, x2, x6, [x1]
+  stlxp   w1, w2, w6, [x1]
+
+; CHECK: stlxr  w8, x7, [x1]            ; encoding: [0x27,0xfc,0x08,0xc8]
+; CHECK: stlxr  w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x88]
+; CHECK: stlxrb w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x08]
+; CHECK: stlxrh w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x48]
+; CHECK: stlxp  w1, x2, x6, [x1]        ; encoding: [0x22,0x98,0x21,0xc8]
+; CHECK: stlxp  w1, w2, w6, [x1]        ; encoding: [0x22,0x98,0x21,0x88]
+
+
+;-----------------------------------------------------------------------------
+; LDUR/STUR aliases for negative and unaligned LDR/STR instructions.
+;
+; According to the ARM ISA documentation:
+; "A programmer-friendly assembler should also generate these instructions
+; in response to the standard LDR/STR mnemonics when the immediate offset is
+; unambiguous, i.e. negative or unaligned."
+;-----------------------------------------------------------------------------
+
+  ldr x11, [x29, #-8]
+  ldr x11, [x29, #7]
+  ldr w0, [x0, #2]
+  ldr w0, [x0, #-256]
+  ldr b2, [x1, #-2]
+  ldr h3, [x2, #3]
+  ldr h3, [x3, #-4]
+  ldr s3, [x4, #3]
+  ldr s3, [x5, #-4]
+  ldr d4, [x6, #4]
+  ldr d4, [x7, #-8]
+  ldr q5, [x8, #8]
+  ldr q5, [x9, #-16]
+
+; CHECK: ldur	x11, [x29, #-8]          ; encoding: [0xab,0x83,0x5f,0xf8]
+; CHECK: ldur	x11, [x29, #7]           ; encoding: [0xab,0x73,0x40,0xf8]
+; CHECK: ldur	w0, [x0, #2]            ; encoding: [0x00,0x20,0x40,0xb8]
+; CHECK: ldur	w0, [x0, #-256]         ; encoding: [0x00,0x00,0x50,0xb8]
+; CHECK: ldur	b2, [x1, #-2]           ; encoding: [0x22,0xe0,0x5f,0x3c]
+; CHECK: ldur	h3, [x2, #3]            ; encoding: [0x43,0x30,0x40,0x7c]
+; CHECK: ldur	h3, [x3, #-4]           ; encoding: [0x63,0xc0,0x5f,0x7c]
+; CHECK: ldur	s3, [x4, #3]            ; encoding: [0x83,0x30,0x40,0xbc]
+; CHECK: ldur	s3, [x5, #-4]           ; encoding: [0xa3,0xc0,0x5f,0xbc]
+; CHECK: ldur	d4, [x6, #4]            ; encoding: [0xc4,0x40,0x40,0xfc]
+; CHECK: ldur	d4, [x7, #-8]           ; encoding: [0xe4,0x80,0x5f,0xfc]
+; CHECK: ldur	q5, [x8, #8]            ; encoding: [0x05,0x81,0xc0,0x3c]
+; CHECK: ldur	q5, [x9, #-16]          ; encoding: [0x25,0x01,0xdf,0x3c]
+
+  str x11, [x29, #-8]
+  str x11, [x29, #7]
+  str w0, [x0, #2]
+  str w0, [x0, #-256]
+  str b2, [x1, #-2]
+  str h3, [x2, #3]
+  str h3, [x3, #-4]
+  str s3, [x4, #3]
+  str s3, [x5, #-4]
+  str d4, [x6, #4]
+  str d4, [x7, #-8]
+  str q5, [x8, #8]
+  str q5, [x9, #-16]
+
+; CHECK: stur	x11, [x29, #-8]          ; encoding: [0xab,0x83,0x1f,0xf8]
+; CHECK: stur	x11, [x29, #7]           ; encoding: [0xab,0x73,0x00,0xf8]
+; CHECK: stur	w0, [x0, #2]            ; encoding: [0x00,0x20,0x00,0xb8]
+; CHECK: stur	w0, [x0, #-256]         ; encoding: [0x00,0x00,0x10,0xb8]
+; CHECK: stur	b2, [x1, #-2]           ; encoding: [0x22,0xe0,0x1f,0x3c]
+; CHECK: stur	h3, [x2, #3]            ; encoding: [0x43,0x30,0x00,0x7c]
+; CHECK: stur	h3, [x3, #-4]           ; encoding: [0x63,0xc0,0x1f,0x7c]
+; CHECK: stur	s3, [x4, #3]            ; encoding: [0x83,0x30,0x00,0xbc]
+; CHECK: stur	s3, [x5, #-4]           ; encoding: [0xa3,0xc0,0x1f,0xbc]
+; CHECK: stur	d4, [x6, #4]            ; encoding: [0xc4,0x40,0x00,0xfc]
+; CHECK: stur	d4, [x7, #-8]           ; encoding: [0xe4,0x80,0x1f,0xfc]
+; CHECK: stur	q5, [x8, #8]            ; encoding: [0x05,0x81,0x80,0x3c]
+; CHECK: stur	q5, [x9, #-16]          ; encoding: [0x25,0x01,0x9f,0x3c]
+
+  ldrb w3, [x1, #-1]
+  ldrh w4, [x2, #1]
+  ldrh w5, [x3, #-1]
+  ldrsb w6, [x4, #-1]
+  ldrsb x7, [x5, #-1]
+  ldrsh w8, [x6, #1]
+  ldrsh w9, [x7, #-1]
+  ldrsh x1, [x8, #1]
+  ldrsh x2, [x9, #-1]
+  ldrsw x3, [x10, #10]
+  ldrsw x4, [x11, #-1]
+
+; CHECK: ldurb	w3, [x1, #-1]           ; encoding: [0x23,0xf0,0x5f,0x38]
+; CHECK: ldurh	w4, [x2, #1]            ; encoding: [0x44,0x10,0x40,0x78]
+; CHECK: ldurh	w5, [x3, #-1]           ; encoding: [0x65,0xf0,0x5f,0x78]
+; CHECK: ldursb	w6, [x4, #-1]           ; encoding: [0x86,0xf0,0xdf,0x38]
+; CHECK: ldursb	x7, [x5, #-1]           ; encoding: [0xa7,0xf0,0x9f,0x38]
+; CHECK: ldursh	w8, [x6, #1]            ; encoding: [0xc8,0x10,0xc0,0x78]
+; CHECK: ldursh	w9, [x7, #-1]           ; encoding: [0xe9,0xf0,0xdf,0x78]
+; CHECK: ldursh	x1, [x8, #1]            ; encoding: [0x01,0x11,0x80,0x78]
+; CHECK: ldursh	x2, [x9, #-1]           ; encoding: [0x22,0xf1,0x9f,0x78]
+; CHECK: ldursw	x3, [x10, #10]          ; encoding: [0x43,0xa1,0x80,0xb8]
+; CHECK: ldursw	x4, [x11, #-1]          ; encoding: [0x64,0xf1,0x9f,0xb8]
+
+  strb w3, [x1, #-1]
+  strh w4, [x2, #1]
+  strh w5, [x3, #-1]
+
+; CHECK: sturb	w3, [x1, #-1]           ; encoding: [0x23,0xf0,0x1f,0x38]
+; CHECK: sturh	w4, [x2, #1]            ; encoding: [0x44,0x10,0x00,0x78]
+; CHECK: sturh	w5, [x3, #-1]           ; encoding: [0x65,0xf0,0x1f,0x78]
diff --git a/test/MC/AArch64/arm64-nv-cond.s b/test/MC/AArch64/arm64-nv-cond.s
new file mode 100644
index 00000000000..1b4d054d248
--- /dev/null
+++ b/test/MC/AArch64/arm64-nv-cond.s
@@ -0,0 +1,11 @@
+// RUN: llvm-mc < %s -triple arm64 -mattr=neon -show-encoding | FileCheck %s
+
+fcsel d28,d31,d31,nv
+csel x0,x0,x0,nv
+ccmp x0,x0,#0,nv
+b.nv #0
+
+// CHECK: fcsel   d28, d31, d31, nv       // encoding: [0xfc,0xff,0x7f,0x1e]
+// CHECK: csel    x0, x0, x0, nv          // encoding: [0x00,0xf0,0x80,0x9a]
+// CHECK: ccmp    x0, x0, #0, nv          // encoding: [0x00,0xf0,0x40,0xfa]
+// CHECK: b.nv #0                         // encoding: [0x0f,0x00,0x00,0x54]
diff --git a/test/MC/AArch64/arm64-optional-hash.s b/test/MC/AArch64/arm64-optional-hash.s
new file mode 100644
index 00000000000..71e2fda217d
--- /dev/null
+++ b/test/MC/AArch64/arm64-optional-hash.s
@@ -0,0 +1,31 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+.text
+; parseOperand check
+; CHECK: add sp, sp, #32             ; encoding: [0xff,0x83,0x00,0x91]
+    add sp, sp, 32
+
+; Optional shift
+; CHECK: adds x3, x4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0xb1]
+adds x3, x4, 1024, lsl 12
+
+; Optional extend
+; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
+add sp, x2, x3, uxtx 0
+
+; FP immediates
+; CHECK: fmov s1, #0.12500000      ; encoding: [0x01,0x10,0x28,0x1e]
+fmov s1, 0.125
+
+; Barrier operand
+; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
+dmb 3
+
+; Prefetch and memory
+
+; Single register inside []
+; CHECK: ldnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x42,0x28]
+ldnp  w3, w2, [x15, 16]
+
+; Memory, two registers inside []
+; CHECK: prfm   pstl3strm, [x4, x5, lsl #3] ; encoding: [0x95,0x78,0xa5,0xf8]
+prfm  pstl3strm, [x4, x5, lsl 3]
diff --git a/test/MC/AArch64/arm64-separator.s b/test/MC/AArch64/arm64-separator.s
new file mode 100644
index 00000000000..e67deba825d
--- /dev/null
+++ b/test/MC/AArch64/arm64-separator.s
@@ -0,0 +1,20 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+; ARM64 uses a multi-character statement separator, "%%". Check that we lex
+; it properly and recognize the multiple assembly statements on the line.
+
+; To make sure the output assembly correctly handled the instructions,
+; tell it to show encodings. That will result in the two 'mov' instructions
+; being on separate lines in the output. We look for the "; encoding" string
+; to verify that. For this test, we don't care what the encoding is, just that
+; there is one for each 'mov' instruction.
+
+
+_foo:
+; CHECK: foo
+; CHECK: mov x0, x1 ; encoding
+; CHECK: mov x1, x0 ; encoding
+	mov x0, x1 %% mov x1, x0
+	ret	lr
+
+
diff --git a/test/MC/AArch64/arm64-simd-ldst.s b/test/MC/AArch64/arm64-simd-ldst.s
new file mode 100644
index 00000000000..30854852c28
--- /dev/null
+++ b/test/MC/AArch64/arm64-simd-ldst.s
@@ -0,0 +1,2404 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon -output-asm-variant=1 -show-encoding < %s | FileCheck %s
+
+_ld1st1_multiple:
+  ld1.8b {v0}, [x1]
+  ld1.8b {v0, v1}, [x1]
+  ld1.8b {v0, v1, v2}, [x1]
+  ld1.8b {v0, v1, v2, v3}, [x1]
+
+  ld1.8b {v3}, [x1]
+  ld1.8b {v3, v4}, [x2]
+  ld1.8b {v4, v5, v6}, [x3]
+  ld1.8b {v7, v8, v9, v10}, [x4]
+
+  ld1.16b {v0}, [x1]
+  ld1.16b {v0, v1}, [x1]
+  ld1.16b {v0, v1, v2}, [x1]
+  ld1.16b {v0, v1, v2, v3}, [x1]
+
+  ld1.4h {v0}, [x1]
+  ld1.4h {v0, v1}, [x1]
+  ld1.4h {v0, v1, v2}, [x1]
+  ld1.4h {v0, v1, v2, v3}, [x1]
+
+  ld1.8h {v0}, [x1]
+  ld1.8h {v0, v1}, [x1]
+  ld1.8h {v0, v1, v2}, [x1]
+  ld1.8h {v0, v1, v2, v3}, [x1]
+
+  ld1.2s {v0}, [x1]
+  ld1.2s {v0, v1}, [x1]
+  ld1.2s {v0, v1, v2}, [x1]
+  ld1.2s {v0, v1, v2, v3}, [x1]
+
+  ld1.4s {v0}, [x1]
+  ld1.4s {v0, v1}, [x1]
+  ld1.4s {v0, v1, v2}, [x1]
+  ld1.4s {v0, v1, v2, v3}, [x1]
+
+  ld1.1d {v0}, [x1]
+  ld1.1d {v0, v1}, [x1]
+  ld1.1d {v0, v1, v2}, [x1]
+  ld1.1d {v0, v1, v2, v3}, [x1]
+
+  ld1.2d {v0}, [x1]
+  ld1.2d {v0, v1}, [x1]
+  ld1.2d {v0, v1, v2}, [x1]
+  ld1.2d {v0, v1, v2, v3}, [x1]
+
+  st1.8b {v0}, [x1]
+  st1.8b {v0, v1}, [x1]
+  st1.8b {v0, v1, v2}, [x1]
+  st1.8b {v0, v1, v2, v3}, [x1]
+
+  st1.16b {v0}, [x1]
+  st1.16b {v0, v1}, [x1]
+  st1.16b {v0, v1, v2}, [x1]
+  st1.16b {v0, v1, v2, v3}, [x1]
+
+  st1.4h {v0}, [x1]
+  st1.4h {v0, v1}, [x1]
+  st1.4h {v0, v1, v2}, [x1]
+  st1.4h {v0, v1, v2, v3}, [x1]
+
+  st1.8h {v0}, [x1]
+  st1.8h {v0, v1}, [x1]
+  st1.8h {v0, v1, v2}, [x1]
+  st1.8h {v0, v1, v2, v3}, [x1]
+
+  st1.2s {v0}, [x1]
+  st1.2s {v0, v1}, [x1]
+  st1.2s {v0, v1, v2}, [x1]
+  st1.2s {v0, v1, v2, v3}, [x1]
+
+  st1.4s {v0}, [x1]
+  st1.4s {v0, v1}, [x1]
+  st1.4s {v0, v1, v2}, [x1]
+  st1.4s {v0, v1, v2, v3}, [x1]
+
+  st1.1d {v0}, [x1]
+  st1.1d {v0, v1}, [x1]
+  st1.1d {v0, v1, v2}, [x1]
+  st1.1d {v0, v1, v2, v3}, [x1]
+
+  st1.2d {v0}, [x1]
+  st1.2d {v0, v1}, [x1]
+  st1.2d {v0, v1, v2}, [x1]
+  st1.2d {v0, v1, v2, v3}, [x1]
+
+  st1.2d {v5}, [x1]
+  st1.2d {v7, v8}, [x10]
+  st1.2d {v11, v12, v13}, [x1]
+  st1.2d {v28, v29, v30, v31}, [x13]
+
+; CHECK: _ld1st1_multiple:
+; CHECK: ld1.8b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x40,0x0c]
+; CHECK: ld1.8b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x40,0x0c]
+; CHECK: ld1.8b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x40,0x0c]
+; CHECK: ld1.8b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x40,0x0c]
+
+; CHECK: ld1.8b { v3 }, [x1]            ; encoding: [0x23,0x70,0x40,0x0c]
+; CHECK: ld1.8b { v3, v4 }, [x2]        ; encoding: [0x43,0xa0,0x40,0x0c]
+; CHECK: ld1.8b { v4, v5, v6 }, [x3]    ; encoding: [0x64,0x60,0x40,0x0c]
+; CHECK: ld1.8b { v7, v8, v9, v10 }, [x4] ; encoding: [0x87,0x20,0x40,0x0c]
+
+; CHECK: ld1.16b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x40,0x4c]
+; CHECK: ld1.16b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x40,0x4c]
+; CHECK: ld1.16b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x40,0x4c]
+; CHECK: ld1.16b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x40,0x4c]
+
+; CHECK: ld1.4h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x40,0x0c]
+; CHECK: ld1.4h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x40,0x0c]
+; CHECK: ld1.4h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x40,0x0c]
+; CHECK: ld1.4h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x40,0x0c]
+
+; CHECK: ld1.8h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x40,0x4c]
+; CHECK: ld1.8h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x40,0x4c]
+; CHECK: ld1.8h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x40,0x4c]
+; CHECK: ld1.8h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x40,0x4c]
+
+; CHECK: ld1.2s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x40,0x0c]
+; CHECK: ld1.2s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x40,0x0c]
+; CHECK: ld1.2s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x40,0x0c]
+; CHECK: ld1.2s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x40,0x0c]
+
+; CHECK: ld1.4s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x40,0x4c]
+; CHECK: ld1.4s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x40,0x4c]
+; CHECK: ld1.4s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x40,0x4c]
+; CHECK: ld1.4s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x40,0x4c]
+
+; CHECK: ld1.1d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x40,0x0c]
+; CHECK: ld1.1d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x40,0x0c]
+; CHECK: ld1.1d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x40,0x0c]
+; CHECK: ld1.1d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x40,0x0c]
+
+; CHECK: ld1.2d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x40,0x4c]
+; CHECK: ld1.2d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x40,0x4c]
+; CHECK: ld1.2d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x40,0x4c]
+; CHECK: ld1.2d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x40,0x4c]
+
+
+; CHECK: st1.8b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x00,0x0c]
+; CHECK: st1.8b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x00,0x0c]
+; CHECK: st1.8b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x00,0x0c]
+; CHECK: st1.8b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x00,0x0c]
+
+; CHECK: st1.16b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x00,0x4c]
+; CHECK: st1.16b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x00,0x4c]
+; CHECK: st1.16b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x00,0x4c]
+; CHECK: st1.16b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x00,0x4c]
+
+; CHECK: st1.4h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x00,0x0c]
+; CHECK: st1.4h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x00,0x0c]
+; CHECK: st1.4h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x00,0x0c]
+; CHECK: st1.4h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x00,0x0c]
+
+; CHECK: st1.8h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x00,0x4c]
+; CHECK: st1.8h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x00,0x4c]
+; CHECK: st1.8h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x00,0x4c]
+; CHECK: st1.8h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x00,0x4c]
+
+; CHECK: st1.2s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x00,0x0c]
+; CHECK: st1.2s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x00,0x0c]
+; CHECK: st1.2s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x00,0x0c]
+; CHECK: st1.2s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x00,0x0c]
+
+; CHECK: st1.4s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x00,0x4c]
+; CHECK: st1.4s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x00,0x4c]
+; CHECK: st1.4s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x00,0x4c]
+; CHECK: st1.4s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x00,0x4c]
+
+; CHECK: st1.1d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x00,0x0c]
+; CHECK: st1.1d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x00,0x0c]
+; CHECK: st1.1d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x00,0x0c]
+; CHECK: st1.1d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x00,0x0c]
+
+; CHECK: st1.2d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x00,0x4c]
+; CHECK: st1.2d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x00,0x4c]
+; CHECK: st1.2d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x00,0x4c]
+; CHECK: st1.2d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x00,0x4c]
+
+; CHECK: st1.2d { v5 }, [x1]            ; encoding: [0x25,0x7c,0x00,0x4c]
+; CHECK: st1.2d { v7, v8 }, [x10]       ; encoding: [0x47,0xad,0x00,0x4c]
+; CHECK: st1.2d { v11, v12, v13 }, [x1] ; encoding: [0x2b,0x6c,0x00,0x4c]
+; CHECK: st1.2d { v28, v29, v30, v31 }, [x13] ; encoding: [0xbc,0x2d,0x00,0x4c]
+
+_ld2st2_multiple:
+  ld2.8b {v4, v5}, [x19]
+  ld2.16b {v4, v5}, [x19]
+  ld2.4h {v4, v5}, [x19]
+  ld2.8h {v4, v5}, [x19]
+  ld2.2s {v4, v5}, [x19]
+  ld2.4s {v4, v5}, [x19]
+  ld2.2d {v4, v5}, [x19]
+
+  st2.8b {v4, v5}, [x19]
+  st2.16b {v4, v5}, [x19]
+  st2.4h {v4, v5}, [x19]
+  st2.8h {v4, v5}, [x19]
+  st2.2s {v4, v5}, [x19]
+  st2.4s {v4, v5}, [x19]
+  st2.2d {v4, v5}, [x19]
+
+
+; CHECK: _ld2st2_multiple
+; CHECK: ld2.8b { v4, v5 }, [x19]       ; encoding: [0x64,0x82,0x40,0x0c]
+; CHECK: ld2.16b { v4, v5 }, [x19]      ; encoding: [0x64,0x82,0x40,0x4c]
+; CHECK: ld2.4h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x40,0x0c]
+; CHECK: ld2.8h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x40,0x4c]
+; CHECK: ld2.2s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x40,0x0c]
+; CHECK: ld2.4s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x40,0x4c]
+; CHECK: ld2.2d { v4, v5 }, [x19]       ; encoding: [0x64,0x8e,0x40,0x4c]
+
+; CHECK: st2.8b { v4, v5 }, [x19]       ; encoding: [0x64,0x82,0x00,0x0c]
+; CHECK: st2.16b { v4, v5 }, [x19]      ; encoding: [0x64,0x82,0x00,0x4c]
+; CHECK: st2.4h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x00,0x0c]
+; CHECK: st2.8h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x00,0x4c]
+; CHECK: st2.2s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x00,0x0c]
+; CHECK: st2.4s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x00,0x4c]
+; CHECK: st2.2d { v4, v5 }, [x19]       ; encoding: [0x64,0x8e,0x00,0x4c]
+
+
+ld3st3_multiple:
+    ld3.8b {v4, v5, v6}, [x19]
+    ld3.16b {v4, v5, v6}, [x19]
+    ld3.4h {v4, v5, v6}, [x19]
+    ld3.8h {v4, v5, v6}, [x19]
+    ld3.2s {v4, v5, v6}, [x19]
+    ld3.4s {v4, v5, v6}, [x19]
+    ld3.2d {v4, v5, v6}, [x19]
+
+    ld3.8b {v9, v10, v11}, [x9]
+    ld3.16b {v14, v15, v16}, [x19]
+    ld3.4h {v24, v25, v26}, [x29]
+    ld3.8h {v30, v31, v0}, [x9]
+    ld3.2s {v2, v3, v4}, [x19]
+    ld3.4s {v4, v5, v6}, [x29]
+    ld3.2d {v7, v8, v9}, [x9]
+
+    st3.8b {v4, v5, v6}, [x19]
+    st3.16b {v4, v5, v6}, [x19]
+    st3.4h {v4, v5, v6}, [x19]
+    st3.8h {v4, v5, v6}, [x19]
+    st3.2s {v4, v5, v6}, [x19]
+    st3.4s {v4, v5, v6}, [x19]
+    st3.2d {v4, v5, v6}, [x19]
+
+    st3.8b {v10, v11, v12}, [x9]
+    st3.16b {v14, v15, v16}, [x19]
+    st3.4h {v24, v25, v26}, [x29]
+    st3.8h {v30, v31, v0}, [x9]
+    st3.2s {v2, v3, v4}, [x19]
+    st3.4s {v7, v8, v9}, [x29]
+    st3.2d {v4, v5, v6}, [x9]
+
+; CHECK: ld3st3_multiple:
+; CHECK: ld3.8b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x40,0x0c]
+; CHECK: ld3.16b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x40,0x4c]
+; CHECK: ld3.4h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x40,0x0c]
+; CHECK: ld3.8h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x40,0x4c]
+; CHECK: ld3.2s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x40,0x0c]
+; CHECK: ld3.4s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x40,0x4c]
+; CHECK: ld3.2d { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4e,0x40,0x4c]
+
+; CHECK: ld3.8b { v9, v10, v11 }, [x9]  ; encoding: [0x29,0x41,0x40,0x0c]
+; CHECK: ld3.16b { v14, v15, v16 }, [x19] ; encoding: [0x6e,0x42,0x40,0x4c]
+; CHECK: ld3.4h { v24, v25, v26 }, [x29] ; encoding: [0xb8,0x47,0x40,0x0c]
+; CHECK: ld3.8h { v30, v31, v0 }, [x9]  ; encoding: [0x3e,0x45,0x40,0x4c]
+; CHECK: ld3.2s { v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x0c]
+; CHECK: ld3.4s { v4, v5, v6 }, [x29]    ; encoding: [0xa4,0x4b,0x40,0x4c]
+; CHECK: ld3.2d { v7, v8, v9 }, [x9]    ; encoding: [0x27,0x4d,0x40,0x4c]
+
+; CHECK: st3.8b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x00,0x0c]
+; CHECK: st3.16b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x00,0x4c]
+; CHECK: st3.4h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x00,0x0c]
+; CHECK: st3.8h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x00,0x4c]
+; CHECK: st3.2s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x00,0x0c]
+; CHECK: st3.4s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x00,0x4c]
+; CHECK: st3.2d { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4e,0x00,0x4c]
+
+; CHECK: st3.8b { v10, v11, v12 }, [x9] ; encoding: [0x2a,0x41,0x00,0x0c]
+; CHECK: st3.16b { v14, v15, v16 }, [x19] ; encoding: [0x6e,0x42,0x00,0x4c]
+; CHECK: st3.4h { v24, v25, v26 }, [x29] ; encoding: [0xb8,0x47,0x00,0x0c]
+; CHECK: st3.8h { v30, v31, v0 }, [x9]  ; encoding: [0x3e,0x45,0x00,0x4c]
+; CHECK: st3.2s { v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x0c]
+; CHECK: st3.4s { v7, v8, v9 }, [x29]    ; encoding: [0xa7,0x4b,0x00,0x4c]
+; CHECK: st3.2d { v4, v5, v6 }, [x9]    ; encoding: [0x24,0x4d,0x00,0x4c]
+
+ld4st4_multiple:
+    ld4.8b {v4, v5, v6, v7}, [x19]
+    ld4.16b {v4, v5, v6, v7}, [x19]
+    ld4.4h {v4, v5, v6, v7}, [x19]
+    ld4.8h {v4, v5, v6, v7}, [x19]
+    ld4.2s {v4, v5, v6, v7}, [x19]
+    ld4.4s {v4, v5, v6, v7}, [x19]
+    ld4.2d {v4, v5, v6, v7}, [x19]
+
+    st4.8b {v4, v5, v6, v7}, [x19]
+    st4.16b {v4, v5, v6, v7}, [x19]
+    st4.4h {v4, v5, v6, v7}, [x19]
+    st4.8h {v4, v5, v6, v7}, [x19]
+    st4.2s {v4, v5, v6, v7}, [x19]
+    st4.4s {v4, v5, v6, v7}, [x19]
+    st4.2d {v4, v5, v6, v7}, [x19]
+
+; CHECK: ld4st4_multiple:
+; CHECK: ld4.8b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x40,0x0c]
+; CHECK: ld4.16b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x40,0x4c]
+; CHECK: ld4.4h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x40,0x0c]
+; CHECK: ld4.8h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x40,0x4c]
+; CHECK: ld4.2s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x40,0x0c]
+; CHECK: ld4.4s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x40,0x4c]
+; CHECK: ld4.2d { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0e,0x40,0x4c]
+
+; CHECK: st4.8b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x00,0x0c]
+; CHECK: st4.16b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x00,0x4c]
+; CHECK: st4.4h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x00,0x0c]
+; CHECK: st4.8h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x00,0x4c]
+; CHECK: st4.2s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x00,0x0c]
+; CHECK: st4.4s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x00,0x4c]
+; CHECK: st4.2d { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0e,0x00,0x4c]
+
+;-----------------------------------------------------------------------------
+; Post-increment versions.
+;-----------------------------------------------------------------------------
+
+_ld1st1_multiple_post:
+  ld1.8b {v0}, [x1], x15
+  ld1.8b {v0, v1}, [x1], x15
+  ld1.8b {v0, v1, v2}, [x1], x15
+  ld1.8b {v0, v1, v2, v3}, [x1], x15
+
+  ld1.16b {v0}, [x1], x15
+  ld1.16b {v0, v1}, [x1], x15
+  ld1.16b {v0, v1, v2}, [x1], x15
+  ld1.16b {v0, v1, v2, v3}, [x1], x15
+
+  ld1.4h {v0}, [x1], x15
+  ld1.4h {v0, v1}, [x1], x15
+  ld1.4h {v0, v1, v2}, [x1], x15
+  ld1.4h {v0, v1, v2, v3}, [x1], x15
+
+  ld1.8h {v0}, [x1], x15
+  ld1.8h {v0, v1}, [x1], x15
+  ld1.8h {v0, v1, v2}, [x1], x15
+  ld1.8h {v0, v1, v2, v3}, [x1], x15
+
+  ld1.2s {v0}, [x1], x15
+  ld1.2s {v0, v1}, [x1], x15
+  ld1.2s {v0, v1, v2}, [x1], x15
+  ld1.2s {v0, v1, v2, v3}, [x1], x15
+
+  ld1.4s {v0}, [x1], x15
+  ld1.4s {v0, v1}, [x1], x15
+  ld1.4s {v0, v1, v2}, [x1], x15
+  ld1.4s {v0, v1, v2, v3}, [x1], x15
+
+  ld1.1d {v0}, [x1], x15
+  ld1.1d {v0, v1}, [x1], x15
+  ld1.1d {v0, v1, v2}, [x1], x15
+  ld1.1d {v0, v1, v2, v3}, [x1], x15
+
+  ld1.2d {v0}, [x1], x15
+  ld1.2d {v0, v1}, [x1], x15
+  ld1.2d {v0, v1, v2}, [x1], x15
+  ld1.2d {v0, v1, v2, v3}, [x1], x15
+
+  st1.8b {v0}, [x1], x15
+  st1.8b {v0, v1}, [x1], x15
+  st1.8b {v0, v1, v2}, [x1], x15
+  st1.8b {v0, v1, v2, v3}, [x1], x15
+
+  st1.16b {v0}, [x1], x15
+  st1.16b {v0, v1}, [x1], x15
+  st1.16b {v0, v1, v2}, [x1], x15
+  st1.16b {v0, v1, v2, v3}, [x1], x15
+
+  st1.4h {v0}, [x1], x15
+  st1.4h {v0, v1}, [x1], x15
+  st1.4h {v0, v1, v2}, [x1], x15
+  st1.4h {v0, v1, v2, v3}, [x1], x15
+
+  st1.8h {v0}, [x1], x15
+  st1.8h {v0, v1}, [x1], x15
+  st1.8h {v0, v1, v2}, [x1], x15
+  st1.8h {v0, v1, v2, v3}, [x1], x15
+
+  st1.2s {v0}, [x1], x15
+  st1.2s {v0, v1}, [x1], x15
+  st1.2s {v0, v1, v2}, [x1], x15
+  st1.2s {v0, v1, v2, v3}, [x1], x15
+
+  st1.4s {v0}, [x1], x15
+  st1.4s {v0, v1}, [x1], x15
+  st1.4s {v0, v1, v2}, [x1], x15
+  st1.4s {v0, v1, v2, v3}, [x1], x15
+
+  st1.1d {v0}, [x1], x15
+  st1.1d {v0, v1}, [x1], x15
+  st1.1d {v0, v1, v2}, [x1], x15
+  st1.1d {v0, v1, v2, v3}, [x1], x15
+
+  st1.2d {v0}, [x1], x15
+  st1.2d {v0, v1}, [x1], x15
+  st1.2d {v0, v1, v2}, [x1], x15
+  st1.2d {v0, v1, v2, v3}, [x1], x15
+
+  ld1.8b {v0}, [x1], #8
+  ld1.8b {v0, v1}, [x1], #16
+  ld1.8b {v0, v1, v2}, [x1], #24
+  ld1.8b {v0, v1, v2, v3}, [x1], #32
+
+  ld1.16b {v0}, [x1], #16
+  ld1.16b {v0, v1}, [x1], #32
+  ld1.16b {v0, v1, v2}, [x1], #48
+  ld1.16b {v0, v1, v2, v3}, [x1], #64
+
+  ld1.4h {v0}, [x1], #8
+  ld1.4h {v0, v1}, [x1], #16
+  ld1.4h {v0, v1, v2}, [x1], #24
+  ld1.4h {v0, v1, v2, v3}, [x1], #32
+
+  ld1.8h {v0}, [x1], #16
+  ld1.8h {v0, v1}, [x1], #32
+  ld1.8h {v0, v1, v2}, [x1], #48
+  ld1.8h {v0, v1, v2, v3}, [x1], #64
+
+  ld1.2s {v0}, [x1], #8
+  ld1.2s {v0, v1}, [x1], #16
+  ld1.2s {v0, v1, v2}, [x1], #24
+  ld1.2s {v0, v1, v2, v3}, [x1], #32
+
+  ld1.4s {v0}, [x1], #16
+  ld1.4s {v0, v1}, [x1], #32
+  ld1.4s {v0, v1, v2}, [x1], #48
+  ld1.4s {v0, v1, v2, v3}, [x1], #64
+
+  ld1.1d {v0}, [x1], #8
+  ld1.1d {v0, v1}, [x1], #16
+  ld1.1d {v0, v1, v2}, [x1], #24
+  ld1.1d {v0, v1, v2, v3}, [x1], #32
+
+  ld1.2d {v0}, [x1], #16
+  ld1.2d {v0, v1}, [x1], #32
+  ld1.2d {v0, v1, v2}, [x1], #48
+  ld1.2d {v0, v1, v2, v3}, [x1], #64
+
+  st1.8b {v0}, [x1], #8
+  st1.8b {v0, v1}, [x1], #16
+  st1.8b {v0, v1, v2}, [x1], #24
+  st1.8b {v0, v1, v2, v3}, [x1], #32
+
+  st1.16b {v0}, [x1], #16
+  st1.16b {v0, v1}, [x1], #32
+  st1.16b {v0, v1, v2}, [x1], #48
+  st1.16b {v0, v1, v2, v3}, [x1], #64
+
+  st1.4h {v0}, [x1], #8
+  st1.4h {v0, v1}, [x1], #16
+  st1.4h {v0, v1, v2}, [x1], #24
+  st1.4h {v0, v1, v2, v3}, [x1], #32
+
+  st1.8h {v0}, [x1], #16
+  st1.8h {v0, v1}, [x1], #32
+  st1.8h {v0, v1, v2}, [x1], #48
+  st1.8h {v0, v1, v2, v3}, [x1], #64
+
+  st1.2s {v0}, [x1], #8
+  st1.2s {v0, v1}, [x1], #16
+  st1.2s {v0, v1, v2}, [x1], #24
+  st1.2s {v0, v1, v2, v3}, [x1], #32
+
+  st1.4s {v0}, [x1], #16
+  st1.4s {v0, v1}, [x1], #32
+  st1.4s {v0, v1, v2}, [x1], #48
+  st1.4s {v0, v1, v2, v3}, [x1], #64
+
+  st1.1d {v0}, [x1], #8
+  st1.1d {v0, v1}, [x1], #16
+  st1.1d {v0, v1, v2}, [x1], #24
+  st1.1d {v0, v1, v2, v3}, [x1], #32
+
+  st1.2d {v0}, [x1], #16
+  st1.2d {v0, v1}, [x1], #32
+  st1.2d {v0, v1, v2}, [x1], #48
+  st1.2d {v0, v1, v2, v3}, [x1], #64
+
+; CHECK: ld1st1_multiple_post:
+; CHECK: ld1.8b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0xcf,0x0c]
+; CHECK: ld1.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0xcf,0x0c]
+; CHECK: ld1.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0xcf,0x0c]
+; CHECK: ld1.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0xcf,0x0c]
+
+; CHECK: ld1.16b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0xcf,0x4c]
+; CHECK: ld1.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0xcf,0x4c]
+; CHECK: ld1.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0xcf,0x4c]
+; CHECK: ld1.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0xcf,0x4c]
+
+; CHECK: ld1.4h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0xcf,0x0c]
+; CHECK: ld1.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0xcf,0x0c]
+; CHECK: ld1.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0xcf,0x0c]
+; CHECK: ld1.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0xcf,0x0c]
+
+; CHECK: ld1.8h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0xcf,0x4c]
+; CHECK: ld1.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0xcf,0x4c]
+; CHECK: ld1.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0xcf,0x4c]
+; CHECK: ld1.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0xcf,0x4c]
+
+; CHECK: ld1.2s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0xcf,0x0c]
+; CHECK: ld1.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0xcf,0x0c]
+; CHECK: ld1.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0xcf,0x0c]
+; CHECK: ld1.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0xcf,0x0c]
+
+; CHECK: ld1.4s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0xcf,0x4c]
+; CHECK: ld1.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0xcf,0x4c]
+; CHECK: ld1.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0xcf,0x4c]
+; CHECK: ld1.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0xcf,0x4c]
+
+; CHECK: ld1.1d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0xcf,0x0c]
+; CHECK: ld1.1d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0xcf,0x0c]
+; CHECK: ld1.1d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0xcf,0x0c]
+; CHECK: ld1.1d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0xcf,0x0c]
+
+; CHECK: ld1.2d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0xcf,0x4c]
+; CHECK: ld1.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0xcf,0x4c]
+; CHECK: ld1.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0xcf,0x4c]
+; CHECK: ld1.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0xcf,0x4c]
+
+; CHECK: st1.8b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0x8f,0x0c]
+; CHECK: st1.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0x8f,0x0c]
+; CHECK: st1.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0x8f,0x0c]
+; CHECK: st1.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0x8f,0x0c]
+
+; CHECK: st1.16b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0x8f,0x4c]
+; CHECK: st1.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0x8f,0x4c]
+; CHECK: st1.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0x8f,0x4c]
+; CHECK: st1.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0x8f,0x4c]
+
+; CHECK: st1.4h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0x8f,0x0c]
+; CHECK: st1.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0x8f,0x0c]
+; CHECK: st1.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0x8f,0x0c]
+; CHECK: st1.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0x8f,0x0c]
+
+; CHECK: st1.8h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0x8f,0x4c]
+; CHECK: st1.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0x8f,0x4c]
+; CHECK: st1.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0x8f,0x4c]
+; CHECK: st1.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0x8f,0x4c]
+
+; CHECK: st1.2s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0x8f,0x0c]
+; CHECK: st1.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0x8f,0x0c]
+; CHECK: st1.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0x8f,0x0c]
+; CHECK: st1.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0x8f,0x0c]
+
+; CHECK: st1.4s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0x8f,0x4c]
+; CHECK: st1.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0x8f,0x4c]
+; CHECK: st1.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0x8f,0x4c]
+; CHECK: st1.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0x8f,0x4c]
+
+; CHECK: st1.1d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0x8f,0x0c]
+; CHECK: st1.1d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0x8f,0x0c]
+; CHECK: st1.1d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0x8f,0x0c]
+; CHECK: st1.1d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0x8f,0x0c]
+
+; CHECK: st1.2d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0x8f,0x4c]
+; CHECK: st1.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0x8f,0x4c]
+; CHECK: st1.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0x8f,0x4c]
+; CHECK: st1.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0x8f,0x4c]
+
+; CHECK: ld1.8b { v0 }, [x1], #8       ; encoding: [0x20,0x70,0xdf,0x0c]
+; CHECK: ld1.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa0,0xdf,0x0c]
+; CHECK: ld1.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x60,0xdf,0x0c]
+; CHECK: ld1.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x20,0xdf,0x0c]
+
+; CHECK: ld1.16b { v0 }, [x1], #16       ; encoding: [0x20,0x70,0xdf,0x4c]
+; CHECK: ld1.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa0,0xdf,0x4c]
+; CHECK: ld1.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x60,0xdf,0x4c]
+; CHECK: ld1.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x20,0xdf,0x4c]
+
+; CHECK: ld1.4h { v0 }, [x1], #8       ; encoding: [0x20,0x74,0xdf,0x0c]
+; CHECK: ld1.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa4,0xdf,0x0c]
+; CHECK: ld1.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x64,0xdf,0x0c]
+; CHECK: ld1.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x24,0xdf,0x0c]
+
+; CHECK: ld1.8h { v0 }, [x1], #16       ; encoding: [0x20,0x74,0xdf,0x4c]
+; CHECK: ld1.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa4,0xdf,0x4c]
+; CHECK: ld1.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x64,0xdf,0x4c]
+; CHECK: ld1.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x24,0xdf,0x4c]
+
+; CHECK: ld1.2s { v0 }, [x1], #8       ; encoding: [0x20,0x78,0xdf,0x0c]
+; CHECK: ld1.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa8,0xdf,0x0c]
+; CHECK: ld1.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x68,0xdf,0x0c]
+; CHECK: ld1.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x28,0xdf,0x0c]
+
+; CHECK: ld1.4s { v0 }, [x1], #16       ; encoding: [0x20,0x78,0xdf,0x4c]
+; CHECK: ld1.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa8,0xdf,0x4c]
+; CHECK: ld1.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x68,0xdf,0x4c]
+; CHECK: ld1.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x28,0xdf,0x4c]
+
+; CHECK: ld1.1d { v0 }, [x1], #8       ; encoding: [0x20,0x7c,0xdf,0x0c]
+; CHECK: ld1.1d { v0, v1 }, [x1], #16   ; encoding: [0x20,0xac,0xdf,0x0c]
+; CHECK: ld1.1d { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x6c,0xdf,0x0c]
+; CHECK: ld1.1d { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x2c,0xdf,0x0c]
+
+; CHECK: ld1.2d { v0 }, [x1], #16       ; encoding: [0x20,0x7c,0xdf,0x4c]
+; CHECK: ld1.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0xac,0xdf,0x4c]
+; CHECK: ld1.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x6c,0xdf,0x4c]
+; CHECK: ld1.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x2c,0xdf,0x4c]
+
+; CHECK: st1.8b { v0 }, [x1], #8       ; encoding: [0x20,0x70,0x9f,0x0c]
+; CHECK: st1.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa0,0x9f,0x0c]
+; CHECK: st1.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x60,0x9f,0x0c]
+; CHECK: st1.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x20,0x9f,0x0c]
+
+; CHECK: st1.16b { v0 }, [x1], #16       ; encoding: [0x20,0x70,0x9f,0x4c]
+; CHECK: st1.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa0,0x9f,0x4c]
+; CHECK: st1.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x60,0x9f,0x4c]
+; CHECK: st1.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x20,0x9f,0x4c]
+
+; CHECK: st1.4h { v0 }, [x1], #8       ; encoding: [0x20,0x74,0x9f,0x0c]
+; CHECK: st1.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa4,0x9f,0x0c]
+; CHECK: st1.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x64,0x9f,0x0c]
+; CHECK: st1.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x24,0x9f,0x0c]
+
+; CHECK: st1.8h { v0 }, [x1], #16       ; encoding: [0x20,0x74,0x9f,0x4c]
+; CHECK: st1.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa4,0x9f,0x4c]
+; CHECK: st1.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x64,0x9f,0x4c]
+; CHECK: st1.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x24,0x9f,0x4c]
+
+; CHECK: st1.2s { v0 }, [x1], #8       ; encoding: [0x20,0x78,0x9f,0x0c]
+; CHECK: st1.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa8,0x9f,0x0c]
+; CHECK: st1.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x68,0x9f,0x0c]
+; CHECK: st1.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x28,0x9f,0x0c]
+
+; CHECK: st1.4s { v0 }, [x1], #16       ; encoding: [0x20,0x78,0x9f,0x4c]
+; CHECK: st1.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa8,0x9f,0x4c]
+; CHECK: st1.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x68,0x9f,0x4c]
+; CHECK: st1.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x28,0x9f,0x4c]
+
+; CHECK: st1.1d { v0 }, [x1], #8       ; encoding: [0x20,0x7c,0x9f,0x0c]
+; CHECK: st1.1d { v0, v1 }, [x1], #16   ; encoding: [0x20,0xac,0x9f,0x0c]
+; CHECK: st1.1d { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x6c,0x9f,0x0c]
+; CHECK: st1.1d { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x2c,0x9f,0x0c]
+
+; CHECK: st1.2d { v0 }, [x1], #16       ; encoding: [0x20,0x7c,0x9f,0x4c]
+; CHECK: st1.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0xac,0x9f,0x4c]
+; CHECK: st1.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x6c,0x9f,0x4c]
+; CHECK: st1.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x2c,0x9f,0x4c]
+
+
+_ld2st2_multiple_post:
+  ld2.8b {v0, v1}, [x1], x15
+  ld2.16b {v0, v1}, [x1], x15
+  ld2.4h {v0, v1}, [x1], x15
+  ld2.8h {v0, v1}, [x1], x15
+  ld2.2s {v0, v1}, [x1], x15
+  ld2.4s {v0, v1}, [x1], x15
+  ld2.2d {v0, v1}, [x1], x15
+
+  st2.8b {v0, v1}, [x1], x15
+  st2.16b {v0, v1}, [x1], x15
+  st2.4h {v0, v1}, [x1], x15
+  st2.8h {v0, v1}, [x1], x15
+  st2.2s {v0, v1}, [x1], x15
+  st2.4s {v0, v1}, [x1], x15
+  st2.2d {v0, v1}, [x1], x15
+
+  ld2.8b {v0, v1}, [x1], #16
+  ld2.16b {v0, v1}, [x1], #32
+  ld2.4h {v0, v1}, [x1], #16
+  ld2.8h {v0, v1}, [x1], #32
+  ld2.2s {v0, v1}, [x1], #16
+  ld2.4s {v0, v1}, [x1], #32
+  ld2.2d {v0, v1}, [x1], #32
+
+  st2.8b {v0, v1}, [x1], #16
+  st2.16b {v0, v1}, [x1], #32
+  st2.4h {v0, v1}, [x1], #16
+  st2.8h {v0, v1}, [x1], #32
+  st2.2s {v0, v1}, [x1], #16
+  st2.4s {v0, v1}, [x1], #32
+  st2.2d {v0, v1}, [x1], #32
+
+
+; CHECK: ld2st2_multiple_post:
+; CHECK: ld2.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0xcf,0x0c]
+; CHECK: ld2.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0xcf,0x4c]
+; CHECK: ld2.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0xcf,0x0c]
+; CHECK: ld2.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0xcf,0x4c]
+; CHECK: ld2.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0xcf,0x0c]
+; CHECK: ld2.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0xcf,0x4c]
+; CHECK: ld2.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0x8c,0xcf,0x4c]
+
+; CHECK: st2.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0x8f,0x0c]
+; CHECK: st2.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0x8f,0x4c]
+; CHECK: st2.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0x8f,0x0c]
+; CHECK: st2.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0x8f,0x4c]
+; CHECK: st2.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0x8f,0x0c]
+; CHECK: st2.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0x8f,0x4c]
+; CHECK: st2.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0x8c,0x8f,0x4c]
+
+; CHECK: ld2.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0x80,0xdf,0x0c]
+; CHECK: ld2.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0x80,0xdf,0x4c]
+; CHECK: ld2.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0x84,0xdf,0x0c]
+; CHECK: ld2.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0x84,0xdf,0x4c]
+; CHECK: ld2.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0x88,0xdf,0x0c]
+; CHECK: ld2.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0x88,0xdf,0x4c]
+; CHECK: ld2.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0x8c,0xdf,0x4c]
+
+; CHECK: st2.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0x80,0x9f,0x0c]
+; CHECK: st2.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0x80,0x9f,0x4c]
+; CHECK: st2.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0x84,0x9f,0x0c]
+; CHECK: st2.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0x84,0x9f,0x4c]
+; CHECK: st2.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0x88,0x9f,0x0c]
+; CHECK: st2.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0x88,0x9f,0x4c]
+; CHECK: st2.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0x8c,0x9f,0x4c]
+
+
+_ld3st3_multiple_post:
+  ld3.8b {v0, v1, v2}, [x1], x15
+  ld3.16b {v0, v1, v2}, [x1], x15
+  ld3.4h {v0, v1, v2}, [x1], x15
+  ld3.8h {v0, v1, v2}, [x1], x15
+  ld3.2s {v0, v1, v2}, [x1], x15
+  ld3.4s {v0, v1, v2}, [x1], x15
+  ld3.2d {v0, v1, v2}, [x1], x15
+
+  st3.8b {v0, v1, v2}, [x1], x15
+  st3.16b {v0, v1, v2}, [x1], x15
+  st3.4h {v0, v1, v2}, [x1], x15
+  st3.8h {v0, v1, v2}, [x1], x15
+  st3.2s {v0, v1, v2}, [x1], x15
+  st3.4s {v0, v1, v2}, [x1], x15
+  st3.2d {v0, v1, v2}, [x1], x15
+
+  ld3.8b {v0, v1, v2}, [x1], #24
+  ld3.16b {v0, v1, v2}, [x1], #48
+  ld3.4h {v0, v1, v2}, [x1], #24
+  ld3.8h {v0, v1, v2}, [x1], #48
+  ld3.2s {v0, v1, v2}, [x1], #24
+  ld3.4s {v0, v1, v2}, [x1], #48
+  ld3.2d {v0, v1, v2}, [x1], #48
+
+  st3.8b {v0, v1, v2}, [x1], #24
+  st3.16b {v0, v1, v2}, [x1], #48
+  st3.4h {v0, v1, v2}, [x1], #24
+  st3.8h {v0, v1, v2}, [x1], #48
+  st3.2s {v0, v1, v2}, [x1], #24
+  st3.4s {v0, v1, v2}, [x1], #48
+  st3.2d {v0, v1, v2}, [x1], #48
+
+; CHECK: ld3st3_multiple_post:
+; CHECK: ld3.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0xcf,0x0c]
+; CHECK: ld3.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0xcf,0x4c]
+; CHECK: ld3.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0xcf,0x0c]
+; CHECK: ld3.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0xcf,0x4c]
+; CHECK: ld3.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0xcf,0x0c]
+; CHECK: ld3.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0xcf,0x4c]
+; CHECK: ld3.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x4c,0xcf,0x4c]
+
+; CHECK: st3.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0x8f,0x0c]
+; CHECK: st3.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0x8f,0x4c]
+; CHECK: st3.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0x8f,0x0c]
+; CHECK: st3.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0x8f,0x4c]
+; CHECK: st3.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0x8f,0x0c]
+; CHECK: st3.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0x8f,0x4c]
+; CHECK: st3.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x4c,0x8f,0x4c]
+
+; CHECK: ld3.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x40,0xdf,0x0c]
+; CHECK: ld3.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x40,0xdf,0x4c]
+; CHECK: ld3.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x44,0xdf,0x0c]
+; CHECK: ld3.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x44,0xdf,0x4c]
+; CHECK: ld3.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x48,0xdf,0x0c]
+; CHECK: ld3.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x48,0xdf,0x4c]
+; CHECK: ld3.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x4c,0xdf,0x4c]
+
+; CHECK: st3.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x40,0x9f,0x0c]
+; CHECK: st3.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x40,0x9f,0x4c]
+; CHECK: st3.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x44,0x9f,0x0c]
+; CHECK: st3.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x44,0x9f,0x4c]
+; CHECK: st3.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x48,0x9f,0x0c]
+; CHECK: st3.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x48,0x9f,0x4c]
+; CHECK: st3.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x4c,0x9f,0x4c]
+
+_ld4st4_multiple_post:
+  ld4.8b {v0, v1, v2, v3}, [x1], x15
+  ld4.16b {v0, v1, v2, v3}, [x1], x15
+  ld4.4h {v0, v1, v2, v3}, [x1], x15
+  ld4.8h {v0, v1, v2, v3}, [x1], x15
+  ld4.2s {v0, v1, v2, v3}, [x1], x15
+  ld4.4s {v0, v1, v2, v3}, [x1], x15
+  ld4.2d {v0, v1, v2, v3}, [x1], x15
+
+  st4.8b {v0, v1, v2, v3}, [x1], x15
+  st4.16b {v0, v1, v2, v3}, [x1], x15
+  st4.4h {v0, v1, v2, v3}, [x1], x15
+  st4.8h {v0, v1, v2, v3}, [x1], x15
+  st4.2s {v0, v1, v2, v3}, [x1], x15
+  st4.4s {v0, v1, v2, v3}, [x1], x15
+  st4.2d {v0, v1, v2, v3}, [x1], x15
+
+  ld4.8b {v0, v1, v2, v3}, [x1], #32
+  ld4.16b {v0, v1, v2, v3}, [x1], #64
+  ld4.4h {v0, v1, v2, v3}, [x1], #32
+  ld4.8h {v0, v1, v2, v3}, [x1], #64
+  ld4.2s {v0, v1, v2, v3}, [x1], #32
+  ld4.4s {v0, v1, v2, v3}, [x1], #64
+  ld4.2d {v0, v1, v2, v3}, [x1], #64
+
+  st4.8b {v0, v1, v2, v3}, [x1], #32
+  st4.16b {v0, v1, v2, v3}, [x1], #64
+  st4.4h {v0, v1, v2, v3}, [x1], #32
+  st4.8h {v0, v1, v2, v3}, [x1], #64
+  st4.2s {v0, v1, v2, v3}, [x1], #32
+  st4.4s {v0, v1, v2, v3}, [x1], #64
+  st4.2d {v0, v1, v2, v3}, [x1], #64
+
+
+; CHECK: ld4st4_multiple_post:
+; CHECK: ld4.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0xcf,0x0c]
+; CHECK: ld4.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0xcf,0x4c]
+; CHECK: ld4.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0xcf,0x0c]
+; CHECK: ld4.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0xcf,0x4c]
+; CHECK: ld4.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0xcf,0x0c]
+; CHECK: ld4.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0xcf,0x4c]
+; CHECK: ld4.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x0c,0xcf,0x4c]
+
+; CHECK: st4.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0x8f,0x0c]
+; CHECK: st4.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0x8f,0x4c]
+; CHECK: st4.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0x8f,0x0c]
+; CHECK: st4.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0x8f,0x4c]
+; CHECK: st4.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0x8f,0x0c]
+; CHECK: st4.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0x8f,0x4c]
+; CHECK: st4.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x0c,0x8f,0x4c]
+
+; CHECK: ld4.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x00,0xdf,0x0c]
+; CHECK: ld4.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x00,0xdf,0x4c]
+; CHECK: ld4.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x04,0xdf,0x0c]
+; CHECK: ld4.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x04,0xdf,0x4c]
+; CHECK: ld4.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x08,0xdf,0x0c]
+; CHECK: ld4.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x08,0xdf,0x4c]
+; CHECK: ld4.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x0c,0xdf,0x4c]
+
+; CHECK: st4.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x00,0x9f,0x0c]
+; CHECK: st4.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x00,0x9f,0x4c]
+; CHECK: st4.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x04,0x9f,0x0c]
+; CHECK: st4.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x04,0x9f,0x4c]
+; CHECK: st4.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x08,0x9f,0x0c]
+; CHECK: st4.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x08,0x9f,0x4c]
+; CHECK: st4.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x0c,0x9f,0x4c]
+
+ld1r:
+  ld1r.8b {v4}, [x2]
+  ld1r.8b {v4}, [x2], x3
+  ld1r.16b {v4}, [x2]
+  ld1r.16b {v4}, [x2], x3
+  ld1r.4h {v4}, [x2]
+  ld1r.4h {v4}, [x2], x3
+  ld1r.8h {v4}, [x2]
+  ld1r.8h {v4}, [x2], x3
+  ld1r.2s {v4}, [x2]
+  ld1r.2s {v4}, [x2], x3
+  ld1r.4s {v4}, [x2]
+  ld1r.4s {v4}, [x2], x3
+  ld1r.1d {v4}, [x2]
+  ld1r.1d {v4}, [x2], x3
+  ld1r.2d {v4}, [x2]
+  ld1r.2d {v4}, [x2], x3
+
+  ld1r.8b {v4}, [x2], #1
+  ld1r.16b {v4}, [x2], #1
+  ld1r.4h {v4}, [x2], #2
+  ld1r.8h {v4}, [x2], #2
+  ld1r.2s {v4}, [x2], #4
+  ld1r.4s {v4}, [x2], #4
+  ld1r.1d {v4}, [x2], #8
+  ld1r.2d {v4}, [x2], #8
+
+; CHECK: ld1r:
+; CHECK: ld1r.8b { v4 }, [x2]            ; encoding: [0x44,0xc0,0x40,0x0d]
+; CHECK: ld1r.8b { v4 }, [x2], x3        ; encoding: [0x44,0xc0,0xc3,0x0d]
+; CHECK: ld1r.16b { v4 }, [x2]    ; encoding: [0x44,0xc0,0x40,0x4d]
+; CHECK: ld1r.16b { v4 }, [x2], x3 ; encoding: [0x44,0xc0,0xc3,0x4d]
+; CHECK: ld1r.4h { v4 }, [x2]            ; encoding: [0x44,0xc4,0x40,0x0d]
+; CHECK: ld1r.4h { v4 }, [x2], x3        ; encoding: [0x44,0xc4,0xc3,0x0d]
+; CHECK: ld1r.8h { v4 }, [x2]            ; encoding: [0x44,0xc4,0x40,0x4d]
+; CHECK: ld1r.8h { v4 }, [x2], x3        ; encoding: [0x44,0xc4,0xc3,0x4d]
+; CHECK: ld1r.2s { v4 }, [x2]            ; encoding: [0x44,0xc8,0x40,0x0d]
+; CHECK: ld1r.2s { v4 }, [x2], x3        ; encoding: [0x44,0xc8,0xc3,0x0d]
+; CHECK: ld1r.4s { v4 }, [x2]            ; encoding: [0x44,0xc8,0x40,0x4d]
+; CHECK: ld1r.4s { v4 }, [x2], x3        ; encoding: [0x44,0xc8,0xc3,0x4d]
+; CHECK: ld1r.1d { v4 }, [x2]            ; encoding: [0x44,0xcc,0x40,0x0d]
+; CHECK: ld1r.1d { v4 }, [x2], x3        ; encoding: [0x44,0xcc,0xc3,0x0d]
+; CHECK: ld1r.2d { v4 }, [x2]            ; encoding: [0x44,0xcc,0x40,0x4d]
+; CHECK: ld1r.2d { v4 }, [x2], x3        ; encoding: [0x44,0xcc,0xc3,0x4d]
+
+; CHECK: ld1r.8b { v4 }, [x2], #1        ; encoding: [0x44,0xc0,0xdf,0x0d]
+; CHECK: ld1r.16b { v4 }, [x2], #1 ; encoding: [0x44,0xc0,0xdf,0x4d]
+; CHECK: ld1r.4h { v4 }, [x2], #2        ; encoding: [0x44,0xc4,0xdf,0x0d]
+; CHECK: ld1r.8h { v4 }, [x2], #2        ; encoding: [0x44,0xc4,0xdf,0x4d]
+; CHECK: ld1r.2s { v4 }, [x2], #4        ; encoding: [0x44,0xc8,0xdf,0x0d]
+; CHECK: ld1r.4s { v4 }, [x2], #4        ; encoding: [0x44,0xc8,0xdf,0x4d]
+; CHECK: ld1r.1d { v4 }, [x2], #8        ; encoding: [0x44,0xcc,0xdf,0x0d]
+; CHECK: ld1r.2d { v4 }, [x2], #8        ; encoding: [0x44,0xcc,0xdf,0x4d]
+
+ld2r:
+  ld2r.8b {v4, v5}, [x2]
+  ld2r.8b {v4, v5}, [x2], x3
+  ld2r.16b {v4, v5}, [x2]
+  ld2r.16b {v4, v5}, [x2], x3
+  ld2r.4h {v4, v5}, [x2]
+  ld2r.4h {v4, v5}, [x2], x3
+  ld2r.8h {v4, v5}, [x2]
+  ld2r.8h {v4, v5}, [x2], x3
+  ld2r.2s {v4, v5}, [x2]
+  ld2r.2s {v4, v5}, [x2], x3
+  ld2r.4s {v4, v5}, [x2]
+  ld2r.4s {v4, v5}, [x2], x3
+  ld2r.1d {v4, v5}, [x2]
+  ld2r.1d {v4, v5}, [x2], x3
+  ld2r.2d {v4, v5}, [x2]
+  ld2r.2d {v4, v5}, [x2], x3
+
+  ld2r.8b {v4, v5}, [x2], #2
+  ld2r.16b {v4, v5}, [x2], #2
+  ld2r.4h {v4, v5}, [x2], #4
+  ld2r.8h {v4, v5}, [x2], #4
+  ld2r.2s {v4, v5}, [x2], #8
+  ld2r.4s {v4, v5}, [x2], #8
+  ld2r.1d {v4, v5}, [x2], #16
+  ld2r.2d {v4, v5}, [x2], #16
+
+; CHECK: ld2r:
+; CHECK: ld2r.8b { v4, v5 }, [x2]        ; encoding: [0x44,0xc0,0x60,0x0d]
+; CHECK: ld2r.8b { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc0,0xe3,0x0d]
+; CHECK: ld2r.16b { v4, v5 }, [x2] ; encoding: [0x44,0xc0,0x60,0x4d]
+; CHECK: ld2r.16b { v4, v5 }, [x2], x3 ; encoding: [0x44,0xc0,0xe3,0x4d]
+; CHECK: ld2r.4h { v4, v5 }, [x2]        ; encoding: [0x44,0xc4,0x60,0x0d]
+; CHECK: ld2r.4h { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc4,0xe3,0x0d]
+; CHECK: ld2r.8h { v4, v5 }, [x2]        ; encoding: [0x44,0xc4,0x60,0x4d]
+; CHECK: ld2r.8h { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc4,0xe3,0x4d]
+; CHECK: ld2r.2s { v4, v5 }, [x2]        ; encoding: [0x44,0xc8,0x60,0x0d]
+; CHECK: ld2r.2s { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc8,0xe3,0x0d]
+; CHECK: ld2r.4s { v4, v5 }, [x2]        ; encoding: [0x44,0xc8,0x60,0x4d]
+; CHECK: ld2r.4s { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc8,0xe3,0x4d]
+; CHECK: ld2r.1d { v4, v5 }, [x2]        ; encoding: [0x44,0xcc,0x60,0x0d]
+; CHECK: ld2r.1d { v4, v5 }, [x2], x3    ; encoding: [0x44,0xcc,0xe3,0x0d]
+; CHECK: ld2r.2d { v4, v5 }, [x2]        ; encoding: [0x44,0xcc,0x60,0x4d]
+; CHECK: ld2r.2d { v4, v5 }, [x2], x3    ; encoding: [0x44,0xcc,0xe3,0x4d]
+
+; CHECK: ld2r.8b { v4, v5 }, [x2], #2    ; encoding: [0x44,0xc0,0xff,0x0d]
+; CHECK: ld2r.16b { v4, v5 }, [x2], #2 ; encoding: [0x44,0xc0,0xff,0x4d]
+; CHECK: ld2r.4h { v4, v5 }, [x2], #4    ; encoding: [0x44,0xc4,0xff,0x0d]
+; CHECK: ld2r.8h { v4, v5 }, [x2], #4    ; encoding: [0x44,0xc4,0xff,0x4d]
+; CHECK: ld2r.2s { v4, v5 }, [x2], #8    ; encoding: [0x44,0xc8,0xff,0x0d]
+; CHECK: ld2r.4s { v4, v5 }, [x2], #8    ; encoding: [0x44,0xc8,0xff,0x4d]
+; CHECK: ld2r.1d { v4, v5 }, [x2], #16    ; encoding: [0x44,0xcc,0xff,0x0d]
+; CHECK: ld2r.2d { v4, v5 }, [x2], #16    ; encoding: [0x44,0xcc,0xff,0x4d]
+
+ld3r:
+  ld3r.8b {v4, v5, v6}, [x2]
+  ld3r.8b {v4, v5, v6}, [x2], x3
+  ld3r.16b {v4, v5, v6}, [x2]
+  ld3r.16b {v4, v5, v6}, [x2], x3
+  ld3r.4h {v4, v5, v6}, [x2]
+  ld3r.4h {v4, v5, v6}, [x2], x3
+  ld3r.8h {v4, v5, v6}, [x2]
+  ld3r.8h {v4, v5, v6}, [x2], x3
+  ld3r.2s {v4, v5, v6}, [x2]
+  ld3r.2s {v4, v5, v6}, [x2], x3
+  ld3r.4s {v4, v5, v6}, [x2]
+  ld3r.4s {v4, v5, v6}, [x2], x3
+  ld3r.1d {v4, v5, v6}, [x2]
+  ld3r.1d {v4, v5, v6}, [x2], x3
+  ld3r.2d {v4, v5, v6}, [x2]
+  ld3r.2d {v4, v5, v6}, [x2], x3
+
+  ld3r.8b {v4, v5, v6}, [x2], #3
+  ld3r.16b {v4, v5, v6}, [x2], #3
+  ld3r.4h {v4, v5, v6}, [x2], #6
+  ld3r.8h {v4, v5, v6}, [x2], #6
+  ld3r.2s {v4, v5, v6}, [x2], #12
+  ld3r.4s {v4, v5, v6}, [x2], #12
+  ld3r.1d {v4, v5, v6}, [x2], #24
+  ld3r.2d {v4, v5, v6}, [x2], #24
+
+; CHECK: ld3r:
+; CHECK: ld3r.8b { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe0,0x40,0x0d]
+; CHECK: ld3r.8b { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe0,0xc3,0x0d]
+; CHECK: ld3r.16b { v4, v5, v6 }, [x2] ; encoding: [0x44,0xe0,0x40,0x4d]
+; CHECK: ld3r.16b { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe0,0xc3,0x4d]
+; CHECK: ld3r.4h { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe4,0x40,0x0d]
+; CHECK: ld3r.4h { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe4,0xc3,0x0d]
+; CHECK: ld3r.8h { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe4,0x40,0x4d]
+; CHECK: ld3r.8h { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe4,0xc3,0x4d]
+; CHECK: ld3r.2s { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe8,0x40,0x0d]
+; CHECK: ld3r.2s { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe8,0xc3,0x0d]
+; CHECK: ld3r.4s { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe8,0x40,0x4d]
+; CHECK: ld3r.4s { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe8,0xc3,0x4d]
+; CHECK: ld3r.1d { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xec,0x40,0x0d]
+; CHECK: ld3r.1d { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xec,0xc3,0x0d]
+; CHECK: ld3r.2d { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xec,0x40,0x4d]
+; CHECK: ld3r.2d { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xec,0xc3,0x4d]
+
+; CHECK: ld3r.8b { v4, v5, v6 }, [x2], #3 ; encoding: [0x44,0xe0,0xdf,0x0d]
+; CHECK: ld3r.16b { v4, v5, v6 }, [x2], #3 ; encoding: [0x44,0xe0,0xdf,0x4d]
+; CHECK: ld3r.4h { v4, v5, v6 }, [x2], #6 ; encoding: [0x44,0xe4,0xdf,0x0d]
+; CHECK: ld3r.8h { v4, v5, v6 }, [x2], #6 ; encoding: [0x44,0xe4,0xdf,0x4d]
+; CHECK: ld3r.2s { v4, v5, v6 }, [x2], #12 ; encoding: [0x44,0xe8,0xdf,0x0d]
+; CHECK: ld3r.4s { v4, v5, v6 }, [x2], #12 ; encoding: [0x44,0xe8,0xdf,0x4d]
+; CHECK: ld3r.1d { v4, v5, v6 }, [x2], #24 ; encoding: [0x44,0xec,0xdf,0x0d]
+; CHECK: ld3r.2d { v4, v5, v6 }, [x2], #24 ; encoding: [0x44,0xec,0xdf,0x4d]
+
+ld4r:
+  ld4r.8b {v4, v5, v6, v7}, [x2]
+  ld4r.8b {v4, v5, v6, v7}, [x2], x3
+  ld4r.16b {v4, v5, v6, v7}, [x2]
+  ld4r.16b {v4, v5, v6, v7}, [x2], x3
+  ld4r.4h {v4, v5, v6, v7}, [x2]
+  ld4r.4h {v4, v5, v6, v7}, [x2], x3
+  ld4r.8h {v4, v5, v6, v7}, [x2]
+  ld4r.8h {v4, v5, v6, v7}, [x2], x3
+  ld4r.2s {v4, v5, v6, v7}, [x2]
+  ld4r.2s {v4, v5, v6, v7}, [x2], x3
+  ld4r.4s {v4, v5, v6, v7}, [x2]
+  ld4r.4s {v4, v5, v6, v7}, [x2], x3
+  ld4r.1d {v4, v5, v6, v7}, [x2]
+  ld4r.1d {v4, v5, v6, v7}, [x2], x3
+  ld4r.2d {v4, v5, v6, v7}, [x2]
+  ld4r.2d {v4, v5, v6, v7}, [x2], x3
+
+  ld4r.8b {v4, v5, v6, v7}, [x2], #4
+  ld4r.16b {v5, v6, v7, v8}, [x2], #4
+  ld4r.4h {v6, v7, v8, v9}, [x2], #8
+  ld4r.8h {v1, v2, v3, v4}, [x2], #8
+  ld4r.2s {v2, v3, v4, v5}, [x2], #16
+  ld4r.4s {v3, v4, v5, v6}, [x2], #16
+  ld4r.1d {v0, v1, v2, v3}, [x2], #32
+  ld4r.2d {v4, v5, v6, v7}, [x2], #32
+
+; CHECK: ld4r:
+; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe0,0x60,0x0d]
+; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe0,0xe3,0x0d]
+; CHECK: ld4r.16b { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe0,0x60,0x4d]
+; CHECK: ld4r.16b { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe0,0xe3,0x4d]
+; CHECK: ld4r.4h { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe4,0x60,0x0d]
+; CHECK: ld4r.4h { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe4,0xe3,0x0d]
+; CHECK: ld4r.8h { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe4,0x60,0x4d]
+; CHECK: ld4r.8h { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe4,0xe3,0x4d]
+; CHECK: ld4r.2s { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe8,0x60,0x0d]
+; CHECK: ld4r.2s { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe8,0xe3,0x0d]
+; CHECK: ld4r.4s { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe8,0x60,0x4d]
+; CHECK: ld4r.4s { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe8,0xe3,0x4d]
+; CHECK: ld4r.1d { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xec,0x60,0x0d]
+; CHECK: ld4r.1d { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xec,0xe3,0x0d]
+; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xec,0x60,0x4d]
+; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xec,0xe3,0x4d]
+
+; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2], #4 ; encoding: [0x44,0xe0,0xff,0x0d]
+; CHECK: ld4r.16b { v5, v6, v7, v8 }, [x2], #4 ; encoding: [0x45,0xe0,0xff,0x4d]
+; CHECK: ld4r.4h { v6, v7, v8, v9 }, [x2], #8 ; encoding: [0x46,0xe4,0xff,0x0d]
+; CHECK: ld4r.8h { v1, v2, v3, v4 }, [x2], #8 ; encoding: [0x41,0xe4,0xff,0x4d]
+; CHECK: ld4r.2s { v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x0d]
+; CHECK: ld4r.4s { v3, v4, v5, v6 }, [x2], #16 ; encoding: [0x43,0xe8,0xff,0x4d]
+; CHECK: ld4r.1d { v0, v1, v2, v3 }, [x2], #32 ; encoding: [0x40,0xec,0xff,0x0d]
+; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2], #32 ; encoding: [0x44,0xec,0xff,0x4d]
+
+
+_ld1:
+  ld1.b {v4}[13], [x3]
+  ld1.h {v4}[2], [x3]
+  ld1.s {v4}[2], [x3]
+  ld1.d {v4}[1], [x3]
+  ld1.b {v4}[13], [x3], x5
+  ld1.h {v4}[2], [x3], x5
+  ld1.s {v4}[2], [x3], x5
+  ld1.d {v4}[1], [x3], x5
+  ld1.b {v4}[13], [x3], #1
+  ld1.h {v4}[2], [x3], #2
+  ld1.s {v4}[2], [x3], #4
+  ld1.d {v4}[1], [x3], #8
+
+; CHECK: _ld1:
+; CHECK: ld1.b { v4 }[13], [x3]        ; encoding: [0x64,0x14,0x40,0x4d]
+; CHECK: ld1.h { v4 }[2], [x3]         ; encoding: [0x64,0x50,0x40,0x0d]
+; CHECK: ld1.s { v4 }[2], [x3]         ; encoding: [0x64,0x80,0x40,0x4d]
+; CHECK: ld1.d { v4 }[1], [x3]         ; encoding: [0x64,0x84,0x40,0x4d]
+; CHECK: ld1.b { v4 }[13], [x3], x5    ; encoding: [0x64,0x14,0xc5,0x4d]
+; CHECK: ld1.h { v4 }[2], [x3], x5     ; encoding: [0x64,0x50,0xc5,0x0d]
+; CHECK: ld1.s { v4 }[2], [x3], x5     ; encoding: [0x64,0x80,0xc5,0x4d]
+; CHECK: ld1.d { v4 }[1], [x3], x5     ; encoding: [0x64,0x84,0xc5,0x4d]
+; CHECK: ld1.b { v4 }[13], [x3], #1   ; encoding: [0x64,0x14,0xdf,0x4d]
+; CHECK: ld1.h { v4 }[2], [x3], #2    ; encoding: [0x64,0x50,0xdf,0x0d]
+; CHECK: ld1.s { v4 }[2], [x3], #4    ; encoding: [0x64,0x80,0xdf,0x4d]
+; CHECK: ld1.d { v4 }[1], [x3], #8    ; encoding: [0x64,0x84,0xdf,0x4d]
+
+_ld2:
+  ld2.b {v4, v5}[13], [x3]
+  ld2.h {v4, v5}[2], [x3]
+  ld2.s {v4, v5}[2], [x3]
+  ld2.d {v4, v5}[1], [x3]
+  ld2.b {v4, v5}[13], [x3], x5
+  ld2.h {v4, v5}[2], [x3], x5
+  ld2.s {v4, v5}[2], [x3], x5
+  ld2.d {v4, v5}[1], [x3], x5
+  ld2.b {v4, v5}[13], [x3], #2
+  ld2.h {v4, v5}[2], [x3], #4
+  ld2.s {v4, v5}[2], [x3], #8
+  ld2.d {v4, v5}[1], [x3], #16
+
+
+; CHECK: _ld2:
+; CHECK: ld2.b { v4, v5 }[13], [x3]    ; encoding: [0x64,0x14,0x60,0x4d]
+; CHECK: ld2.h { v4, v5 }[2], [x3]     ; encoding: [0x64,0x50,0x60,0x0d]
+; CHECK: ld2.s { v4, v5 }[2], [x3]     ; encoding: [0x64,0x80,0x60,0x4d]
+; CHECK: ld2.d { v4, v5 }[1], [x3]     ; encoding: [0x64,0x84,0x60,0x4d]
+; CHECK: ld2.b { v4, v5 }[13], [x3], x5 ; encoding: [0x64,0x14,0xe5,0x4d]
+; CHECK: ld2.h { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x50,0xe5,0x0d]
+; CHECK: ld2.s { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x80,0xe5,0x4d]
+; CHECK: ld2.d { v4, v5 }[1], [x3], x5 ; encoding: [0x64,0x84,0xe5,0x4d]
+; CHECK: ld2.b { v4, v5 }[13], [x3], #2 ; encoding: [0x64,0x14,0xff,0x4d]
+; CHECK: ld2.h { v4, v5 }[2], [x3], #4 ; encoding: [0x64,0x50,0xff,0x0d]
+; CHECK: ld2.s { v4, v5 }[2], [x3], #8 ; encoding: [0x64,0x80,0xff,0x4d]
+; CHECK: ld2.d { v4, v5 }[1], [x3], #16 ; encoding: [0x64,0x84,0xff,0x4d]
+
+
+_ld3:
+  ld3.b {v4, v5, v6}[13], [x3]
+  ld3.h {v4, v5, v6}[2], [x3]
+  ld3.s {v4, v5, v6}[2], [x3]
+  ld3.d {v4, v5, v6}[1], [x3]
+  ld3.b {v4, v5, v6}[13], [x3], x5
+  ld3.h {v4, v5, v6}[2], [x3], x5
+  ld3.s {v4, v5, v6}[2], [x3], x5
+  ld3.d {v4, v5, v6}[1], [x3], x5
+  ld3.b {v4, v5, v6}[13], [x3], #3
+  ld3.h {v4, v5, v6}[2], [x3], #6
+  ld3.s {v4, v5, v6}[2], [x3], #12
+  ld3.d {v4, v5, v6}[1], [x3], #24
+
+
+; CHECK: _ld3:
+; CHECK: ld3.b { v4, v5, v6 }[13], [x3] ; encoding: [0x64,0x34,0x40,0x4d]
+; CHECK: ld3.h { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0x70,0x40,0x0d]
+; CHECK: ld3.s { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0xa0,0x40,0x4d]
+; CHECK: ld3.d { v4, v5, v6 }[1], [x3] ; encoding: [0x64,0xa4,0x40,0x4d]
+; CHECK: ld3.b { v4, v5, v6 }[13], [x3], x5 ; encoding: [0x64,0x34,0xc5,0x4d]
+; CHECK: ld3.h { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0x70,0xc5,0x0d]
+; CHECK: ld3.s { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xc5,0x4d]
+; CHECK: ld3.d { v4, v5, v6 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xc5,0x4d]
+; CHECK: ld3.b { v4, v5, v6 }[13], [x3], #3 ; encoding: [0x64,0x34,0xdf,0x4d]
+; CHECK: ld3.h { v4, v5, v6 }[2], [x3], #6 ; encoding: [0x64,0x70,0xdf,0x0d]
+; CHECK: ld3.s { v4, v5, v6 }[2], [x3], #12 ; encoding: [0x64,0xa0,0xdf,0x4d]
+; CHECK: ld3.d { v4, v5, v6 }[1], [x3], #24 ; encoding: [0x64,0xa4,0xdf,0x4d]
+
+
+_ld4:
+  ld4.b {v4, v5, v6, v7}[13], [x3]
+  ld4.h {v4, v5, v6, v7}[2], [x3]
+  ld4.s {v4, v5, v6, v7}[2], [x3]
+  ld4.d {v4, v5, v6, v7}[1], [x3]
+  ld4.b {v4, v5, v6, v7}[13], [x3], x5
+  ld4.h {v4, v5, v6, v7}[2], [x3], x5
+  ld4.s {v4, v5, v6, v7}[2], [x3], x5
+  ld4.d {v4, v5, v6, v7}[1], [x3], x5
+  ld4.b {v4, v5, v6, v7}[13], [x3], #4
+  ld4.h {v4, v5, v6, v7}[2], [x3], #8
+  ld4.s {v4, v5, v6, v7}[2], [x3], #16
+  ld4.d {v4, v5, v6, v7}[1], [x3], #32
+
+; CHECK: _ld4:
+; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3] ; encoding: [0x64,0x34,0x60,0x4d]
+; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0x70,0x60,0x0d]
+; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0xa0,0x60,0x4d]
+; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3] ; encoding: [0x64,0xa4,0x60,0x4d]
+; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3], x5 ; encoding: [0x64,0x34,0xe5,0x4d]
+; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0x70,0xe5,0x0d]
+; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xe5,0x4d]
+; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xe5,0x4d]
+; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3], #4 ; encoding: [0x64,0x34,0xff,0x4d]
+; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3], #8 ; encoding: [0x64,0x70,0xff,0x0d]
+; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3], #16 ; encoding: [0x64,0xa0,0xff,0x4d]
+; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3], #32 ; encoding: [0x64,0xa4,0xff,0x4d]
+
+_st1:
+  st1.b {v4}[13], [x3]
+  st1.h {v4}[2], [x3]
+  st1.s {v4}[2], [x3]
+  st1.d {v4}[1], [x3]
+  st1.b {v4}[13], [x3], x5
+  st1.h {v4}[2], [x3], x5
+  st1.s {v4}[2], [x3], x5
+  st1.d {v4}[1], [x3], x5
+  st1.b {v4}[13], [x3], #1
+  st1.h {v4}[2], [x3], #2
+  st1.s {v4}[2], [x3], #4
+  st1.d {v4}[1], [x3], #8
+
+; CHECK: _st1:
+; CHECK: st1.b { v4 }[13], [x3]        ; encoding: [0x64,0x14,0x00,0x4d]
+; CHECK: st1.h { v4 }[2], [x3]         ; encoding: [0x64,0x50,0x00,0x0d]
+; CHECK: st1.s { v4 }[2], [x3]         ; encoding: [0x64,0x80,0x00,0x4d]
+; CHECK: st1.d { v4 }[1], [x3]         ; encoding: [0x64,0x84,0x00,0x4d]
+; CHECK: st1.b { v4 }[13], [x3], x5    ; encoding: [0x64,0x14,0x85,0x4d]
+; CHECK: st1.h { v4 }[2], [x3], x5     ; encoding: [0x64,0x50,0x85,0x0d]
+; CHECK: st1.s { v4 }[2], [x3], x5     ; encoding: [0x64,0x80,0x85,0x4d]
+; CHECK: st1.d { v4 }[1], [x3], x5     ; encoding: [0x64,0x84,0x85,0x4d]
+; CHECK: st1.b { v4 }[13], [x3], #1   ; encoding: [0x64,0x14,0x9f,0x4d]
+; CHECK: st1.h { v4 }[2], [x3], #2    ; encoding: [0x64,0x50,0x9f,0x0d]
+; CHECK: st1.s { v4 }[2], [x3], #4    ; encoding: [0x64,0x80,0x9f,0x4d]
+; CHECK: st1.d { v4 }[1], [x3], #8    ; encoding: [0x64,0x84,0x9f,0x4d]
+
+_st2:
+  st2.b {v4, v5}[13], [x3]
+  st2.h {v4, v5}[2], [x3]
+  st2.s {v4, v5}[2], [x3]
+  st2.d {v4, v5}[1], [x3]
+  st2.b {v4, v5}[13], [x3], x5
+  st2.h {v4, v5}[2], [x3], x5
+  st2.s {v4, v5}[2], [x3], x5
+  st2.d {v4, v5}[1], [x3], x5
+  st2.b {v4, v5}[13], [x3], #2
+  st2.h {v4, v5}[2], [x3], #4
+  st2.s {v4, v5}[2], [x3], #8
+  st2.d {v4, v5}[1], [x3], #16
+
+; CHECK: _st2:
+; CHECK: st2.b { v4, v5 }[13], [x3]    ; encoding: [0x64,0x14,0x20,0x4d]
+; CHECK: st2.h { v4, v5 }[2], [x3]     ; encoding: [0x64,0x50,0x20,0x0d]
+; CHECK: st2.s { v4, v5 }[2], [x3]     ; encoding: [0x64,0x80,0x20,0x4d]
+; CHECK: st2.d { v4, v5 }[1], [x3]     ; encoding: [0x64,0x84,0x20,0x4d]
+; CHECK: st2.b { v4, v5 }[13], [x3], x5 ; encoding: [0x64,0x14,0xa5,0x4d]
+; CHECK: st2.h { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x50,0xa5,0x0d]
+; CHECK: st2.s { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x80,0xa5,0x4d]
+; CHECK: st2.d { v4, v5 }[1], [x3], x5 ; encoding: [0x64,0x84,0xa5,0x4d]
+; CHECK: st2.b { v4, v5 }[13], [x3], #2 ; encoding: [0x64,0x14,0xbf,0x4d]
+; CHECK: st2.h { v4, v5 }[2], [x3], #4 ; encoding: [0x64,0x50,0xbf,0x0d]
+; CHECK: st2.s { v4, v5 }[2], [x3], #8 ; encoding: [0x64,0x80,0xbf,0x4d]
+; CHECK: st2.d { v4, v5 }[1], [x3], #16 ; encoding: [0x64,0x84,0xbf,0x4d]
+
+
+_st3:
+  st3.b {v4, v5, v6}[13], [x3]
+  st3.h {v4, v5, v6}[2], [x3]
+  st3.s {v4, v5, v6}[2], [x3]
+  st3.d {v4, v5, v6}[1], [x3]
+  st3.b {v4, v5, v6}[13], [x3], x5
+  st3.h {v4, v5, v6}[2], [x3], x5
+  st3.s {v4, v5, v6}[2], [x3], x5
+  st3.d {v4, v5, v6}[1], [x3], x5
+  st3.b {v4, v5, v6}[13], [x3], #3
+  st3.h {v4, v5, v6}[2], [x3], #6
+  st3.s {v4, v5, v6}[2], [x3], #12
+  st3.d {v4, v5, v6}[1], [x3], #24
+
+; CHECK: _st3:
+; CHECK: st3.b { v4, v5, v6 }[13], [x3] ; encoding: [0x64,0x34,0x00,0x4d]
+; CHECK: st3.h { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0x70,0x00,0x0d]
+; CHECK: st3.s { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0xa0,0x00,0x4d]
+; CHECK: st3.d { v4, v5, v6 }[1], [x3] ; encoding: [0x64,0xa4,0x00,0x4d]
+; CHECK: st3.b { v4, v5, v6 }[13], [x3], x5 ; encoding: [0x64,0x34,0x85,0x4d]
+; CHECK: st3.h { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0x70,0x85,0x0d]
+; CHECK: st3.s { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0xa0,0x85,0x4d]
+; CHECK: st3.d { v4, v5, v6 }[1], [x3], x5 ; encoding: [0x64,0xa4,0x85,0x4d]
+; CHECK: st3.b { v4, v5, v6 }[13], [x3], #3 ; encoding: [0x64,0x34,0x9f,0x4d]
+; CHECK: st3.h { v4, v5, v6 }[2], [x3], #6 ; encoding: [0x64,0x70,0x9f,0x0d]
+; CHECK: st3.s { v4, v5, v6 }[2], [x3], #12 ; encoding: [0x64,0xa0,0x9f,0x4d]
+; CHECK: st3.d { v4, v5, v6 }[1], [x3], #24 ; encoding: [0x64,0xa4,0x9f,0x4d]
+
+_st4:
+  st4.b {v4, v5, v6, v7}[13], [x3]
+  st4.h {v4, v5, v6, v7}[2], [x3]
+  st4.s {v4, v5, v6, v7}[2], [x3]
+  st4.d {v4, v5, v6, v7}[1], [x3]
+  st4.b {v4, v5, v6, v7}[13], [x3], x5
+  st4.h {v4, v5, v6, v7}[2], [x3], x5
+  st4.s {v4, v5, v6, v7}[2], [x3], x5
+  st4.d {v4, v5, v6, v7}[1], [x3], x5
+  st4.b {v4, v5, v6, v7}[13], [x3], #4
+  st4.h {v4, v5, v6, v7}[2], [x3], #8
+  st4.s {v4, v5, v6, v7}[2], [x3], #16
+  st4.d {v4, v5, v6, v7}[1], [x3], #32
+
+; CHECK: _st4:
+; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3] ; encoding: [0x64,0x34,0x20,0x4d]
+; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0x70,0x20,0x0d]
+; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0xa0,0x20,0x4d]
+; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3] ; encoding: [0x64,0xa4,0x20,0x4d]
+; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3], x5 ; encoding: [0x64,0x34,0xa5,0x4d]
+; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0x70,0xa5,0x0d]
+; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xa5,0x4d]
+; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xa5,0x4d]
+; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3], #4 ; encoding: [0x64,0x34,0xbf,0x4d]
+; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3], #8 ; encoding: [0x64,0x70,0xbf,0x0d]
+; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3], #16 ; encoding: [0x64,0xa0,0xbf,0x4d]
+; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3], #32 ; encoding: [0x64,0xa4,0xbf,0x4d]
+
+
+;---------
+; ARM verbose syntax equivalents to the above.
+;---------
+verbose_syntax:
+
+  ld1 { v1.8b }, [x1]
+  ld1 { v2.8b, v3.8b }, [x1]
+  ld1 { v3.8b, v4.8b, v5.8b }, [x1]
+  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1]
+
+  ld1 { v1.16b }, [x1]
+  ld1 { v2.16b, v3.16b }, [x1]
+  ld1 { v3.16b, v4.16b, v5.16b }, [x1]
+  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1]
+
+  ld1 { v1.4h }, [x1]
+  ld1 { v2.4h, v3.4h }, [x1]
+  ld1 { v3.4h, v4.4h, v5.4h }, [x1]
+  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1]
+
+  ld1 { v1.8h }, [x1]
+  ld1 { v2.8h, v3.8h }, [x1]
+  ld1 { v3.8h, v4.8h, v5.8h }, [x1]
+  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1]
+
+  ld1 { v1.2s }, [x1]
+  ld1 { v2.2s, v3.2s }, [x1]
+  ld1 { v3.2s, v4.2s, v5.2s }, [x1]
+  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1]
+
+  ld1 { v1.4s }, [x1]
+  ld1 { v2.4s, v3.4s }, [x1]
+  ld1 { v3.4s, v4.4s, v5.4s }, [x1]
+  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1]
+
+  ld1 { v1.1d }, [x1]
+  ld1 { v2.1d, v3.1d }, [x1]
+  ld1 { v3.1d, v4.1d, v5.1d }, [x1]
+  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1]
+
+  ld1 { v1.2d }, [x1]
+  ld1 { v2.2d, v3.2d }, [x1]
+  ld1 { v3.2d, v4.2d, v5.2d }, [x1]
+  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1]
+
+  st1 { v1.8b }, [x1]
+  st1 { v2.8b, v3.8b }, [x1]
+  st1 { v3.8b, v4.8b, v5.8b }, [x1]
+  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1]
+
+  st1 { v1.16b }, [x1]
+  st1 { v2.16b, v3.16b }, [x1]
+  st1 { v3.16b, v4.16b, v5.16b }, [x1]
+  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1]
+
+  st1 { v1.4h }, [x1]
+  st1 { v2.4h, v3.4h }, [x1]
+  st1 { v3.4h, v4.4h, v5.4h }, [x1]
+  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1]
+
+  st1 { v1.8h }, [x1]
+  st1 { v2.8h, v3.8h }, [x1]
+  st1 { v3.8h, v4.8h, v5.8h }, [x1]
+  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1]
+
+  st1 { v1.2s }, [x1]
+  st1 { v2.2s, v3.2s }, [x1]
+  st1 { v3.2s, v4.2s, v5.2s }, [x1]
+  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1]
+
+  st1 { v1.4s }, [x1]
+  st1 { v2.4s, v3.4s }, [x1]
+  st1 { v3.4s, v4.4s, v5.4s }, [x1]
+  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1]
+
+  st1 { v1.1d }, [x1]
+  st1 { v2.1d, v3.1d }, [x1]
+  st1 { v3.1d, v4.1d, v5.1d }, [x1]
+  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1]
+
+  st1 { v1.2d }, [x1]
+  st1 { v2.2d, v3.2d }, [x1]
+  st1 { v3.2d, v4.2d, v5.2d }, [x1]
+  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1]
+
+  ld2 { v3.8b, v4.8b }, [x19]
+  ld2 { v3.16b, v4.16b }, [x19]
+  ld2 { v3.4h, v4.4h }, [x19]
+  ld2 { v3.8h, v4.8h }, [x19]
+  ld2 { v3.2s, v4.2s }, [x19]
+  ld2 { v3.4s, v4.4s }, [x19]
+  ld2 { v3.2d, v4.2d }, [x19]
+
+  st2 { v3.8b, v4.8b }, [x19]
+  st2 { v3.16b, v4.16b }, [x19]
+  st2 { v3.4h, v4.4h }, [x19]
+  st2 { v3.8h, v4.8h }, [x19]
+  st2 { v3.2s, v4.2s }, [x19]
+  st2 { v3.4s, v4.4s }, [x19]
+  st2 { v3.2d, v4.2d }, [x19]
+
+  ld3 { v2.8b, v3.8b, v4.8b }, [x19]
+  ld3 { v2.16b, v3.16b, v4.16b }, [x19]
+  ld3 { v2.4h, v3.4h, v4.4h }, [x19]
+  ld3 { v2.8h, v3.8h, v4.8h }, [x19]
+  ld3 { v2.2s, v3.2s, v4.2s }, [x19]
+  ld3 { v2.4s, v3.4s, v4.4s }, [x19]
+  ld3 { v2.2d, v3.2d, v4.2d }, [x19]
+
+  st3 { v2.8b, v3.8b, v4.8b }, [x19]
+  st3 { v2.16b, v3.16b, v4.16b }, [x19]
+  st3 { v2.4h, v3.4h, v4.4h }, [x19]
+  st3 { v2.8h, v3.8h, v4.8h }, [x19]
+  st3 { v2.2s, v3.2s, v4.2s }, [x19]
+  st3 { v2.4s, v3.4s, v4.4s }, [x19]
+  st3 { v2.2d, v3.2d, v4.2d }, [x19]
+
+  ld4 { v2.8b, v3.8b, v4.8b, v5.8b }, [x19]
+  ld4 { v2.16b, v3.16b, v4.16b, v5.16b }, [x19]
+  ld4 { v2.4h, v3.4h, v4.4h, v5.4h }, [x19]
+  ld4 { v2.8h, v3.8h, v4.8h, v5.8h }, [x19]
+  ld4 { v2.2s, v3.2s, v4.2s, v5.2s }, [x19]
+  ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x19]
+  ld4 { v2.2d, v3.2d, v4.2d, v5.2d }, [x19]
+
+  st4 { v2.8b, v3.8b, v4.8b, v5.8b }, [x19]
+  st4 { v2.16b, v3.16b, v4.16b, v5.16b }, [x19]
+  st4 { v2.4h, v3.4h, v4.4h, v5.4h }, [x19]
+  st4 { v2.8h, v3.8h, v4.8h, v5.8h }, [x19]
+  st4 { v2.2s, v3.2s, v4.2s, v5.2s }, [x19]
+  st4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x19]
+  st4 { v2.2d, v3.2d, v4.2d, v5.2d }, [x19]
+
+  ld1 { v1.8b }, [x1], x15
+  ld1 { v2.8b, v3.8b }, [x1], x15
+  ld1 { v3.8b, v4.8b, v5.8b }, [x1], x15
+  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
+
+  ld1 { v1.16b }, [x1], x15
+  ld1 { v2.16b, v3.16b }, [x1], x15
+  ld1 { v3.16b, v4.16b, v5.16b }, [x1], x15
+  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
+
+  ld1 { v1.4h }, [x1], x15
+  ld1 { v2.4h, v3.4h }, [x1], x15
+  ld1 { v3.4h, v4.4h, v5.4h }, [x1], x15
+  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
+
+  ld1 { v1.8h }, [x1], x15
+  ld1 { v2.8h, v3.8h }, [x1], x15
+  ld1 { v3.8h, v4.8h, v5.8h }, [x1], x15
+  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
+
+  ld1 { v1.2s }, [x1], x15
+  ld1 { v2.2s, v3.2s }, [x1], x15
+  ld1 { v3.2s, v4.2s, v5.2s }, [x1], x15
+  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
+
+  ld1 { v1.4s }, [x1], x15
+  ld1 { v2.4s, v3.4s }, [x1], x15
+  ld1 { v3.4s, v4.4s, v5.4s }, [x1], x15
+  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
+
+  ld1 { v1.1d }, [x1], x15
+  ld1 { v2.1d, v3.1d }, [x1], x15
+  ld1 { v3.1d, v4.1d, v5.1d }, [x1], x15
+  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], x15
+
+  ld1 { v1.2d }, [x1], x15
+  ld1 { v2.2d, v3.2d }, [x1], x15
+  ld1 { v3.2d, v4.2d, v5.2d }, [x1], x15
+  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
+
+  st1 { v1.8b }, [x1], x15
+  st1 { v2.8b, v3.8b }, [x1], x15
+  st1 { v3.8b, v4.8b, v5.8b }, [x1], x15
+  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
+
+  st1 { v1.16b }, [x1], x15
+  st1 { v2.16b, v3.16b }, [x1], x15
+  st1 { v3.16b, v4.16b, v5.16b }, [x1], x15
+  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
+
+  st1 { v1.4h }, [x1], x15
+  st1 { v2.4h, v3.4h }, [x1], x15
+  st1 { v3.4h, v4.4h, v5.4h }, [x1], x15
+  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
+
+  st1 { v1.8h }, [x1], x15
+  st1 { v2.8h, v3.8h }, [x1], x15
+  st1 { v3.8h, v4.8h, v5.8h }, [x1], x15
+  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
+
+  st1 { v1.2s }, [x1], x15
+  st1 { v2.2s, v3.2s }, [x1], x15
+  st1 { v3.2s, v4.2s, v5.2s }, [x1], x15
+  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
+
+  st1 { v1.4s }, [x1], x15
+  st1 { v2.4s, v3.4s }, [x1], x15
+  st1 { v3.4s, v4.4s, v5.4s }, [x1], x15
+  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
+
+  st1 { v1.1d }, [x1], x15
+  st1 { v2.1d, v3.1d }, [x1], x15
+  st1 { v3.1d, v4.1d, v5.1d }, [x1], x15
+  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], x15
+
+  st1 { v1.2d }, [x1], x15
+  st1 { v2.2d, v3.2d }, [x1], x15
+  st1 { v3.2d, v4.2d, v5.2d }, [x1], x15
+  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
+
+  ld1 { v1.8b }, [x1], #8
+  ld1 { v2.8b, v3.8b }, [x1], #16
+  ld1 { v3.8b, v4.8b, v5.8b }, [x1], #24
+  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
+
+  ld1 { v1.16b }, [x1], #16
+  ld1 { v2.16b, v3.16b }, [x1], #32
+  ld1 { v3.16b, v4.16b, v5.16b }, [x1], #48
+  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
+
+  ld1 { v1.4h }, [x1], #8
+  ld1 { v2.4h, v3.4h }, [x1], #16
+  ld1 { v3.4h, v4.4h, v5.4h }, [x1], #24
+  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
+
+  ld1 { v1.8h }, [x1], #16
+  ld1 { v2.8h, v3.8h }, [x1], #32
+  ld1 { v3.8h, v4.8h, v5.8h }, [x1], #48
+  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
+
+  ld1 { v1.2s }, [x1], #8
+  ld1 { v2.2s, v3.2s }, [x1], #16
+  ld1 { v3.2s, v4.2s, v5.2s }, [x1], #24
+  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
+
+  ld1 { v1.4s }, [x1], #16
+  ld1 { v2.4s, v3.4s }, [x1], #32
+  ld1 { v3.4s, v4.4s, v5.4s }, [x1], #48
+  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
+
+  ld1 { v1.1d }, [x1], #8
+  ld1 { v2.1d, v3.1d }, [x1], #16
+  ld1 { v3.1d, v4.1d, v5.1d }, [x1], #24
+  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], #32
+
+  ld1 { v1.2d }, [x1], #16
+  ld1 { v2.2d, v3.2d }, [x1], #32
+  ld1 { v3.2d, v4.2d, v5.2d }, [x1], #48
+  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
+
+  st1 { v1.8b }, [x1], #8
+  st1 { v2.8b, v3.8b }, [x1], #16
+  st1 { v3.8b, v4.8b, v5.8b }, [x1], #24
+  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
+
+  st1 { v1.16b }, [x1], #16
+  st1 { v2.16b, v3.16b }, [x1], #32
+  st1 { v3.16b, v4.16b, v5.16b }, [x1], #48
+  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
+
+  st1 { v1.4h }, [x1], #8
+  st1 { v2.4h, v3.4h }, [x1], #16
+  st1 { v3.4h, v4.4h, v5.4h }, [x1], #24
+  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
+
+  st1 { v1.8h }, [x1], #16
+  st1 { v2.8h, v3.8h }, [x1], #32
+  st1 { v3.8h, v4.8h, v5.8h }, [x1], #48
+  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
+
+  st1 { v1.2s }, [x1], #8
+  st1 { v2.2s, v3.2s }, [x1], #16
+  st1 { v3.2s, v4.2s, v5.2s }, [x1], #24
+  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
+
+  st1 { v1.4s }, [x1], #16
+  st1 { v2.4s, v3.4s }, [x1], #32
+  st1 { v3.4s, v4.4s, v5.4s }, [x1], #48
+  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
+
+  st1 { v1.1d }, [x1], #8
+  st1 { v2.1d, v3.1d }, [x1], #16
+  st1 { v3.1d, v4.1d, v5.1d }, [x1], #24
+  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], #32
+
+  st1 { v1.2d }, [x1], #16
+  st1 { v2.2d, v3.2d }, [x1], #32
+  st1 { v3.2d, v4.2d, v5.2d }, [x1], #48
+  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
+
+  ld2 { v2.8b, v3.8b }, [x1], x15
+  ld2 { v2.16b, v3.16b }, [x1], x15
+  ld2 { v2.4h, v3.4h }, [x1], x15
+  ld2 { v2.8h, v3.8h }, [x1], x15
+  ld2 { v2.2s, v3.2s }, [x1], x15
+  ld2 { v2.4s, v3.4s }, [x1], x15
+  ld2 { v2.2d, v3.2d }, [x1], x15
+
+  st2 { v2.8b, v3.8b }, [x1], x15
+  st2 { v2.16b, v3.16b }, [x1], x15
+  st2 { v2.4h, v3.4h }, [x1], x15
+  st2 { v2.8h, v3.8h }, [x1], x15
+  st2 { v2.2s, v3.2s }, [x1], x15
+  st2 { v2.4s, v3.4s }, [x1], x15
+  st2 { v2.2d, v3.2d }, [x1], x15
+
+  ld2 { v2.8b, v3.8b }, [x1], #16
+  ld2 { v2.16b, v3.16b }, [x1], #32
+  ld2 { v2.4h, v3.4h }, [x1], #16
+  ld2 { v2.8h, v3.8h }, [x1], #32
+  ld2 { v2.2s, v3.2s }, [x1], #16
+  ld2 { v2.4s, v3.4s }, [x1], #32
+  ld2 { v2.2d, v3.2d }, [x1], #32
+
+  st2 { v2.8b, v3.8b }, [x1], #16
+  st2 { v2.16b, v3.16b }, [x1], #32
+  st2 { v2.4h, v3.4h }, [x1], #16
+  st2 { v2.8h, v3.8h }, [x1], #32
+  st2 { v2.2s, v3.2s }, [x1], #16
+  st2 { v2.4s, v3.4s }, [x1], #32
+  st2 { v2.2d, v3.2d }, [x1], #32
+
+  ld3 { v3.8b, v4.8b, v5.8b }, [x1], x15
+  ld3 { v3.16b, v4.16b, v5.16b }, [x1], x15
+  ld3 { v3.4h, v4.4h, v5.4h }, [x1], x15
+  ld3 { v3.8h, v4.8h, v5.8h }, [x1], x15
+  ld3 { v3.2s, v4.2s, v5.2s }, [x1], x15
+  ld3 { v3.4s, v4.4s, v5.4s }, [x1], x15
+  ld3 { v3.2d, v4.2d, v5.2d }, [x1], x15
+
+  st3 { v3.8b, v4.8b, v5.8b }, [x1], x15
+  st3 { v3.16b, v4.16b, v5.16b }, [x1], x15
+  st3 { v3.4h, v4.4h, v5.4h }, [x1], x15
+  st3 { v3.8h, v4.8h, v5.8h }, [x1], x15
+  st3 { v3.2s, v4.2s, v5.2s }, [x1], x15
+  st3 { v3.4s, v4.4s, v5.4s }, [x1], x15
+  st3 { v3.2d, v4.2d, v5.2d }, [x1], x15
+  ld3 { v3.8b, v4.8b, v5.8b }, [x1], #24
+
+  ld3 { v3.16b, v4.16b, v5.16b }, [x1], #48
+  ld3 { v3.4h, v4.4h, v5.4h }, [x1], #24
+  ld3 { v3.8h, v4.8h, v5.8h }, [x1], #48
+  ld3 { v3.2s, v4.2s, v5.2s }, [x1], #24
+  ld3 { v3.4s, v4.4s, v5.4s }, [x1], #48
+  ld3 { v3.2d, v4.2d, v5.2d }, [x1], #48
+
+  st3 { v3.8b, v4.8b, v5.8b }, [x1], #24
+  st3 { v3.16b, v4.16b, v5.16b }, [x1], #48
+  st3 { v3.4h, v4.4h, v5.4h }, [x1], #24
+  st3 { v3.8h, v4.8h, v5.8h }, [x1], #48
+  st3 { v3.2s, v4.2s, v5.2s }, [x1], #24
+  st3 { v3.4s, v4.4s, v5.4s }, [x1], #48
+  st3 { v3.2d, v4.2d, v5.2d }, [x1], #48
+
+  ld4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
+  ld4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
+  ld4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
+  ld4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
+  ld4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
+  ld4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
+  ld4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
+
+  st4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
+  st4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
+  st4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
+  st4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
+  st4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
+  st4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
+  st4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
+
+  ld4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
+  ld4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
+  ld4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
+  ld4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
+  ld4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
+  ld4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
+  ld4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
+
+  st4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
+  st4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
+  st4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
+  st4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
+  st4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
+  st4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
+  st4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
+
+
+  ld1r { v12.8b }, [x2]
+  ld1r { v12.8b }, [x2], x3
+  ld1r { v12.16b }, [x2]
+  ld1r { v12.16b }, [x2], x3
+  ld1r { v12.4h }, [x2]
+  ld1r { v12.4h }, [x2], x3
+  ld1r { v12.8h }, [x2]
+  ld1r { v12.8h }, [x2], x3
+  ld1r { v12.2s }, [x2]
+  ld1r { v12.2s }, [x2], x3
+  ld1r { v12.4s }, [x2]
+  ld1r { v12.4s }, [x2], x3
+  ld1r { v12.1d }, [x2]
+  ld1r { v12.1d }, [x2], x3
+  ld1r { v12.2d }, [x2]
+  ld1r { v12.2d }, [x2], x3
+
+  ld1r { v12.8b }, [x2], #1
+  ld1r { v12.16b }, [x2], #1
+  ld1r { v12.4h }, [x2], #2
+  ld1r { v12.8h }, [x2], #2
+  ld1r { v12.2s }, [x2], #4
+  ld1r { v12.4s }, [x2], #4
+  ld1r { v12.1d }, [x2], #8
+  ld1r { v12.2d }, [x2], #8
+  ld2r { v3.8b, v4.8b }, [x2]
+  ld2r { v3.8b, v4.8b }, [x2], x3
+  ld2r { v3.16b, v4.16b }, [x2]
+  ld2r { v3.16b, v4.16b }, [x2], x3
+  ld2r { v3.4h, v4.4h }, [x2]
+  ld2r { v3.4h, v4.4h }, [x2], x3
+  ld2r { v3.8h, v4.8h }, [x2]
+  ld2r { v3.8h, v4.8h }, [x2], x3
+  ld2r { v3.2s, v4.2s }, [x2]
+  ld2r { v3.2s, v4.2s }, [x2], x3
+  ld2r { v3.4s, v4.4s }, [x2]
+  ld2r { v3.4s, v4.4s }, [x2], x3
+  ld2r { v3.1d, v4.1d }, [x2]
+  ld2r { v3.1d, v4.1d }, [x2], x3
+  ld2r { v3.2d, v4.2d }, [x2]
+  ld2r { v3.2d, v4.2d }, [x2], x3
+
+  ld2r { v3.8b, v4.8b }, [x2], #2
+  ld2r { v3.16b, v4.16b }, [x2], #2
+  ld2r { v3.4h, v4.4h }, [x2], #4
+  ld2r { v3.8h, v4.8h }, [x2], #4
+  ld2r { v3.2s, v4.2s }, [x2], #8
+  ld2r { v3.4s, v4.4s }, [x2], #8
+  ld2r { v3.1d, v4.1d }, [x2], #16
+  ld2r { v3.2d, v4.2d }, [x2], #16
+
+  ld3r { v2.8b, v3.8b, v4.8b }, [x2]
+  ld3r { v2.8b, v3.8b, v4.8b }, [x2], x3
+  ld3r { v2.16b, v3.16b, v4.16b }, [x2]
+  ld3r { v2.16b, v3.16b, v4.16b }, [x2], x3
+  ld3r { v2.4h, v3.4h, v4.4h }, [x2]
+  ld3r { v2.4h, v3.4h, v4.4h }, [x2], x3
+  ld3r { v2.8h, v3.8h, v4.8h }, [x2]
+  ld3r { v2.8h, v3.8h, v4.8h }, [x2], x3
+  ld3r { v2.2s, v3.2s, v4.2s }, [x2]
+  ld3r { v2.2s, v3.2s, v4.2s }, [x2], x3
+  ld3r { v2.4s, v3.4s, v4.4s }, [x2]
+  ld3r { v2.4s, v3.4s, v4.4s }, [x2], x3
+  ld3r { v2.1d, v3.1d, v4.1d }, [x2]
+  ld3r { v2.1d, v3.1d, v4.1d }, [x2], x3
+  ld3r { v2.2d, v3.2d, v4.2d }, [x2]
+  ld3r { v2.2d, v3.2d, v4.2d }, [x2], x3
+
+  ld3r { v2.8b, v3.8b, v4.8b }, [x2], #3
+  ld3r { v2.16b, v3.16b, v4.16b }, [x2], #3
+  ld3r { v2.4h, v3.4h, v4.4h }, [x2], #6
+  ld3r { v2.8h, v3.8h, v4.8h }, [x2], #6
+  ld3r { v2.2s, v3.2s, v4.2s }, [x2], #12
+  ld3r { v2.4s, v3.4s, v4.4s }, [x2], #12
+  ld3r { v2.1d, v3.1d, v4.1d }, [x2], #24
+  ld3r { v2.2d, v3.2d, v4.2d }, [x2], #24
+
+  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2]
+  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2], x3
+  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2]
+  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2], x3
+  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2]
+  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2], x3
+  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2]
+  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2], x3
+  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2]
+  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2], x3
+  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2]
+  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2], x3
+  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2]
+  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2], x3
+  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2]
+  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2], x3
+
+  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2], #4
+  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2], #4
+  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2], #8
+  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2], #8
+  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2], #16
+  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2], #16
+  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2], #32
+  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2], #32
+
+  ld1 { v6.b }[13], [x3]
+  ld1 { v6.h }[2], [x3]
+  ld1 { v6.s }[2], [x3]
+  ld1 { v6.d }[1], [x3]
+  ld1 { v6.b }[13], [x3], x5
+  ld1 { v6.h }[2], [x3], x5
+  ld1 { v6.s }[2], [x3], x5
+  ld1 { v6.d }[1], [x3], x5
+  ld1 { v6.b }[13], [x3], #1
+  ld1 { v6.h }[2], [x3], #2
+  ld1 { v6.s }[2], [x3], #4
+  ld1 { v6.d }[1], [x3], #8
+
+  ld2 { v5.b, v6.b }[13], [x3]
+  ld2 { v5.h, v6.h }[2], [x3]
+  ld2 { v5.s, v6.s }[2], [x3]
+  ld2 { v5.d, v6.d }[1], [x3]
+  ld2 { v5.b, v6.b }[13], [x3], x5
+  ld2 { v5.h, v6.h }[2], [x3], x5
+  ld2 { v5.s, v6.s }[2], [x3], x5
+  ld2 { v5.d, v6.d }[1], [x3], x5
+  ld2 { v5.b, v6.b }[13], [x3], #2
+  ld2 { v5.h, v6.h }[2], [x3], #4
+  ld2 { v5.s, v6.s }[2], [x3], #8
+  ld2 { v5.d, v6.d }[1], [x3], #16
+
+  ld3 { v7.b, v8.b, v9.b }[13], [x3]
+  ld3 { v7.h, v8.h, v9.h }[2], [x3]
+  ld3 { v7.s, v8.s, v9.s }[2], [x3]
+  ld3 { v7.d, v8.d, v9.d }[1], [x3]
+  ld3 { v7.b, v8.b, v9.b }[13], [x3], x5
+  ld3 { v7.h, v8.h, v9.h }[2], [x3], x5
+  ld3 { v7.s, v8.s, v9.s }[2], [x3], x5
+  ld3 { v7.d, v8.d, v9.d }[1], [x3], x5
+  ld3 { v7.b, v8.b, v9.b }[13], [x3], #3
+  ld3 { v7.h, v8.h, v9.h }[2], [x3], #6
+  ld3 { v7.s, v8.s, v9.s }[2], [x3], #12
+  ld3 { v7.d, v8.d, v9.d }[1], [x3], #24
+
+  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3]
+  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3]
+  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3]
+  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3]
+  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], x5
+  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], x5
+  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], x5
+  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], x5
+  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], #4
+  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], #8
+  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], #16
+  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], #32
+
+  st1 { v6.b }[13], [x3]
+  st1 { v6.h }[2], [x3]
+  st1 { v6.s }[2], [x3]
+  st1 { v6.d }[1], [x3]
+  st1 { v6.b }[13], [x3], x5
+  st1 { v6.h }[2], [x3], x5
+  st1 { v6.s }[2], [x3], x5
+  st1 { v6.d }[1], [x3], x5
+  st1 { v6.b }[13], [x3], #1
+  st1 { v6.h }[2], [x3], #2
+  st1 { v6.s }[2], [x3], #4
+  st1 { v6.d }[1], [x3], #8
+
+
+  st2 { v5.b, v6.b }[13], [x3]
+  st2 { v5.h, v6.h }[2], [x3]
+  st2 { v5.s, v6.s }[2], [x3]
+  st2 { v5.d, v6.d }[1], [x3]
+  st2 { v5.b, v6.b }[13], [x3], x5
+  st2 { v5.h, v6.h }[2], [x3], x5
+  st2 { v5.s, v6.s }[2], [x3], x5
+  st2 { v5.d, v6.d }[1], [x3], x5
+  st2 { v5.b, v6.b }[13], [x3], #2
+  st2 { v5.h, v6.h }[2], [x3], #4
+  st2 { v5.s, v6.s }[2], [x3], #8
+  st2 { v5.d, v6.d }[1], [x3], #16
+
+  st3 { v7.b, v8.b, v9.b }[13], [x3]
+  st3 { v7.h, v8.h, v9.h }[2], [x3]
+  st3 { v7.s, v8.s, v9.s }[2], [x3]
+  st3 { v7.d, v8.d, v9.d }[1], [x3]
+  st3 { v7.b, v8.b, v9.b }[13], [x3], x5
+  st3 { v7.h, v8.h, v9.h }[2], [x3], x5
+  st3 { v7.s, v8.s, v9.s }[2], [x3], x5
+  st3 { v7.d, v8.d, v9.d }[1], [x3], x5
+  st3 { v7.b, v8.b, v9.b }[13], [x3], #3
+  st3 { v7.h, v8.h, v9.h }[2], [x3], #6
+  st3 { v7.s, v8.s, v9.s }[2], [x3], #12
+  st3 { v7.d, v8.d, v9.d }[1], [x3], #24
+
+  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3]
+  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3]
+  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3]
+  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3]
+  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], x5
+  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], x5
+  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], x5
+  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], x5
+  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], #4
+  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], #8
+  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], #16
+  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], #32
+
+; CHECK: ld1.8b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x40,0x0c]
+; CHECK: ld1.8b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x40,0x0c]
+; CHECK: ld1.8b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x40,0x0c]
+; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x40,0x0c]
+; CHECK: ld1.16b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x40,0x4c]
+; CHECK: ld1.16b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x40,0x4c]
+; CHECK: ld1.16b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x40,0x4c]
+; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x40,0x4c]
+; CHECK: ld1.4h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x40,0x0c]
+; CHECK: ld1.4h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x40,0x0c]
+; CHECK: ld1.4h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x40,0x0c]
+; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x40,0x0c]
+; CHECK: ld1.8h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x40,0x4c]
+; CHECK: ld1.8h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x40,0x4c]
+; CHECK: ld1.8h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x40,0x4c]
+; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x40,0x4c]
+; CHECK: ld1.2s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x40,0x0c]
+; CHECK: ld1.2s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x40,0x0c]
+; CHECK: ld1.2s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x40,0x0c]
+; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x40,0x0c]
+; CHECK: ld1.4s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x40,0x4c]
+; CHECK: ld1.4s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x40,0x4c]
+; CHECK: ld1.4s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x40,0x4c]
+; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x40,0x4c]
+; CHECK: ld1.1d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x40,0x0c]
+; CHECK: ld1.1d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x40,0x0c]
+; CHECK: ld1.1d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x40,0x0c]
+; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x40,0x0c]
+; CHECK: ld1.2d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x40,0x4c]
+; CHECK: ld1.2d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x40,0x4c]
+; CHECK: ld1.2d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x40,0x4c]
+; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x40,0x4c]
+; CHECK: st1.8b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x00,0x0c]
+; CHECK: st1.8b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x00,0x0c]
+; CHECK: st1.8b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x00,0x0c]
+; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x00,0x0c]
+; CHECK: st1.16b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x00,0x4c]
+; CHECK: st1.16b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x00,0x4c]
+; CHECK: st1.16b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x00,0x4c]
+; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x00,0x4c]
+; CHECK: st1.4h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x00,0x0c]
+; CHECK: st1.4h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x00,0x0c]
+; CHECK: st1.4h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x00,0x0c]
+; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x00,0x0c]
+; CHECK: st1.8h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x00,0x4c]
+; CHECK: st1.8h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x00,0x4c]
+; CHECK: st1.8h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x00,0x4c]
+; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x00,0x4c]
+; CHECK: st1.2s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x00,0x0c]
+; CHECK: st1.2s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x00,0x0c]
+; CHECK: st1.2s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x00,0x0c]
+; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x00,0x0c]
+; CHECK: st1.4s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x00,0x4c]
+; CHECK: st1.4s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x00,0x4c]
+; CHECK: st1.4s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x00,0x4c]
+; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x00,0x4c]
+; CHECK: st1.1d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x00,0x0c]
+; CHECK: st1.1d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x00,0x0c]
+; CHECK: st1.1d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x00,0x0c]
+; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x00,0x0c]
+; CHECK: st1.2d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x00,0x4c]
+; CHECK: st1.2d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x00,0x4c]
+; CHECK: st1.2d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x00,0x4c]
+; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x00,0x4c]
+; CHECK: ld2.8b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x40,0x0c]
+; CHECK: ld2.16b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x40,0x4c]
+; CHECK: ld2.4h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x40,0x0c]
+; CHECK: ld2.8h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x40,0x4c]
+; CHECK: ld2.2s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x40,0x0c]
+; CHECK: ld2.4s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x40,0x4c]
+; CHECK: ld2.2d	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8e,0x40,0x4c]
+; CHECK: st2.8b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x00,0x0c]
+; CHECK: st2.16b { v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x00,0x4c]
+; CHECK: st2.4h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x00,0x0c]
+; CHECK: st2.8h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x00,0x4c]
+; CHECK: st2.2s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x00,0x0c]
+; CHECK: st2.4s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x00,0x4c]
+; CHECK: st2.2d	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8e,0x00,0x4c]
+; CHECK: ld3.8b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x40,0x0c]
+; CHECK: ld3.16b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x40,0x4c]
+; CHECK: ld3.4h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x40,0x0c]
+; CHECK: ld3.8h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x40,0x4c]
+; CHECK: ld3.2s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x0c]
+; CHECK: ld3.4s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x4c]
+; CHECK: ld3.2d	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4e,0x40,0x4c]
+; CHECK: st3.8b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x00,0x0c]
+; CHECK: st3.16b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x00,0x4c]
+; CHECK: st3.4h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x00,0x0c]
+; CHECK: st3.8h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x00,0x4c]
+; CHECK: st3.2s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x0c]
+; CHECK: st3.4s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x4c]
+; CHECK: st3.2d	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4e,0x00,0x4c]
+; CHECK: ld4.8b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x40,0x0c]
+; CHECK: ld4.16b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x40,0x4c]
+; CHECK: ld4.4h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x40,0x0c]
+; CHECK: ld4.8h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x40,0x4c]
+; CHECK: ld4.2s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x40,0x0c]
+; CHECK: ld4.4s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x40,0x4c]
+; CHECK: ld4.2d	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0e,0x40,0x4c]
+; CHECK: st4.8b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x00,0x0c]
+; CHECK: st4.16b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x00,0x4c]
+; CHECK: st4.4h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x00,0x0c]
+; CHECK: st4.8h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x00,0x4c]
+; CHECK: st4.2s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x00,0x0c]
+; CHECK: st4.4s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x00,0x4c]
+; CHECK: st4.2d	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0e,0x00,0x4c]
+; CHECK: ld1.8b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0xcf,0x0c]
+; CHECK: ld1.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0xcf,0x0c]
+; CHECK: ld1.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0xcf,0x0c]
+; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0xcf,0x0c]
+; CHECK: ld1.16b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0xcf,0x4c]
+; CHECK: ld1.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0xcf,0x4c]
+; CHECK: ld1.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0xcf,0x4c]
+; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0xcf,0x4c]
+; CHECK: ld1.4h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0xcf,0x0c]
+; CHECK: ld1.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0xcf,0x0c]
+; CHECK: ld1.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0xcf,0x0c]
+; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0xcf,0x0c]
+; CHECK: ld1.8h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0xcf,0x4c]
+; CHECK: ld1.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0xcf,0x4c]
+; CHECK: ld1.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0xcf,0x4c]
+; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0xcf,0x4c]
+; CHECK: ld1.2s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0xcf,0x0c]
+; CHECK: ld1.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0xcf,0x0c]
+; CHECK: ld1.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0xcf,0x0c]
+; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0xcf,0x0c]
+; CHECK: ld1.4s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0xcf,0x4c]
+; CHECK: ld1.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0xcf,0x4c]
+; CHECK: ld1.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0xcf,0x4c]
+; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0xcf,0x4c]
+; CHECK: ld1.1d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0xcf,0x0c]
+; CHECK: ld1.1d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0xcf,0x0c]
+; CHECK: ld1.1d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0xcf,0x0c]
+; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0xcf,0x0c]
+; CHECK: ld1.2d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0xcf,0x4c]
+; CHECK: ld1.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0xcf,0x4c]
+; CHECK: ld1.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0xcf,0x4c]
+; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0xcf,0x4c]
+; CHECK: st1.8b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0x8f,0x0c]
+; CHECK: st1.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0x8f,0x0c]
+; CHECK: st1.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0x8f,0x0c]
+; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0x8f,0x0c]
+; CHECK: st1.16b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0x8f,0x4c]
+; CHECK: st1.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0x8f,0x4c]
+; CHECK: st1.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0x8f,0x4c]
+; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0x8f,0x4c]
+; CHECK: st1.4h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0x8f,0x0c]
+; CHECK: st1.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0x8f,0x0c]
+; CHECK: st1.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0x8f,0x0c]
+; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0x8f,0x0c]
+; CHECK: st1.8h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0x8f,0x4c]
+; CHECK: st1.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0x8f,0x4c]
+; CHECK: st1.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0x8f,0x4c]
+; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0x8f,0x4c]
+; CHECK: st1.2s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0x8f,0x0c]
+; CHECK: st1.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0x8f,0x0c]
+; CHECK: st1.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0x8f,0x0c]
+; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0x8f,0x0c]
+; CHECK: st1.4s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0x8f,0x4c]
+; CHECK: st1.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0x8f,0x4c]
+; CHECK: st1.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0x8f,0x4c]
+; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0x8f,0x4c]
+; CHECK: st1.1d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0x8f,0x0c]
+; CHECK: st1.1d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0x8f,0x0c]
+; CHECK: st1.1d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0x8f,0x0c]
+; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0x8f,0x0c]
+; CHECK: st1.2d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0x8f,0x4c]
+; CHECK: st1.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0x8f,0x4c]
+; CHECK: st1.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0x8f,0x4c]
+; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0x8f,0x4c]
+; CHECK: ld1.8b	{ v1 }, [x1], #8       ; encoding: [0x21,0x70,0xdf,0x0c]
+; CHECK: ld1.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa0,0xdf,0x0c]
+; CHECK: ld1.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x60,0xdf,0x0c]
+; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x20,0xdf,0x0c]
+; CHECK: ld1.16b	{ v1 }, [x1], #16       ; encoding: [0x21,0x70,0xdf,0x4c]
+; CHECK: ld1.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa0,0xdf,0x4c]
+; CHECK: ld1.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x60,0xdf,0x4c]
+; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x20,0xdf,0x4c]
+; CHECK: ld1.4h	{ v1 }, [x1], #8       ; encoding: [0x21,0x74,0xdf,0x0c]
+; CHECK: ld1.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa4,0xdf,0x0c]
+; CHECK: ld1.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x64,0xdf,0x0c]
+; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x24,0xdf,0x0c]
+; CHECK: ld1.8h	{ v1 }, [x1], #16       ; encoding: [0x21,0x74,0xdf,0x4c]
+; CHECK: ld1.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa4,0xdf,0x4c]
+; CHECK: ld1.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x64,0xdf,0x4c]
+; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x24,0xdf,0x4c]
+; CHECK: ld1.2s	{ v1 }, [x1], #8       ; encoding: [0x21,0x78,0xdf,0x0c]
+; CHECK: ld1.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa8,0xdf,0x0c]
+; CHECK: ld1.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x68,0xdf,0x0c]
+; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x28,0xdf,0x0c]
+; CHECK: ld1.4s	{ v1 }, [x1], #16       ; encoding: [0x21,0x78,0xdf,0x4c]
+; CHECK: ld1.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa8,0xdf,0x4c]
+; CHECK: ld1.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x68,0xdf,0x4c]
+; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x28,0xdf,0x4c]
+; CHECK: ld1.1d	{ v1 }, [x1], #8       ; encoding: [0x21,0x7c,0xdf,0x0c]
+; CHECK: ld1.1d	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xac,0xdf,0x0c]
+; CHECK: ld1.1d	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x6c,0xdf,0x0c]
+; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x2c,0xdf,0x0c]
+; CHECK: ld1.2d	{ v1 }, [x1], #16       ; encoding: [0x21,0x7c,0xdf,0x4c]
+; CHECK: ld1.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xac,0xdf,0x4c]
+; CHECK: ld1.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x6c,0xdf,0x4c]
+; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x2c,0xdf,0x4c]
+; CHECK: st1.8b	{ v1 }, [x1], #8       ; encoding: [0x21,0x70,0x9f,0x0c]
+; CHECK: st1.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa0,0x9f,0x0c]
+; CHECK: st1.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x60,0x9f,0x0c]
+; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x20,0x9f,0x0c]
+; CHECK: st1.16b	{ v1 }, [x1], #16       ; encoding: [0x21,0x70,0x9f,0x4c]
+; CHECK: st1.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa0,0x9f,0x4c]
+; CHECK: st1.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x60,0x9f,0x4c]
+; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x20,0x9f,0x4c]
+; CHECK: st1.4h	{ v1 }, [x1], #8       ; encoding: [0x21,0x74,0x9f,0x0c]
+; CHECK: st1.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa4,0x9f,0x0c]
+; CHECK: st1.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x64,0x9f,0x0c]
+; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x24,0x9f,0x0c]
+; CHECK: st1.8h	{ v1 }, [x1], #16       ; encoding: [0x21,0x74,0x9f,0x4c]
+; CHECK: st1.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa4,0x9f,0x4c]
+; CHECK: st1.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x64,0x9f,0x4c]
+; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x24,0x9f,0x4c]
+; CHECK: st1.2s	{ v1 }, [x1], #8       ; encoding: [0x21,0x78,0x9f,0x0c]
+; CHECK: st1.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa8,0x9f,0x0c]
+; CHECK: st1.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x68,0x9f,0x0c]
+; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x28,0x9f,0x0c]
+; CHECK: st1.4s	{ v1 }, [x1], #16       ; encoding: [0x21,0x78,0x9f,0x4c]
+; CHECK: st1.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa8,0x9f,0x4c]
+; CHECK: st1.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x68,0x9f,0x4c]
+; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x28,0x9f,0x4c]
+; CHECK: st1.1d	{ v1 }, [x1], #8       ; encoding: [0x21,0x7c,0x9f,0x0c]
+; CHECK: st1.1d	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xac,0x9f,0x0c]
+; CHECK: st1.1d	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x6c,0x9f,0x0c]
+; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x2c,0x9f,0x0c]
+; CHECK: st1.2d	{ v1 }, [x1], #16       ; encoding: [0x21,0x7c,0x9f,0x4c]
+; CHECK: st1.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xac,0x9f,0x4c]
+; CHECK: st1.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x6c,0x9f,0x4c]
+; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x2c,0x9f,0x4c]
+; CHECK: ld2.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0xcf,0x0c]
+; CHECK: ld2.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0xcf,0x4c]
+; CHECK: ld2.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0xcf,0x0c]
+; CHECK: ld2.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0xcf,0x4c]
+; CHECK: ld2.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0xcf,0x0c]
+; CHECK: ld2.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0xcf,0x4c]
+; CHECK: ld2.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x8c,0xcf,0x4c]
+; CHECK: st2.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0x8f,0x0c]
+; CHECK: st2.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0x8f,0x4c]
+; CHECK: st2.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0x8f,0x0c]
+; CHECK: st2.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0x8f,0x4c]
+; CHECK: st2.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0x8f,0x0c]
+; CHECK: st2.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0x8f,0x4c]
+; CHECK: st2.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x8c,0x8f,0x4c]
+; CHECK: ld2.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x80,0xdf,0x0c]
+; CHECK: ld2.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x80,0xdf,0x4c]
+; CHECK: ld2.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x84,0xdf,0x0c]
+; CHECK: ld2.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x84,0xdf,0x4c]
+; CHECK: ld2.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x88,0xdf,0x0c]
+; CHECK: ld2.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x88,0xdf,0x4c]
+; CHECK: ld2.2d	{ v2, v3 }, [x1], #32	; encoding: [0x22,0x8c,0xdf,0x4c]
+; CHECK: st2.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x80,0x9f,0x0c]
+; CHECK: st2.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x80,0x9f,0x4c]
+; CHECK: st2.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x84,0x9f,0x0c]
+; CHECK: st2.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x84,0x9f,0x4c]
+; CHECK: st2.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x88,0x9f,0x0c]
+; CHECK: st2.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x88,0x9f,0x4c]
+; CHECK: st2.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x8c,0x9f,0x4c]
+; CHECK: ld3.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0xcf,0x0c]
+; CHECK: ld3.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0xcf,0x4c]
+; CHECK: ld3.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0xcf,0x0c]
+; CHECK: ld3.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0xcf,0x4c]
+; CHECK: ld3.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0xcf,0x0c]
+; CHECK: ld3.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0xcf,0x4c]
+; CHECK: ld3.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x4c,0xcf,0x4c]
+; CHECK: st3.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0x8f,0x0c]
+; CHECK: st3.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0x8f,0x4c]
+; CHECK: st3.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0x8f,0x0c]
+; CHECK: st3.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0x8f,0x4c]
+; CHECK: st3.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0x8f,0x0c]
+; CHECK: st3.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0x8f,0x4c]
+; CHECK: st3.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x4c,0x8f,0x4c]
+; CHECK: ld3.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x40,0xdf,0x0c]
+; CHECK: ld3.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x40,0xdf,0x4c]
+; CHECK: ld3.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x44,0xdf,0x0c]
+; CHECK: ld3.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x44,0xdf,0x4c]
+; CHECK: ld3.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x48,0xdf,0x0c]
+; CHECK: ld3.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x48,0xdf,0x4c]
+; CHECK: ld3.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x4c,0xdf,0x4c]
+; CHECK: st3.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x40,0x9f,0x0c]
+; CHECK: st3.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x40,0x9f,0x4c]
+; CHECK: st3.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x44,0x9f,0x0c]
+; CHECK: st3.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x44,0x9f,0x4c]
+; CHECK: st3.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x48,0x9f,0x0c]
+; CHECK: st3.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x48,0x9f,0x4c]
+; CHECK: st3.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x4c,0x9f,0x4c]
+; CHECK: ld4.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0xcf,0x0c]
+; CHECK: ld4.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0xcf,0x4c]
+; CHECK: ld4.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0xcf,0x0c]
+; CHECK: ld4.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0xcf,0x4c]
+; CHECK: ld4.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0xcf,0x0c]
+; CHECK: ld4.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0xcf,0x4c]
+; CHECK: ld4.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x0c,0xcf,0x4c]
+; CHECK: st4.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0x8f,0x0c]
+; CHECK: st4.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0x8f,0x4c]
+; CHECK: st4.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0x8f,0x0c]
+; CHECK: st4.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0x8f,0x4c]
+; CHECK: st4.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0x8f,0x0c]
+; CHECK: st4.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0x8f,0x4c]
+; CHECK: st4.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x0c,0x8f,0x4c]
+; CHECK: ld4.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x00,0xdf,0x0c]
+; CHECK: ld4.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x00,0xdf,0x4c]
+; CHECK: ld4.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x04,0xdf,0x0c]
+; CHECK: ld4.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x04,0xdf,0x4c]
+; CHECK: ld4.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x08,0xdf,0x0c]
+; CHECK: ld4.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x08,0xdf,0x4c]
+; CHECK: ld4.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x0c,0xdf,0x4c]
+; CHECK: st4.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x00,0x9f,0x0c]
+; CHECK: st4.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x00,0x9f,0x4c]
+; CHECK: st4.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x04,0x9f,0x0c]
+; CHECK: st4.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x04,0x9f,0x4c]
+; CHECK: st4.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x08,0x9f,0x0c]
+; CHECK: st4.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x08,0x9f,0x4c]
+; CHECK: st4.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x0c,0x9f,0x4c]
+; CHECK: ld1r.8b	{ v12 }, [x2]           ; encoding: [0x4c,0xc0,0x40,0x0d]
+; CHECK: ld1r.8b	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc0,0xc3,0x0d]
+; CHECK: ld1r.16b	{ v12 }, [x2]   ; encoding: [0x4c,0xc0,0x40,0x4d]
+; CHECK: ld1r.16b	{ v12 }, [x2], x3 ; encoding: [0x4c,0xc0,0xc3,0x4d]
+; CHECK: ld1r.4h	{ v12 }, [x2]           ; encoding: [0x4c,0xc4,0x40,0x0d]
+; CHECK: ld1r.4h	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc4,0xc3,0x0d]
+; CHECK: ld1r.8h	{ v12 }, [x2]           ; encoding: [0x4c,0xc4,0x40,0x4d]
+; CHECK: ld1r.8h	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc4,0xc3,0x4d]
+; CHECK: ld1r.2s	{ v12 }, [x2]           ; encoding: [0x4c,0xc8,0x40,0x0d]
+; CHECK: ld1r.2s	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc8,0xc3,0x0d]
+; CHECK: ld1r.4s	{ v12 }, [x2]           ; encoding: [0x4c,0xc8,0x40,0x4d]
+; CHECK: ld1r.4s	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc8,0xc3,0x4d]
+; CHECK: ld1r.1d	{ v12 }, [x2]           ; encoding: [0x4c,0xcc,0x40,0x0d]
+; CHECK: ld1r.1d	{ v12 }, [x2], x3       ; encoding: [0x4c,0xcc,0xc3,0x0d]
+; CHECK: ld1r.2d	{ v12 }, [x2]           ; encoding: [0x4c,0xcc,0x40,0x4d]
+; CHECK: ld1r.2d	{ v12 }, [x2], x3       ; encoding: [0x4c,0xcc,0xc3,0x4d]
+; CHECK: ld1r.8b	{ v12 }, [x2], #1      ; encoding: [0x4c,0xc0,0xdf,0x0d]
+; CHECK: ld1r.16b	{ v12 }, [x2], #1 ; encoding: [0x4c,0xc0,0xdf,0x4d]
+; CHECK: ld1r.4h	{ v12 }, [x2], #2      ; encoding: [0x4c,0xc4,0xdf,0x0d]
+; CHECK: ld1r.8h	{ v12 }, [x2], #2      ; encoding: [0x4c,0xc4,0xdf,0x4d]
+; CHECK: ld1r.2s	{ v12 }, [x2], #4      ; encoding: [0x4c,0xc8,0xdf,0x0d]
+; CHECK: ld1r.4s	{ v12 }, [x2], #4      ; encoding: [0x4c,0xc8,0xdf,0x4d]
+; CHECK: ld1r.1d	{ v12 }, [x2], #8      ; encoding: [0x4c,0xcc,0xdf,0x0d]
+; CHECK: ld1r.2d	{ v12 }, [x2], #8      ; encoding: [0x4c,0xcc,0xdf,0x4d]
+; CHECK: ld2r.8b	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc0,0x60,0x0d]
+; CHECK: ld2r.8b	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc0,0xe3,0x0d]
+; CHECK: ld2r.16b	{ v3, v4 }, [x2] ; encoding: [0x43,0xc0,0x60,0x4d]
+; CHECK: ld2r.16b	{ v3, v4 }, [x2], x3 ; encoding: [0x43,0xc0,0xe3,0x4d]
+; CHECK: ld2r.4h	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc4,0x60,0x0d]
+; CHECK: ld2r.4h	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc4,0xe3,0x0d]
+; CHECK: ld2r.8h	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc4,0x60,0x4d]
+; CHECK: ld2r.8h	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc4,0xe3,0x4d]
+; CHECK: ld2r.2s	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc8,0x60,0x0d]
+; CHECK: ld2r.2s	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc8,0xe3,0x0d]
+; CHECK: ld2r.4s	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc8,0x60,0x4d]
+; CHECK: ld2r.4s	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc8,0xe3,0x4d]
+; CHECK: ld2r.1d	{ v3, v4 }, [x2]        ; encoding: [0x43,0xcc,0x60,0x0d]
+; CHECK: ld2r.1d	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xcc,0xe3,0x0d]
+; CHECK: ld2r.2d	{ v3, v4 }, [x2]        ; encoding: [0x43,0xcc,0x60,0x4d]
+; CHECK: ld2r.2d	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xcc,0xe3,0x4d]
+; CHECK: ld2r.8b	{ v3, v4 }, [x2], #2   ; encoding: [0x43,0xc0,0xff,0x0d]
+; CHECK: ld2r.16b	{ v3, v4 }, [x2], #2 ; encoding: [0x43,0xc0,0xff,0x4d]
+; CHECK: ld2r.4h	{ v3, v4 }, [x2], #4   ; encoding: [0x43,0xc4,0xff,0x0d]
+; CHECK: ld2r.8h	{ v3, v4 }, [x2], #4   ; encoding: [0x43,0xc4,0xff,0x4d]
+; CHECK: ld2r.2s	{ v3, v4 }, [x2], #8   ; encoding: [0x43,0xc8,0xff,0x0d]
+; CHECK: ld2r.4s	{ v3, v4 }, [x2], #8   ; encoding: [0x43,0xc8,0xff,0x4d]
+; CHECK: ld2r.1d	{ v3, v4 }, [x2], #16   ; encoding: [0x43,0xcc,0xff,0x0d]
+; CHECK: ld2r.2d	{ v3, v4 }, [x2], #16   ; encoding: [0x43,0xcc,0xff,0x4d]
+; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe0,0x40,0x0d]
+; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe0,0xc3,0x0d]
+; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2] ; encoding: [0x42,0xe0,0x40,0x4d]
+; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe0,0xc3,0x4d]
+; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe4,0x40,0x0d]
+; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe4,0xc3,0x0d]
+; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe4,0x40,0x4d]
+; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe4,0xc3,0x4d]
+; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe8,0x40,0x0d]
+; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe8,0xc3,0x0d]
+; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe8,0x40,0x4d]
+; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe8,0xc3,0x4d]
+; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xec,0x40,0x0d]
+; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xec,0xc3,0x0d]
+; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xec,0x40,0x4d]
+; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xec,0xc3,0x4d]
+; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2], #3 ; encoding: [0x42,0xe0,0xdf,0x0d]
+; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2], #3 ; encoding: [0x42,0xe0,0xdf,0x4d]
+; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2], #6 ; encoding: [0x42,0xe4,0xdf,0x0d]
+; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2], #6 ; encoding: [0x42,0xe4,0xdf,0x4d]
+; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2], #12 ; encoding: [0x42,0xe8,0xdf,0x0d]
+; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2], #12 ; encoding: [0x42,0xe8,0xdf,0x4d]
+; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2], #24 ; encoding: [0x42,0xec,0xdf,0x0d]
+; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2], #24 ; encoding: [0x42,0xec,0xdf,0x4d]
+; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe0,0x60,0x0d]
+; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe0,0xe3,0x0d]
+; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe0,0x60,0x4d]
+; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe0,0xe3,0x4d]
+; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe4,0x60,0x0d]
+; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe4,0xe3,0x0d]
+; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe4,0x60,0x4d]
+; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe4,0xe3,0x4d]
+; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe8,0x60,0x0d]
+; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe8,0xe3,0x0d]
+; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe8,0x60,0x4d]
+; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe8,0xe3,0x4d]
+; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xec,0x60,0x0d]
+; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xec,0xe3,0x0d]
+; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xec,0x60,0x4d]
+; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xec,0xe3,0x4d]
+; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2], #4 ; encoding: [0x42,0xe0,0xff,0x0d]
+; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2], #4 ; encoding: [0x42,0xe0,0xff,0x4d]
+; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2], #8 ; encoding: [0x42,0xe4,0xff,0x0d]
+; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2], #8 ; encoding: [0x42,0xe4,0xff,0x4d]
+; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x0d]
+; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x4d]
+; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2], #32 ; encoding: [0x42,0xec,0xff,0x0d]
+; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2], #32 ; encoding: [0x42,0xec,0xff,0x4d]
+; CHECK: ld1.b	{ v6 }[13], [x3]        ; encoding: [0x66,0x14,0x40,0x4d]
+; CHECK: ld1.h	{ v6 }[2], [x3]         ; encoding: [0x66,0x50,0x40,0x0d]
+; CHECK: ld1.s	{ v6 }[2], [x3]         ; encoding: [0x66,0x80,0x40,0x4d]
+; CHECK: ld1.d	{ v6 }[1], [x3]         ; encoding: [0x66,0x84,0x40,0x4d]
+; CHECK: ld1.b	{ v6 }[13], [x3], x5    ; encoding: [0x66,0x14,0xc5,0x4d]
+; CHECK: ld1.h	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x50,0xc5,0x0d]
+; CHECK: ld1.s	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x80,0xc5,0x4d]
+; CHECK: ld1.d	{ v6 }[1], [x3], x5     ; encoding: [0x66,0x84,0xc5,0x4d]
+; CHECK: ld1.b	{ v6 }[13], [x3], #1   ; encoding: [0x66,0x14,0xdf,0x4d]
+; CHECK: ld1.h	{ v6 }[2], [x3], #2    ; encoding: [0x66,0x50,0xdf,0x0d]
+; CHECK: ld1.s	{ v6 }[2], [x3], #4    ; encoding: [0x66,0x80,0xdf,0x4d]
+; CHECK: ld1.d	{ v6 }[1], [x3], #8    ; encoding: [0x66,0x84,0xdf,0x4d]
+; CHECK: ld2.b	{ v5, v6 }[13], [x3]    ; encoding: [0x65,0x14,0x60,0x4d]
+; CHECK: ld2.h	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x50,0x60,0x0d]
+; CHECK: ld2.s	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x80,0x60,0x4d]
+; CHECK: ld2.d	{ v5, v6 }[1], [x3]     ; encoding: [0x65,0x84,0x60,0x4d]
+; CHECK: ld2.b	{ v5, v6 }[13], [x3], x5 ; encoding: [0x65,0x14,0xe5,0x4d]
+; CHECK: ld2.h	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x50,0xe5,0x0d]
+; CHECK: ld2.s	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x80,0xe5,0x4d]
+; CHECK: ld2.d	{ v5, v6 }[1], [x3], x5 ; encoding: [0x65,0x84,0xe5,0x4d]
+; CHECK: ld2.b	{ v5, v6 }[13], [x3], #2 ; encoding: [0x65,0x14,0xff,0x4d]
+; CHECK: ld2.h	{ v5, v6 }[2], [x3], #4 ; encoding: [0x65,0x50,0xff,0x0d]
+; CHECK: ld2.s	{ v5, v6 }[2], [x3], #8 ; encoding: [0x65,0x80,0xff,0x4d]
+; CHECK: ld2.d	{ v5, v6 }[1], [x3], #16 ; encoding: [0x65,0x84,0xff,0x4d]
+; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3] ; encoding: [0x67,0x34,0x40,0x4d]
+; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0x70,0x40,0x0d]
+; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0xa0,0x40,0x4d]
+; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3] ; encoding: [0x67,0xa4,0x40,0x4d]
+; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3], x5 ; encoding: [0x67,0x34,0xc5,0x4d]
+; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0x70,0xc5,0x0d]
+; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xc5,0x4d]
+; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xc5,0x4d]
+; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3], #3 ; encoding: [0x67,0x34,0xdf,0x4d]
+; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3], #6 ; encoding: [0x67,0x70,0xdf,0x0d]
+; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3], #12 ; encoding: [0x67,0xa0,0xdf,0x4d]
+; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3], #24 ; encoding: [0x67,0xa4,0xdf,0x4d]
+; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3] ; encoding: [0x67,0x34,0x60,0x4d]
+; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0x70,0x60,0x0d]
+; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0xa0,0x60,0x4d]
+; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3] ; encoding: [0x67,0xa4,0x60,0x4d]
+; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3], x5 ; encoding: [0x67,0x34,0xe5,0x4d]
+; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0x70,0xe5,0x0d]
+; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xe5,0x4d]
+; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xe5,0x4d]
+; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3], #4 ; encoding: [0x67,0x34,0xff,0x4d]
+; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3], #8 ; encoding: [0x67,0x70,0xff,0x0d]
+; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3], #16 ; encoding: [0x67,0xa0,0xff,0x4d]
+; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3], #32 ; encoding: [0x67,0xa4,0xff,0x4d]
+; CHECK: st1.b	{ v6 }[13], [x3]        ; encoding: [0x66,0x14,0x00,0x4d]
+; CHECK: st1.h	{ v6 }[2], [x3]         ; encoding: [0x66,0x50,0x00,0x0d]
+; CHECK: st1.s	{ v6 }[2], [x3]         ; encoding: [0x66,0x80,0x00,0x4d]
+; CHECK: st1.d	{ v6 }[1], [x3]         ; encoding: [0x66,0x84,0x00,0x4d]
+; CHECK: st1.b	{ v6 }[13], [x3], x5    ; encoding: [0x66,0x14,0x85,0x4d]
+; CHECK: st1.h	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x50,0x85,0x0d]
+; CHECK: st1.s	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x80,0x85,0x4d]
+; CHECK: st1.d	{ v6 }[1], [x3], x5     ; encoding: [0x66,0x84,0x85,0x4d]
+; CHECK: st1.b	{ v6 }[13], [x3], #1   ; encoding: [0x66,0x14,0x9f,0x4d]
+; CHECK: st1.h	{ v6 }[2], [x3], #2    ; encoding: [0x66,0x50,0x9f,0x0d]
+; CHECK: st1.s	{ v6 }[2], [x3], #4    ; encoding: [0x66,0x80,0x9f,0x4d]
+; CHECK: st1.d	{ v6 }[1], [x3], #8    ; encoding: [0x66,0x84,0x9f,0x4d]
+; CHECK: st2.b	{ v5, v6 }[13], [x3]    ; encoding: [0x65,0x14,0x20,0x4d]
+; CHECK: st2.h	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x50,0x20,0x0d]
+; CHECK: st2.s	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x80,0x20,0x4d]
+; CHECK: st2.d	{ v5, v6 }[1], [x3]     ; encoding: [0x65,0x84,0x20,0x4d]
+; CHECK: st2.b	{ v5, v6 }[13], [x3], x5 ; encoding: [0x65,0x14,0xa5,0x4d]
+; CHECK: st2.h	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x50,0xa5,0x0d]
+; CHECK: st2.s	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x80,0xa5,0x4d]
+; CHECK: st2.d	{ v5, v6 }[1], [x3], x5 ; encoding: [0x65,0x84,0xa5,0x4d]
+; CHECK: st2.b	{ v5, v6 }[13], [x3], #2 ; encoding: [0x65,0x14,0xbf,0x4d]
+; CHECK: st2.h	{ v5, v6 }[2], [x3], #4 ; encoding: [0x65,0x50,0xbf,0x0d]
+; CHECK: st2.s	{ v5, v6 }[2], [x3], #8 ; encoding: [0x65,0x80,0xbf,0x4d]
+; CHECK: st2.d	{ v5, v6 }[1], [x3], #16 ; encoding: [0x65,0x84,0xbf,0x4d]
+; CHECK: st3.b	{ v7, v8, v9 }[13], [x3] ; encoding: [0x67,0x34,0x00,0x4d]
+; CHECK: st3.h	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0x70,0x00,0x0d]
+; CHECK: st3.s	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0xa0,0x00,0x4d]
+; CHECK: st3.d	{ v7, v8, v9 }[1], [x3] ; encoding: [0x67,0xa4,0x00,0x4d]
+; CHECK: st3.b	{ v7, v8, v9 }[13], [x3], x5 ; encoding: [0x67,0x34,0x85,0x4d]
+; CHECK: st3.h	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0x70,0x85,0x0d]
+; CHECK: st3.s	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0xa0,0x85,0x4d]
+; CHECK: st3.d	{ v7, v8, v9 }[1], [x3], x5 ; encoding: [0x67,0xa4,0x85,0x4d]
+; CHECK: st3.b	{ v7, v8, v9 }[13], [x3], #3 ; encoding: [0x67,0x34,0x9f,0x4d]
+; CHECK: st3.h	{ v7, v8, v9 }[2], [x3], #6 ; encoding: [0x67,0x70,0x9f,0x0d]
+; CHECK: st3.s	{ v7, v8, v9 }[2], [x3], #12 ; encoding: [0x67,0xa0,0x9f,0x4d]
+; CHECK: st3.d	{ v7, v8, v9 }[1], [x3], #24 ; encoding: [0x67,0xa4,0x9f,0x4d]
+; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3] ; encoding: [0x67,0x34,0x20,0x4d]
+; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0x70,0x20,0x0d]
+; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0xa0,0x20,0x4d]
+; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3] ; encoding: [0x67,0xa4,0x20,0x4d]
+; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3], x5 ; encoding: [0x67,0x34,0xa5,0x4d]
+; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0x70,0xa5,0x0d]
+; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xa5,0x4d]
+; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xa5,0x4d]
+; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3], #4 ; encoding: [0x67,0x34,0xbf,0x4d]
+; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3], #8 ; encoding: [0x67,0x70,0xbf,0x0d]
+; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3], #16 ; encoding: [0x67,0xa0,0xbf,0x4d]
+; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3], #32 ; encoding: [0x67,0xa4,0xbf,0x4d]
diff --git a/test/MC/AArch64/arm64-small-data-fixups.s b/test/MC/AArch64/arm64-small-data-fixups.s
new file mode 100644
index 00000000000..3fe7c75c011
--- /dev/null
+++ b/test/MC/AArch64/arm64-small-data-fixups.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o - %s | macho-dump | FileCheck %s
+
+foo:
+  .long 0
+bar:
+  .long 1
+
+baz:
+  .byte foo - bar
+  .short foo - bar
+
+; CHECK: # Relocation 0
+; CHECK: (('word-0', 0x9),
+; CHECK:  ('word-1', 0x1a000002)),
+; CHECK: # Relocation 1
+; CHECK: (('word-0', 0x9),
+; CHECK:  ('word-1', 0xa000001)),
+; CHECK: # Relocation 2
+; CHECK: (('word-0', 0x8),
+; CHECK:  ('word-1', 0x18000002)),
+; CHECK: # Relocation 3
+; CHECK: (('word-0', 0x8),
+; CHECK:  ('word-1', 0x8000001)),
+
diff --git a/test/MC/AArch64/arm64-spsel-sysreg.s b/test/MC/AArch64/arm64-spsel-sysreg.s
new file mode 100644
index 00000000000..f1d94d8c2d8
--- /dev/null
+++ b/test/MC/AArch64/arm64-spsel-sysreg.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc -triple arm64 -show-encoding < %s 2>%t | FileCheck %s
+// RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+msr SPSel, #0
+msr SPSel, x0
+msr DAIFSet, #0
+msr ESR_EL1, x0
+mrs x0, SPSel
+mrs x0, ESR_EL1
+
+// CHECK: msr SPSEL, #0               // encoding: [0xbf,0x40,0x00,0xd5]
+// CHECK: msr SPSEL, x0               // encoding: [0x00,0x42,0x18,0xd5]
+// CHECK: msr DAIFSET, #0             // encoding: [0xdf,0x40,0x03,0xd5]
+// CHECK: msr ESR_EL1, x0             // encoding: [0x00,0x52,0x18,0xd5]
+// CHECK: mrs x0, SPSEL               // encoding: [0x00,0x42,0x38,0xd5]
+// CHECK: mrs x0, ESR_EL1             // encoding: [0x00,0x52,0x38,0xd5]
+
+
+msr DAIFSet, x0
+msr ESR_EL1, #0
+mrs x0, DAIFSet
+// CHECK-ERRORS: error: immediate must be an integer in range [0, 15]
+// CHECK-ERRORS: error: invalid operand for instruction
+// CHECK-ERRORS: error: expected readable system register
diff --git a/test/MC/AArch64/arm64-system-encoding.s b/test/MC/AArch64/arm64-system-encoding.s
new file mode 100644
index 00000000000..9246608725b
--- /dev/null
+++ b/test/MC/AArch64/arm64-system-encoding.s
@@ -0,0 +1,623 @@
+; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+foo:
+
+;-----------------------------------------------------------------------------
+; Simple encodings (instuctions w/ no operands)
+;-----------------------------------------------------------------------------
+
+  nop
+  sev
+  sevl
+  wfe
+  wfi
+  yield
+
+; CHECK: nop                             ; encoding: [0x1f,0x20,0x03,0xd5]
+; CHECK: sev                             ; encoding: [0x9f,0x20,0x03,0xd5]
+; CHECK: sevl                            ; encoding: [0xbf,0x20,0x03,0xd5]
+; CHECK: wfe                             ; encoding: [0x5f,0x20,0x03,0xd5]
+; CHECK: wfi                             ; encoding: [0x7f,0x20,0x03,0xd5]
+; CHECK: yield                           ; encoding: [0x3f,0x20,0x03,0xd5]
+
+;-----------------------------------------------------------------------------
+; Single-immediate operand instructions
+;-----------------------------------------------------------------------------
+
+  clrex #10
+; CHECK: clrex #10  ; encoding: [0x5f,0x3a,0x03,0xd5]
+  isb #15
+  isb sy
+; CHECK: isb     ; encoding: [0xdf,0x3f,0x03,0xd5]
+; CHECK: isb     ; encoding: [0xdf,0x3f,0x03,0xd5]
+  dmb #3
+  dmb osh
+; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
+; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
+  dsb #7
+  dsb nsh
+; CHECK: dsb nsh    ; encoding: [0x9f,0x37,0x03,0xd5]
+; CHECK: dsb nsh    ; encoding: [0x9f,0x37,0x03,0xd5]
+
+;-----------------------------------------------------------------------------
+; Generic system instructions
+;-----------------------------------------------------------------------------
+  sys #2, c0, c5, #7
+; CHECK: encoding: [0xff,0x05,0x0a,0xd5]
+  sys #7, C6, c10, #7, x7
+; CHECK: encoding: [0xe7,0x6a,0x0f,0xd5]
+  sysl  x20, #6, c3, C15, #7
+; CHECK: encoding: [0xf4,0x3f,0x2e,0xd5]
+
+; Check for error on invalid 'C' operand value.
+  sys #2, c16, c5, #7
+; CHECK-ERRORS: error: Expected cN operand where 0 <= N <= 15
+
+;-----------------------------------------------------------------------------
+; MSR/MRS instructions
+;-----------------------------------------------------------------------------
+  msr ACTLR_EL1, x3
+  msr ACTLR_EL2, x3
+  msr ACTLR_EL3, x3
+  msr AFSR0_EL1, x3
+  msr AFSR0_EL2, x3
+  msr AFSR0_EL3, x3
+  msr AFSR1_EL1, x3
+  msr AFSR1_EL2, x3
+  msr AFSR1_EL3, x3
+  msr AMAIR_EL1, x3
+  msr AMAIR_EL2, x3
+  msr AMAIR_EL3, x3
+  msr CNTFRQ_EL0, x3
+  msr CNTHCTL_EL2, x3
+  msr CNTHP_CTL_EL2, x3
+  msr CNTHP_CVAL_EL2, x3
+  msr CNTHP_TVAL_EL2, x3
+  msr CNTKCTL_EL1, x3
+  msr CNTP_CTL_EL0, x3
+  msr CNTP_CVAL_EL0, x3
+  msr CNTP_TVAL_EL0, x3
+  msr CNTVOFF_EL2, x3
+  msr CNTV_CTL_EL0, x3
+  msr CNTV_CVAL_EL0, x3
+  msr CNTV_TVAL_EL0, x3
+  msr CONTEXTIDR_EL1, x3
+  msr CPACR_EL1, x3
+  msr CPTR_EL2, x3
+  msr CPTR_EL3, x3
+  msr CSSELR_EL1, x3
+  msr CURRENTEL, x3
+  msr DACR32_EL2, x3
+  msr ESR_EL1, x3
+  msr ESR_EL2, x3
+  msr ESR_EL3, x3
+  msr FAR_EL1, x3
+  msr FAR_EL2, x3
+  msr FAR_EL3, x3
+  msr FPEXC32_EL2, x3
+  msr HACR_EL2, x3
+  msr HCR_EL2, x3
+  msr HPFAR_EL2, x3
+  msr HSTR_EL2, x3
+  msr IFSR32_EL2, x3
+  msr MAIR_EL1, x3
+  msr MAIR_EL2, x3
+  msr MAIR_EL3, x3
+  msr MDCR_EL2, x3
+  msr MDCR_EL3, x3
+  msr PAR_EL1, x3
+  msr SCR_EL3, x3
+  msr SCTLR_EL1, x3
+  msr SCTLR_EL2, x3
+  msr SCTLR_EL3, x3
+  msr SDER32_EL3, x3
+  msr TCR_EL1, x3
+  msr TCR_EL2, x3
+  msr TCR_EL3, x3
+  msr TEECR32_EL1, x3
+  msr TEEHBR32_EL1, x3
+  msr TPIDRRO_EL0, x3
+  msr TPIDR_EL0, x3
+  msr TPIDR_EL1, x3
+  msr TPIDR_EL2, x3
+  msr TPIDR_EL3, x3
+  msr TTBR0_EL1, x3
+  msr TTBR0_EL2, x3
+  msr TTBR0_EL3, x3
+  msr TTBR1_EL1, x3
+  msr VBAR_EL1, x3
+  msr VBAR_EL2, x3
+  msr VBAR_EL3, x3
+  msr VMPIDR_EL2, x3
+  msr VPIDR_EL2, x3
+  msr VTCR_EL2, x3
+  msr VTTBR_EL2, x3
+  msr SPSel, x3
+  msr S3_2_C11_C6_4, x1
+; CHECK: msr ACTLR_EL1, x3              ; encoding: [0x23,0x10,0x18,0xd5]
+; CHECK: msr ACTLR_EL2, x3              ; encoding: [0x23,0x10,0x1c,0xd5]
+; CHECK: msr ACTLR_EL3, x3              ; encoding: [0x23,0x10,0x1e,0xd5]
+; CHECK: msr AFSR0_EL1, x3              ; encoding: [0x03,0x51,0x18,0xd5]
+; CHECK: msr AFSR0_EL2, x3              ; encoding: [0x03,0x51,0x1c,0xd5]
+; CHECK: msr AFSR0_EL3, x3              ; encoding: [0x03,0x51,0x1e,0xd5]
+; CHECK: msr AFSR1_EL1, x3              ; encoding: [0x23,0x51,0x18,0xd5]
+; CHECK: msr AFSR1_EL2, x3              ; encoding: [0x23,0x51,0x1c,0xd5]
+; CHECK: msr AFSR1_EL3, x3              ; encoding: [0x23,0x51,0x1e,0xd5]
+; CHECK: msr AMAIR_EL1, x3              ; encoding: [0x03,0xa3,0x18,0xd5]
+; CHECK: msr AMAIR_EL2, x3              ; encoding: [0x03,0xa3,0x1c,0xd5]
+; CHECK: msr AMAIR_EL3, x3              ; encoding: [0x03,0xa3,0x1e,0xd5]
+; CHECK: msr CNTFRQ_EL0, x3             ; encoding: [0x03,0xe0,0x1b,0xd5]
+; CHECK: msr CNTHCTL_EL2, x3            ; encoding: [0x03,0xe1,0x1c,0xd5]
+; CHECK: msr CNTHP_CTL_EL2, x3          ; encoding: [0x23,0xe2,0x1c,0xd5]
+; CHECK: msr CNTHP_CVAL_EL2, x3         ; encoding: [0x43,0xe2,0x1c,0xd5]
+; CHECK: msr CNTHP_TVAL_EL2, x3         ; encoding: [0x03,0xe2,0x1c,0xd5]
+; CHECK: msr CNTKCTL_EL1, x3            ; encoding: [0x03,0xe1,0x18,0xd5]
+; CHECK: msr CNTP_CTL_EL0, x3           ; encoding: [0x23,0xe2,0x1b,0xd5]
+; CHECK: msr CNTP_CVAL_EL0, x3          ; encoding: [0x43,0xe2,0x1b,0xd5]
+; CHECK: msr CNTP_TVAL_EL0, x3          ; encoding: [0x03,0xe2,0x1b,0xd5]
+; CHECK: msr CNTVOFF_EL2, x3            ; encoding: [0x63,0xe0,0x1c,0xd5]
+; CHECK: msr CNTV_CTL_EL0, x3           ; encoding: [0x23,0xe3,0x1b,0xd5]
+; CHECK: msr CNTV_CVAL_EL0, x3          ; encoding: [0x43,0xe3,0x1b,0xd5]
+; CHECK: msr CNTV_TVAL_EL0, x3          ; encoding: [0x03,0xe3,0x1b,0xd5]
+; CHECK: msr CONTEXTIDR_EL1, x3         ; encoding: [0x23,0xd0,0x18,0xd5]
+; CHECK: msr CPACR_EL1, x3              ; encoding: [0x43,0x10,0x18,0xd5]
+; CHECK: msr CPTR_EL2, x3               ; encoding: [0x43,0x11,0x1c,0xd5]
+; CHECK: msr CPTR_EL3, x3               ; encoding: [0x43,0x11,0x1e,0xd5]
+; CHECK: msr CSSELR_EL1, x3             ; encoding: [0x03,0x00,0x1a,0xd5]
+; CHECK: msr CURRENTEL, x3              ; encoding: [0x43,0x42,0x18,0xd5]
+; CHECK: msr DACR32_EL2, x3             ; encoding: [0x03,0x30,0x1c,0xd5]
+; CHECK: msr ESR_EL1, x3                ; encoding: [0x03,0x52,0x18,0xd5]
+; CHECK: msr ESR_EL2, x3                ; encoding: [0x03,0x52,0x1c,0xd5]
+; CHECK: msr ESR_EL3, x3                ; encoding: [0x03,0x52,0x1e,0xd5]
+; CHECK: msr FAR_EL1, x3                ; encoding: [0x03,0x60,0x18,0xd5]
+; CHECK: msr FAR_EL2, x3                ; encoding: [0x03,0x60,0x1c,0xd5]
+; CHECK: msr FAR_EL3, x3                ; encoding: [0x03,0x60,0x1e,0xd5]
+; CHECK: msr FPEXC32_EL2, x3            ; encoding: [0x03,0x53,0x1c,0xd5]
+; CHECK: msr HACR_EL2, x3               ; encoding: [0xe3,0x11,0x1c,0xd5]
+; CHECK: msr HCR_EL2, x3                ; encoding: [0x03,0x11,0x1c,0xd5]
+; CHECK: msr HPFAR_EL2, x3              ; encoding: [0x83,0x60,0x1c,0xd5]
+; CHECK: msr HSTR_EL2, x3               ; encoding: [0x63,0x11,0x1c,0xd5]
+; CHECK: msr IFSR32_EL2, x3             ; encoding: [0x23,0x50,0x1c,0xd5]
+; CHECK: msr MAIR_EL1, x3               ; encoding: [0x03,0xa2,0x18,0xd5]
+; CHECK: msr MAIR_EL2, x3               ; encoding: [0x03,0xa2,0x1c,0xd5]
+; CHECK: msr MAIR_EL3, x3               ; encoding: [0x03,0xa2,0x1e,0xd5]
+; CHECK: msr MDCR_EL2, x3               ; encoding: [0x23,0x11,0x1c,0xd5]
+; CHECK: msr MDCR_EL3, x3               ; encoding: [0x23,0x13,0x1e,0xd5]
+; CHECK: msr PAR_EL1, x3                ; encoding: [0x03,0x74,0x18,0xd5]
+; CHECK: msr SCR_EL3, x3                ; encoding: [0x03,0x11,0x1e,0xd5]
+; CHECK: msr SCTLR_EL1, x3              ; encoding: [0x03,0x10,0x18,0xd5]
+; CHECK: msr SCTLR_EL2, x3              ; encoding: [0x03,0x10,0x1c,0xd5]
+; CHECK: msr SCTLR_EL3, x3              ; encoding: [0x03,0x10,0x1e,0xd5]
+; CHECK: msr SDER32_EL3, x3             ; encoding: [0x23,0x11,0x1e,0xd5]
+; CHECK: msr TCR_EL1, x3                ; encoding: [0x43,0x20,0x18,0xd5]
+; CHECK: msr TCR_EL2, x3                ; encoding: [0x43,0x20,0x1c,0xd5]
+; CHECK: msr TCR_EL3, x3                ; encoding: [0x43,0x20,0x1e,0xd5]
+; CHECK: msr TEECR32_EL1, x3            ; encoding: [0x03,0x00,0x12,0xd5]
+; CHECK: msr TEEHBR32_EL1, x3           ; encoding: [0x03,0x10,0x12,0xd5]
+; CHECK: msr TPIDRRO_EL0, x3            ; encoding: [0x63,0xd0,0x1b,0xd5]
+; CHECK: msr TPIDR_EL0, x3              ; encoding: [0x43,0xd0,0x1b,0xd5]
+; CHECK: msr TPIDR_EL1, x3              ; encoding: [0x83,0xd0,0x18,0xd5]
+; CHECK: msr TPIDR_EL2, x3              ; encoding: [0x43,0xd0,0x1c,0xd5]
+; CHECK: msr TPIDR_EL3, x3              ; encoding: [0x43,0xd0,0x1e,0xd5]
+; CHECK: msr TTBR0_EL1, x3              ; encoding: [0x03,0x20,0x18,0xd5]
+; CHECK: msr TTBR0_EL2, x3              ; encoding: [0x03,0x20,0x1c,0xd5]
+; CHECK: msr TTBR0_EL3, x3              ; encoding: [0x03,0x20,0x1e,0xd5]
+; CHECK: msr TTBR1_EL1, x3              ; encoding: [0x23,0x20,0x18,0xd5]
+; CHECK: msr VBAR_EL1, x3               ; encoding: [0x03,0xc0,0x18,0xd5]
+; CHECK: msr VBAR_EL2, x3               ; encoding: [0x03,0xc0,0x1c,0xd5]
+; CHECK: msr VBAR_EL3, x3               ; encoding: [0x03,0xc0,0x1e,0xd5]
+; CHECK: msr VMPIDR_EL2, x3             ; encoding: [0xa3,0x00,0x1c,0xd5]
+; CHECK: msr VPIDR_EL2, x3              ; encoding: [0x03,0x00,0x1c,0xd5]
+; CHECK: msr VTCR_EL2, x3               ; encoding: [0x43,0x21,0x1c,0xd5]
+; CHECK: msr VTTBR_EL2, x3              ; encoding: [0x03,0x21,0x1c,0xd5]
+; CHECK: msr  SPSEL, x3                 ; encoding: [0x03,0x42,0x18,0xd5]
+; CHECK: msr  S3_2_C11_C6_4, x1         ; encoding: [0x81,0xb6,0x1a,0xd5]
+
+  mrs x3, ACTLR_EL1
+  mrs x3, ACTLR_EL2
+  mrs x3, ACTLR_EL3
+  mrs x3, AFSR0_EL1
+  mrs x3, AFSR0_EL2
+  mrs x3, AFSR0_EL3
+  mrs x3, AIDR_EL1
+  mrs x3, AFSR1_EL1
+  mrs x3, AFSR1_EL2
+  mrs x3, AFSR1_EL3
+  mrs x3, AMAIR_EL1
+  mrs x3, AMAIR_EL2
+  mrs x3, AMAIR_EL3
+  mrs x3, CCSIDR_EL1
+  mrs x3, CLIDR_EL1
+  mrs x3, CNTFRQ_EL0
+  mrs x3, CNTHCTL_EL2
+  mrs x3, CNTHP_CTL_EL2
+  mrs x3, CNTHP_CVAL_EL2
+  mrs x3, CNTHP_TVAL_EL2
+  mrs x3, CNTKCTL_EL1
+  mrs x3, CNTPCT_EL0
+  mrs x3, CNTP_CTL_EL0
+  mrs x3, CNTP_CVAL_EL0
+  mrs x3, CNTP_TVAL_EL0
+  mrs x3, CNTVCT_EL0
+  mrs x3, CNTVOFF_EL2
+  mrs x3, CNTV_CTL_EL0
+  mrs x3, CNTV_CVAL_EL0
+  mrs x3, CNTV_TVAL_EL0
+  mrs x3, CONTEXTIDR_EL1
+  mrs x3, CPACR_EL1
+  mrs x3, CPTR_EL2
+  mrs x3, CPTR_EL3
+  mrs x3, CSSELR_EL1
+  mrs x3, CTR_EL0
+  mrs x3, CURRENTEL
+  mrs x3, DACR32_EL2
+  mrs x3, DCZID_EL0
+  mrs x3, REVIDR_EL1
+  mrs x3, ESR_EL1
+  mrs x3, ESR_EL2
+  mrs x3, ESR_EL3
+  mrs x3, FAR_EL1
+  mrs x3, FAR_EL2
+  mrs x3, FAR_EL3
+  mrs x3, FPEXC32_EL2
+  mrs x3, HACR_EL2
+  mrs x3, HCR_EL2
+  mrs x3, HPFAR_EL2
+  mrs x3, HSTR_EL2
+  mrs x3, ID_AA64DFR0_EL1
+  mrs x3, ID_AA64DFR1_EL1
+  mrs x3, ID_AA64ISAR0_EL1
+  mrs x3, ID_AA64ISAR1_EL1
+  mrs x3, ID_AA64MMFR0_EL1
+  mrs x3, ID_AA64MMFR1_EL1
+  mrs x3, ID_AA64PFR0_EL1
+  mrs x3, ID_AA64PFR1_EL1
+  mrs x3, IFSR32_EL2
+  mrs x3, ISR_EL1
+  mrs x3, MAIR_EL1
+  mrs x3, MAIR_EL2
+  mrs x3, MAIR_EL3
+  mrs x3, MDCR_EL2
+  mrs x3, MDCR_EL3
+  mrs x3, MIDR_EL1
+  mrs x3, MPIDR_EL1
+  mrs x3, MVFR0_EL1
+  mrs x3, MVFR1_EL1
+  mrs x3, PAR_EL1
+  mrs x3, RVBAR_EL1
+  mrs x3, RVBAR_EL2
+  mrs x3, RVBAR_EL3
+  mrs x3, SCR_EL3
+  mrs x3, SCTLR_EL1
+  mrs x3, SCTLR_EL2
+  mrs x3, SCTLR_EL3
+  mrs x3, SDER32_EL3
+  mrs x3, TCR_EL1
+  mrs x3, TCR_EL2
+  mrs x3, TCR_EL3
+  mrs x3, TEECR32_EL1
+  mrs x3, TEEHBR32_EL1
+  mrs x3, TPIDRRO_EL0
+  mrs x3, TPIDR_EL0
+  mrs x3, TPIDR_EL1
+  mrs x3, TPIDR_EL2
+  mrs x3, TPIDR_EL3
+  mrs x3, TTBR0_EL1
+  mrs x3, TTBR0_EL2
+  mrs x3, TTBR0_EL3
+  mrs x3, TTBR1_EL1
+  mrs x3, VBAR_EL1
+  mrs x3, VBAR_EL2
+  mrs x3, VBAR_EL3
+  mrs x3, VMPIDR_EL2
+  mrs x3, VPIDR_EL2
+  mrs x3, VTCR_EL2
+  mrs x3, VTTBR_EL2
+
+  mrs x3, MDCCSR_EL0
+  mrs x3, MDCCINT_EL1
+  mrs x3, DBGDTR_EL0
+  mrs x3, DBGDTRRX_EL0
+  mrs x3, DBGVCR32_EL2
+  mrs x3, OSDTRRX_EL1
+  mrs x3, MDSCR_EL1
+  mrs x3, OSDTRTX_EL1
+  mrs x3, OSECCR_EL1
+  mrs x3, DBGBVR0_EL1
+  mrs x3, DBGBVR1_EL1
+  mrs x3, DBGBVR2_EL1
+  mrs x3, DBGBVR3_EL1
+  mrs x3, DBGBVR4_EL1
+  mrs x3, DBGBVR5_EL1
+  mrs x3, DBGBVR6_EL1
+  mrs x3, DBGBVR7_EL1
+  mrs x3, DBGBVR8_EL1
+  mrs x3, DBGBVR9_EL1
+  mrs x3, DBGBVR10_EL1
+  mrs x3, DBGBVR11_EL1
+  mrs x3, DBGBVR12_EL1
+  mrs x3, DBGBVR13_EL1
+  mrs x3, DBGBVR14_EL1
+  mrs x3, DBGBVR15_EL1
+  mrs x3, DBGBCR0_EL1
+  mrs x3, DBGBCR1_EL1
+  mrs x3, DBGBCR2_EL1
+  mrs x3, DBGBCR3_EL1
+  mrs x3, DBGBCR4_EL1
+  mrs x3, DBGBCR5_EL1
+  mrs x3, DBGBCR6_EL1
+  mrs x3, DBGBCR7_EL1
+  mrs x3, DBGBCR8_EL1
+  mrs x3, DBGBCR9_EL1
+  mrs x3, DBGBCR10_EL1
+  mrs x3, DBGBCR11_EL1
+  mrs x3, DBGBCR12_EL1
+  mrs x3, DBGBCR13_EL1
+  mrs x3, DBGBCR14_EL1
+  mrs x3, DBGBCR15_EL1
+  mrs x3, DBGWVR0_EL1
+  mrs x3, DBGWVR1_EL1
+  mrs x3, DBGWVR2_EL1
+  mrs x3, DBGWVR3_EL1
+  mrs x3, DBGWVR4_EL1
+  mrs x3, DBGWVR5_EL1
+  mrs x3, DBGWVR6_EL1
+  mrs x3, DBGWVR7_EL1
+  mrs x3, DBGWVR8_EL1
+  mrs x3, DBGWVR9_EL1
+  mrs x3, DBGWVR10_EL1
+  mrs x3, DBGWVR11_EL1
+  mrs x3, DBGWVR12_EL1
+  mrs x3, DBGWVR13_EL1
+  mrs x3, DBGWVR14_EL1
+  mrs x3, DBGWVR15_EL1
+  mrs x3, DBGWCR0_EL1
+  mrs x3, DBGWCR1_EL1
+  mrs x3, DBGWCR2_EL1
+  mrs x3, DBGWCR3_EL1
+  mrs x3, DBGWCR4_EL1
+  mrs x3, DBGWCR5_EL1
+  mrs x3, DBGWCR6_EL1
+  mrs x3, DBGWCR7_EL1
+  mrs x3, DBGWCR8_EL1
+  mrs x3, DBGWCR9_EL1
+  mrs x3, DBGWCR10_EL1
+  mrs x3, DBGWCR11_EL1
+  mrs x3, DBGWCR12_EL1
+  mrs x3, DBGWCR13_EL1
+  mrs x3, DBGWCR14_EL1
+  mrs x3, DBGWCR15_EL1
+  mrs x3, MDRAR_EL1
+  mrs x3, OSLSR_EL1
+  mrs x3, OSDLR_EL1
+  mrs x3, DBGPRCR_EL1
+  mrs x3, DBGCLAIMSET_EL1
+  mrs x3, DBGCLAIMCLR_EL1
+  mrs x3, DBGAUTHSTATUS_EL1
+  mrs x1, S3_2_C15_C6_4
+  mrs x3, s3_3_c11_c1_4
+  mrs x3, S3_3_c11_c1_4
+
+; CHECK: mrs x3, ACTLR_EL1              ; encoding: [0x23,0x10,0x38,0xd5]
+; CHECK: mrs x3, ACTLR_EL2              ; encoding: [0x23,0x10,0x3c,0xd5]
+; CHECK: mrs x3, ACTLR_EL3              ; encoding: [0x23,0x10,0x3e,0xd5]
+; CHECK: mrs x3, AFSR0_EL1              ; encoding: [0x03,0x51,0x38,0xd5]
+; CHECK: mrs x3, AFSR0_EL2              ; encoding: [0x03,0x51,0x3c,0xd5]
+; CHECK: mrs x3, AFSR0_EL3              ; encoding: [0x03,0x51,0x3e,0xd5]
+; CHECK: mrs x3, AIDR_EL1               ; encoding: [0xe3,0x00,0x39,0xd5]
+; CHECK: mrs x3, AFSR1_EL1              ; encoding: [0x23,0x51,0x38,0xd5]
+; CHECK: mrs x3, AFSR1_EL2              ; encoding: [0x23,0x51,0x3c,0xd5]
+; CHECK: mrs x3, AFSR1_EL3              ; encoding: [0x23,0x51,0x3e,0xd5]
+; CHECK: mrs x3, AMAIR_EL1              ; encoding: [0x03,0xa3,0x38,0xd5]
+; CHECK: mrs x3, AMAIR_EL2              ; encoding: [0x03,0xa3,0x3c,0xd5]
+; CHECK: mrs x3, AMAIR_EL3              ; encoding: [0x03,0xa3,0x3e,0xd5]
+; CHECK: mrs x3, CCSIDR_EL1             ; encoding: [0x03,0x00,0x39,0xd5]
+; CHECK: mrs x3, CLIDR_EL1              ; encoding: [0x23,0x00,0x39,0xd5]
+; CHECK: mrs x3, CNTFRQ_EL0             ; encoding: [0x03,0xe0,0x3b,0xd5]
+; CHECK: mrs x3, CNTHCTL_EL2            ; encoding: [0x03,0xe1,0x3c,0xd5]
+; CHECK: mrs x3, CNTHP_CTL_EL2          ; encoding: [0x23,0xe2,0x3c,0xd5]
+; CHECK: mrs x3, CNTHP_CVAL_EL2         ; encoding: [0x43,0xe2,0x3c,0xd5]
+; CHECK: mrs x3, CNTHP_TVAL_EL2         ; encoding: [0x03,0xe2,0x3c,0xd5]
+; CHECK: mrs x3, CNTKCTL_EL1            ; encoding: [0x03,0xe1,0x38,0xd5]
+; CHECK: mrs x3, CNTPCT_EL0             ; encoding: [0x23,0xe0,0x3b,0xd5]
+; CHECK: mrs x3, CNTP_CTL_EL0           ; encoding: [0x23,0xe2,0x3b,0xd5]
+; CHECK: mrs x3, CNTP_CVAL_EL0          ; encoding: [0x43,0xe2,0x3b,0xd5]
+; CHECK: mrs x3, CNTP_TVAL_EL0          ; encoding: [0x03,0xe2,0x3b,0xd5]
+; CHECK: mrs x3, CNTVCT_EL0             ; encoding: [0x43,0xe0,0x3b,0xd5]
+; CHECK: mrs x3, CNTVOFF_EL2            ; encoding: [0x63,0xe0,0x3c,0xd5]
+; CHECK: mrs x3, CNTV_CTL_EL0           ; encoding: [0x23,0xe3,0x3b,0xd5]
+; CHECK: mrs x3, CNTV_CVAL_EL0          ; encoding: [0x43,0xe3,0x3b,0xd5]
+; CHECK: mrs x3, CNTV_TVAL_EL0          ; encoding: [0x03,0xe3,0x3b,0xd5]
+; CHECK: mrs x3, CONTEXTIDR_EL1         ; encoding: [0x23,0xd0,0x38,0xd5]
+; CHECK: mrs x3, CPACR_EL1              ; encoding: [0x43,0x10,0x38,0xd5]
+; CHECK: mrs x3, CPTR_EL2               ; encoding: [0x43,0x11,0x3c,0xd5]
+; CHECK: mrs x3, CPTR_EL3               ; encoding: [0x43,0x11,0x3e,0xd5]
+; CHECK: mrs x3, CSSELR_EL1             ; encoding: [0x03,0x00,0x3a,0xd5]
+; CHECK: mrs x3, CTR_EL0                ; encoding: [0x23,0x00,0x3b,0xd5]
+; CHECK: mrs x3, CURRENTEL              ; encoding: [0x43,0x42,0x38,0xd5]
+; CHECK: mrs x3, DACR32_EL2             ; encoding: [0x03,0x30,0x3c,0xd5]
+; CHECK: mrs x3, DCZID_EL0              ; encoding: [0xe3,0x00,0x3b,0xd5]
+; CHECK: mrs x3, REVIDR_EL1             ; encoding: [0xc3,0x00,0x38,0xd5]
+; CHECK: mrs x3, ESR_EL1                ; encoding: [0x03,0x52,0x38,0xd5]
+; CHECK: mrs x3, ESR_EL2                ; encoding: [0x03,0x52,0x3c,0xd5]
+; CHECK: mrs x3, ESR_EL3                ; encoding: [0x03,0x52,0x3e,0xd5]
+; CHECK: mrs x3, FAR_EL1                ; encoding: [0x03,0x60,0x38,0xd5]
+; CHECK: mrs x3, FAR_EL2                ; encoding: [0x03,0x60,0x3c,0xd5]
+; CHECK: mrs x3, FAR_EL3                ; encoding: [0x03,0x60,0x3e,0xd5]
+; CHECK: mrs x3, FPEXC32_EL2            ; encoding: [0x03,0x53,0x3c,0xd5]
+; CHECK: mrs x3, HACR_EL2               ; encoding: [0xe3,0x11,0x3c,0xd5]
+; CHECK: mrs x3, HCR_EL2                ; encoding: [0x03,0x11,0x3c,0xd5]
+; CHECK: mrs x3, HPFAR_EL2              ; encoding: [0x83,0x60,0x3c,0xd5]
+; CHECK: mrs x3, HSTR_EL2               ; encoding: [0x63,0x11,0x3c,0xd5]
+; CHECK: mrs x3, ID_AA64DFR0_EL1        ; encoding: [0x03,0x05,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64DFR1_EL1        ; encoding: [0x23,0x05,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64ISAR0_EL1       ; encoding: [0x03,0x06,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64ISAR1_EL1       ; encoding: [0x23,0x06,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64MMFR0_EL1       ; encoding: [0x03,0x07,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64MMFR1_EL1       ; encoding: [0x23,0x07,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64PFR0_EL1        ; encoding: [0x03,0x04,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64PFR1_EL1        ; encoding: [0x23,0x04,0x38,0xd5]
+; CHECK: mrs x3, IFSR32_EL2             ; encoding: [0x23,0x50,0x3c,0xd5]
+; CHECK: mrs x3, ISR_EL1                ; encoding: [0x03,0xc1,0x38,0xd5]
+; CHECK: mrs x3, MAIR_EL1               ; encoding: [0x03,0xa2,0x38,0xd5]
+; CHECK: mrs x3, MAIR_EL2               ; encoding: [0x03,0xa2,0x3c,0xd5]
+; CHECK: mrs x3, MAIR_EL3               ; encoding: [0x03,0xa2,0x3e,0xd5]
+; CHECK: mrs x3, MDCR_EL2               ; encoding: [0x23,0x11,0x3c,0xd5]
+; CHECK: mrs x3, MDCR_EL3               ; encoding: [0x23,0x13,0x3e,0xd5]
+; CHECK: mrs x3, MIDR_EL1               ; encoding: [0x03,0x00,0x38,0xd5]
+; CHECK: mrs x3, MPIDR_EL1              ; encoding: [0xa3,0x00,0x38,0xd5]
+; CHECK: mrs x3, MVFR0_EL1              ; encoding: [0x03,0x03,0x38,0xd5]
+; CHECK: mrs x3, MVFR1_EL1              ; encoding: [0x23,0x03,0x38,0xd5]
+; CHECK: mrs x3, PAR_EL1                ; encoding: [0x03,0x74,0x38,0xd5]
+; CHECK: mrs x3, RVBAR_EL1              ; encoding: [0x23,0xc0,0x38,0xd5]
+; CHECK: mrs x3, RVBAR_EL2              ; encoding: [0x23,0xc0,0x3c,0xd5]
+; CHECK: mrs x3, RVBAR_EL3              ; encoding: [0x23,0xc0,0x3e,0xd5]
+; CHECK: mrs x3, SCR_EL3                ; encoding: [0x03,0x11,0x3e,0xd5]
+; CHECK: mrs x3, SCTLR_EL1              ; encoding: [0x03,0x10,0x38,0xd5]
+; CHECK: mrs x3, SCTLR_EL2              ; encoding: [0x03,0x10,0x3c,0xd5]
+; CHECK: mrs x3, SCTLR_EL3              ; encoding: [0x03,0x10,0x3e,0xd5]
+; CHECK: mrs x3, SDER32_EL3             ; encoding: [0x23,0x11,0x3e,0xd5]
+; CHECK: mrs x3, TCR_EL1                ; encoding: [0x43,0x20,0x38,0xd5]
+; CHECK: mrs x3, TCR_EL2                ; encoding: [0x43,0x20,0x3c,0xd5]
+; CHECK: mrs x3, TCR_EL3                ; encoding: [0x43,0x20,0x3e,0xd5]
+; CHECK: mrs x3, TEECR32_EL1            ; encoding: [0x03,0x00,0x32,0xd5]
+; CHECK: mrs x3, TEEHBR32_EL1           ; encoding: [0x03,0x10,0x32,0xd5]
+; CHECK: mrs x3, TPIDRRO_EL0            ; encoding: [0x63,0xd0,0x3b,0xd5]
+; CHECK: mrs x3, TPIDR_EL0              ; encoding: [0x43,0xd0,0x3b,0xd5]
+; CHECK: mrs x3, TPIDR_EL1              ; encoding: [0x83,0xd0,0x38,0xd5]
+; CHECK: mrs x3, TPIDR_EL2              ; encoding: [0x43,0xd0,0x3c,0xd5]
+; CHECK: mrs x3, TPIDR_EL3              ; encoding: [0x43,0xd0,0x3e,0xd5]
+; CHECK: mrs x3, TTBR0_EL1              ; encoding: [0x03,0x20,0x38,0xd5]
+; CHECK: mrs x3, TTBR0_EL2              ; encoding: [0x03,0x20,0x3c,0xd5]
+; CHECK: mrs x3, TTBR0_EL3              ; encoding: [0x03,0x20,0x3e,0xd5]
+; CHECK: mrs x3, TTBR1_EL1              ; encoding: [0x23,0x20,0x38,0xd5]
+; CHECK: mrs x3, VBAR_EL1               ; encoding: [0x03,0xc0,0x38,0xd5]
+; CHECK: mrs x3, VBAR_EL2               ; encoding: [0x03,0xc0,0x3c,0xd5]
+; CHECK: mrs x3, VBAR_EL3               ; encoding: [0x03,0xc0,0x3e,0xd5]
+; CHECK: mrs x3, VMPIDR_EL2             ; encoding: [0xa3,0x00,0x3c,0xd5]
+; CHECK: mrs x3, VPIDR_EL2              ; encoding: [0x03,0x00,0x3c,0xd5]
+; CHECK: mrs x3, VTCR_EL2               ; encoding: [0x43,0x21,0x3c,0xd5]
+; CHECK: mrs x3, VTTBR_EL2              ; encoding: [0x03,0x21,0x3c,0xd5]
+; CHECK: mrs	x3, MDCCSR_EL0          ; encoding: [0x03,0x01,0x33,0xd5]
+; CHECK: mrs	x3, MDCCINT_EL1         ; encoding: [0x03,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGDTR_EL0          ; encoding: [0x03,0x04,0x33,0xd5]
+; CHECK: mrs	x3, DBGDTRRX_EL0        ; encoding: [0x03,0x05,0x33,0xd5]
+; CHECK: mrs	x3, DBGVCR32_EL2        ; encoding: [0x03,0x07,0x34,0xd5]
+; CHECK: mrs	x3, OSDTRRX_EL1         ; encoding: [0x43,0x00,0x30,0xd5]
+; CHECK: mrs	x3, MDSCR_EL1           ; encoding: [0x43,0x02,0x30,0xd5]
+; CHECK: mrs	x3, OSDTRTX_EL1         ; encoding: [0x43,0x03,0x30,0xd5]
+; CHECK: mrs	x3, OSECCR_EL1          ; encoding: [0x43,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR0_EL1         ; encoding: [0x83,0x00,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR1_EL1         ; encoding: [0x83,0x01,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR2_EL1         ; encoding: [0x83,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR3_EL1         ; encoding: [0x83,0x03,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR4_EL1         ; encoding: [0x83,0x04,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR5_EL1         ; encoding: [0x83,0x05,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR6_EL1         ; encoding: [0x83,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR7_EL1         ; encoding: [0x83,0x07,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR8_EL1         ; encoding: [0x83,0x08,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR9_EL1         ; encoding: [0x83,0x09,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR10_EL1        ; encoding: [0x83,0x0a,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR11_EL1        ; encoding: [0x83,0x0b,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR12_EL1        ; encoding: [0x83,0x0c,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR13_EL1        ; encoding: [0x83,0x0d,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR14_EL1        ; encoding: [0x83,0x0e,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR15_EL1        ; encoding: [0x83,0x0f,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR0_EL1         ; encoding: [0xa3,0x00,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR1_EL1         ; encoding: [0xa3,0x01,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR2_EL1         ; encoding: [0xa3,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR3_EL1         ; encoding: [0xa3,0x03,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR4_EL1         ; encoding: [0xa3,0x04,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR5_EL1         ; encoding: [0xa3,0x05,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR6_EL1         ; encoding: [0xa3,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR7_EL1         ; encoding: [0xa3,0x07,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR8_EL1         ; encoding: [0xa3,0x08,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR9_EL1         ; encoding: [0xa3,0x09,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR10_EL1        ; encoding: [0xa3,0x0a,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR11_EL1        ; encoding: [0xa3,0x0b,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR12_EL1        ; encoding: [0xa3,0x0c,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR13_EL1        ; encoding: [0xa3,0x0d,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR14_EL1        ; encoding: [0xa3,0x0e,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR15_EL1        ; encoding: [0xa3,0x0f,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR0_EL1         ; encoding: [0xc3,0x00,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR1_EL1         ; encoding: [0xc3,0x01,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR2_EL1         ; encoding: [0xc3,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR3_EL1         ; encoding: [0xc3,0x03,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR4_EL1         ; encoding: [0xc3,0x04,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR5_EL1         ; encoding: [0xc3,0x05,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR6_EL1         ; encoding: [0xc3,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR7_EL1         ; encoding: [0xc3,0x07,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR8_EL1         ; encoding: [0xc3,0x08,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR9_EL1         ; encoding: [0xc3,0x09,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR10_EL1        ; encoding: [0xc3,0x0a,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR11_EL1        ; encoding: [0xc3,0x0b,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR12_EL1        ; encoding: [0xc3,0x0c,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR13_EL1        ; encoding: [0xc3,0x0d,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR14_EL1        ; encoding: [0xc3,0x0e,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR15_EL1        ; encoding: [0xc3,0x0f,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR0_EL1         ; encoding: [0xe3,0x00,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR1_EL1         ; encoding: [0xe3,0x01,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR2_EL1         ; encoding: [0xe3,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR3_EL1         ; encoding: [0xe3,0x03,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR4_EL1         ; encoding: [0xe3,0x04,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR5_EL1         ; encoding: [0xe3,0x05,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR6_EL1         ; encoding: [0xe3,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR7_EL1         ; encoding: [0xe3,0x07,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR8_EL1         ; encoding: [0xe3,0x08,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR9_EL1         ; encoding: [0xe3,0x09,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR10_EL1        ; encoding: [0xe3,0x0a,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR11_EL1        ; encoding: [0xe3,0x0b,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR12_EL1        ; encoding: [0xe3,0x0c,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR13_EL1        ; encoding: [0xe3,0x0d,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR14_EL1        ; encoding: [0xe3,0x0e,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR15_EL1        ; encoding: [0xe3,0x0f,0x30,0xd5]
+; CHECK: mrs	x3, MDRAR_EL1           ; encoding: [0x03,0x10,0x30,0xd5]
+; CHECK: mrs	x3, OSLSR_EL1           ; encoding: [0x83,0x11,0x30,0xd5]
+; CHECK: mrs	x3, OSDLR_EL1           ; encoding: [0x83,0x13,0x30,0xd5]
+; CHECK: mrs	x3, DBGPRCR_EL1         ; encoding: [0x83,0x14,0x30,0xd5]
+; CHECK: mrs	x3, DBGCLAIMSET_EL1     ; encoding: [0xc3,0x78,0x30,0xd5]
+; CHECK: mrs	x3, DBGCLAIMCLR_EL1     ; encoding: [0xc3,0x79,0x30,0xd5]
+; CHECK: mrs	x3, DBGAUTHSTATUS_EL1   ; encoding: [0xc3,0x7e,0x30,0xd5]
+; CHECK: mrs    x1, S3_2_C15_C6_4       ; encoding: [0x81,0xf6,0x3a,0xd5]
+; CHECK: mrs	x3, S3_3_C11_C1_4       ; encoding: [0x83,0xb1,0x3b,0xd5]
+; CHECK: mrs	x3, S3_3_C11_C1_4       ; encoding: [0x83,0xb1,0x3b,0xd5]
+
+  msr RMR_EL3, x0
+  msr RMR_EL2, x0
+  msr RMR_EL1, x0
+  msr OSLAR_EL1, x3
+  msr DBGDTRTX_EL0, x3
+        
+; CHECK: msr	RMR_EL3, x0             ; encoding: [0x40,0xc0,0x1e,0xd5]
+; CHECK: msr	RMR_EL2, x0             ; encoding: [0x40,0xc0,0x1c,0xd5]
+; CHECK: msr	RMR_EL1, x0             ; encoding: [0x40,0xc0,0x18,0xd5]
+; CHECK: msr	OSLAR_EL1, x3           ; encoding: [0x83,0x10,0x10,0xd5]
+; CHECK: msr	DBGDTRTX_EL0, x3        ; encoding: [0x03,0x05,0x13,0xd5]
+        
+ mrs x0, ID_PFR0_EL1
+ mrs x0, ID_PFR1_EL1
+ mrs x0, ID_DFR0_EL1
+ mrs x0, ID_AFR0_EL1
+ mrs x0, ID_ISAR0_EL1
+ mrs x0, ID_ISAR1_EL1
+ mrs x0, ID_ISAR2_EL1
+ mrs x0, ID_ISAR3_EL1
+ mrs x0, ID_ISAR4_EL1
+ mrs x0, ID_ISAR5_EL1
+ mrs x0, AFSR1_EL1
+ mrs x0, AFSR0_EL1
+ mrs x0, REVIDR_EL1
+; CHECK: mrs	x0, ID_PFR0_EL1         ; encoding: [0x00,0x01,0x38,0xd5]
+; CHECK: mrs	x0, ID_PFR1_EL1         ; encoding: [0x20,0x01,0x38,0xd5]
+; CHECK: mrs	x0, ID_DFR0_EL1         ; encoding: [0x40,0x01,0x38,0xd5]
+; CHECK: mrs	x0, ID_AFR0_EL1         ; encoding: [0x60,0x01,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR0_EL1        ; encoding: [0x00,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR1_EL1        ; encoding: [0x20,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR2_EL1        ; encoding: [0x40,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR3_EL1        ; encoding: [0x60,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR4_EL1        ; encoding: [0x80,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR5_EL1        ; encoding: [0xa0,0x02,0x38,0xd5]
+; CHECK: mrs	x0, AFSR1_EL1           ; encoding: [0x20,0x51,0x38,0xd5]
+; CHECK: mrs	x0, AFSR0_EL1           ; encoding: [0x00,0x51,0x38,0xd5]
+; CHECK: mrs	x0, REVIDR_EL1          ; encoding: [0xc0,0x00,0x38,0xd5]
diff --git a/test/MC/AArch64/arm64-target-specific-sysreg.s b/test/MC/AArch64/arm64-target-specific-sysreg.s
new file mode 100644
index 00000000000..05cea3ac2da
--- /dev/null
+++ b/test/MC/AArch64/arm64-target-specific-sysreg.s
@@ -0,0 +1,10 @@
+// RUN: not llvm-mc -triple arm64 -mcpu=generic -show-encoding < %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CHECK-GENERIC
+//
+// RUN: llvm-mc -triple arm64 -mcpu=cyclone -show-encoding < %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CHECK-CYCLONE
+
+msr CPM_IOACC_CTL_EL3, x0
+
+// CHECK-GENERIC: error: expected writable system register or pstate
+// CHECK-CYCLONE: msr CPM_IOACC_CTL_EL3, x0   // encoding: [0x00,0xf2,0x1f,0xd5]
diff --git a/test/MC/AArch64/arm64-tls-modifiers-darwin.s b/test/MC/AArch64/arm64-tls-modifiers-darwin.s
new file mode 100644
index 00000000000..8ff07cd86b2
--- /dev/null
+++ b/test/MC/AArch64/arm64-tls-modifiers-darwin.s
@@ -0,0 +1,13 @@
+; RUN: llvm-mc -triple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+; RUN: llvm-mc -triple=arm64-apple-ios7.0 -filetype=obj %s -o - | llvm-objdump -r - | FileCheck %s --check-prefix=CHECK-OBJ
+
+        adrp x2, _var@TLVPPAGE
+        ldr x0, [x15, _var@TLVPPAGEOFF]
+        add x30, x0, _var@TLVPPAGEOFF
+; CHECK: adrp x2, _var@TLVPPAG
+; CHECK: ldr x0, [x15, _var@TLVPPAGEOFF]
+; CHECK: add x30, x0, _var@TLVPPAGEOFF
+
+; CHECK-OBJ: 8 ARM64_RELOC_TLVP_LOAD_PAGEOFF12 _var
+; CHECK-OBJ: 4 ARM64_RELOC_TLVP_LOAD_PAGEOFF12 _var
+; CHECK-OBJ: 0 ARM64_RELOC_TLVP_LOAD_PAGE21 _var
diff --git a/test/MC/AArch64/arm64-tls-relocs.s b/test/MC/AArch64/arm64-tls-relocs.s
new file mode 100644
index 00000000000..96c2b55c36d
--- /dev/null
+++ b/test/MC/AArch64/arm64-tls-relocs.s
@@ -0,0 +1,320 @@
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s -o - | \
+// RUN:   llvm-readobj -r -t | FileCheck --check-prefix=CHECK-ELF %s
+
+
+////////////////////////////////////////////////////////////////////////////////
+// TLS initial-exec forms
+////////////////////////////////////////////////////////////////////////////////
+
+        movz x15, #:gottprel_g1:var
+// CHECK: movz    x15, #:gottprel_g1:var  // encoding: [0bAAA01111,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 [[VARSYM:[^ ]+]]
+
+
+        movk x13, #:gottprel_g0_nc:var
+// CHECK: movk    x13, #:gottprel_g0_nc:var // encoding: [0bAAA01101,A,0b100AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_aarch64_movw
+
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC [[VARSYM]]
+
+        adrp x11, :gottprel:var
+        ldr x10, [x0, #:gottprel_lo12:var]
+        ldr x9, :gottprel:var
+// CHECK: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK: ldr     x10, [x0, :gottprel_lo12:var] // encoding: [0x0a,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: ldr     x9, :gottprel:var       // encoding: [0bAAA01001,A,A,0x58]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_ldr_pcrel_imm19
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_LD_GOTTPREL_PREL19 [[VARSYM]]
+
+
+////////////////////////////////////////////////////////////////////////////////
+// TLS local-exec forms
+////////////////////////////////////////////////////////////////////////////////
+
+        movz x3, #:tprel_g2:var
+        movn x4, #:tprel_g2:var
+// CHECK: movz    x3, #:tprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x4, #:tprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
+
+
+        movz x5, #:tprel_g1:var
+        movn x6, #:tprel_g1:var
+        movz w7, #:tprel_g1:var
+// CHECK: movz    x5, #:tprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    x6, #:tprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w7, #:tprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
+
+
+        movk x9, #:tprel_g1_nc:var
+        movk w10, #:tprel_g1_nc:var
+// CHECK: movk    x9, #:tprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w10, #:tprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
+
+
+        movz x11, #:tprel_g0:var
+        movn x12, #:tprel_g0:var
+        movz w13, #:tprel_g0:var
+// CHECK: movz    x11, #:tprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    x12, #:tprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movz    w13, #:tprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
+
+
+        movk x15, #:tprel_g0_nc:var
+        movk w16, #:tprel_g0_nc:var
+// CHECK: movk    x15, #:tprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w16, #:tprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
+
+
+        add x21, x22, #:tprel_lo12:var
+// CHECK: add     x21, x22, :tprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_add_imm12
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
+
+
+        add x25, x26, #:tprel_lo12_nc:var
+// CHECK: add     x25, x26, :tprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
+
+
+        ldrb w29, [x30, #:tprel_lo12:var]
+        ldrsb x29, [x28, #:tprel_lo12_nc:var]
+// CHECK: ldrb    w29, [x30, :tprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsb   x29, [x28, :tprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST8_TPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC [[VARSYM]]
+
+
+        strh w27, [x26, #:tprel_lo12:var]
+        ldrsh x25, [x24, #:tprel_lo12_nc:var]
+// CHECK: strh    w27, [x26, :tprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsh   x25, [x24, :tprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST16_TPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC [[VARSYM]]
+
+
+        ldr w23, [x22, #:tprel_lo12:var]
+        ldrsw x21, [x20, #:tprel_lo12_nc:var]
+// CHECK: ldr     w23, [x22, :tprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldrsw   x21, [x20, :tprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST32_TPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC [[VARSYM]]
+
+        ldr x19, [x18, #:tprel_lo12:var]
+        str x17, [x16, #:tprel_lo12_nc:var]
+// CHECK: ldr     x19, [x18, :tprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: str     x17, [x16, :tprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST64_TPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC [[VARSYM]]
+
+
+////////////////////////////////////////////////////////////////////////////////
+// TLS local-dynamic forms
+////////////////////////////////////////////////////////////////////////////////
+
+        movz x3, #:dtprel_g2:var
+        movn x4, #:dtprel_g2:var
+// CHECK: movz    x3, #:dtprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x4, #:dtprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G2 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G2 [[VARSYM]]
+
+
+        movz x5, #:dtprel_g1:var
+        movn x6, #:dtprel_g1:var
+        movz w7, #:dtprel_g1:var
+// CHECK: movz    x5, #:dtprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    x6, #:dtprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w7, #:dtprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
+
+
+        movk x9, #:dtprel_g1_nc:var
+        movk w10, #:dtprel_g1_nc:var
+// CHECK: movk    x9, #:dtprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w10, #:dtprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
+
+
+        movz x11, #:dtprel_g0:var
+        movn x12, #:dtprel_g0:var
+        movz w13, #:dtprel_g0:var
+// CHECK: movz    x11, #:dtprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    x12, #:dtprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movz    w13, #:dtprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
+
+
+        movk x15, #:dtprel_g0_nc:var
+        movk w16, #:dtprel_g0_nc:var
+// CHECK: movk    x15, #:dtprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w16, #:dtprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
+
+
+        add x21, x22, #:dtprel_lo12:var
+// CHECK: add     x21, x22, :dtprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_add_imm12
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
+
+
+        add x25, x26, #:dtprel_lo12_nc:var
+// CHECK: add     x25, x26, :dtprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
+
+
+        ldrb w29, [x30, #:dtprel_lo12:var]
+        ldrsb x29, [x28, #:dtprel_lo12_nc:var]
+// CHECK: ldrb    w29, [x30, :dtprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsb   x29, [x28, :dtprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST8_DTPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC [[VARSYM]]
+
+
+        strh w27, [x26, #:dtprel_lo12:var]
+        ldrsh x25, [x24, #:dtprel_lo12_nc:var]
+// CHECK: strh    w27, [x26, :dtprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsh   x25, [x24, :dtprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST16_DTPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC [[VARSYM]]
+
+
+        ldr w23, [x22, #:dtprel_lo12:var]
+        ldrsw x21, [x20, #:dtprel_lo12_nc:var]
+// CHECK: ldr     w23, [x22, :dtprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldrsw   x21, [x20, :dtprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST32_DTPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC [[VARSYM]]
+
+        ldr x19, [x18, #:dtprel_lo12:var]
+        str x17, [x16, #:dtprel_lo12_nc:var]
+// CHECK: ldr     x19, [x18, :dtprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: str     x17, [x16, :dtprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST64_DTPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC [[VARSYM]]
+
+////////////////////////////////////////////////////////////////////////////////
+// TLS descriptor forms
+////////////////////////////////////////////////////////////////////////////////
+
+        adrp x8, :tlsdesc:var
+        ldr x7, [x6, #:tlsdesc_lo12:var]
+        add x5, x4, #:tlsdesc_lo12:var
+        .tlsdesccall var
+        blr x3
+
+// CHECK: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK: ldr     x7, [x6, :tlsdesc_lo12:var] // encoding: [0xc7,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: add     x5, x4, :tlsdesc_lo12:var // encoding: [0x85,0bAAAAAA00,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_add_imm12
+// CHECK: .tlsdesccall var                // encoding: []
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: var, kind: fixup_aarch64_tlsdesc_call
+// CHECK: blr     x3                      // encoding: [0x60,0x00,0x3f,0xd6]
+
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADR_PAGE [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_LD64_LO12_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADD_LO12_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_CALL [[VARSYM]]
+
+        // Make sure symbol 5 has type STT_TLS:
+
+// CHECK-ELF:      Symbols [
+// CHECK-ELF:        Symbol {
+// CHECK-ELF:          Name: var
+// CHECK-ELF-NEXT:     Value:
+// CHECK-ELF-NEXT:     Size:
+// CHECK-ELF-NEXT:     Binding: Global
+// CHECK-ELF-NEXT:     Type: TLS
diff --git a/test/MC/AArch64/arm64-v128_lo-diagnostics.s b/test/MC/AArch64/arm64-v128_lo-diagnostics.s
new file mode 100644
index 00000000000..ffe29cfbed3
--- /dev/null
+++ b/test/MC/AArch64/arm64-v128_lo-diagnostics.s
@@ -0,0 +1,11 @@
+// RUN: not llvm-mc -triple arm64 -mattr=neon %s 2> %t > /dev/null
+// RUN: FileCheck %s < %t
+
+        sqrdmulh v0.8h, v1.8h, v16.h[0]
+// CHECK: error: invalid operand for instruction
+
+        sqrdmulh h0, h1, v16.h[0]
+// CHECK: error: invalid operand for instruction
+
+        sqdmull2 v0.4h, v1.8h, v16.h[0]
+// CHECK: error: invalid operand for instruction
diff --git a/test/MC/AArch64/arm64-variable-exprs.s b/test/MC/AArch64/arm64-variable-exprs.s
new file mode 100644
index 00000000000..01204425c79
--- /dev/null
+++ b/test/MC/AArch64/arm64-variable-exprs.s
@@ -0,0 +1,40 @@
+// RUN: llvm-mc -triple arm64-apple-darwin10 %s -filetype=obj -o %t.o
+
+.data
+
+        .long 0
+a:
+        .long 0
+b = a
+
+c:      .long b
+
+d2 = d
+.globl d2
+d3 = d + 4
+.globl d3
+
+e = a + 4
+
+g:
+f = g
+        .long 0
+
+        .long b
+        .long e
+        .long a + 4
+        .long d
+        .long d2
+        .long d3
+        .long f
+        .long g
+
+///
+        .text
+t0:
+Lt0_a:
+        .long 0
+
+	.section	__DWARF,__debug_frame,regular,debug
+Lt1 = Lt0_a
+	.long	Lt1
diff --git a/test/MC/AArch64/arm64-vector-lists.s b/test/MC/AArch64/arm64-vector-lists.s
new file mode 100644
index 00000000000..a9b2d198e86
--- /dev/null
+++ b/test/MC/AArch64/arm64-vector-lists.s
@@ -0,0 +1,20 @@
+// RUN: not llvm-mc -triple arm64 -mattr=neon -show-encoding < %s 2>%t | FileCheck %s
+// RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+    ST4     {v0.8B-v3.8B}, [x0]
+    ST4     {v0.4H-v3.4H}, [x0]
+
+// CHECK: st4  { v0.8b, v1.8b, v2.8b, v3.8b }, [x0] // encoding: [0x00,0x00,0x00,0x0c]
+// CHECK: st4  { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] // encoding: [0x00,0x04,0x00,0x0c]
+
+    ST4     {v0.8B-v4.8B}, [x0]
+    ST4     {v0.8B-v3.8B,v4.8B}, [x0]
+    ST4     {v0.8B-v3.8H}, [x0]
+    ST4     {v0.8B-v3.16B}, [x0]
+    ST4     {v0.8B-},[x0]
+
+// CHECK-ERRORS: error: invalid number of vectors
+// CHECK-ERRORS: error: '}' expected
+// CHECK-ERRORS: error: mismatched register size suffix
+// CHECK-ERRORS: error: mismatched register size suffix
+// CHECK-ERRORS: error: vector register expected
diff --git a/test/MC/AArch64/arm64-verbose-vector-case.s b/test/MC/AArch64/arm64-verbose-vector-case.s
new file mode 100644
index 00000000000..6f0a3812dd7
--- /dev/null
+++ b/test/MC/AArch64/arm64-verbose-vector-case.s
@@ -0,0 +1,19 @@
+// RUN: llvm-mc -triple arm64 -mattr=crypto -show-encoding < %s | FileCheck %s
+
+pmull v8.8h, v8.8b, v8.8b
+pmull2 v8.8h, v8.16b, v8.16b
+pmull v8.1q, v8.1d, v8.1d
+pmull2 v8.1q, v8.2d, v8.2d
+// CHECK: pmull v8.8h, v8.8b, v8.8b    // encoding: [0x08,0xe1,0x28,0x0e]
+// CHECK: pmull2 v8.8h, v8.16b, v8.16b // encoding: [0x08,0xe1,0x28,0x4e]
+// CHECK: pmull v8.1q, v8.1d, v8.1d    // encoding: [0x08,0xe1,0xe8,0x0e]
+// CHECK: pmull2 v8.1q, v8.2d, v8.2d   // encoding: [0x08,0xe1,0xe8,0x4e]
+
+pmull v8.8H, v8.8B, v8.8B
+pmull2 v8.8H, v8.16B, v8.16B
+pmull v8.1Q, v8.1D, v8.1D
+pmull2 v8.1Q, v8.2D, v8.2D
+// CHECK: pmull v8.8h, v8.8b, v8.8b    // encoding: [0x08,0xe1,0x28,0x0e]
+// CHECK: pmull2 v8.8h, v8.16b, v8.16b // encoding: [0x08,0xe1,0x28,0x4e]
+// CHECK: pmull v8.1q, v8.1d, v8.1d    // encoding: [0x08,0xe1,0xe8,0x0e]
+// CHECK: pmull2 v8.1q, v8.2d, v8.2d   // encoding: [0x08,0xe1,0xe8,0x4e]
diff --git a/test/MC/AArch64/basic-a64-diagnostics.s b/test/MC/AArch64/basic-a64-diagnostics.s
index c6cb6b01f8a..a4a3b1379c9 100644
--- a/test/MC/AArch64/basic-a64-diagnostics.s
+++ b/test/MC/AArch64/basic-a64-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple arm64-none-linux-gnu < %s 2> %t
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ERROR --check-prefix=CHECK-ERROR-ARM64 < %t %s
 
 //------------------------------------------------------------------------------
diff --git a/test/MC/AArch64/basic-a64-instructions.s b/test/MC/AArch64/basic-a64-instructions.s
index 72156bc9c51..a12968b04e3 100644
--- a/test/MC/AArch64/basic-a64-instructions.s
+++ b/test/MC/AArch64/basic-a64-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -show-encoding -mattr=+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+fp-armv8 < %s | FileCheck %s
   .globl _func
 
 // Check that the assembler can handle the documented syntax from the ARM ARM.
@@ -127,7 +127,7 @@ _func:
 // CHECK: adds     w19, w17, w1, uxtx         // encoding: [0x33,0x62,0x21,0x2b]
 // CHECK: adds     w2, w5, w1, sxtb #1        // encoding: [0xa2,0x84,0x21,0x2b]
 // CHECK: adds     w26, wsp, w19, sxth        // encoding: [0xfa,0xa3,0x33,0x2b]
-// CHECK-ARM64: cmn w2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0x2b]
+// CHECK: cmn w2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0x2b]
 // CHECK: adds     w2, w3, w5, sxtx           // encoding: [0x62,0xe0,0x25,0x2b]
 
         // subs
@@ -255,7 +255,7 @@ _func:
 // CHECK: sub      sp, x3, x7, lsl #4         // encoding: [0x7f,0x70,0x27,0xcb]
 // CHECK: add      w2, wsp, w3, lsl #1        // encoding: [0xe2,0x47,0x23,0x0b]
 // CHECK: cmp      wsp, w9                    // encoding: [0xff,0x43,0x29,0x6b]
-// CHECK-ARM64: cmn wsp, w3, lsl #4       // encoding: [0xff,0x53,0x23,0x2b]
+// CHECK: cmn wsp, w3, lsl #4       // encoding: [0xff,0x53,0x23,0x2b]
 // CHECK: subs     x3, sp, x9, lsl #2         // encoding: [0xe3,0x6b,0x29,0xeb]
 
 //------------------------------------------------------------------------------
@@ -349,8 +349,8 @@ _func:
 
 // A relocation check (default to lo12, which is the only sane relocation anyway really)
         add x0, x4, #:lo12:var
-// CHECK-ARM64: add x0, x4, :lo12:var       // encoding: [0x80,0bAAAAAA00,0b00AAAAAA,0x91]
-// CHECK-ARM64:                             // fixup A - offset: 0, value: :lo12:var, kind: fixup_arm64_add_imm12
+// CHECK: add x0, x4, :lo12:var       // encoding: [0x80,0bAAAAAA00,0b00AAAAAA,0x91]
+// CHECK:                             // fixup A - offset: 0, value: :lo12:var, kind: fixup_aarch64_add_imm12
 
 //------------------------------------------------------------------------------
 // Add-sub (shifted register)
@@ -484,7 +484,7 @@ _func:
         sub w4, w6, wzr
 // CHECK: sub      w3, w5, w7                 // encoding: [0xa3,0x00,0x07,0x4b]
 // CHECK: sub      wzr, w3, w5                // encoding: [0x7f,0x00,0x05,0x4b]
-// CHECK-ARM64: neg      w20, w4              // encoding: [0xf4,0x03,0x04,0x4b]
+// CHECK: neg      w20, w4              // encoding: [0xf4,0x03,0x04,0x4b]
 // CHECK: sub      w4, w6, wzr                // encoding: [0xc4,0x00,0x1f,0x4b]
 
         sub w11, w13, w15, lsl #0
@@ -514,7 +514,7 @@ _func:
         sub x4, x6, xzr
 // CHECK: sub      x3, x5, x7                 // encoding: [0xa3,0x00,0x07,0xcb]
 // CHECK: sub      xzr, x3, x5                // encoding: [0x7f,0x00,0x05,0xcb]
-// CHECK-ARM64: neg      x20, x4              // encoding: [0xf4,0x03,0x04,0xcb]
+// CHECK: neg      x20, x4              // encoding: [0xf4,0x03,0x04,0xcb]
 // CHECK: sub      x4, x6, xzr                // encoding: [0xc4,0x00,0x1f,0xcb]
 
         sub x11, x13, x15, lsl #0
@@ -544,7 +544,7 @@ _func:
         subs w4, w6, wzr
 // CHECK: subs     w3, w5, w7                 // encoding: [0xa3,0x00,0x07,0x6b]
 // CHECK: {{subs wzr,|cmp}} w3, w5            // encoding: [0x7f,0x00,0x05,0x6b]
-// CHECK-ARM64: negs     w20, w4              // encoding: [0xf4,0x03,0x04,0x6b]
+// CHECK: negs     w20, w4              // encoding: [0xf4,0x03,0x04,0x6b]
 // CHECK: subs     w4, w6, wzr                // encoding: [0xc4,0x00,0x1f,0x6b]
 
         subs w11, w13, w15, lsl #0
@@ -574,7 +574,7 @@ _func:
         subs x4, x6, xzr
 // CHECK: subs     x3, x5, x7                 // encoding: [0xa3,0x00,0x07,0xeb]
 // CHECK: {{subs xzr,|cmp}} x3, x5            // encoding: [0x7f,0x00,0x05,0xeb]
-// CHECK-ARM64: negs     x20, x4              // encoding: [0xf4,0x03,0x04,0xeb]
+// CHECK: negs     x20, x4              // encoding: [0xf4,0x03,0x04,0xeb]
 // CHECK: subs     x4, x6, xzr                // encoding: [0xc4,0x00,0x1f,0xeb]
 
         subs x11, x13, x15, lsl #0
@@ -713,17 +713,17 @@ _func:
         neg w29, w30
         neg w30, wzr
         neg wzr, w0
-// CHECK-ARM64: neg      w29, w30              // encoding: [0xfd,0x03,0x1e,0x4b]
-// CHECK-ARM64: neg      w30, wzr              // encoding: [0xfe,0x03,0x1f,0x4b]
-// CHECK-ARM64: neg      wzr, w0               // encoding: [0xff,0x03,0x00,0x4b]
+// CHECK: neg      w29, w30              // encoding: [0xfd,0x03,0x1e,0x4b]
+// CHECK: neg      w30, wzr              // encoding: [0xfe,0x03,0x1f,0x4b]
+// CHECK: neg      wzr, w0               // encoding: [0xff,0x03,0x00,0x4b]
 
         neg w28, w27, lsl #0
         neg w26, w25, lsl #29
         neg w24, w23, lsl #31
 
-// CHECK-ARM64: neg      w28, w27              // encoding: [0xfc,0x03,0x1b,0x4b]
-// CHECK-ARM64: neg      w26, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x4b]
-// CHECK-ARM64: neg      w24, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x4b]
+// CHECK: neg      w28, w27              // encoding: [0xfc,0x03,0x1b,0x4b]
+// CHECK: neg      w26, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x4b]
+// CHECK: neg      w24, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x4b]
 
         neg w22, w21, lsr #0
         neg w20, w19, lsr #1
@@ -742,17 +742,17 @@ _func:
         neg x29, x30
         neg x30, xzr
         neg xzr, x0
-// CHECK-ARM64: neg      x29, x30              // encoding: [0xfd,0x03,0x1e,0xcb]
-// CHECK-ARM64: neg      x30, xzr              // encoding: [0xfe,0x03,0x1f,0xcb]
-// CHECK-ARM64: neg      xzr, x0               // encoding: [0xff,0x03,0x00,0xcb]
+// CHECK: neg      x29, x30              // encoding: [0xfd,0x03,0x1e,0xcb]
+// CHECK: neg      x30, xzr              // encoding: [0xfe,0x03,0x1f,0xcb]
+// CHECK: neg      xzr, x0               // encoding: [0xff,0x03,0x00,0xcb]
 
         neg x28, x27, lsl #0
         neg x26, x25, lsl #29
         neg x24, x23, lsl #31
 
-// CHECK-ARM64: neg      x28, x27              // encoding: [0xfc,0x03,0x1b,0xcb]
-// CHECK-ARM64: neg      x26, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xcb]
-// CHECK-ARM64: neg      x24, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xcb]
+// CHECK: neg      x28, x27              // encoding: [0xfc,0x03,0x1b,0xcb]
+// CHECK: neg      x26, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xcb]
+// CHECK: neg      x24, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xcb]
 
         neg x22, x21, lsr #0
         neg x20, x19, lsr #1
@@ -771,17 +771,17 @@ _func:
         negs w29, w30
         negs w30, wzr
         negs wzr, w0
-// CHECK-ARM64: negs     w29, w30              // encoding: [0xfd,0x03,0x1e,0x6b]
-// CHECK-ARM64: negs     w30, wzr              // encoding: [0xfe,0x03,0x1f,0x6b]
-// CHECK-ARM64: cmp      wzr, w0               // encoding: [0xff,0x03,0x00,0x6b]
+// CHECK: negs     w29, w30              // encoding: [0xfd,0x03,0x1e,0x6b]
+// CHECK: negs     w30, wzr              // encoding: [0xfe,0x03,0x1f,0x6b]
+// CHECK: cmp      wzr, w0               // encoding: [0xff,0x03,0x00,0x6b]
 
         negs w28, w27, lsl #0
         negs w26, w25, lsl #29
         negs w24, w23, lsl #31
 
-// CHECK-ARM64: negs     w28, w27             // encoding: [0xfc,0x03,0x1b,0x6b]
-// CHECK-ARM64: negs     w26, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x6b]
-// CHECK-ARM64: negs     w24, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x6b]
+// CHECK: negs     w28, w27             // encoding: [0xfc,0x03,0x1b,0x6b]
+// CHECK: negs     w26, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x6b]
+// CHECK: negs     w24, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x6b]
 
         negs w22, w21, lsr #0
         negs w20, w19, lsr #1
@@ -800,17 +800,17 @@ _func:
         negs x29, x30
         negs x30, xzr
         negs xzr, x0
-// CHECK-ARM64: negs     x29, x30              // encoding: [0xfd,0x03,0x1e,0xeb]
-// CHECK-ARM64: negs     x30, xzr              // encoding: [0xfe,0x03,0x1f,0xeb]
-// CHECK-ARM64: cmp     xzr, x0                // encoding: [0xff,0x03,0x00,0xeb]
+// CHECK: negs     x29, x30              // encoding: [0xfd,0x03,0x1e,0xeb]
+// CHECK: negs     x30, xzr              // encoding: [0xfe,0x03,0x1f,0xeb]
+// CHECK: cmp     xzr, x0                // encoding: [0xff,0x03,0x00,0xeb]
 
         negs x28, x27, lsl #0
         negs x26, x25, lsl #29
         negs x24, x23, lsl #31
 
-// CHECK-ARM64: negs     x28, x27              // encoding: [0xfc,0x03,0x1b,0xeb]
-// CHECK-ARM64: negs     x26, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xeb]
-// CHECK-ARM64: negs     x24, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xeb]
+// CHECK: negs     x28, x27              // encoding: [0xfc,0x03,0x1b,0xeb]
+// CHECK: negs     x26, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xeb]
+// CHECK: negs     x24, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xeb]
 
         negs x22, x21, lsr #0
         negs x20, x19, lsr #1
@@ -938,28 +938,28 @@ _func:
         sbfm wzr, wzr, #31, #31
         sbfm w12, w9, #0, #0
 
-// CHECK-ARM64: sbfx     x1, x2, #3, #2       // encoding: [0x41,0x10,0x43,0x93]
-// CHECK-ARM64: asr      x3, x4, #63          // encoding: [0x83,0xfc,0x7f,0x93]
-// CHECK-ARM64: asr      wzr, wzr, #31        // encoding: [0xff,0x7f,0x1f,0x13]
-// CHECK-ARM64: sbfx     w12, w9, #0, #1      // encoding: [0x2c,0x01,0x00,0x13]
+// CHECK: sbfx     x1, x2, #3, #2       // encoding: [0x41,0x10,0x43,0x93]
+// CHECK: asr      x3, x4, #63          // encoding: [0x83,0xfc,0x7f,0x93]
+// CHECK: asr      wzr, wzr, #31        // encoding: [0xff,0x7f,0x1f,0x13]
+// CHECK: sbfx     w12, w9, #0, #1      // encoding: [0x2c,0x01,0x00,0x13]
 
         ubfm x4, x5, #12, #10
         ubfm xzr, x4, #0, #0
         ubfm x4, xzr, #63, #5
         ubfm x5, x6, #12, #63
-// CHECK-ARM64: ubfiz    x4, x5, #52, #11        // encoding: [0xa4,0x28,0x4c,0xd3]
-// CHECK-ARM64: ubfx     xzr, x4, #0, #1         // encoding: [0x9f,0x00,0x40,0xd3]
-// CHECK-ARM64: ubfiz    x4, xzr, #1, #6         // encoding: [0xe4,0x17,0x7f,0xd3]
-// CHECK-ARM64: lsr      x5, x6, #12             // encoding: [0xc5,0xfc,0x4c,0xd3]
+// CHECK: ubfiz    x4, x5, #52, #11        // encoding: [0xa4,0x28,0x4c,0xd3]
+// CHECK: ubfx     xzr, x4, #0, #1         // encoding: [0x9f,0x00,0x40,0xd3]
+// CHECK: ubfiz    x4, xzr, #1, #6         // encoding: [0xe4,0x17,0x7f,0xd3]
+// CHECK: lsr      x5, x6, #12             // encoding: [0xc5,0xfc,0x4c,0xd3]
 
         bfm x4, x5, #12, #10
         bfm xzr, x4, #0, #0
         bfm x4, xzr, #63, #5
         bfm x5, x6, #12, #63
-// CHECK-ARM64: bfi      x4, x5, #52, #11           // encoding: [0xa4,0x28,0x4c,0xb3]
-// CHECK-ARM64: bfxil    xzr, x4, #0, #1            // encoding: [0x9f,0x00,0x40,0xb3]
-// CHECK-ARM64: bfi      x4, xzr, #1, #6            // encoding: [0xe4,0x17,0x7f,0xb3]
-// CHECK-ARM64: bfxil    x5, x6, #12, #52           // encoding: [0xc5,0xfc,0x4c,0xb3]
+// CHECK: bfi      x4, x5, #52, #11           // encoding: [0xa4,0x28,0x4c,0xb3]
+// CHECK: bfxil    xzr, x4, #0, #1            // encoding: [0x9f,0x00,0x40,0xb3]
+// CHECK: bfi      x4, xzr, #1, #6            // encoding: [0xe4,0x17,0x7f,0xb3]
+// CHECK: bfxil    x5, x6, #12, #52           // encoding: [0xc5,0xfc,0x4c,0xb3]
 
         sxtb w1, w2
         sxtb xzr, w3
@@ -1018,9 +1018,9 @@ _func:
         sbfiz xzr, xzr, #10, #11
 // CHECK: {{sbfiz|sbfx}}    w9, w10, #0, #1   // encoding: [0x49,0x01,0x00,0x13]
 // CHECK: sbfiz    x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0x93]
-// CHECK-ARM64: asr    x19, x20, #0           // encoding: [0x93,0xfe,0x40,0x93]
+// CHECK: asr    x19, x20, #0           // encoding: [0x93,0xfe,0x40,0x93]
 // CHECK: sbfiz    x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0x93]
-// CHECK-ARM64: asr    w9, w10, #0            // encoding: [0x49,0x7d,0x00,0x13]
+// CHECK: asr    w9, w10, #0            // encoding: [0x49,0x7d,0x00,0x13]
 // CHECK: sbfiz    w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x13]
 // CHECK: sbfiz    w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x13]
 // CHECK: sbfiz    xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0x93]
@@ -1034,12 +1034,12 @@ _func:
         sbfx w13, w14, #29, #3
         sbfx xzr, xzr, #10, #11
 // CHECK: sbfx     w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x13]
-// CHECK-ARM64: asr     x2, x3, #63           // encoding: [0x62,0xfc,0x7f,0x93]
-// CHECK-ARM64: asr     x19, x20, #0          // encoding: [0x93,0xfe,0x40,0x93]
-// CHECK-ARM64: asr     x9, x10, #5           // encoding: [0x49,0xfd,0x45,0x93]
-// CHECK-ARM64: asr     w9, w10, #0           // encoding: [0x49,0x7d,0x00,0x13]
-// CHECK-ARM64: asr     w11, w12, #31         // encoding: [0x8b,0x7d,0x1f,0x13]
-// CHECK-ARM64: asr     w13, w14, #29         // encoding: [0xcd,0x7d,0x1d,0x13]
+// CHECK: asr     x2, x3, #63           // encoding: [0x62,0xfc,0x7f,0x93]
+// CHECK: asr     x19, x20, #0          // encoding: [0x93,0xfe,0x40,0x93]
+// CHECK: asr     x9, x10, #5           // encoding: [0x49,0xfd,0x45,0x93]
+// CHECK: asr     w9, w10, #0           // encoding: [0x49,0x7d,0x00,0x13]
+// CHECK: asr     w11, w12, #31         // encoding: [0x8b,0x7d,0x1f,0x13]
+// CHECK: asr     w13, w14, #29         // encoding: [0xcd,0x7d,0x1d,0x13]
 // CHECK: sbfx     xzr, xzr, #10, #11         // encoding: [0xff,0x53,0x4a,0x93]
 
         bfi w9, w10, #0, #1
@@ -1051,14 +1051,14 @@ _func:
         bfi w13, w14, #29, #3
         bfi xzr, xzr, #10, #11
 
-// CHECK-ARM64: bfxil    w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x33]
-// CHECK-ARM64: bfi      x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0xb3]
-// CHECK-ARM64: bfxil    x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xb3]
-// CHECK-ARM64: bfi      x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0xb3]
-// CHECK-ARM64: bfxil    w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x33]
-// CHECK-ARM64: bfi      w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x33]
-// CHECK-ARM64: bfi      w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x33]
-// CHECK-ARM64: bfi      xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0xb3]
+// CHECK: bfxil    w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x33]
+// CHECK: bfi      x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0xb3]
+// CHECK: bfxil    x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xb3]
+// CHECK: bfi      x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0xb3]
+// CHECK: bfxil    w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x33]
+// CHECK: bfi      w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x33]
+// CHECK: bfi      w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x33]
+// CHECK: bfi      xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0xb3]
 
         bfxil w9, w10, #0, #1
         bfxil x2, x3, #63, #1
@@ -1086,14 +1086,14 @@ _func:
         ubfiz w13, w14, #29, #3
         ubfiz xzr, xzr, #10, #11
 
-// CHECK-ARM64: ubfx     w9, w10, #0, #1         // encoding: [0x49,0x01,0x00,0x53]
-// CHECK-ARM64: lsl      x2, x3, #63             // encoding: [0x62,0x00,0x41,0xd3]
-// CHECK-ARM64: lsr      x19, x20, #0            // encoding: [0x93,0xfe,0x40,0xd3]
-// CHECK-ARM64: lsl      x9, x10, #5             // encoding: [0x49,0xe9,0x7b,0xd3]
-// CHECK-ARM64: lsr      w9, w10, #0             // encoding: [0x49,0x7d,0x00,0x53]
-// CHECK-ARM64: lsl      w11, w12, #31           // encoding: [0x8b,0x01,0x01,0x53]
-// CHECK-ARM64: lsl      w13, w14, #29           // encoding: [0xcd,0x09,0x03,0x53]
-// CHECK-ARM64: ubfiz    xzr, xzr, #10, #11      // encoding: [0xff,0x2b,0x76,0xd3]
+// CHECK: ubfx     w9, w10, #0, #1         // encoding: [0x49,0x01,0x00,0x53]
+// CHECK: lsl      x2, x3, #63             // encoding: [0x62,0x00,0x41,0xd3]
+// CHECK: lsr      x19, x20, #0            // encoding: [0x93,0xfe,0x40,0xd3]
+// CHECK: lsl      x9, x10, #5             // encoding: [0x49,0xe9,0x7b,0xd3]
+// CHECK: lsr      w9, w10, #0             // encoding: [0x49,0x7d,0x00,0x53]
+// CHECK: lsl      w11, w12, #31           // encoding: [0x8b,0x01,0x01,0x53]
+// CHECK: lsl      w13, w14, #29           // encoding: [0xcd,0x09,0x03,0x53]
+// CHECK: ubfiz    xzr, xzr, #10, #11      // encoding: [0xff,0x2b,0x76,0xd3]
 
         ubfx w9, w10, #0, #1
         ubfx x2, x3, #63, #1
@@ -1104,14 +1104,14 @@ _func:
         ubfx w13, w14, #29, #3
         ubfx xzr, xzr, #10, #11
 
-// CHECK-ARM64: ubfx    w9, w10, #0, #1         // encoding: [0x49,0x01,0x00,0x53]
-// CHECK-ARM64: lsr     x2, x3, #63             // encoding: [0x62,0xfc,0x7f,0xd3]
-// CHECK-ARM64: lsr     x19, x20, #0            // encoding: [0x93,0xfe,0x40,0xd3]
-// CHECK-ARM64: lsr     x9, x10, #5             // encoding: [0x49,0xfd,0x45,0xd3]
-// CHECK-ARM64: lsr     w9, w10, #0             // encoding: [0x49,0x7d,0x00,0x53]
-// CHECK-ARM64: lsr     w11, w12, #31           // encoding: [0x8b,0x7d,0x1f,0x53]
-// CHECK-ARM64: lsr     w13, w14, #29           // encoding: [0xcd,0x7d,0x1d,0x53]
-// CHECK-ARM64: ubfx    xzr, xzr, #10, #11      // encoding: [0xff,0x53,0x4a,0xd3]
+// CHECK: ubfx    w9, w10, #0, #1         // encoding: [0x49,0x01,0x00,0x53]
+// CHECK: lsr     x2, x3, #63             // encoding: [0x62,0xfc,0x7f,0xd3]
+// CHECK: lsr     x19, x20, #0            // encoding: [0x93,0xfe,0x40,0xd3]
+// CHECK: lsr     x9, x10, #5             // encoding: [0x49,0xfd,0x45,0xd3]
+// CHECK: lsr     w9, w10, #0             // encoding: [0x49,0x7d,0x00,0x53]
+// CHECK: lsr     w11, w12, #31           // encoding: [0x8b,0x7d,0x1f,0x53]
+// CHECK: lsr     w13, w14, #29           // encoding: [0xcd,0x7d,0x1d,0x53]
+// CHECK: ubfx    xzr, xzr, #10, #11      // encoding: [0xff,0x53,0x4a,0xd3]
 //------------------------------------------------------------------------------
 // Compare & branch (immediate)
 //------------------------------------------------------------------------------
@@ -1120,22 +1120,22 @@ _func:
         cbz x5, lbl
         cbnz x2, lbl
         cbnz x26, lbl
-// CHECK-ARM64: cbz    w5, lbl                 // encoding: [0bAAA00101,A,A,0x34]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: cbz    x5, lbl                 // encoding: [0bAAA00101,A,A,0xb4]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: cbnz    x2, lbl                 // encoding: [0bAAA00010,A,A,0xb5]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: cbnz    x26, lbl                // encoding: [0bAAA11010,A,A,0xb5]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
+// CHECK: cbz    w5, lbl                 // encoding: [0bAAA00101,A,A,0x34]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: cbz    x5, lbl                 // encoding: [0bAAA00101,A,A,0xb4]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: cbnz    x2, lbl                 // encoding: [0bAAA00010,A,A,0xb5]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: cbnz    x26, lbl                // encoding: [0bAAA11010,A,A,0xb5]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
 
         cbz wzr, lbl
         cbnz xzr, lbl
 
-// CHECK-ARM64: cbz    wzr, lbl                // encoding: [0bAAA11111,A,A,0x34]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: cbnz    xzr, lbl                // encoding: [0bAAA11111,A,A,0xb5]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
+// CHECK: cbz    wzr, lbl                // encoding: [0bAAA11111,A,A,0x34]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: cbnz    xzr, lbl                // encoding: [0bAAA11111,A,A,0xb5]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
 
         cbz w5, #0
         cbnz x3, #-4
@@ -1168,40 +1168,40 @@ _func:
         b.le lbl
         b.al lbl
 
-// CHECK-ARM64: b.eq lbl                     // encoding: [0bAAA00000,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.ne lbl                     // encoding: [0bAAA00001,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.hs lbl                     // encoding: [0bAAA00010,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.hs lbl                     // encoding: [0bAAA00010,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.lo lbl                     // encoding: [0bAAA00011,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.lo lbl                     // encoding: [0bAAA00011,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.mi lbl                     // encoding: [0bAAA00100,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.pl lbl                     // encoding: [0bAAA00101,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.vs lbl                     // encoding: [0bAAA00110,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.vc lbl                     // encoding: [0bAAA00111,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.hi lbl                     // encoding: [0bAAA01000,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.ls lbl                     // encoding: [0bAAA01001,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.ge lbl                     // encoding: [0bAAA01010,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.lt lbl                     // encoding: [0bAAA01011,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.gt lbl                     // encoding: [0bAAA01100,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.le lbl                     // encoding: [0bAAA01101,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.al lbl                     // encoding: [0bAAA01110,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
+// CHECK: b.eq lbl                     // encoding: [0bAAA00000,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.ne lbl                     // encoding: [0bAAA00001,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.hs lbl                     // encoding: [0bAAA00010,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.hs lbl                     // encoding: [0bAAA00010,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.lo lbl                     // encoding: [0bAAA00011,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.lo lbl                     // encoding: [0bAAA00011,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.mi lbl                     // encoding: [0bAAA00100,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.pl lbl                     // encoding: [0bAAA00101,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.vs lbl                     // encoding: [0bAAA00110,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.vc lbl                     // encoding: [0bAAA00111,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.hi lbl                     // encoding: [0bAAA01000,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.ls lbl                     // encoding: [0bAAA01001,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.ge lbl                     // encoding: [0bAAA01010,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.lt lbl                     // encoding: [0bAAA01011,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.gt lbl                     // encoding: [0bAAA01100,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.le lbl                     // encoding: [0bAAA01101,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.al lbl                     // encoding: [0bAAA01110,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
 
         //  ARM64 has these in a separate file
         beq lbl
@@ -2186,23 +2186,23 @@ _func:
         ldr x29, there
         ldrsw xzr, everywhere
 
-// CHECK-ARM64: ldr    w3, here                // encoding: [0bAAA00011,A,A,0x18]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: here, kind: fixup_arm64_ldr_pcrel_imm19
-// CHECK-ARM64: ldr    x29, there              // encoding: [0bAAA11101,A,A,0x58]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: there, kind: fixup_arm64_ldr_pcrel_imm19
-// CHECK-ARM64: ldrsw    xzr, everywhere         // encoding: [0bAAA11111,A,A,0x98]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: everywhere, kind: fixup_arm64_ldr_pcrel_imm19
+// CHECK: ldr    w3, here                // encoding: [0bAAA00011,A,A,0x18]
+// CHECK:                                 //   fixup A - offset: 0, value: here, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: ldr    x29, there              // encoding: [0bAAA11101,A,A,0x58]
+// CHECK:                                 //   fixup A - offset: 0, value: there, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: ldrsw    xzr, everywhere         // encoding: [0bAAA11111,A,A,0x98]
+// CHECK:                                 //   fixup A - offset: 0, value: everywhere, kind: fixup_aarch64_ldr_pcrel_imm19
 
         ldr s0, who_knows
         ldr d0, i_dont
         ldr q0, there_must_be_a_better_way
 
-// CHECK-ARM64: ldr    s0, who_knows           // encoding: [0bAAA00000,A,A,0x1c]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: who_knows, kind: fixup_arm64_ldr_pcrel_imm19
-// CHECK-ARM64: ldr    d0, i_dont              // encoding: [0bAAA00000,A,A,0x5c]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: i_dont, kind: fixup_arm64_ldr_pcrel_imm19
-// CHECK-ARM64: ldr    q0, there_must_be_a_better_way // encoding: [0bAAA00000,A,A,0x9c]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: there_must_be_a_better_way, kind: fixup_arm64_ldr_pcrel_imm19
+// CHECK: ldr    s0, who_knows           // encoding: [0bAAA00000,A,A,0x1c]
+// CHECK:                                 //   fixup A - offset: 0, value: who_knows, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: ldr    d0, i_dont              // encoding: [0bAAA00000,A,A,0x5c]
+// CHECK:                                 //   fixup A - offset: 0, value: i_dont, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: ldr    q0, there_must_be_a_better_way // encoding: [0bAAA00000,A,A,0x9c]
+// CHECK:                                 //   fixup A - offset: 0, value: there_must_be_a_better_way, kind: fixup_aarch64_ldr_pcrel_imm19
 
         ldr w0, #1048572
         ldr x10, #-1048576
@@ -2212,10 +2212,10 @@ _func:
         prfm pldl1strm, nowhere
         prfm #22, somewhere
 
-// CHECK-ARM64: prfm    pldl1strm, nowhere      // encoding: [0bAAA00001,A,A,0xd8]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_arm64_ldr_pcrel_imm19
-// CHECK-ARM64: prfm    #22, somewhere          // encoding: [0bAAA10110,A,A,0xd8]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_arm64_ldr_pcrel_imm19
+// CHECK: prfm    pldl1strm, nowhere      // encoding: [0bAAA00001,A,A,0xd8]
+// CHECK:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: prfm    #22, somewhere          // encoding: [0bAAA10110,A,A,0xd8]
+// CHECK:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_aarch64_ldr_pcrel_imm19
 
 //------------------------------------------------------------------------------
 // Load/store exclusive
@@ -2431,18 +2431,18 @@ _func:
         ldr x15, [x5, #:lo12:sym]
         ldr q3, [x2, #:lo12:sym]
 
-// CHECK-ARM64: str    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b00AAAAAA,0xf9]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK-ARM64: ldrb    w15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b01AAAAAA,0x39]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_arm64_ldst_imm12_scale1
-// CHECK-ARM64: ldrsh    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b10AAAAAA,0x79]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_arm64_ldst_imm12_scale2
-// CHECK-ARM64: ldrsw    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b10AAAAAA,0xb9]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_arm64_ldst_imm12_scale4
-// CHECK-ARM64: ldr    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b01AAAAAA,0xf9]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK-ARM64: ldr    q3, [x2, :lo12:sym]     // encoding: [0x43,0bAAAAAA00,0b11AAAAAA,0x3d]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_arm64_ldst_imm12_scale16
+// CHECK: str    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b00AAAAAA,0xf9]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: ldrb    w15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b01AAAAAA,0x39]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsh    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b10AAAAAA,0x79]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsw    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b10AAAAAA,0xb9]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldr    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: ldr    q3, [x2, :lo12:sym]     // encoding: [0x43,0bAAAAAA00,0b11AAAAAA,0x3d]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale16
 
         prfm pldl1keep, [sp, #8]
         prfm pldl1strm, [x3]
@@ -3323,34 +3323,34 @@ _func:
         movz x2, #:abs_g0:sym
         movk w3, #:abs_g0_nc:sym
 
-// CHECK-ARM64: movz    x2, #:abs_g0:sym        // encoding: [0bAAA00010,A,0b100AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w3, #:abs_g0_nc:sym     // encoding: [0bAAA00011,A,0b100AAAAA,0x72]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_arm64_movw
+// CHECK: movz    x2, #:abs_g0:sym        // encoding: [0bAAA00010,A,0b100AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_aarch64_movw
+// CHECK: movk    w3, #:abs_g0_nc:sym     // encoding: [0bAAA00011,A,0b100AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_aarch64_movw
 
         movz x4, #:abs_g1:sym
         movk w5, #:abs_g1_nc:sym
 
-// CHECK-ARM64: movz    x4, #:abs_g1:sym        // encoding: [0bAAA00100,A,0b101AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w5, #:abs_g1_nc:sym     // encoding: [0bAAA00101,A,0b101AAAAA,0x72]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_arm64_movw
+// CHECK: movz    x4, #:abs_g1:sym        // encoding: [0bAAA00100,A,0b101AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_aarch64_movw
+// CHECK: movk    w5, #:abs_g1_nc:sym     // encoding: [0bAAA00101,A,0b101AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_aarch64_movw
 
         movz x6, #:abs_g2:sym
         movk x7, #:abs_g2_nc:sym
 
-// CHECK-ARM64: movz    x6, #:abs_g2:sym        // encoding: [0bAAA00110,A,0b110AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    x7, #:abs_g2_nc:sym     // encoding: [0bAAA00111,A,0b110AAAAA,0xf2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_arm64_movw
+// CHECK: movz    x6, #:abs_g2:sym        // encoding: [0bAAA00110,A,0b110AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_aarch64_movw
+// CHECK: movk    x7, #:abs_g2_nc:sym     // encoding: [0bAAA00111,A,0b110AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_aarch64_movw
 
         movz x8, #:abs_g3:sym
         movk x9, #:abs_g3:sym
 
-// CHECK-ARM64: movz    x8, #:abs_g3:sym        // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    x9, #:abs_g3:sym        // encoding: [0bAAA01001,A,0b111AAAAA,0xf2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_arm64_movw
+// CHECK: movz    x8, #:abs_g3:sym        // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_aarch64_movw
+// CHECK: movk    x9, #:abs_g3:sym        // encoding: [0bAAA01001,A,0b111AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_aarch64_movw
 
 
         movn x30, #:abs_g0_s:sym
@@ -3358,36 +3358,36 @@ _func:
         movn w10, #:abs_g0_s:sym
         movz w25, #:abs_g0_s:sym
 
-// CHECK-ARM64: movn    x30, #:abs_g0_s:sym     // encoding: [0bAAA11110,A,0b100AAAAA,0x92]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    x19, #:abs_g0_s:sym     // encoding: [0bAAA10011,A,0b100AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    w10, #:abs_g0_s:sym     // encoding: [0bAAA01010,A,0b100AAAAA,0x12]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w25, #:abs_g0_s:sym     // encoding: [0bAAA11001,A,0b100AAAAA,0x52]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_arm64_movw
+// CHECK: movn    x30, #:abs_g0_s:sym     // encoding: [0bAAA11110,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    x19, #:abs_g0_s:sym     // encoding: [0bAAA10011,A,0b100AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
+// CHECK: movn    w10, #:abs_g0_s:sym     // encoding: [0bAAA01010,A,0b100AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    w25, #:abs_g0_s:sym     // encoding: [0bAAA11001,A,0b100AAAAA,0x52]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
 
         movn x30, #:abs_g1_s:sym
         movz x19, #:abs_g1_s:sym
         movn w10, #:abs_g1_s:sym
         movz w25, #:abs_g1_s:sym
 
-// CHECK-ARM64: movn    x30, #:abs_g1_s:sym     // encoding: [0bAAA11110,A,0b101AAAAA,0x92]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    x19, #:abs_g1_s:sym     // encoding: [0bAAA10011,A,0b101AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    w10, #:abs_g1_s:sym     // encoding: [0bAAA01010,A,0b101AAAAA,0x12]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w25, #:abs_g1_s:sym     // encoding: [0bAAA11001,A,0b101AAAAA,0x52]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_arm64_movw
+// CHECK: movn    x30, #:abs_g1_s:sym     // encoding: [0bAAA11110,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    x19, #:abs_g1_s:sym     // encoding: [0bAAA10011,A,0b101AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_aarch64_movw
+// CHECK: movn    w10, #:abs_g1_s:sym     // encoding: [0bAAA01010,A,0b101AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    w25, #:abs_g1_s:sym     // encoding: [0bAAA11001,A,0b101AAAAA,0x52]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_aarch64_movw
 
         movn x30, #:abs_g2_s:sym
         movz x19, #:abs_g2_s:sym
 
-// CHECK-ARM64: movn    x30, #:abs_g2_s:sym     // encoding: [0bAAA11110,A,0b110AAAAA,0x92]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    x19, #:abs_g2_s:sym     // encoding: [0bAAA10011,A,0b110AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_arm64_movw
+// CHECK: movn    x30, #:abs_g2_s:sym     // encoding: [0bAAA11110,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    x19, #:abs_g2_s:sym     // encoding: [0bAAA10011,A,0b110AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_aarch64_movw
 
 //------------------------------------------------------------------------------
 // PC-relative addressing
@@ -3396,15 +3396,15 @@ _func:
         adr x2, loc
         adr xzr, loc
 
-// CHECK-ARM64: adr    x2, loc                 // encoding: [0x02'A',A,A,0x10'A']
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: loc, kind: fixup_arm64_pcrel_adr_imm21
-// CHECK-ARM64: adr    xzr, loc                // encoding: [0x1f'A',A,A,0x10'A']
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: loc, kind: fixup_arm64_pcrel_adr_imm21
+// CHECK: adr    x2, loc                 // encoding: [0x02'A',A,A,0x10'A']
+// CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_aarch64_pcrel_adr_imm21
+// CHECK: adr    xzr, loc                // encoding: [0x1f'A',A,A,0x10'A']
+// CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_aarch64_pcrel_adr_imm21
 
         adrp x29, loc
 
-// CHECK-ARM64: adrp    x29, loc                // encoding: [0x1d'A',A,A,0x90'A']
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: loc, kind: fixup_arm64_pcrel_adrp_imm21
+// CHECK: adrp    x29, loc                // encoding: [0x1d'A',A,A,0x90'A']
+// CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_aarch64_pcrel_adrp_imm21
         adrp x30, #4096
         adr x20, #0
         adr x9, #-1
@@ -4782,24 +4782,24 @@ _func:
         tbz xzr, #63, elsewhere
         tbnz x5, #45, nowhere
 
-// CHECK-ARM64: tbz    w5, #0, somewhere       // encoding: [0bAAA00101,A,0b00000AAA,0x36]
-// CHECK-ARM64:                                //   fixup A - offset: 0, value: somewhere, kind: fixup_arm64_pcrel_branch14
-// CHECK-ARM64: tbz    xzr, #63, elsewhere     // encoding: [0bAAA11111,A,0b11111AAA,0xb6]
-// CHECK-ARM64:                                //   fixup A - offset: 0, value: elsewhere, kind: fixup_arm64_pcrel_branch14
-// CHECK-ARM64: tbnz   x5, #45, nowhere        // encoding: [0bAAA00101,A,0b01101AAA,0xb7]
-// CHECK-ARM64:                                //   fixup A - offset: 0, value: nowhere, kind: fixup_arm64_pcrel_branch14
+// CHECK: tbz    w5, #0, somewhere       // encoding: [0bAAA00101,A,0b00000AAA,0x36]
+// CHECK:                                //   fixup A - offset: 0, value: somewhere, kind: fixup_aarch64_pcrel_branch14
+// CHECK: tbz    xzr, #63, elsewhere     // encoding: [0bAAA11111,A,0b11111AAA,0xb6]
+// CHECK:                                //   fixup A - offset: 0, value: elsewhere, kind: fixup_aarch64_pcrel_branch14
+// CHECK: tbnz   x5, #45, nowhere        // encoding: [0bAAA00101,A,0b01101AAA,0xb7]
+// CHECK:                                //   fixup A - offset: 0, value: nowhere, kind: fixup_aarch64_pcrel_branch14
 
 
         tbnz w3, #2, there
         tbnz wzr, #31, nowhere
         tbz w5, #12, anywhere
 
-// CHECK-ARM64: tbnz    w3, #2, there           // encoding: [0bAAA00011,A,0b00010AAA,0x37]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: there, kind: fixup_arm64_pcrel_branch14
-// CHECK-ARM64: tbnz    wzr, #31, nowhere       // encoding: [0bAAA11111,A,0b11111AAA,0x37]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_arm64_pcrel_branch14
-// CHECK-ARM64: tbz     w5, #12, anywhere       // encoding: [0bAAA00101,A,0b01100AAA,0x36]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: anywhere, kind: fixup_arm64_pcrel_branch14
+// CHECK: tbnz    w3, #2, there           // encoding: [0bAAA00011,A,0b00010AAA,0x37]
+// CHECK:                                 //   fixup A - offset: 0, value: there, kind: fixup_aarch64_pcrel_branch14
+// CHECK: tbnz    wzr, #31, nowhere       // encoding: [0bAAA11111,A,0b11111AAA,0x37]
+// CHECK:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_aarch64_pcrel_branch14
+// CHECK: tbz     w5, #12, anywhere       // encoding: [0bAAA00101,A,0b01100AAA,0x36]
+// CHECK:                                 //   fixup A - offset: 0, value: anywhere, kind: fixup_aarch64_pcrel_branch14
 
 //------------------------------------------------------------------------------
 // Unconditional branch (immediate)
@@ -4808,10 +4808,10 @@ _func:
         b somewhere
         bl elsewhere
 
-// CHECK-ARM64: b    somewhere               // encoding: [A,A,A,0b000101AA]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: somewhere, kind: fixup_arm64_pcrel_branch26
-// CHECK-ARM64: bl    elsewhere               // encoding: [A,A,A,0b100101AA]
-// CHECK-ARM64:                               //   fixup A - offset: 0, value: elsewhere, kind: fixup_arm64_pcrel_call26
+// CHECK: b    somewhere               // encoding: [A,A,A,0b000101AA]
+// CHECK:                              //   fixup A - offset: 0, value: somewhere, kind: fixup_aarch64_pcrel_branch26
+// CHECK: bl    elsewhere               // encoding: [A,A,A,0b100101AA]
+// CHECK:                               //   fixup A - offset: 0, value: elsewhere, kind: fixup_aarch64_pcrel_call26
 
         b #4
         bl #0
diff --git a/test/MC/AArch64/basic-pic.s b/test/MC/AArch64/basic-pic.s
index 6bb6aaa7de1..a10874dcca0 100644
--- a/test/MC/AArch64/basic-pic.s
+++ b/test/MC/AArch64/basic-pic.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o -| llvm-objdump -r - | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o -| llvm-objdump -r - | FileCheck %s
 
 // CHECK: RELOCATION RECORDS FOR [.rela.text]
 
diff --git a/test/MC/AArch64/elf-extern.s b/test/MC/AArch64/elf-extern.s
index 3d84bde052f..dfa3fb002ed 100644
--- a/test/MC/AArch64/elf-extern.s
+++ b/test/MC/AArch64/elf-extern.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc < %s -triple=arm64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s
+// RUN: llvm-mc < %s -triple=aarch64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s
 
 // External symbols are a different concept to global variables but should still
 // get relocations and so on when used.
diff --git a/test/MC/AArch64/elf-objdump.s b/test/MC/AArch64/elf-objdump.s
index b69926efbc2..3b3aa65819d 100644
--- a/test/MC/AArch64/elf-objdump.s
+++ b/test/MC/AArch64/elf-objdump.s
@@ -1,5 +1,5 @@
 // 64 bit little endian
-// RUN: llvm-mc -filetype=obj -triple arm64-none-linux-gnu %s -o - | llvm-objdump -d -
+// RUN: llvm-mc -filetype=obj -triple aarch64-none-linux-gnu %s -o - | llvm-objdump -d -
 
 // We just want to see if llvm-objdump works at all.
 // CHECK: .text
diff --git a/test/MC/AArch64/elf-reloc-addsubimm.s b/test/MC/AArch64/elf-reloc-addsubimm.s
index cc5c3f7f25b..e37991bfba1 100644
--- a/test/MC/AArch64/elf-reloc-addsubimm.s
+++ b/test/MC/AArch64/elf-reloc-addsubimm.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         add x2, x3, #:lo12:some_label
diff --git a/test/MC/AArch64/elf-reloc-ldrlit.s b/test/MC/AArch64/elf-reloc-ldrlit.s
index 3554ef3ae42..d4c3a4eb50d 100644
--- a/test/MC/AArch64/elf-reloc-ldrlit.s
+++ b/test/MC/AArch64/elf-reloc-ldrlit.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         ldr x0, some_label
diff --git a/test/MC/AArch64/elf-reloc-ldstunsimm.s b/test/MC/AArch64/elf-reloc-ldstunsimm.s
index 196f65fd299..371e7e51f24 100644
--- a/test/MC/AArch64/elf-reloc-ldstunsimm.s
+++ b/test/MC/AArch64/elf-reloc-ldstunsimm.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -mattr=+fp-armv8 -filetype=obj %s -o - | \
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+fp-armv8 -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         ldrb w0, [sp, #:lo12:some_label]
diff --git a/test/MC/AArch64/elf-reloc-movw.s b/test/MC/AArch64/elf-reloc-movw.s
index dc7dbb0c156..333159562c0 100644
--- a/test/MC/AArch64/elf-reloc-movw.s
+++ b/test/MC/AArch64/elf-reloc-movw.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         movz x0, #:abs_g0:some_label
diff --git a/test/MC/AArch64/elf-reloc-pcreladdressing.s b/test/MC/AArch64/elf-reloc-pcreladdressing.s
index 652011318c3..093891d931a 100644
--- a/test/MC/AArch64/elf-reloc-pcreladdressing.s
+++ b/test/MC/AArch64/elf-reloc-pcreladdressing.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         adr x2, some_label
diff --git a/test/MC/AArch64/elf-reloc-tstb.s b/test/MC/AArch64/elf-reloc-tstb.s
index 9cbe3a53fb7..25c98163b58 100644
--- a/test/MC/AArch64/elf-reloc-tstb.s
+++ b/test/MC/AArch64/elf-reloc-tstb.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         tbz x6, #45, somewhere
diff --git a/test/MC/AArch64/elf-reloc-uncondbrimm.s b/test/MC/AArch64/elf-reloc-uncondbrimm.s
index 8f3915afab7..9ac66bd876a 100644
--- a/test/MC/AArch64/elf-reloc-uncondbrimm.s
+++ b/test/MC/AArch64/elf-reloc-uncondbrimm.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         b somewhere
diff --git a/test/MC/AArch64/gicv3-regs-diagnostics.s b/test/MC/AArch64/gicv3-regs-diagnostics.s
index 6f4f5ee66c6..bc005b1d530 100644
--- a/test/MC/AArch64/gicv3-regs-diagnostics.s
+++ b/test/MC/AArch64/gicv3-regs-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple arm64-none-linux-gnu < %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2>&1 | FileCheck %s
 
         // Write-only
         mrs x10, icc_eoir1_el1
diff --git a/test/MC/AArch64/gicv3-regs.s b/test/MC/AArch64/gicv3-regs.s
index b9eac1a5695..0f5742ee543 100644
--- a/test/MC/AArch64/gicv3-regs.s
+++ b/test/MC/AArch64/gicv3-regs.s
@@ -1,4 +1,4 @@
- // RUN: llvm-mc -triple arm64-none-linux-gnu -show-encoding < %s | FileCheck %s
+ // RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s
 
         mrs x8, icc_iar1_el1
         mrs x26, icc_iar0_el1
diff --git a/test/MC/AArch64/inline-asm-modifiers.s b/test/MC/AArch64/inline-asm-modifiers.s
index 33d5bf519f9..cf34a952e90 100644
--- a/test/MC/AArch64/inline-asm-modifiers.s
+++ b/test/MC/AArch64/inline-asm-modifiers.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj -mattr=+fp-armv8 < %s | llvm-objdump -r - | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj -mattr=+fp-armv8 < %s | llvm-objdump -r - | FileCheck %s
 
 	.file	"<stdin>"
 	.text
diff --git a/test/MC/AArch64/jump-table.s b/test/MC/AArch64/jump-table.s
index 439ecd90de3..578ebf4e660 100644
--- a/test/MC/AArch64/jump-table.s
+++ b/test/MC/AArch64/jump-table.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc < %s -triple=arm64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s
+// RUN: llvm-mc < %s -triple=aarch64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s
 
 	.file	"<stdin>"
 	.text
diff --git a/test/MC/AArch64/lit.local.cfg b/test/MC/AArch64/lit.local.cfg
index 17a6b7ab033..1be70c04415 100644
--- a/test/MC/AArch64/lit.local.cfg
+++ b/test/MC/AArch64/lit.local.cfg
@@ -1,3 +1,3 @@
 targets = set(config.root.targets_to_build.split())
-if 'ARM64' not in targets:
+if 'AArch64' not in targets:
     config.unsupported = True
diff --git a/test/MC/AArch64/mapping-across-sections.s b/test/MC/AArch64/mapping-across-sections.s
index 00b324cb826..3d32c1dfb40 100644
--- a/test/MC/AArch64/mapping-across-sections.s
+++ b/test/MC/AArch64/mapping-across-sections.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
 
         .text
         add w0, w0, w0
diff --git a/test/MC/AArch64/mapping-within-section.s b/test/MC/AArch64/mapping-within-section.s
index f515cb9a5c0..c8bd804fa0e 100644
--- a/test/MC/AArch64/mapping-within-section.s
+++ b/test/MC/AArch64/mapping-within-section.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
 
     .text
 // $x at 0x0000
diff --git a/test/MC/AArch64/neon-3vdiff.s b/test/MC/AArch64/neon-3vdiff.s
index 3ffc38fc69c..fc3215b4b67 100644
--- a/test/MC/AArch64/neon-3vdiff.s
+++ b/test/MC/AArch64/neon-3vdiff.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -mattr=+crypto -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+crypto -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-aba-abd.s b/test/MC/AArch64/neon-aba-abd.s
index e7964834146..178eb26f64c 100644
--- a/test/MC/AArch64/neon-aba-abd.s
+++ b/test/MC/AArch64/neon-aba-abd.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-add-pairwise.s b/test/MC/AArch64/neon-add-pairwise.s
index 0b9e4d3146b..df9938b07e5 100644
--- a/test/MC/AArch64/neon-add-pairwise.s
+++ b/test/MC/AArch64/neon-add-pairwise.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-add-sub-instructions.s b/test/MC/AArch64/neon-add-sub-instructions.s
index 7d11d70bb90..68f169b3dd9 100644
--- a/test/MC/AArch64/neon-add-sub-instructions.s
+++ b/test/MC/AArch64/neon-add-sub-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-bitwise-instructions.s b/test/MC/AArch64/neon-bitwise-instructions.s
index ec192aa2d8a..79d0a9b70b5 100644
--- a/test/MC/AArch64/neon-bitwise-instructions.s
+++ b/test/MC/AArch64/neon-bitwise-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-compare-instructions.s b/test/MC/AArch64/neon-compare-instructions.s
index 4d3daf066ed..19cfaf1f4d3 100644
--- a/test/MC/AArch64/neon-compare-instructions.s
+++ b/test/MC/AArch64/neon-compare-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s
index 46ae311f5f8..fa1f3caf5ad 100644
--- a/test/MC/AArch64/neon-diagnostics.s
+++ b/test/MC/AArch64/neon-diagnostics.s
@@ -1,6 +1,5 @@
-
-// RUN: not llvm-mc -triple arm64-none-linux-gnu -mattr=+neon < %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-ERROR --check-prefix=CHECK-ARM64-ERROR < %t %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
 
 //------------------------------------------------------------------------------
 // Vector Integer Add/sub
@@ -589,12 +588,12 @@
 // CHECK-ERROR:        fcmgt v0.2d, v31.2s, v16.2s
 // CHECK-ERROR:                         ^
 
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmgt v4.4s, v7.4s, v15.4h
-// CHECK-ARM64-ERROR:                                ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmlt v29.2d, v5.2d, v2.16b
-// CHECK-ARM64-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmgt v4.4s, v7.4s, v15.4h
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, v2.16b
+// CHECK-ERROR:                                ^
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Equal to Zero (Integer)
@@ -684,12 +683,12 @@
 // CHECK-ERROR:                 ^
 
 
-// CHECK-ARM64-ERROR: error: expected floating-point constant #0.0
-// CHECK-ARM64-ERROR:        fcmeq v0.8b, v1.4h, #1.0
-// CHECK-ARM64-ERROR:                             ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmeq v0.8b, v1.4h, #1
-// CHECK-ARM64-ERROR:                             ^
+// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR:        fcmeq v0.8b, v1.4h, #1.0
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmeq v0.8b, v1.4h, #1
+// CHECK-ERROR:                             ^
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
@@ -709,12 +708,12 @@
 // CHECK-ERROR:                 ^
 
 
-// CHECK-ARM64-ERROR: error: expected floating-point constant #0.0
-// CHECK-ARM64-ERROR:        fcmle v17.8h, v15.2d, #-1.0
-// CHECK-ARM64-ERROR:                               ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmle v17.8h, v15.2d, #2
-// CHECK-ARM64-ERROR:                               ^
+// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR:        fcmle v17.8h, v15.2d, #-1.0
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmle v17.8h, v15.2d, #2
+// CHECK-ERROR:                               ^
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Greater Than Zero (Floating Point)
@@ -733,12 +732,12 @@
 // CHECK-ERROR:                        ^
 
 
-// CHECK-ARM64-ERROR: error: expected floating-point constant #0.0
-// CHECK-ARM64-ERROR:        fcmlt v29.2d, v5.2d, #255.0
-// CHECK-ARM64-ERROR:                              ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmlt v29.2d, v5.2d, #255
-// CHECK-ARM64-ERROR:                              ^
+// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, #255.0
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, #255
+// CHECK-ERROR:                              ^
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Less Than or Equal To Zero (Floating Point)
@@ -757,12 +756,12 @@
 // CHECK-ERROR:                 ^
 
 
-// CHECK-ARM64-ERROR: error: expected floating-point constant #0.0
-// CHECK-ARM64-ERROR:        fcmle v17.2d, v15.2d, #15.0
-// CHECK-ARM64-ERROR:                               ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmle v17.2d, v15.2d, #15
-// CHECK-ARM64-ERROR:                              ^
+// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR:        fcmle v17.2d, v15.2d, #15.0
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmle v17.2d, v15.2d, #15
+// CHECK-ERROR:                              ^
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Less Than Zero (Floating Point)
@@ -781,12 +780,12 @@
 // CHECK-ERROR:                        ^
 
 
-// CHECK-ARM64-ERROR: error: expected floating-point constant #0.0
-// CHECK-ARM64-ERROR:        fcmlt v29.2d, v5.2d, #16.0
-// CHECK-ARM64-ERROR:                              ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmlt v29.2d, v5.2d, #2
-// CHECK-ARM64-ERROR:                              ^
+// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, #16.0
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, #2
+// CHECK-ERROR:                              ^
 
 /-----------------------------------------------------------------------
 // Vector Integer Halving Add (Signed)
@@ -1300,9 +1299,9 @@
          shl v0.2d, v1.2d, #64
 
 
-// CHECK-ARM64-ERROR: error: unexpected token in argument list
-// CHECK-ARM64-ERROR:         shl v0.4s, v15,2s, #3
-// CHECK-ARM64-ERROR:                         ^
+// CHECK-ERROR: error: unexpected token in argument list
+// CHECK-ERROR:         shl v0.4s, v15,2s, #3
+// CHECK-ERROR:                         ^
 
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         shl v0.2d, v17.4s, #3
@@ -2633,9 +2632,9 @@
         pmull2 v0.2d, v1.4s, v2.4s
 
 
-// CHECK-ARM64-ERROR: error: unexpected token in argument list
-// CHECK-ARM64-ERROR:        pmull2 v0.4s, v1.8h v2.8h
-// CHECK-ARM64-ERROR:                            ^
+// CHECK-ERROR: error: unexpected token in argument list
+// CHECK-ERROR:        pmull2 v0.4s, v1.8h v2.8h
+// CHECK-ERROR:                            ^
 
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        pmull2 v0.2d, v1.4s, v2.4s
@@ -2959,19 +2958,19 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mla v0.2d, v1.2d, v16.d[1]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mla v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mla v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mla v0.2h, v1.2h, v2.h[1]
 // CHECK-ERROR:            ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mla v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mla v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -2993,19 +2992,19 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mls v0.2d, v1.2d, v16.d[1]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mls v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mls v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mls v0.2h, v1.2h, v2.h[1]
 // CHECK-ERROR:            ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mls v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mls v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3030,22 +3029,22 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmla v0.8h, v1.8h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v3.4s, v8.4s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v3.4s, v8.4s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v0.2d, v1.2d, v2.d[2]
 // CHECK-ERROR:                                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v0.2d, v1.2d, v22.d[2]
 // CHECK-ERROR:                                 ^
 
@@ -3064,22 +3063,22 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmls v0.8h, v1.8h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v3.4s, v8.4s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v3.4s, v8.4s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v0.2d, v1.2d, v2.d[2]
 // CHECK-ERROR:                                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v0.2d, v1.2d, v22.d[2]
 // CHECK-ERROR:                                 ^
 
@@ -3099,7 +3098,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlal v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3108,16 +3107,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlal v0.2s, v1.2s, v2.s[1]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlal2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3126,10 +3125,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlal2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3149,7 +3148,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlsl v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3158,16 +3157,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlsl v0.2s, v1.2s, v2.s[1]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlsl2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3176,10 +3175,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlsl2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3199,7 +3198,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlal v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3208,16 +3207,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlal v0.2s, v1.2s, v2.s[1]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlal2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3226,10 +3225,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlal2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3249,7 +3248,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlsl v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3258,16 +3257,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlsl v0.2s, v1.2s, v2.s[3]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlsl2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3276,10 +3275,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlsl2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3299,7 +3298,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3308,16 +3307,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal v0.2s, v1.2s, v2.s[3]
 // CHECK-ERROR:                                   ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3326,10 +3325,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                     ^
 
@@ -3349,7 +3348,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3358,16 +3357,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl v0.2s, v1.2s, v2.s[3]
 // CHECK-ERROR:                                   ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3376,10 +3375,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                     ^
 
@@ -3393,28 +3392,28 @@
       mul v0.4s, v1.4s, v22.s[4]
       mul v0.2d, v1.2d, v2.d[1]
 
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                               ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mul v0.4h, v1.4h, v16.h[8]
 // CHECK-ERROR:                                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                               ^
-// CHECK-ARM64-ERROR: invalid operand for instruction
+// CHECK-ERROR: invalid operand for instruction
 // CHECK-ERROR:        mul v0.8h, v1.8h, v16.h[8]
 // CHECK-ERROR:                                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                ^
 
@@ -3432,22 +3431,22 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmul v0.4h, v1.4h, v2.h[4]
 // CHECK-ERROR:                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.2d, v1.2d, v2.d[2]
 // CHECK-ERROR:                                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.2d, v1.2d, v22.d[2]
 // CHECK-ERROR:                                 ^
 
@@ -3462,22 +3461,22 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmulx v0.4h, v1.4h, v2.h[4]
 // CHECK-ERROR:                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.2d, v1.2d, v2.d[2]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.2d, v1.2d, v22.d[2]
 // CHECK-ERROR:                                  ^
 
@@ -3497,7 +3496,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smull v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3506,16 +3505,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smull v0.2s, v1.2s, v2.s[2]
 // CHECK-ERROR:              ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smull2 v0.4h, v1.8h, v2.h[2]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull2 v0.4s, v1.8h, v2.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3524,10 +3523,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smull2 v0.2s, v1.4s, v2.s[2]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull2 v0.2d, v1.4s, v2.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3547,7 +3546,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umull v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3556,16 +3555,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umull v0.2s, v1.2s, v2.s[2]
 // CHECK-ERROR:              ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umull2 v0.4h, v1.8h, v2.h[2]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull2 v0.4s, v1.8h, v2.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3574,10 +3573,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umull2 v0.2s, v1.4s, v2.s[2]
 // CHECK-ERROR:               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull2 v0.2d, v1.4s, v2.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3597,7 +3596,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3606,16 +3605,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull v0.2s, v1.2s, v2.s[2]
 // CHECK-ERROR:                ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull2 v0.4h, v1.8h, v2.h[2]
 // CHECK-ERROR:                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull2 v0.4s, v1.8h, v2.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3624,10 +3623,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull2 v0.2s, v1.4s, v2.s[2]
 // CHECK-ERROR:                 ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull2 v0.2d, v1.4s, v2.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                     ^
 
@@ -3641,28 +3640,28 @@
       sqdmulh v0.4s, v1.4s, v22.s[4]
       sqdmulh v0.2d, v1.2d, v22.d[1]
 
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmulh v0.4h, v1.4h, v16.h[2]
 // CHECK-ERROR:                              ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmulh v0.8h, v1.8h, v16.h[2]
 // CHECK-ERROR:                                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3679,28 +3678,28 @@
       sqrdmulh v0.4s, v1.4s, v22.s[4]
       sqrdmulh v0.2d, v1.2d, v22.d[1]
 
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqrdmulh v0.4h, v1.4h, v16.h[2]
 // CHECK-ERROR:                               ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqrdmulh v0.8h, v1.8h, v16.h[2]
 // CHECK-ERROR:                                   ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                     ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                     ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3918,13 +3917,13 @@
          ld1 {v4}, [x0]
          ld1 {v32.16b}, [x0]
          ld1 {v15.8h}, [x32]
-// CHECK-ARM64-ERROR: error: vector register expected
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR:        ld1 {x3}, [x2]
 // CHECK-ERROR:             ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        ld1 {v4}, [x0]
 // CHECK-ERROR:             ^
-// CHECK-ARM64-ERROR: error: vector register expected
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR:        ld1 {v32.16b}, [x0]
 // CHECK-ERROR:             ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3938,13 +3937,13 @@
          ld1 {v1.8h-v1.8h}, [x0]
          ld1 {v15.8h-v17.4h}, [x15]
          ld1 {v0.8b-v2.8b, [x0]
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        ld1 {v0.16b, v2.16b}, [x0]
 // CHECK-ERROR:                     ^
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        ld1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
 // CHECK-ERROR:                                         ^
-// CHECK-ARM64-ERROR: error: unexpected token in argument list
+// CHECK-ERROR: error: unexpected token in argument list
 // CHECK-ERROR:        ld1 v0.8b, v1.8b}, [x0]
 // CHECK-ERROR:            ^
 // CHECK-ERROR: error: invalid number of vectors
@@ -3953,7 +3952,7 @@
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        ld1 {v1.8h-v1.8h}, [x0]
 // CHECK-ERROR:                   ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld1 {v15.8h-v17.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: '}' expected
@@ -3965,15 +3964,15 @@
          ld2 {v15.4h, v16.4h, v17.4h}, [x32]
          ld2 {v15.8h-v16.4h}, [x15]
          ld2 {v0.2d-v2.2d}, [x0]
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld2 {v15.8h, v16.4h}, [x15]
 // CHECK-ERROR:                     ^
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        ld2 {v0.8b, v2.8b}, [x0]
 // CHECK-ERROR:                    ^
 // CHECK-ERROR:        ld2 {v15.4h, v16.4h, v17.4h}, [x32]
 // CHECK-ERROR:            ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld2 {v15.8h-v16.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3985,16 +3984,16 @@
          ld3 {v0.8b, v2.8b, v3.8b}, [x0]
          ld3 {v15.8h-v17.4h}, [x15]
          ld3 {v31.4s-v2.4s}, [sp]
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld3 {v15.8h, v16.8h, v17.4h}, [x15]
 // CHECK-ERROR:                             ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        ld3 {v0.8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld3 {v15.8h-v17.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4006,16 +4005,16 @@
          ld4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
          ld4 {v15.8h-v18.4h}, [x15]
          ld4 {v31.2s-v1.2s}, [x31]
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
 // CHECK-ERROR:                             ^
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        ld4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        ld4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
 // CHECK-ERROR:                                             ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld4 {v15.8h-v18.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4026,13 +4025,13 @@
          st1 {v4}, [x0]
          st1 {v32.16b}, [x0]
          st1 {v15.8h}, [x32]
-// CHECK-ARM64-ERROR: error: vector register expected
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR:        st1 {x3}, [x2]
 // CHECK-ERROR:             ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        st1 {v4}, [x0]
 // CHECK-ERROR:             ^
-// CHECK-ARM64-ERROR: error: vector register expected
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR:        st1 {v32.16b}, [x0]
 // CHECK-ERROR:             ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4046,13 +4045,13 @@
          st1 {v1.8h-v1.8h}, [x0]
          st1 {v15.8h-v17.4h}, [x15]
          st1 {v0.8b-v2.8b, [x0]
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        st1 {v0.16b, v2.16b}, [x0]
 // CHECK-ERROR:                     ^
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        st1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
 // CHECK-ERROR:                                         ^
-// CHECK-ARM64-ERROR: error: unexpected token in argument list
+// CHECK-ERROR: error: unexpected token in argument list
 // CHECK-ERROR:        st1 v0.8b, v1.8b}, [x0]
 // CHECK-ERROR:            ^
 // CHECK-ERROR: error: invalid number of vectors
@@ -4061,7 +4060,7 @@
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        st1 {v1.8h-v1.8h}, [x0]
 // CHECK-ERROR:                   ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st1 {v15.8h-v17.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: '}' expected
@@ -4073,16 +4072,16 @@
          st2 {v15.4h, v16.4h, v17.4h}, [x30]
          st2 {v15.8h-v16.4h}, [x15]
          st2 {v0.2d-v2.2d}, [x0]
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st2 {v15.8h, v16.4h}, [x15]
 // CHECK-ERROR:                     ^
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        st2 {v0.8b, v2.8b}, [x0]
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        st2 {v15.4h, v16.4h, v17.4h}, [x30]
 // CHECK-ERROR:            ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st2 {v15.8h-v16.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4094,16 +4093,16 @@
          st3 {v0.8b, v2.8b, v3.8b}, [x0]
          st3 {v15.8h-v17.4h}, [x15]
          st3 {v31.4s-v2.4s}, [sp]
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st3 {v15.8h, v16.8h, v17.4h}, [x15]
 // CHECK-ERROR:                             ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        st3 {v0.8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st3 {v15.8h-v17.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4115,16 +4114,16 @@
          st4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
          st4 {v15.8h-v18.4h}, [x15]
          st4 {v31.2s-v1.2s}, [x31]
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
 // CHECK-ERROR:                             ^
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        st4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        st4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
 // CHECK-ERROR:                                             ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st4 {v15.8h-v18.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4141,7 +4140,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          ld1 {v0.16b}, [x0], #8
 // CHECK-ERROR:                              ^
-// CHECK-ARM64-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR: error: invalid vector kind qualifier
 // CHECK-ERROR:          ld1 {v0.8h, v1.16h}, [x0], x1
 // CHECK-ERROR:                      ^
 // CHECK-ERROR:  error: invalid operand for instruction
@@ -4157,7 +4156,7 @@
 // CHECK-ERROR:  error: invalid operand for instruction
 // CHECK-ERROR:          ld3 {v5.2s, v6.2s, v7.2s}, [x1], #48
 // CHECK-ERROR:                                           ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:          ld4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1
 // CHECK-ERROR:                                     ^
 
@@ -4167,7 +4166,7 @@
 // CHECK-ERROR:  error: invalid operand for instruction
 // CHECK-ERROR:          st1 {v0.16b}, [x0], #8
 // CHECK-ERROR:                              ^
-// CHECK-ARM64-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR: error: invalid vector kind qualifier
 // CHECK-ERROR:          st1 {v0.8h, v1.16h}, [x0], x1
 // CHECK-ERROR:                      ^
 // CHECK-ERROR:  error: invalid operand for instruction
@@ -4183,7 +4182,7 @@
 // CHECK-ERROR:  error: invalid operand for instruction
 // CHECK-ERROR:          st3 {v5.2s, v6.2s, v7.2s}, [x1], #48
 // CHECK-ERROR:                                           ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:          st4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1
 // CHECK-ERROR:                                     ^
 
@@ -4195,16 +4194,16 @@
          ld2r {v31.4s, v0.2s}, [sp]
          ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
          ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp]
-// CHECK-ARM64-ERROR: error: vector register expected
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR: ld1r {x1}, [x0]
 // CHECK-ERROR:       ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR: ld2r {v31.4s, v0.2s}, [sp]
 // CHECK-ERROR:               ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR: ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:      ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR: ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp]
 // CHECK-ERROR:                      ^
 
@@ -4216,16 +4215,16 @@
          ld2 {v15.h, v16.h}[8], [x15]
          ld3 {v31.s, v0.s, v1.s}[-1], [sp]
          ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: ld1 {v0.b}[16], [x0]
 // CHECK-ERROR:            ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: ld2 {v15.h, v16.h}[8], [x15]
 // CHECK-ERROR:                    ^
-// CHECK-ARM64-ERROR: error: vector lane must be an integer in range
+// CHECK-ERROR: error: vector lane must be an integer in range
 // CHECK-ERROR: ld3 {v31.s, v0.s, v1.s}[-1], [sp]
 // CHECK-ERROR:                         ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
 // CHECK-ERROR:                              ^
 
@@ -4233,16 +4232,16 @@
          st2 {v31.s, v0.s}[3], [8]
          st3 {v15.h, v16.h, v17.h}[-1], [x15]
          st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: st1 {v0.d}[16], [x0]
 // CHECK-ERROR:            ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR: st2 {v31.s, v0.s}[3], [8]
 // CHECK-ERROR:                        ^
-// CHECK-ARM64-ERROR: error: vector lane must be an integer in range
+// CHECK-ERROR: error: vector lane must be an integer in range
 // CHECK-ERROR: st3 {v15.h, v16.h, v17.h}[-1], [x15]
 // CHECK-ERROR:                           ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
 // CHECK-ERROR:                              ^
 
@@ -4281,7 +4280,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR: ld2 {v15.h, v16.h}[0], [x15], #3
 // CHECK-ERROR:                               ^
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR: ld3 {v31.s, v0.s, v1.d}[0], [sp], x9
 // CHECK-ERROR:                      ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4315,16 +4314,16 @@
          ins v20.s[1], s30
          ins v1.d[0], d7
 
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:         ins v2.b[16], w1
 // CHECK-ERROR:                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:         ins v7.h[8], w14
 // CHECK-ERROR:                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:         ins v20.s[5], w30
 // CHECK-ERROR:                   ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:         ins v1.d[2], x7
 // CHECK-ERROR:                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4351,19 +4350,19 @@
          smov x14, v6.d[1]
          smov x20, v9.d[0]
 
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov w1, v0.b[16]
 // CHECK-ERROR                       ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov w14, v6.h[8]
 // CHECK-ERROR                        ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov x1, v0.b[16]
 // CHECK-ERROR                       ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov x14, v6.h[8]
 // CHECK-ERROR                        ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov x20, v9.s[5]
 // CHECK-ERROR                        ^
 // CHECK-ERROR error: invalid operand for instruction
@@ -4390,16 +4389,16 @@
          umov s20, v9.s[2]
          umov d7, v18.d[1]
 
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         umov w1, v0.b[16]
 // CHECK-ERROR                       ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         umov w14, v6.h[8]
 // CHECK-ERROR                        ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         umov w20, v9.s[5]
 // CHECK-ERROR                        ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         umov x7, v18.d[3]
 // CHECK-ERROR                        ^
 // CHECK-ERROR error: invalid operand for instruction
@@ -4815,7 +4814,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal s17, h27, s12
 // CHECK-ERROR:                          ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal d19, s24, d12
 // CHECK-ERROR:                          ^
 
@@ -4829,7 +4828,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl s14, h12, s25
 // CHECK-ERROR:                          ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl d12, s23, d13
 // CHECK-ERROR:                          ^
 
@@ -4843,7 +4842,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull s12, h22, s12
 // CHECK-ERROR:                          ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull d15, s22, d12
 // CHECK-ERROR:                          ^
 
@@ -6885,7 +6884,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          fmul    h0, h1, v1.s[0]
 // CHECK-ERROR:                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          fmul    s2, s29, v10.s[4]
 // CHECK-ERROR:                                 ^
 
@@ -6904,7 +6903,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          fmulx    h0, h1, v1.d[0]
 // CHECK-ERROR:                   ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          fmulx    d2, d29, v10.d[3]
 // CHECK-ERROR:                                  ^
 
@@ -6923,7 +6922,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          fmla    d30, s11, v1.d[1]
 // CHECK-ERROR:                       ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          fmla    s16, s22, v16.s[5]
 // CHECK-ERROR:                                  ^
 
@@ -6942,7 +6941,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          fmls    h7, h17, v26.s[2]
 // CHECK-ERROR:                  ^
-// CHECK-ARM64-ERROR: error: vector lane must be an integer in range [0, 1]
+// CHECK-ERROR: error: vector lane must be an integer in range [0, 1]
 // CHECK-ERROR:          fmls    d16, d22, v16.d[-1]
 // CHECK-ERROR:                                  ^
 
@@ -6964,7 +6963,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmlal s8, s9, v14.s[1]
 // CHECK-ERROR:                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqdmlal d4, s5, v1.s[5]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -6989,7 +6988,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmlsl d1, h1, v13.s[0]
 // CHECK-ERROR:                      ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqdmlsl d1, s1, v13.s[4]
 // CHECK-ERROR:                                ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -7016,7 +7015,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmull s1, s1, v4.s[0]
 // CHECK-ERROR:                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqdmull s12, h17, v9.h[9]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -7041,7 +7040,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmulh s25, s26, v27.h[3]
 // CHECK-ERROR:                  ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqdmulh s25, s26, v27.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -7066,7 +7065,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqrdmulh s5, h6, v7.s[2]
 // CHECK-ERROR:                       ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqrdmulh h31, h30, v14.h[9]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -7098,16 +7097,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          dup d0, v17.s[3]
 // CHECK-ERROR:                      ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          dup d0, v17.d[4]
 // CHECK-ERROR:                        ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          dup s0, v1.s[7]
 // CHECK-ERROR:                       ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          dup h0, v31.h[16]
 // CHECK-ERROR:                        ^
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          dup b1, v3.b[16]
 // CHECK-ERROR:                       ^
 
diff --git a/test/MC/AArch64/neon-facge-facgt.s b/test/MC/AArch64/neon-facge-facgt.s
index 799b85ff42f..212eda2f209 100644
--- a/test/MC/AArch64/neon-facge-facgt.s
+++ b/test/MC/AArch64/neon-facge-facgt.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-frsqrt-frecp.s b/test/MC/AArch64/neon-frsqrt-frecp.s
index 56bc47154a0..79fe5da5e76 100644
--- a/test/MC/AArch64/neon-frsqrt-frecp.s
+++ b/test/MC/AArch64/neon-frsqrt-frecp.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-halving-add-sub.s b/test/MC/AArch64/neon-halving-add-sub.s
index 19b56ced3e6..555f1b83b4f 100644
--- a/test/MC/AArch64/neon-halving-add-sub.s
+++ b/test/MC/AArch64/neon-halving-add-sub.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-max-min-pairwise.s b/test/MC/AArch64/neon-max-min-pairwise.s
index e48f9753586..8d2dadb1997 100644
--- a/test/MC/AArch64/neon-max-min-pairwise.s
+++ b/test/MC/AArch64/neon-max-min-pairwise.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-max-min.s b/test/MC/AArch64/neon-max-min.s
index 8cc4ac86e65..6d1efde5077 100644
--- a/test/MC/AArch64/neon-max-min.s
+++ b/test/MC/AArch64/neon-max-min.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-mla-mls-instructions.s b/test/MC/AArch64/neon-mla-mls-instructions.s
index 5c8b7d8788a..3072e6f1200 100644
--- a/test/MC/AArch64/neon-mla-mls-instructions.s
+++ b/test/MC/AArch64/neon-mla-mls-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-mov.s b/test/MC/AArch64/neon-mov.s
index 6231ffe49c5..567a5ecc541 100644
--- a/test/MC/AArch64/neon-mov.s
+++ b/test/MC/AArch64/neon-mov.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-mul-div-instructions.s b/test/MC/AArch64/neon-mul-div-instructions.s
index 2601d50f131..1fe6d2b819c 100644
--- a/test/MC/AArch64/neon-mul-div-instructions.s
+++ b/test/MC/AArch64/neon-mul-div-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-rounding-halving-add.s b/test/MC/AArch64/neon-rounding-halving-add.s
index 55c9f921da7..47ac2126802 100644
--- a/test/MC/AArch64/neon-rounding-halving-add.s
+++ b/test/MC/AArch64/neon-rounding-halving-add.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-rounding-shift.s b/test/MC/AArch64/neon-rounding-shift.s
index 38924e7c4bd..e70f766f2b6 100644
--- a/test/MC/AArch64/neon-rounding-shift.s
+++ b/test/MC/AArch64/neon-rounding-shift.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-saturating-add-sub.s b/test/MC/AArch64/neon-saturating-add-sub.s
index d39997901f7..4a7ed109426 100644
--- a/test/MC/AArch64/neon-saturating-add-sub.s
+++ b/test/MC/AArch64/neon-saturating-add-sub.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-saturating-rounding-shift.s b/test/MC/AArch64/neon-saturating-rounding-shift.s
index 702b9d2c60e..9215c1cabef 100644
--- a/test/MC/AArch64/neon-saturating-rounding-shift.s
+++ b/test/MC/AArch64/neon-saturating-rounding-shift.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-saturating-shift.s b/test/MC/AArch64/neon-saturating-shift.s
index d03172b1788..9ae393a040b 100644
--- a/test/MC/AArch64/neon-saturating-shift.s
+++ b/test/MC/AArch64/neon-saturating-shift.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-abs.s b/test/MC/AArch64/neon-scalar-abs.s
index 897c93506e1..d08756c0c10 100644
--- a/test/MC/AArch64/neon-scalar-abs.s
+++ b/test/MC/AArch64/neon-scalar-abs.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-add-sub.s b/test/MC/AArch64/neon-scalar-add-sub.s
index 955c30716b4..0a3eba73212 100644
--- a/test/MC/AArch64/neon-scalar-add-sub.s
+++ b/test/MC/AArch64/neon-scalar-add-sub.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Scalar Integer Add
diff --git a/test/MC/AArch64/neon-scalar-by-elem-mla.s b/test/MC/AArch64/neon-scalar-by-elem-mla.s
index d4f3682dc2b..fec9d12d8b8 100644
--- a/test/MC/AArch64/neon-scalar-by-elem-mla.s
+++ b/test/MC/AArch64/neon-scalar-by-elem-mla.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Floating Point fused multiply-add (scalar, by element)
diff --git a/test/MC/AArch64/neon-scalar-by-elem-mul.s b/test/MC/AArch64/neon-scalar-by-elem-mul.s
index d22aa9b15b2..8b8a3f57a9c 100644
--- a/test/MC/AArch64/neon-scalar-by-elem-mul.s
+++ b/test/MC/AArch64/neon-scalar-by-elem-mul.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Floating Point  multiply (scalar, by element)
diff --git a/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s b/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s
index dadb8db9936..e3d7e0514f9 100644
--- a/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s
+++ b/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //-----------------------------------------------------------------------------
 // Signed saturating doubling multiply-add long (scalar, by element)
diff --git a/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s b/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s
index 90eeb5e64c0..8a8405ef282 100644
--- a/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s
+++ b/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //-----------------------------------------------------------------------------
 // Signed saturating doubling multiply long (scalar, by element)
diff --git a/test/MC/AArch64/neon-scalar-compare.s b/test/MC/AArch64/neon-scalar-compare.s
index 16ba92e0797..28de46a7733 100644
--- a/test/MC/AArch64/neon-scalar-compare.s
+++ b/test/MC/AArch64/neon-scalar-compare.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-cvt.s b/test/MC/AArch64/neon-scalar-cvt.s
index 047495276fb..97416daf080 100644
--- a/test/MC/AArch64/neon-scalar-cvt.s
+++ b/test/MC/AArch64/neon-scalar-cvt.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-dup.s b/test/MC/AArch64/neon-scalar-dup.s
index ba4f3c2ad79..db11ea2aa08 100644
--- a/test/MC/AArch64/neon-scalar-dup.s
+++ b/test/MC/AArch64/neon-scalar-dup.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Duplicate element (scalar)
diff --git a/test/MC/AArch64/neon-scalar-extract-narrow.s b/test/MC/AArch64/neon-scalar-extract-narrow.s
index e6167930d1c..e25224e386f 100644
--- a/test/MC/AArch64/neon-scalar-extract-narrow.s
+++ b/test/MC/AArch64/neon-scalar-extract-narrow.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-fp-compare.s b/test/MC/AArch64/neon-scalar-fp-compare.s
index cb9e7a7a66e..b798b341067 100644
--- a/test/MC/AArch64/neon-scalar-fp-compare.s
+++ b/test/MC/AArch64/neon-scalar-fp-compare.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-mul.s b/test/MC/AArch64/neon-scalar-mul.s
index 21be537cbb7..e33bdad91a9 100644
--- a/test/MC/AArch64/neon-scalar-mul.s
+++ b/test/MC/AArch64/neon-scalar-mul.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-neg.s b/test/MC/AArch64/neon-scalar-neg.s
index e902c2307a1..8e5d61dd245 100644
--- a/test/MC/AArch64/neon-scalar-neg.s
+++ b/test/MC/AArch64/neon-scalar-neg.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-recip.s b/test/MC/AArch64/neon-scalar-recip.s
index dde26b557be..7a886f3b4a7 100644
--- a/test/MC/AArch64/neon-scalar-recip.s
+++ b/test/MC/AArch64/neon-scalar-recip.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-reduce-pairwise.s b/test/MC/AArch64/neon-scalar-reduce-pairwise.s
index cb7564ac68d..403a940ec2f 100644
--- a/test/MC/AArch64/neon-scalar-reduce-pairwise.s
+++ b/test/MC/AArch64/neon-scalar-reduce-pairwise.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //----------------------------------------------------------------------
 // Scalar Reduce Add Pairwise (Integer)
diff --git a/test/MC/AArch64/neon-scalar-rounding-shift.s b/test/MC/AArch64/neon-scalar-rounding-shift.s
index 2594c2f2ac5..6113e09af38 100644
--- a/test/MC/AArch64/neon-scalar-rounding-shift.s
+++ b/test/MC/AArch64/neon-scalar-rounding-shift.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 
 //------------------------------------------------------------------------------
diff --git a/test/MC/AArch64/neon-scalar-saturating-add-sub.s b/test/MC/AArch64/neon-scalar-saturating-add-sub.s
index d5cd838a92b..0bf24349599 100644
--- a/test/MC/AArch64/neon-scalar-saturating-add-sub.s
+++ b/test/MC/AArch64/neon-scalar-saturating-add-sub.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Scalar Integer Saturating Add (Signed)
diff --git a/test/MC/AArch64/neon-scalar-saturating-rounding-shift.s b/test/MC/AArch64/neon-scalar-saturating-rounding-shift.s
index 83bd59f50c8..b09a5892344 100644
--- a/test/MC/AArch64/neon-scalar-saturating-rounding-shift.s
+++ b/test/MC/AArch64/neon-scalar-saturating-rounding-shift.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Scalar Integer Saturating Rounding Shift Lef (Signed)
diff --git a/test/MC/AArch64/neon-scalar-saturating-shift.s b/test/MC/AArch64/neon-scalar-saturating-shift.s
index 679f1f4052c..b53c9f072f3 100644
--- a/test/MC/AArch64/neon-scalar-saturating-shift.s
+++ b/test/MC/AArch64/neon-scalar-saturating-shift.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Scalar Integer Saturating Shift Lef (Signed)
diff --git a/test/MC/AArch64/neon-scalar-shift-imm.s b/test/MC/AArch64/neon-scalar-shift-imm.s
index 47a8dec212b..96cb815eafa 100644
--- a/test/MC/AArch64/neon-scalar-shift-imm.s
+++ b/test/MC/AArch64/neon-scalar-shift-imm.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-shift.s b/test/MC/AArch64/neon-scalar-shift.s
index 98aa51a63da..366840a9315 100644
--- a/test/MC/AArch64/neon-scalar-shift.s
+++ b/test/MC/AArch64/neon-scalar-shift.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Scalar Integer Shift Lef (Signed)
diff --git a/test/MC/AArch64/neon-shift-left-long.s b/test/MC/AArch64/neon-shift-left-long.s
index 87204683104..97604587424 100644
--- a/test/MC/AArch64/neon-shift-left-long.s
+++ b/test/MC/AArch64/neon-shift-left-long.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-shift.s b/test/MC/AArch64/neon-shift.s
index dcff992a782..614e6de1622 100644
--- a/test/MC/AArch64/neon-shift.s
+++ b/test/MC/AArch64/neon-shift.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-simd-copy.s b/test/MC/AArch64/neon-simd-copy.s
index 917f7cb524e..4837a4cb9ee 100644
--- a/test/MC/AArch64/neon-simd-copy.s
+++ b/test/MC/AArch64/neon-simd-copy.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-simd-shift.s b/test/MC/AArch64/neon-simd-shift.s
index 1c1ad7489d5..a16432324ef 100644
--- a/test/MC/AArch64/neon-simd-shift.s
+++ b/test/MC/AArch64/neon-simd-shift.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-sxtl.s b/test/MC/AArch64/neon-sxtl.s
index 363796ee334..0fe26cb5e8e 100644
--- a/test/MC/AArch64/neon-sxtl.s
+++ b/test/MC/AArch64/neon-sxtl.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-uxtl.s b/test/MC/AArch64/neon-uxtl.s
index 46c56625c0f..685b6362bcb 100644
--- a/test/MC/AArch64/neon-uxtl.s
+++ b/test/MC/AArch64/neon-uxtl.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/noneon-diagnostics.s b/test/MC/AArch64/noneon-diagnostics.s
index 470a74d5b31..60a5fd208af 100644
--- a/test/MC/AArch64/noneon-diagnostics.s
+++ b/test/MC/AArch64/noneon-diagnostics.s
@@ -1,48 +1,29 @@
-// RUN: not llvm-mc  -triple arm64-none-linux-gnu -mattr=-neon < %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-ARM64-ERROR < %t %s
+// RUN: not llvm-mc  -triple aarch64-none-linux-gnu -mattr=-neon < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
 
         fmla v3.4s, v12.4s, v17.4s
         fmla v1.2d, v30.2d, v20.2d
         fmla v9.2s, v9.2s, v0.2s
-// CHECK-ERROR: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmla v3.4s, v12.4s, v17.4s
 // CHECK-ERROR-NEXT:    ^
-// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmla v1.2d, v30.2d, v20.2d
 // CHECK-ERROR-NEXT:    ^
-// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmla v9.2s, v9.2s, v0.2s
 // CHECK-ERROR-NEXT:    ^
 
-// CHECK-ARM64-ERROR: error: instruction requires: neon
-// CHECK-ARM64-ERROR-NEXT:    fmla v3.4s, v12.4s, v17.4s
-// CHECK-ARM64-ERROR-NEXT:    ^
-// CHECK-ARM64-ERROR-NEXT: error: instruction requires: neon
-// CHECK-ARM64-ERROR-NEXT:    fmla v1.2d, v30.2d, v20.2d
-// CHECK-ARM64-ERROR-NEXT:    ^
-// CHECK-ARM64-ERROR-NEXT: error: instruction requires: neon
-// CHECK-ARM64-ERROR-NEXT:    fmla v9.2s, v9.2s, v0.2s
-// CHECK-ARM64-ERROR-NEXT:    ^
-
         fmls v3.4s, v12.4s, v17.4s
         fmls v1.2d, v30.2d, v20.2d
         fmls v9.2s, v9.2s, v0.2s
-// CHECK-ERROR: error: instruction requires a CPU feature not currently enabled
+
+// CHECK-ERROR: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmls v3.4s, v12.4s, v17.4s
 // CHECK-ERROR-NEXT:    ^
-// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmls v1.2d, v30.2d, v20.2d
 // CHECK-ERROR-NEXT:    ^
-// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmls v9.2s, v9.2s, v0.2s
 // CHECK-ERROR-NEXT:    ^
-
-// CHECK-ARM64-ERROR: error: instruction requires: neon
-// CHECK-ARM64-ERROR-NEXT:    fmls v3.4s, v12.4s, v17.4s
-// CHECK-ARM64-ERROR-NEXT:    ^
-// CHECK-ARM64-ERROR-NEXT: error: instruction requires: neon
-// CHECK-ARM64-ERROR-NEXT:    fmls v1.2d, v30.2d, v20.2d
-// CHECK-ARM64-ERROR-NEXT:    ^
-// CHECK-ARM64-ERROR-NEXT: error: instruction requires: neon
-// CHECK-ARM64-ERROR-NEXT:    fmls v9.2s, v9.2s, v0.2s
-// CHECK-ARM64-ERROR-NEXT:    ^
diff --git a/test/MC/AArch64/optional-hash.s b/test/MC/AArch64/optional-hash.s
index 7ae1aa49047..3922b5be34a 100644
--- a/test/MC/AArch64/optional-hash.s
+++ b/test/MC/AArch64/optional-hash.s
@@ -1,5 +1,5 @@
 // PR18929
-// RUN: llvm-mc < %s -triple=arm64-linux-gnueabi -mattr=+fp-armv8,+neon -filetype=obj -o - \
+// RUN: llvm-mc < %s -triple=aarch64-linux-gnueabi -mattr=+fp-armv8,+neon -filetype=obj -o - \
 // RUN: | llvm-objdump --disassemble -arch=arm64 -mattr=+fp-armv8,+neon - | FileCheck %s
 
     .text
diff --git a/test/MC/AArch64/tls-relocs.s b/test/MC/AArch64/tls-relocs.s
index ae7b20cefd5..ebf02167a8f 100644
--- a/test/MC/AArch64/tls-relocs.s
+++ b/test/MC/AArch64/tls-relocs.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -show-encoding < %s | FileCheck %s --check-prefix=CHECK-ARM64
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s -o - | \
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj < %s -o - | \
 // RUN:   llvm-readobj -r -t | FileCheck --check-prefix=CHECK-ELF %s
 
         // TLS local-dynamic forms
@@ -8,14 +8,14 @@
         movz x3, #:dtprel_g2:var
         movn x4, #:dtprel_g2:var
 
-// CHECK-ARM64: movz    x1, #:dtprel_g2:var     // encoding: [0bAAA00001,A,0b110AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x2, #:dtprel_g2:var     // encoding: [0bAAA00010,A,0b110AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    x3, #:dtprel_g2:var     // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x4, #:dtprel_g2:var     // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
+// CHECK: movz    x1, #:dtprel_g2:var     // encoding: [0bAAA00001,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x2, #:dtprel_g2:var     // encoding: [0bAAA00010,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movz    x3, #:dtprel_g2:var     // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x4, #:dtprel_g2:var     // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF:      Relocations [
 // CHECK-ELF-NEXT:   Section (2) .rela.text {
@@ -30,14 +30,14 @@
         movz w7, #:dtprel_g1:var
         movn w8, #:dtprel_g1:var
 
-// CHECK-ARM64: movz    x5, #:dtprel_g1:var     // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x6, #:dtprel_g1:var     // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w7, #:dtprel_g1:var     // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    w8, #:dtprel_g1:var     // encoding: [0bAAA01000,A,0b101AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
+// CHECK: movz    x5, #:dtprel_g1:var     // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    x6, #:dtprel_g1:var     // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w7, #:dtprel_g1:var     // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    w8, #:dtprel_g1:var     // encoding: [0bAAA01000,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x10 R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x14 R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
@@ -48,10 +48,10 @@
         movk x9, #:dtprel_g1_nc:var
         movk w10, #:dtprel_g1_nc:var
 
-// CHECK-ARM64: movk    x9, #:dtprel_g1_nc:var  // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w10, #:dtprel_g1_nc:var // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    x9, #:dtprel_g1_nc:var  // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w10, #:dtprel_g1_nc:var // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x20 R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x24 R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
@@ -62,14 +62,14 @@
         movz w13, #:dtprel_g0:var
         movn w14, #:dtprel_g0:var
 
-// CHECK-ARM64: movz    x11, #:dtprel_g0:var    // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x12, #:dtprel_g0:var    // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w13, #:dtprel_g0:var    // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    w14, #:dtprel_g0:var    // encoding: [0bAAA01110,A,0b100AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
+// CHECK: movz    x11, #:dtprel_g0:var    // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    x12, #:dtprel_g0:var    // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movz    w13, #:dtprel_g0:var    // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    w14, #:dtprel_g0:var    // encoding: [0bAAA01110,A,0b100AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x28 R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x2C R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
@@ -80,10 +80,10 @@
         movk x15, #:dtprel_g0_nc:var
         movk w16, #:dtprel_g0_nc:var
 
-// CHECK-ARM64: movk    x15, #:dtprel_g0_nc:var // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w16, #:dtprel_g0_nc:var // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    x15, #:dtprel_g0_nc:var // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w16, #:dtprel_g0_nc:var // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x38 R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x3C R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
@@ -92,10 +92,10 @@
         add x17, x18, #:dtprel_hi12:var, lsl #12
         add w19, w20, #:dtprel_hi12:var, lsl #12
 
-// CHECK-ARM64: add    x17, x18, :dtprel_hi12:var, lsl #12 // encoding: [0x51,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                            //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: add    w19, w20, :dtprel_hi12:var, lsl #12 // encoding: [0x93,0bAAAAAA10,0b00AAAAAA,0x11]
-// CHECK-ARM64:                                            //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_arm64_add_imm12
+// CHECK: add    x17, x18, :dtprel_hi12:var, lsl #12 // encoding: [0x51,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK:                                            //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w19, w20, :dtprel_hi12:var, lsl #12 // encoding: [0x93,0bAAAAAA10,0b00AAAAAA,0x11]
+// CHECK:                                            //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0x40 R_AARCH64_TLSLD_ADD_DTPREL_HI12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x44 R_AARCH64_TLSLD_ADD_DTPREL_HI12 [[VARSYM]]
@@ -104,10 +104,10 @@
         add x21, x22, #:dtprel_lo12:var
         add w23, w24, #:dtprel_lo12:var
 
-// CHECK-ARM64: add    x21, x22, :dtprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                   //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: add    w23, w24, :dtprel_lo12:var // encoding: [0x17,0bAAAAAA11,0b00AAAAAA,0x11]
-// CHECK-ARM64:                                   //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_add_imm12
+// CHECK: add    x21, x22, :dtprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK:                                   //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w23, w24, :dtprel_lo12:var // encoding: [0x17,0bAAAAAA11,0b00AAAAAA,0x11]
+// CHECK:                                   //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0x48 R_AARCH64_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x4C R_AARCH64_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
@@ -116,10 +116,10 @@
         add x25, x26, #:dtprel_lo12_nc:var
         add w27, w28, #:dtprel_lo12_nc:var
 
-// CHECK-ARM64: add    x25, x26, :dtprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                      //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: add    w27, w28, :dtprel_lo12_nc:var // encoding: [0x9b,0bAAAAAA11,0b00AAAAAA,0x11]
-// CHECK-ARM64:                                      //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_add_imm12
+// CHECK: add    x25, x26, :dtprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
+// CHECK:                                      //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w27, w28, :dtprel_lo12_nc:var // encoding: [0x9b,0bAAAAAA11,0b00AAAAAA,0x11]
+// CHECK:                                      //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0x50 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x54 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
@@ -128,10 +128,10 @@
         ldrb w29, [x30, #:dtprel_lo12:var]
         ldrsb x29, [x28, #:dtprel_lo12_nc:var]
 
-// CHECK-ARM64: ldrb    w29, [x30, :dtprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
-// CHECK-ARM64:                                      //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale1
-// CHECK-ARM64: ldrsb    x29, [x28, :dtprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
-// CHECK-ARM64:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale1
+// CHECK: ldrb    w29, [x30, :dtprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
+// CHECK:                                      //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsb    x29, [x28, :dtprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
+// CHECK:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
 
 // CHECK-ELF-NEXT:     0x58 R_AARCH64_TLSLD_LDST8_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x5C R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC [[VARSYM]]
@@ -140,10 +140,10 @@
         strh w27, [x26, #:dtprel_lo12:var]
         ldrsh x25, [x24, #:dtprel_lo12_nc:var]
 
-// CHECK-ARM64: strh    w27, [x26, :dtprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
-// CHECK-ARM64:                                      //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale2
-// CHECK-ARM64: ldrsh    x25, [x24, :dtprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
-// CHECK-ARM64:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale2
+// CHECK: strh    w27, [x26, :dtprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
+// CHECK:                                      //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsh    x25, [x24, :dtprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
+// CHECK:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
 
 // CHECK-ELF-NEXT:     0x60 R_AARCH64_TLSLD_LDST16_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x64 R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC [[VARSYM]]
@@ -152,10 +152,10 @@
         ldr w23, [x22, #:dtprel_lo12:var]
         ldrsw x21, [x20, #:dtprel_lo12_nc:var]
 
-// CHECK-ARM64: ldr    w23, [x22, :dtprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale4
-// CHECK-ARM64: ldrsw    x21, [x20, :dtprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
-// CHECK-ARM64:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale4
+// CHECK: ldr    w23, [x22, :dtprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
+// CHECK:                                     //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldrsw    x21, [x20, :dtprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
+// CHECK:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
 
 // CHECK-ELF-NEXT:     0x68 R_AARCH64_TLSLD_LDST32_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x6C R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC [[VARSYM]]
@@ -164,10 +164,10 @@
         ldr x19, [x18, #:dtprel_lo12:var]
         str x17, [x16, #:dtprel_lo12_nc:var]
 
-// CHECK-ARM64: ldr    x19, [x18, :dtprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK-ARM64: str    x17, [x16, :dtprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
-// CHECK-ARM64:                                        //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK: ldr    x19, [x18, :dtprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
+// CHECK:                                     //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: str    x17, [x16, :dtprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
+// CHECK:                                        //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
 
 // CHECK-ELF-NEXT:     0x70 R_AARCH64_TLSLD_LDST64_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x74 R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC [[VARSYM]]
@@ -177,10 +177,10 @@
         movz x15, #:gottprel_g1:var
         movz w14, #:gottprel_g1:var
 
-// CHECK-ARM64: movz    x15, #:gottprel_g1:var  // encoding: [0bAAA01111,A,0b101AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w14, #:gottprel_g1:var  // encoding: [0bAAA01110,A,0b101AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_arm64_movw
+// CHECK: movz    x15, #:gottprel_g1:var  // encoding: [0bAAA01111,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w14, #:gottprel_g1:var  // encoding: [0bAAA01110,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x78 R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x7C R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 [[VARSYM]]
@@ -189,10 +189,10 @@
         movk x13, #:gottprel_g0_nc:var
         movk w12, #:gottprel_g0_nc:var
 
-// CHECK-ARM64: movk    x13, #:gottprel_g0_nc:var // encoding: [0bAAA01101,A,0b100AAAAA,0xf2]
-// CHECK-ARM64:                                   //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w12, #:gottprel_g0_nc:var // encoding: [0bAAA01100,A,0b100AAAAA,0x72]
-// CHECK-ARM64:                                   //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    x13, #:gottprel_g0_nc:var // encoding: [0bAAA01101,A,0b100AAAAA,0xf2]
+// CHECK:                                   //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w12, #:gottprel_g0_nc:var // encoding: [0bAAA01100,A,0b100AAAAA,0x72]
+// CHECK:                                   //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x80 R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x84 R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC [[VARSYM]]
@@ -202,12 +202,12 @@
         ldr x10, [x0, #:gottprel_lo12:var]
         ldr x9, :gottprel:var
 
-// CHECK-ARM64: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_arm64_pcrel_adrp_imm21
-// CHECK-ARM64: ldr    x10, [x0, :gottprel_lo12:var] // encoding: [0x0a,0bAAAAAA00,0b01AAAAAA,0xf9]
-// CHECK-ARM64:                                      //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK-ARM64: ldr    x9, :gottprel:var       // encoding: [0bAAA01001,A,A,0x58]
-// CHECK-ARM64:                                //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_arm64_ldr_pcrel_imm19
+// CHECK: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
+// CHECK:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK: ldr    x10, [x0, :gottprel_lo12:var] // encoding: [0x0a,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK:                                      //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: ldr    x9, :gottprel:var       // encoding: [0bAAA01001,A,A,0x58]
+// CHECK:                                //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_ldr_pcrel_imm19
 
 // CHECK-ELF-NEXT:     0x88 R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x8C R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC [[VARSYM]]
@@ -218,10 +218,10 @@
         movz x3, #:tprel_g2:var
         movn x4, #:tprel_g2:var
 
-// CHECK-ARM64: movz    x3, #:tprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x4, #:tprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_arm64_movw
+// CHECK: movz    x3, #:tprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x4, #:tprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x94 R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x98 R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
@@ -232,14 +232,14 @@
         movz w7, #:tprel_g1:var
         movn w8, #:tprel_g1:var
 
-// CHECK-ARM64: movz    x5, #:tprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x6, #:tprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w7, #:tprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    w8, #:tprel_g1:var      // encoding: [0bAAA01000,A,0b101AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
+// CHECK: movz    x5, #:tprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    x6, #:tprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w7, #:tprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    w8, #:tprel_g1:var      // encoding: [0bAAA01000,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x9C R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xA0 R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
@@ -250,10 +250,10 @@
         movk x9, #:tprel_g1_nc:var
         movk w10, #:tprel_g1_nc:var
 
-// CHECK-ARM64: movk    x9, #:tprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w10, #:tprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    x9, #:tprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w10, #:tprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0xAC R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0xB0 R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
@@ -264,14 +264,14 @@
         movz w13, #:tprel_g0:var
         movn w14, #:tprel_g0:var
 
-// CHECK-ARM64: movz    x11, #:tprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x12, #:tprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w13, #:tprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    w14, #:tprel_g0:var     // encoding: [0bAAA01110,A,0b100AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
+// CHECK: movz    x11, #:tprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    x12, #:tprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movz    w13, #:tprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    w14, #:tprel_g0:var     // encoding: [0bAAA01110,A,0b100AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0xB4 R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xB8 R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
@@ -282,10 +282,10 @@
         movk x15, #:tprel_g0_nc:var
         movk w16, #:tprel_g0_nc:var
 
-// CHECK-ARM64: movk    x15, #:tprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w16, #:tprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    x15, #:tprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w16, #:tprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0xC4 R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0xC8 R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
@@ -294,10 +294,10 @@
         add x17, x18, #:tprel_hi12:var, lsl #12
         add w19, w20, #:tprel_hi12:var, lsl #12
 
-// CHECK-ARM64: add    x17, x18, :tprel_hi12:var, lsl #12 // encoding: [0x51,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                           //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: add    w19, w20, :tprel_hi12:var, lsl #12 // encoding: [0x93,0bAAAAAA10,0b00AAAAAA,0x11]
-// CHECK-ARM64:                                           //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_arm64_add_imm12
+// CHECK: add    x17, x18, :tprel_hi12:var, lsl #12 // encoding: [0x51,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK:                                           //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w19, w20, :tprel_hi12:var, lsl #12 // encoding: [0x93,0bAAAAAA10,0b00AAAAAA,0x11]
+// CHECK:                                           //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0xCC R_AARCH64_TLSLE_ADD_TPREL_HI12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xD0 R_AARCH64_TLSLE_ADD_TPREL_HI12 [[VARSYM]]
@@ -306,10 +306,10 @@
         add x21, x22, #:tprel_lo12:var
         add w23, w24, #:tprel_lo12:var
 
-// CHECK-ARM64: add    x21, x22, :tprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                  //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: add    w23, w24, :tprel_lo12:var // encoding: [0x17,0bAAAAAA11,0b00AAAAAA,0x11]
-// CHECK-ARM64:                                  //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_add_imm12
+// CHECK: add    x21, x22, :tprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK:                                  //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w23, w24, :tprel_lo12:var // encoding: [0x17,0bAAAAAA11,0b00AAAAAA,0x11]
+// CHECK:                                  //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0xD4 R_AARCH64_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xD8 R_AARCH64_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
@@ -318,10 +318,10 @@
         add x25, x26, #:tprel_lo12_nc:var
         add w27, w28, #:tprel_lo12_nc:var
 
-// CHECK-ARM64: add    x25, x26, :tprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: add    w27, w28, :tprel_lo12_nc:var // encoding: [0x9b,0bAAAAAA11,0b00AAAAAA,0x11]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_add_imm12
+// CHECK: add    x25, x26, :tprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w27, w28, :tprel_lo12_nc:var // encoding: [0x9b,0bAAAAAA11,0b00AAAAAA,0x11]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0xDC R_AARCH64_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0xE0 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
@@ -330,10 +330,10 @@
         ldrb w29, [x30, #:tprel_lo12:var]
         ldrsb x29, [x28, #:tprel_lo12_nc:var]
 
-// CHECK-ARM64: ldrb    w29, [x30, :tprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale1
-// CHECK-ARM64: ldrsb    x29, [x28, :tprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale1
+// CHECK: ldrb    w29, [x30, :tprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsb    x29, [x28, :tprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
 
 // CHECK-ELF-NEXT:     0xE4 R_AARCH64_TLSLE_LDST8_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xE8 R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC [[VARSYM]]
@@ -342,10 +342,10 @@
         strh w27, [x26, #:tprel_lo12:var]
         ldrsh x25, [x24, #:tprel_lo12_nc:var]
 
-// CHECK-ARM64: strh    w27, [x26, :tprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale2
-// CHECK-ARM64: ldrsh    x25, [x24, :tprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
-// CHECK-ARM64:                                         //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale2
+// CHECK: strh    w27, [x26, :tprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsh    x25, [x24, :tprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
+// CHECK:                                         //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
 
 // CHECK-ELF-NEXT:     0xEC R_AARCH64_TLSLE_LDST16_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xF0 R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC [[VARSYM]]
@@ -354,10 +354,10 @@
         ldr w23, [x22, #:tprel_lo12:var]
         ldrsw x21, [x20, #:tprel_lo12_nc:var]
 
-// CHECK-ARM64: ldr    w23, [x22, :tprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
-// CHECK-ARM64:                                    //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale4
-// CHECK-ARM64: ldrsw    x21, [x20, :tprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
-// CHECK-ARM64:                                         //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale4
+// CHECK: ldr    w23, [x22, :tprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
+// CHECK:                                    //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldrsw    x21, [x20, :tprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
+// CHECK:                                         //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
 
 // CHECK-ELF-NEXT:     0xF4 R_AARCH64_TLSLE_LDST32_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xF8 R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC [[VARSYM]]
@@ -365,10 +365,10 @@
         ldr x19, [x18, #:tprel_lo12:var]
         str x17, [x16, #:tprel_lo12_nc:var]
 
-// CHECK-ARM64: ldr    x19, [x18, :tprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
-// CHECK-ARM64:                                    //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK-ARM64: str    x17, [x16, :tprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
-// CHECK-ARM64:                                       //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK: ldr    x19, [x18, :tprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
+// CHECK:                                    //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: str    x17, [x16, :tprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
+// CHECK:                                       //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
 
 // CHECK-ELF-NEXT:     0xFC  R_AARCH64_TLSLE_LDST64_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x100 R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC [[VARSYM]]
@@ -381,15 +381,15 @@
         blr x3
 
 
-// CHECK-ARM64: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_arm64_pcrel_adrp_imm21
-// CHECK-ARM64: ldr    x7, [x6, :tlsdesc_lo12:var] // encoding: [0xc7,0bAAAAAA00,0b01AAAAAA,0xf9]
-// CHECK-ARM64:                                    //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK-ARM64: add    x5, x4, :tlsdesc_lo12:var // encoding: [0x85,0bAAAAAA00,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                  //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: .tlsdesccall var                // encoding: []
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: var, kind: fixup_arm64_tlsdesc_call
-// CHECK-ARM64: blr    x3                      // encoding: [0x60,0x00,0x3f,0xd6]
+// CHECK: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
+// CHECK:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK: ldr    x7, [x6, :tlsdesc_lo12:var] // encoding: [0xc7,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK:                                    //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: add    x5, x4, :tlsdesc_lo12:var // encoding: [0x85,0bAAAAAA00,0b00AAAAAA,0x91]
+// CHECK:                                  //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_add_imm12
+// CHECK: .tlsdesccall var                // encoding: []
+// CHECK:                                 //   fixup A - offset: 0, value: var, kind: fixup_aarch64_tlsdesc_call
+// CHECK: blr    x3                      // encoding: [0x60,0x00,0x3f,0xd6]
 
 // CHECK-ELF-NEXT:     0x104 R_AARCH64_TLSDESC_ADR_PAGE [[VARSYM]]
 // CHECK-ELF-NEXT:     0x108 R_AARCH64_TLSDESC_LD64_LO12_NC [[VARSYM]]
diff --git a/test/MC/AArch64/trace-regs-diagnostics.s b/test/MC/AArch64/trace-regs-diagnostics.s
index fa57817dd38..41331e7703c 100644
--- a/test/MC/AArch64/trace-regs-diagnostics.s
+++ b/test/MC/AArch64/trace-regs-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple arm64-none-linux-gnu < %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2>&1 | FileCheck %s
         // Write-only
         mrs x12, trcoslar
         mrs x10, trclar
diff --git a/test/MC/AArch64/trace-regs.s b/test/MC/AArch64/trace-regs.s
index be25f08947b..92f16cd54f3 100644
--- a/test/MC/AArch64/trace-regs.s
+++ b/test/MC/AArch64/trace-regs.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s
 
         mrs x8, trcstatr
         mrs x9, trcidr8
diff --git a/test/MC/ARM64/adr.s b/test/MC/ARM64/adr.s
deleted file mode 100644
index 3442225dfe3..00000000000
--- a/test/MC/ARM64/adr.s
+++ /dev/null
@@ -1,31 +0,0 @@
-// RUN: not llvm-mc -triple arm64 -show-encoding < %s 2>%t | FileCheck %s
-// RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-adr x0, #0
-adr x0, #1
-adr x0, 1f
-adr x0, foo
-// CHECK: adr x0, #0          // encoding: [0x00,0x00,0x00,0x10]
-// CHECK: adr x0, #1          // encoding: [0x00,0x00,0x00,0x30]
-// CHECK: adr x0, .Ltmp0      // encoding: [A,A,A,0x10'A']
-// CHECK-NEXT:                //   fixup A - offset: 0, value: .Ltmp0, kind: fixup_arm64_pcrel_adr_imm21
-// CHECK: adr x0, foo         // encoding: [A,A,A,0x10'A']
-// CHECK-NEXT:                //   fixup A - offset: 0, value: foo, kind: fixup_arm64_pcrel_adr_imm21
-
-adrp x0, #0
-adrp x0, #4096
-adrp x0, 1f
-adrp x0, foo
-// CHECK: adrp    x0, #0      // encoding: [0x00,0x00,0x00,0x90]
-// CHECK: adrp    x0, #4096   // encoding: [0x00,0x00,0x00,0xb0]
-// CHECK: adrp    x0, .Ltmp0  // encoding: [A,A,A,0x90'A']
-// CHECK-NEXT:                //   fixup A - offset: 0, value: .Ltmp0, kind: fixup_arm64_pcrel_adrp_imm21
-// CHECK: adrp    x0, foo     // encoding: [A,A,A,0x90'A']
-// CHECK-NEXT:                //   fixup A - offset: 0, value: foo, kind: fixup_arm64_pcrel_adrp_imm21
-
-adr x0, #0xffffffff
-adrp x0, #0xffffffff
-adrp x0, #1
-// CHECK-ERRORS: error: expected label or encodable integer pc offset
-// CHECK-ERRORS: error: expected label or encodable integer pc offset
-// CHECK-ERRORS: error: expected label or encodable integer pc offset
diff --git a/test/MC/ARM64/advsimd.s b/test/MC/ARM64/advsimd.s
deleted file mode 100644
index c627de708d3..00000000000
--- a/test/MC/ARM64/advsimd.s
+++ /dev/null
@@ -1,1997 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -output-asm-variant=1 -show-encoding < %s | FileCheck %s
-
-foo:
-
-  abs.8b  v0, v0
-  abs.16b v0, v0
-  abs.4h  v0, v0
-  abs.8h  v0, v0
-  abs.2s  v0, v0
-  abs.4s  v0, v0
-
-; CHECK: abs.8b  v0, v0              ; encoding: [0x00,0xb8,0x20,0x0e]
-; CHECK: abs.16b v0, v0              ; encoding: [0x00,0xb8,0x20,0x4e]
-; CHECK: abs.4h  v0, v0              ; encoding: [0x00,0xb8,0x60,0x0e]
-; CHECK: abs.8h  v0, v0              ; encoding: [0x00,0xb8,0x60,0x4e]
-; CHECK: abs.2s  v0, v0              ; encoding: [0x00,0xb8,0xa0,0x0e]
-; CHECK: abs.4s  v0, v0              ; encoding: [0x00,0xb8,0xa0,0x4e]
-
-  add.8b  v0, v0, v0
-  add.16b v0, v0, v0
-  add.4h  v0, v0, v0
-  add.8h  v0, v0, v0
-  add.2s  v0, v0, v0
-  add.4s  v0, v0, v0
-  add.2d  v0, v0, v0
-
-; CHECK: add.8b  v0, v0, v0          ; encoding: [0x00,0x84,0x20,0x0e]
-; CHECK: add.16b v0, v0, v0          ; encoding: [0x00,0x84,0x20,0x4e]
-; CHECK: add.4h  v0, v0, v0          ; encoding: [0x00,0x84,0x60,0x0e]
-; CHECK: add.8h  v0, v0, v0          ; encoding: [0x00,0x84,0x60,0x4e]
-; CHECK: add.2s  v0, v0, v0          ; encoding: [0x00,0x84,0xa0,0x0e]
-; CHECK: add.4s  v0, v0, v0          ; encoding: [0x00,0x84,0xa0,0x4e]
-; CHECK: add.2d  v0, v0, v0          ; encoding: [0x00,0x84,0xe0,0x4e]
-
-  add d1, d2, d3
-
-; CHECK: add d1, d2, d3              ; encoding: [0x41,0x84,0xe3,0x5e]
-
-  addhn.8b   v0, v0, v0
-  addhn2.16b v0, v0, v0
-  addhn.4h   v0, v0, v0
-  addhn2.8h  v0, v0, v0
-  addhn.2s   v0, v0, v0
-  addhn2.4s  v0, v0, v0
-
-; CHECK: addhn.8b   v0, v0, v0       ; encoding: [0x00,0x40,0x20,0x0e]
-; CHECK: addhn2.16b v0, v0, v0       ; encoding: [0x00,0x40,0x20,0x4e]
-; CHECK: addhn.4h   v0, v0, v0       ; encoding: [0x00,0x40,0x60,0x0e]
-; CHECK: addhn2.8h  v0, v0, v0       ; encoding: [0x00,0x40,0x60,0x4e]
-; CHECK: addhn.2s   v0, v0, v0       ; encoding: [0x00,0x40,0xa0,0x0e]
-; CHECK: addhn2.4s  v0, v0, v0       ; encoding: [0x00,0x40,0xa0,0x4e]
-
-  addp.8b  v0, v0, v0
-  addp.16b v0, v0, v0
-  addp.4h  v0, v0, v0
-  addp.8h  v0, v0, v0
-  addp.2s  v0, v0, v0
-  addp.4s  v0, v0, v0
-  addp.2d  v0, v0, v0
-
-; CHECK: addp.8b   v0, v0, v0        ; encoding: [0x00,0xbc,0x20,0x0e]
-; CHECK: addp.16b  v0, v0, v0        ; encoding: [0x00,0xbc,0x20,0x4e]
-; CHECK: addp.4h   v0, v0, v0        ; encoding: [0x00,0xbc,0x60,0x0e]
-; CHECK: addp.8h   v0, v0, v0        ; encoding: [0x00,0xbc,0x60,0x4e]
-; CHECK: addp.2s   v0, v0, v0        ; encoding: [0x00,0xbc,0xa0,0x0e]
-; CHECK: addp.4s   v0, v0, v0        ; encoding: [0x00,0xbc,0xa0,0x4e]
-; CHECK: addp.2d   v0, v0, v0        ; encoding: [0x00,0xbc,0xe0,0x4e]
-
-  addp.2d  d0, v0
-
-; CHECK: addp.2d d0, v0              ; encoding: [0x00,0xb8,0xf1,0x5e]
-
-  addv.8b  b0, v0
-  addv.16b b0, v0
-  addv.4h  h0, v0
-  addv.8h  h0, v0
-  addv.4s  s0, v0
-
-; CHECK: addv.8b  b0, v0             ; encoding: [0x00,0xb8,0x31,0x0e]
-; CHECK: addv.16b b0, v0             ; encoding: [0x00,0xb8,0x31,0x4e]
-; CHECK: addv.4h  h0, v0             ; encoding: [0x00,0xb8,0x71,0x0e]
-; CHECK: addv.8h  h0, v0             ; encoding: [0x00,0xb8,0x71,0x4e]
-; CHECK: addv.4s  s0, v0             ; encoding: [0x00,0xb8,0xb1,0x4e]
-
-
-; INS/DUP
-  dup.2d  v0, x3
-  dup.4s  v0, w3
-  dup.2s  v0, w3
-  dup.8h  v0, w3
-  dup.4h  v0, w3
-  dup.16b v0, w3
-  dup.8b  v0, w3
-
-  dup v1.2d, x3
-  dup v2.4s, w4
-  dup v3.2s, w5
-  dup v4.8h, w6
-  dup v5.4h, w7
-  dup v6.16b, w8
-  dup v7.8b, w9
-
-; CHECK: dup.2d  v0, x3              ; encoding: [0x60,0x0c,0x08,0x4e]
-; CHECK: dup.4s  v0, w3              ; encoding: [0x60,0x0c,0x04,0x4e]
-; CHECK: dup.2s  v0, w3              ; encoding: [0x60,0x0c,0x04,0x0e]
-; CHECK: dup.8h  v0, w3              ; encoding: [0x60,0x0c,0x02,0x4e]
-; CHECK: dup.4h  v0, w3              ; encoding: [0x60,0x0c,0x02,0x0e]
-; CHECK: dup.16b v0, w3              ; encoding: [0x60,0x0c,0x01,0x4e]
-; CHECK: dup.8b  v0, w3              ; encoding: [0x60,0x0c,0x01,0x0e]
-
-; CHECK: dup.2d	v1, x3               ; encoding: [0x61,0x0c,0x08,0x4e]
-; CHECK: dup.4s	v2, w4               ; encoding: [0x82,0x0c,0x04,0x4e]
-; CHECK: dup.2s	v3, w5               ; encoding: [0xa3,0x0c,0x04,0x0e]
-; CHECK: dup.8h	v4, w6               ; encoding: [0xc4,0x0c,0x02,0x4e]
-; CHECK: dup.4h	v5, w7               ; encoding: [0xe5,0x0c,0x02,0x0e]
-; CHECK: dup.16b v6, w8              ; encoding: [0x06,0x0d,0x01,0x4e]
-; CHECK: dup.8b	v7, w9               ; encoding: [0x27,0x0d,0x01,0x0e]
-
-  dup.2d  v0, v3[1]
-  dup.2s  v0, v3[1]
-  dup.4s  v0, v3[1]
-  dup.4h  v0, v3[1]
-  dup.8h  v0, v3[1]
-  dup.8b  v0, v3[1]
-  dup.16b v0, v3[1]
-
-  dup v7.2d, v9.d[1]
-  dup v6.2s, v8.s[1]
-  dup v5.4s, v7.s[2]
-  dup v4.4h, v6.h[3]
-  dup v3.8h, v5.h[4]
-  dup v2.8b, v4.b[5]
-  dup v1.16b, v3.b[6]
-
-; CHECK: dup.2d  v0, v3[1]           ; encoding: [0x60,0x04,0x18,0x4e]
-; CHECK: dup.2s  v0, v3[1]           ; encoding: [0x60,0x04,0x0c,0x0e]
-; CHECK: dup.4s  v0, v3[1]           ; encoding: [0x60,0x04,0x0c,0x4e]
-; CHECK: dup.4h  v0, v3[1]           ; encoding: [0x60,0x04,0x06,0x0e]
-; CHECK: dup.8h  v0, v3[1]           ; encoding: [0x60,0x04,0x06,0x4e]
-; CHECK: dup.8b  v0, v3[1]           ; encoding: [0x60,0x04,0x03,0x0e]
-; CHECK: dup.16b v0, v3[1]           ; encoding: [0x60,0x04,0x03,0x4e]
-
-; CHECK: dup.2d  v7, v9[1]            ; encoding: [0x27,0x05,0x18,0x4e]
-; CHECK: dup.2s  v6, v8[1]            ; encoding: [0x06,0x05,0x0c,0x0e]
-; CHECK: dup.4s  v5, v7[2]            ; encoding: [0xe5,0x04,0x14,0x4e]
-; CHECK: dup.4h  v4, v6[3]            ; encoding: [0xc4,0x04,0x0e,0x0e]
-; CHECK: dup.8h  v3, v5[4]            ; encoding: [0xa3,0x04,0x12,0x4e]
-; CHECK: dup.8b  v2, v4[5]            ; encoding: [0x82,0x04,0x0b,0x0e]
-; CHECK: dup.16b v1, v3[6]            ; encoding: [0x61,0x04,0x0d,0x4e]
-
-  dup b3, v4[1]
-  dup h3, v4[1]
-  dup s3, v4[1]
-  dup d3, v4[1]
-  dup b3, v4.b[1]
-  dup h3, v4.h[1]
-  dup s3, v4.s[1]
-  dup d3, v4.d[1]
-
-  mov b3, v4[1]
-  mov h3, v4[1]
-  mov s3, v4[1]
-  mov d3, v4[1]
-  mov b3, v4.b[1]
-  mov h3, v4.h[1]
-  mov s3, v4.s[1]
-  mov d3, v4.d[1]
-
-; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
-; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
-; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
-; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
-; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
-; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
-; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
-; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
-
-; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
-; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
-; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
-; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
-; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
-; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
-; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
-; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
-
-  smov.s x3, v2[2]
-  smov   x3, v2.s[2]
-  umov.s w3, v2[2]
-  umov   w3, v2.s[2]
-  umov.d x3, v2[1]
-  umov   x3, v2.d[1]
-
-; CHECK: smov.s  x3, v2[2]           ; encoding: [0x43,0x2c,0x14,0x4e]
-; CHECK: smov.s  x3, v2[2]           ; encoding: [0x43,0x2c,0x14,0x4e]
-; CHECK: mov.s  w3, v2[2]           ; encoding: [0x43,0x3c,0x14,0x0e]
-; CHECK: mov.s  w3, v2[2]           ; encoding: [0x43,0x3c,0x14,0x0e]
-; CHECK: mov.d  x3, v2[1]           ; encoding: [0x43,0x3c,0x18,0x4e]
-; CHECK: mov.d  x3, v2[1]           ; encoding: [0x43,0x3c,0x18,0x4e]
-
-  ; MOV aliases for UMOV instructions above
-
-  mov.s w2, v3[3]
-  mov   w5, v7.s[2]
-  mov.d x11, v13[1]
-  mov   x17, v19.d[0]
-
-; CHECK: mov.s  w2, v3[3]               ; encoding: [0x62,0x3c,0x1c,0x0e]
-; CHECK: mov.s  w5, v7[2]               ; encoding: [0xe5,0x3c,0x14,0x0e]
-; CHECK: mov.d  x11, v13[1]             ; encoding: [0xab,0x3d,0x18,0x4e]
-; CHECK: mov.d  x17, v19[0]             ; encoding: [0x71,0x3e,0x08,0x4e]
-
-  ins.d v2[1], x5
-  ins.s v2[1], w5
-  ins.h v2[1], w5
-  ins.b v2[1], w5
-
-  ins   v2.d[1], x5
-  ins   v2.s[1], w5
-  ins   v2.h[1], w5
-  ins   v2.b[1], w5
-
-; CHECK: ins.d v2[1], x5             ; encoding: [0xa2,0x1c,0x18,0x4e]
-; CHECK: ins.s v2[1], w5             ; encoding: [0xa2,0x1c,0x0c,0x4e]
-; CHECK: ins.h v2[1], w5             ; encoding: [0xa2,0x1c,0x06,0x4e]
-; CHECK: ins.b v2[1], w5             ; encoding: [0xa2,0x1c,0x03,0x4e]
-
-; CHECK: ins.d v2[1], x5             ; encoding: [0xa2,0x1c,0x18,0x4e]
-; CHECK: ins.s v2[1], w5             ; encoding: [0xa2,0x1c,0x0c,0x4e]
-; CHECK: ins.h v2[1], w5             ; encoding: [0xa2,0x1c,0x06,0x4e]
-; CHECK: ins.b v2[1], w5             ; encoding: [0xa2,0x1c,0x03,0x4e]
-
-  ins.d v2[1], v15[1]
-  ins.s v2[1], v15[1]
-  ins.h v2[1], v15[1]
-  ins.b v2[1], v15[1]
-
-  ins   v2.d[1], v15.d[0]
-  ins   v2.s[3], v15.s[2]
-  ins   v2.h[7], v15.h[3]
-  ins   v2.b[10], v15.b[5]
-
-; CHECK: ins.d v2[1], v15[1]         ; encoding: [0xe2,0x45,0x18,0x6e]
-; CHECK: ins.s v2[1], v15[1]         ; encoding: [0xe2,0x25,0x0c,0x6e]
-; CHECK: ins.h v2[1], v15[1]         ; encoding: [0xe2,0x15,0x06,0x6e]
-; CHECK: ins.b v2[1], v15[1]         ; encoding: [0xe2,0x0d,0x03,0x6e]
-
-; CHECK: ins.d v2[1], v15[0]         ; encoding: [0xe2,0x05,0x18,0x6e]
-; CHECK: ins.s v2[3], v15[2]         ; encoding: [0xe2,0x45,0x1c,0x6e]
-; CHECK: ins.h v2[7], v15[3]         ; encoding: [0xe2,0x35,0x1e,0x6e]
-; CHECK: ins.b v2[10], v15[5]        ; encoding: [0xe2,0x2d,0x15,0x6e]
-
-; MOV aliases for the above INS instructions.
-  mov.d v2[1], x5
-  mov.s v3[1], w6
-  mov.h v4[1], w7
-  mov.b v5[1], w8
-
-  mov   v9.d[1], x2
-  mov   v8.s[1], w3
-  mov   v7.h[1], w4
-  mov   v6.b[1], w5
-
-  mov.d v1[1], v10[1]
-  mov.s v2[1], v11[1]
-  mov.h v7[1], v12[1]
-  mov.b v8[1], v15[1]
-
-  mov   v2.d[1], v15.d[0]
-  mov   v7.s[3], v16.s[2]
-  mov   v8.h[7], v17.h[3]
-  mov   v9.b[10], v18.b[5]
-
-; CHECK: ins.d	v2[1], x5               ; encoding: [0xa2,0x1c,0x18,0x4e]
-; CHECK: ins.s	v3[1], w6               ; encoding: [0xc3,0x1c,0x0c,0x4e]
-; CHECK: ins.h	v4[1], w7               ; encoding: [0xe4,0x1c,0x06,0x4e]
-; CHECK: ins.b	v5[1], w8               ; encoding: [0x05,0x1d,0x03,0x4e]
-; CHECK: ins.d	v9[1], x2               ; encoding: [0x49,0x1c,0x18,0x4e]
-; CHECK: ins.s	v8[1], w3               ; encoding: [0x68,0x1c,0x0c,0x4e]
-; CHECK: ins.h	v7[1], w4               ; encoding: [0x87,0x1c,0x06,0x4e]
-; CHECK: ins.b	v6[1], w5               ; encoding: [0xa6,0x1c,0x03,0x4e]
-; CHECK: ins.d	v1[1], v10[1]           ; encoding: [0x41,0x45,0x18,0x6e]
-; CHECK: ins.s	v2[1], v11[1]           ; encoding: [0x62,0x25,0x0c,0x6e]
-; CHECK: ins.h	v7[1], v12[1]           ; encoding: [0x87,0x15,0x06,0x6e]
-; CHECK: ins.b	v8[1], v15[1]           ; encoding: [0xe8,0x0d,0x03,0x6e]
-; CHECK: ins.d	v2[1], v15[0]           ; encoding: [0xe2,0x05,0x18,0x6e]
-; CHECK: ins.s	v7[3], v16[2]           ; encoding: [0x07,0x46,0x1c,0x6e]
-; CHECK: ins.h	v8[7], v17[3]           ; encoding: [0x28,0x36,0x1e,0x6e]
-; CHECK: ins.b	v9[10], v18[5]          ; encoding: [0x49,0x2e,0x15,0x6e]
-
-
-  and.8b  v0, v0, v0
-  and.16b v0, v0, v0
-
-; CHECK: and.8b  v0, v0, v0          ; encoding: [0x00,0x1c,0x20,0x0e]
-; CHECK: and.16b v0, v0, v0          ; encoding: [0x00,0x1c,0x20,0x4e]
-
-  bic.8b  v0, v0, v0
-
-; CHECK: bic.8b  v0, v0, v0          ; encoding: [0x00,0x1c,0x60,0x0e]
-
-  cmeq.8b v0, v0, v0
-  cmge.8b v0, v0, v0
-  cmgt.8b v0, v0, v0
-  cmhi.8b v0, v0, v0
-  cmhs.8b v0, v0, v0
-  cmtst.8b v0, v0, v0
-  fabd.2s v0, v0, v0
-  facge.2s  v0, v0, v0
-  facgt.2s  v0, v0, v0
-  faddp.2s v0, v0, v0
-  fadd.2s v0, v0, v0
-  fcmeq.2s  v0, v0, v0
-  fcmge.2s  v0, v0, v0
-  fcmgt.2s  v0, v0, v0
-  fdiv.2s v0, v0, v0
-  fmaxnmp.2s v0, v0, v0
-  fmaxnm.2s v0, v0, v0
-  fmaxp.2s v0, v0, v0
-  fmax.2s v0, v0, v0
-  fminnmp.2s v0, v0, v0
-  fminnm.2s v0, v0, v0
-  fminp.2s v0, v0, v0
-  fmin.2s v0, v0, v0
-  fmla.2s v0, v0, v0
-  fmls.2s v0, v0, v0
-  fmulx.2s v0, v0, v0
-  fmul.2s v0, v0, v0
-  fmulx	d2, d3, d1
-  fmulx	s2, s3, s1
-  frecps.2s v0, v0, v0
-  frsqrts.2s v0, v0, v0
-  fsub.2s v0, v0, v0
-  mla.8b v0, v0, v0
-  mls.8b v0, v0, v0
-  mul.8b v0, v0, v0
-  pmul.8b v0, v0, v0
-  saba.8b v0, v0, v0
-  sabd.8b v0, v0, v0
-  shadd.8b v0, v0, v0
-  shsub.8b v0, v0, v0
-  smaxp.8b v0, v0, v0
-  smax.8b v0, v0, v0
-  sminp.8b v0, v0, v0
-  smin.8b v0, v0, v0
-  sqadd.8b v0, v0, v0
-  sqdmulh.4h v0, v0, v0
-  sqrdmulh.4h v0, v0, v0
-  sqrshl.8b v0, v0, v0
-  sqshl.8b v0, v0, v0
-  sqsub.8b v0, v0, v0
-  srhadd.8b v0, v0, v0
-  srshl.8b v0, v0, v0
-  sshl.8b v0, v0, v0
-  sub.8b v0, v0, v0
-  uaba.8b v0, v0, v0
-  uabd.8b v0, v0, v0
-  uhadd.8b v0, v0, v0
-  uhsub.8b v0, v0, v0
-  umaxp.8b v0, v0, v0
-  umax.8b v0, v0, v0
-  uminp.8b v0, v0, v0
-  umin.8b v0, v0, v0
-  uqadd.8b v0, v0, v0
-  uqrshl.8b v0, v0, v0
-  uqshl.8b v0, v0, v0
-  uqsub.8b v0, v0, v0
-  urhadd.8b v0, v0, v0
-  urshl.8b v0, v0, v0
-  ushl.8b v0, v0, v0
-
-; CHECK: cmeq.8b	v0, v0, v0              ; encoding: [0x00,0x8c,0x20,0x2e]
-; CHECK: cmge.8b	v0, v0, v0              ; encoding: [0x00,0x3c,0x20,0x0e]
-; CHECK: cmgt.8b	v0, v0, v0              ; encoding: [0x00,0x34,0x20,0x0e]
-; CHECK: cmhi.8b	v0, v0, v0              ; encoding: [0x00,0x34,0x20,0x2e]
-; CHECK: cmhs.8b	v0, v0, v0              ; encoding: [0x00,0x3c,0x20,0x2e]
-; CHECK: cmtst.8b	v0, v0, v0      ; encoding: [0x00,0x8c,0x20,0x0e]
-; CHECK: fabd.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0xa0,0x2e]
-; CHECK: facge.2s	v0, v0, v0      ; encoding: [0x00,0xec,0x20,0x2e]
-; CHECK: facgt.2s	v0, v0, v0      ; encoding: [0x00,0xec,0xa0,0x2e]
-; CHECK: faddp.2s	v0, v0, v0      ; encoding: [0x00,0xd4,0x20,0x2e]
-; CHECK: fadd.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0x20,0x0e]
-; CHECK: fcmeq.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0x20,0x0e]
-; CHECK: fcmge.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0x20,0x2e]
-; CHECK: fcmgt.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0xa0,0x2e]
-; CHECK: fdiv.2s	v0, v0, v0              ; encoding: [0x00,0xfc,0x20,0x2e]
-; CHECK: fmaxnmp.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0x20,0x2e]
-; CHECK: fmaxnm.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0x20,0x0e]
-; CHECK: fmaxp.2s	v0, v0, v0      ; encoding: [0x00,0xf4,0x20,0x2e]
-; CHECK: fmax.2s	v0, v0, v0              ; encoding: [0x00,0xf4,0x20,0x0e]
-; CHECK: fminnmp.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0xa0,0x2e]
-; CHECK: fminnm.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0xa0,0x0e]
-; CHECK: fminp.2s	v0, v0, v0      ; encoding: [0x00,0xf4,0xa0,0x2e]
-; CHECK: fmin.2s	v0, v0, v0              ; encoding: [0x00,0xf4,0xa0,0x0e]
-; CHECK: fmla.2s	v0, v0, v0              ; encoding: [0x00,0xcc,0x20,0x0e]
-; CHECK: fmls.2s	v0, v0, v0              ; encoding: [0x00,0xcc,0xa0,0x0e]
-; CHECK: fmulx.2s	v0, v0, v0      ; encoding: [0x00,0xdc,0x20,0x0e]
-
-; CHECK: fmul.2s	v0, v0, v0              ; encoding: [0x00,0xdc,0x20,0x2e]
-; CHECK: fmulx	d2, d3, d1              ; encoding: [0x62,0xdc,0x61,0x5e]
-; CHECK: fmulx	s2, s3, s1              ; encoding: [0x62,0xdc,0x21,0x5e]
-; CHECK: frecps.2s	v0, v0, v0      ; encoding: [0x00,0xfc,0x20,0x0e]
-; CHECK: frsqrts.2s	v0, v0, v0      ; encoding: [0x00,0xfc,0xa0,0x0e]
-; CHECK: fsub.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0xa0,0x0e]
-; CHECK: mla.8b	v0, v0, v0              ; encoding: [0x00,0x94,0x20,0x0e]
-; CHECK: mls.8b	v0, v0, v0              ; encoding: [0x00,0x94,0x20,0x2e]
-; CHECK: mul.8b	v0, v0, v0              ; encoding: [0x00,0x9c,0x20,0x0e]
-; CHECK: pmul.8b	v0, v0, v0              ; encoding: [0x00,0x9c,0x20,0x2e]
-; CHECK: saba.8b	v0, v0, v0              ; encoding: [0x00,0x7c,0x20,0x0e]
-; CHECK: sabd.8b	v0, v0, v0              ; encoding: [0x00,0x74,0x20,0x0e]
-; CHECK: shadd.8b	v0, v0, v0      ; encoding: [0x00,0x04,0x20,0x0e]
-; CHECK: shsub.8b	v0, v0, v0      ; encoding: [0x00,0x24,0x20,0x0e]
-; CHECK: smaxp.8b	v0, v0, v0      ; encoding: [0x00,0xa4,0x20,0x0e]
-; CHECK: smax.8b	v0, v0, v0              ; encoding: [0x00,0x64,0x20,0x0e]
-; CHECK: sminp.8b	v0, v0, v0      ; encoding: [0x00,0xac,0x20,0x0e]
-; CHECK: smin.8b	v0, v0, v0              ; encoding: [0x00,0x6c,0x20,0x0e]
-; CHECK: sqadd.8b	v0, v0, v0      ; encoding: [0x00,0x0c,0x20,0x0e]
-; CHECK: sqdmulh.4h v0, v0, v0 ; encoding: [0x00,0xb4,0x60,0x0e]
-; CHECK: sqrdmulh.4h v0, v0, v0 ; encoding: [0x00,0xb4,0x60,0x2e]
-; CHECK: sqrshl.8b	v0, v0, v0      ; encoding: [0x00,0x5c,0x20,0x0e]
-; CHECK: sqshl.8b	v0, v0, v0      ; encoding: [0x00,0x4c,0x20,0x0e]
-; CHECK: sqsub.8b	v0, v0, v0      ; encoding: [0x00,0x2c,0x20,0x0e]
-; CHECK: srhadd.8b	v0, v0, v0      ; encoding: [0x00,0x14,0x20,0x0e]
-; CHECK: srshl.8b	v0, v0, v0      ; encoding: [0x00,0x54,0x20,0x0e]
-; CHECK: sshl.8b	v0, v0, v0              ; encoding: [0x00,0x44,0x20,0x0e]
-; CHECK: sub.8b	v0, v0, v0              ; encoding: [0x00,0x84,0x20,0x2e]
-; CHECK: uaba.8b	v0, v0, v0              ; encoding: [0x00,0x7c,0x20,0x2e]
-; CHECK: uabd.8b	v0, v0, v0              ; encoding: [0x00,0x74,0x20,0x2e]
-; CHECK: uhadd.8b	v0, v0, v0      ; encoding: [0x00,0x04,0x20,0x2e]
-; CHECK: uhsub.8b	v0, v0, v0      ; encoding: [0x00,0x24,0x20,0x2e]
-; CHECK: umaxp.8b	v0, v0, v0      ; encoding: [0x00,0xa4,0x20,0x2e]
-; CHECK: umax.8b	v0, v0, v0              ; encoding: [0x00,0x64,0x20,0x2e]
-; CHECK: uminp.8b	v0, v0, v0      ; encoding: [0x00,0xac,0x20,0x2e]
-; CHECK: umin.8b	v0, v0, v0              ; encoding: [0x00,0x6c,0x20,0x2e]
-; CHECK: uqadd.8b	v0, v0, v0      ; encoding: [0x00,0x0c,0x20,0x2e]
-; CHECK: uqrshl.8b	v0, v0, v0      ; encoding: [0x00,0x5c,0x20,0x2e]
-; CHECK: uqshl.8b	v0, v0, v0      ; encoding: [0x00,0x4c,0x20,0x2e]
-; CHECK: uqsub.8b	v0, v0, v0      ; encoding: [0x00,0x2c,0x20,0x2e]
-; CHECK: urhadd.8b	v0, v0, v0      ; encoding: [0x00,0x14,0x20,0x2e]
-; CHECK: urshl.8b	v0, v0, v0      ; encoding: [0x00,0x54,0x20,0x2e]
-; CHECK: ushl.8b	v0, v0, v0              ; encoding: [0x00,0x44,0x20,0x2e]
-
-  bif.8b v0, v0, v0
-  bit.8b v0, v0, v0
-  bsl.8b v0, v0, v0
-  eor.8b v0, v0, v0
-  orn.8b v0, v0, v0
-  orr.8b v0, v0, v1
-
-; CHECK: bif.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xe0,0x2e]
-; CHECK: bit.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xa0,0x2e]
-; CHECK: bsl.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0x60,0x2e]
-; CHECK: eor.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0x20,0x2e]
-; CHECK: orn.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xe0,0x0e]
-; CHECK: orr.8b v0, v0, v1              ; encoding: [0x00,0x1c,0xa1,0x0e]
-
-  sadalp.4h   v0, v0
-  sadalp.8h  v0, v0
-  sadalp.2s   v0, v0
-  sadalp.4s   v0, v0
-  sadalp.1d   v0, v0
-  sadalp.2d   v0, v0
-
-; CHECK: sadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x0e]
-; CHECK: sadalp.8h	v0, v0          ; encoding: [0x00,0x68,0x20,0x4e]
-; CHECK: sadalp.2s	v0, v0          ; encoding: [0x00,0x68,0x60,0x0e]
-; CHECK: sadalp.4s	v0, v0          ; encoding: [0x00,0x68,0x60,0x4e]
-; CHECK: sadalp.1d	v0, v0          ; encoding: [0x00,0x68,0xa0,0x0e]
-; CHECK: sadalp.2d	v0, v0          ; encoding: [0x00,0x68,0xa0,0x4e]
-
-  cls.8b      v0, v0
-  clz.8b      v0, v0
-  cnt.8b      v0, v0
-  fabs.2s     v0, v0
-  fneg.2s     v0, v0
-  frecpe.2s   v0, v0
-  frinta.2s   v0, v0
-  frintx.2s   v0, v0
-  frinti.2s   v0, v0
-  frintm.2s   v0, v0
-  frintn.2s   v0, v0
-  frintp.2s   v0, v0
-  frintz.2s   v0, v0
-  frsqrte.2s  v0, v0
-  fsqrt.2s    v0, v0
-  neg.8b      v0, v0
-  not.8b      v0, v0
-  rbit.8b     v0, v0
-  rev16.8b    v0, v0
-  rev32.8b    v0, v0
-  rev64.8b    v0, v0
-  sadalp.4h   v0, v0
-  saddlp.4h	  v0, v0
-  scvtf.2s    v0, v0
-  sqabs.8b    v0, v0
-  sqneg.8b    v0, v0
-  sqxtn.8b    v0, v0
-  sqxtun.8b   v0, v0
-  suqadd.8b   v0, v0
-  uadalp.4h   v0, v0
-  uaddlp.4h   v0, v0
-  ucvtf.2s    v0, v0
-  uqxtn.8b    v0, v0
-  urecpe.2s   v0, v0
-  ursqrte.2s  v0, v0
-  usqadd.8b   v0, v0
-  xtn.8b      v0, v0
-  shll.8h v1, v2, #8
-  shll.4s v3, v4, #16
-  shll.2d v5, v6, #32
-  shll2.8h v7, v8, #8
-  shll2.4s v9, v10, #16
-  shll2.2d v11, v12, #32
-  shll v1.8h, v2.8b, #8
-  shll v1.4s, v2.4h, #16
-  shll v1.2d, v2.2s, #32
-  shll2 v1.8h, v2.16b, #8
-  shll2 v1.4s, v2.8h, #16
-  shll2 v1.2d, v2.4s, #32
-
-; CHECK: cls.8b	v0, v0                  ; encoding: [0x00,0x48,0x20,0x0e]
-; CHECK: clz.8b	v0, v0                  ; encoding: [0x00,0x48,0x20,0x2e]
-; CHECK: cnt.8b	v0, v0                  ; encoding: [0x00,0x58,0x20,0x0e]
-; CHECK: fabs.2s	v0, v0                  ; encoding: [0x00,0xf8,0xa0,0x0e]
-; CHECK: fneg.2s	v0, v0                  ; encoding: [0x00,0xf8,0xa0,0x2e]
-; CHECK: frecpe.2s	v0, v0          ; encoding: [0x00,0xd8,0xa1,0x0e]
-; CHECK: frinta.2s	v0, v0          ; encoding: [0x00,0x88,0x21,0x2e]
-; CHECK: frintx.2s	v0, v0          ; encoding: [0x00,0x98,0x21,0x2e]
-; CHECK: frinti.2s	v0, v0          ; encoding: [0x00,0x98,0xa1,0x2e]
-; CHECK: frintm.2s	v0, v0          ; encoding: [0x00,0x98,0x21,0x0e]
-; CHECK: frintn.2s	v0, v0          ; encoding: [0x00,0x88,0x21,0x0e]
-; CHECK: frintp.2s	v0, v0          ; encoding: [0x00,0x88,0xa1,0x0e]
-; CHECK: frintz.2s	v0, v0          ; encoding: [0x00,0x98,0xa1,0x0e]
-; CHECK: frsqrte.2s	v0, v0          ; encoding: [0x00,0xd8,0xa1,0x2e]
-; CHECK: fsqrt.2s	v0, v0          ; encoding: [0x00,0xf8,0xa1,0x2e]
-; CHECK: neg.8b	v0, v0                  ; encoding: [0x00,0xb8,0x20,0x2e]
-; CHECK: mvn.8b	v0, v0                  ; encoding: [0x00,0x58,0x20,0x2e]
-; CHECK: rbit.8b	v0, v0                  ; encoding: [0x00,0x58,0x60,0x2e]
-; CHECK: rev16.8b	v0, v0          ; encoding: [0x00,0x18,0x20,0x0e]
-; CHECK: rev32.8b	v0, v0          ; encoding: [0x00,0x08,0x20,0x2e]
-; CHECK: rev64.8b	v0, v0          ; encoding: [0x00,0x08,0x20,0x0e]
-; CHECK: sadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x0e]
-; CHECK: saddlp.4h	v0, v0          ; encoding: [0x00,0x28,0x20,0x0e]
-; CHECK: scvtf.2s	v0, v0          ; encoding: [0x00,0xd8,0x21,0x0e]
-; CHECK: sqabs.8b	v0, v0          ; encoding: [0x00,0x78,0x20,0x0e]
-; CHECK: sqneg.8b	v0, v0          ; encoding: [0x00,0x78,0x20,0x2e]
-; CHECK: sqxtn.8b	v0, v0          ; encoding: [0x00,0x48,0x21,0x0e]
-; CHECK: sqxtun.8b	v0, v0          ; encoding: [0x00,0x28,0x21,0x2e]
-; CHECK: suqadd.8b	v0, v0          ; encoding: [0x00,0x38,0x20,0x0e]
-; CHECK: uadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x2e]
-; CHECK: uaddlp.4h	v0, v0          ; encoding: [0x00,0x28,0x20,0x2e]
-; CHECK: ucvtf.2s	v0, v0          ; encoding: [0x00,0xd8,0x21,0x2e]
-; CHECK: uqxtn.8b	v0, v0          ; encoding: [0x00,0x48,0x21,0x2e]
-; CHECK: urecpe.2s	v0, v0          ; encoding: [0x00,0xc8,0xa1,0x0e]
-; CHECK: ursqrte.2s	v0, v0          ; encoding: [0x00,0xc8,0xa1,0x2e]
-; CHECK: usqadd.8b	v0, v0          ; encoding: [0x00,0x38,0x20,0x2e]
-; CHECK: xtn.8b	v0, v0                  ; encoding: [0x00,0x28,0x21,0x0e]
-; CHECK: shll.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x2e]
-; CHECK: shll.4s	v3, v4, #16     ; encoding: [0x83,0x38,0x61,0x2e]
-; CHECK: shll.2d	v5, v6, #32     ; encoding: [0xc5,0x38,0xa1,0x2e]
-; CHECK: shll2.8h	v7, v8, #8      ; encoding: [0x07,0x39,0x21,0x6e]
-; CHECK: shll2.4s	v9, v10, #16    ; encoding: [0x49,0x39,0x61,0x6e]
-; CHECK: shll2.2d	v11, v12, #32   ; encoding: [0x8b,0x39,0xa1,0x6e]
-; CHECK: shll.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x2e]
-; CHECK: shll.4s	v1, v2, #16     ; encoding: [0x41,0x38,0x61,0x2e]
-; CHECK: shll.2d	v1, v2, #32     ; encoding: [0x41,0x38,0xa1,0x2e]
-; CHECK: shll2.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x6e]
-; CHECK: shll2.4s	v1, v2, #16     ; encoding: [0x41,0x38,0x61,0x6e]
-; CHECK: shll2.2d	v1, v2, #32     ; encoding: [0x41,0x38,0xa1,0x6e]
-
-
-  cmeq.8b   v0, v0, #0
-  cmeq.16b  v0, v0, #0
-  cmeq.4h   v0, v0, #0
-  cmeq.8h   v0, v0, #0
-  cmeq.2s   v0, v0, #0
-  cmeq.4s   v0, v0, #0
-  cmeq.2d   v0, v0, #0
-
-; CHECK: cmeq.8b	v0, v0, #0              ; encoding: [0x00,0x98,0x20,0x0e]
-; CHECK: cmeq.16b	v0, v0, #0      ; encoding: [0x00,0x98,0x20,0x4e]
-; CHECK: cmeq.4h	v0, v0, #0              ; encoding: [0x00,0x98,0x60,0x0e]
-; CHECK: cmeq.8h	v0, v0, #0              ; encoding: [0x00,0x98,0x60,0x4e]
-; CHECK: cmeq.2s	v0, v0, #0              ; encoding: [0x00,0x98,0xa0,0x0e]
-; CHECK: cmeq.4s	v0, v0, #0              ; encoding: [0x00,0x98,0xa0,0x4e]
-; CHECK: cmeq.2d	v0, v0, #0              ; encoding: [0x00,0x98,0xe0,0x4e]
-
-  cmge.8b   v0, v0, #0
-  cmgt.8b   v0, v0, #0
-  cmle.8b   v0, v0, #0
-  cmlt.8b   v0, v0, #0
-  fcmeq.2s  v0, v0, #0
-  fcmge.2s  v0, v0, #0
-  fcmgt.2s  v0, v0, #0
-  fcmle.2s  v0, v0, #0
-  fcmlt.2s  v0, v0, #0
-
-; ARM verbose mode aliases
-  cmlt v8.8b, v14.8b, #0
-  cmlt v8.16b, v14.16b, #0
-  cmlt v8.4h, v14.4h, #0
-  cmlt v8.8h, v14.8h, #0
-  cmlt v8.2s, v14.2s, #0
-  cmlt v8.4s, v14.4s, #0
-  cmlt v8.2d, v14.2d, #0
-
-; CHECK: cmge.8b	v0, v0, #0              ; encoding: [0x00,0x88,0x20,0x2e]
-; CHECK: cmgt.8b	v0, v0, #0              ; encoding: [0x00,0x88,0x20,0x0e]
-; CHECK: cmle.8b	v0, v0, #0              ; encoding: [0x00,0x98,0x20,0x2e]
-; CHECK: cmlt.8b	v0, v0, #0              ; encoding: [0x00,0xa8,0x20,0x0e]
-; CHECK: fcmeq.2s	v0, v0, #0.0      ; encoding: [0x00,0xd8,0xa0,0x0e]
-; CHECK: fcmge.2s	v0, v0, #0.0      ; encoding: [0x00,0xc8,0xa0,0x2e]
-; CHECK: fcmgt.2s	v0, v0, #0.0      ; encoding: [0x00,0xc8,0xa0,0x0e]
-; CHECK: fcmle.2s	v0, v0, #0.0      ; encoding: [0x00,0xd8,0xa0,0x2e]
-; CHECK: fcmlt.2s	v0, v0, #0.0      ; encoding: [0x00,0xe8,0xa0,0x0e]
-; CHECK: cmlt.8b	v8, v14, #0             ; encoding: [0xc8,0xa9,0x20,0x0e]
-; CHECK: cmlt.16b	v8, v14, #0     ; encoding: [0xc8,0xa9,0x20,0x4e]
-; CHECK: cmlt.4h	v8, v14, #0             ; encoding: [0xc8,0xa9,0x60,0x0e]
-; CHECK: cmlt.8h	v8, v14, #0             ; encoding: [0xc8,0xa9,0x60,0x4e]
-; CHECK: cmlt.2s	v8, v14, #0             ; encoding: [0xc8,0xa9,0xa0,0x0e]
-; CHECK: cmlt.4s	v8, v14, #0             ; encoding: [0xc8,0xa9,0xa0,0x4e]
-; CHECK: cmlt.2d	v8, v14, #0             ; encoding: [0xc8,0xa9,0xe0,0x4e]
-
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD Floating-point <-> Integer Conversions
-;===-------------------------------------------------------------------------===
-
-  fcvtas.2s   v0, v0
-  fcvtas.4s   v0, v0
-  fcvtas.2d   v0, v0
-  fcvtas      s0, s0
-  fcvtas      d0, d0
-
-; CHECK: fcvtas.2s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x0e]
-; CHECK: fcvtas.4s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x4e]
-; CHECK: fcvtas.2d  v0, v0           ; encoding: [0x00,0xc8,0x61,0x4e]
-; CHECK: fcvtas     s0, s0           ; encoding: [0x00,0xc8,0x21,0x5e]
-; CHECK: fcvtas     d0, d0           ; encoding: [0x00,0xc8,0x61,0x5e]
-
-  fcvtau.2s   v0, v0
-  fcvtau.4s   v0, v0
-  fcvtau.2d   v0, v0
-  fcvtau      s0, s0
-  fcvtau      d0, d0
-
-; CHECK: fcvtau.2s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x2e]
-; CHECK: fcvtau.4s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x6e]
-; CHECK: fcvtau.2d  v0, v0           ; encoding: [0x00,0xc8,0x61,0x6e]
-; CHECK: fcvtau     s0, s0           ; encoding: [0x00,0xc8,0x21,0x7e]
-; CHECK: fcvtau     d0, d0           ; encoding: [0x00,0xc8,0x61,0x7e]
-
-  fcvtl   v1.4s, v5.4h
-  fcvtl   v2.2d, v6.2s
-  fcvtl2  v3.4s, v7.8h
-  fcvtl2  v4.2d, v8.4s
-
-; CHECK: fcvtl	v1.4s, v5.4h            ; encoding: [0xa1,0x78,0x21,0x0e]
-; CHECK: fcvtl	v2.2d, v6.2s            ; encoding: [0xc2,0x78,0x61,0x0e]
-; CHECK: fcvtl2	v3.4s, v7.8h            ; encoding: [0xe3,0x78,0x21,0x4e]
-; CHECK: fcvtl2	v4.2d, v8.4s            ; encoding: [0x04,0x79,0x61,0x4e]
-
-  fcvtms.2s  v0, v0
-  fcvtms.4s  v0, v0
-  fcvtms.2d  v0, v0
-  fcvtms     s0, s0
-  fcvtms     d0, d0
-
-; CHECK: fcvtms.2s v0, v0            ; encoding: [0x00,0xb8,0x21,0x0e]
-; CHECK: fcvtms.4s v0, v0            ; encoding: [0x00,0xb8,0x21,0x4e]
-; CHECK: fcvtms.2d v0, v0            ; encoding: [0x00,0xb8,0x61,0x4e]
-; CHECK: fcvtms    s0, s0            ; encoding: [0x00,0xb8,0x21,0x5e]
-; CHECK: fcvtms    d0, d0            ; encoding: [0x00,0xb8,0x61,0x5e]
-
-  fcvtmu.2s   v0, v0
-  fcvtmu.4s   v0, v0
-  fcvtmu.2d   v0, v0
-  fcvtmu      s0, s0
-  fcvtmu      d0, d0
-
-; CHECK: fcvtmu.2s v0, v0            ; encoding: [0x00,0xb8,0x21,0x2e]
-; CHECK: fcvtmu.4s v0, v0            ; encoding: [0x00,0xb8,0x21,0x6e]
-; CHECK: fcvtmu.2d v0, v0            ; encoding: [0x00,0xb8,0x61,0x6e]
-; CHECK: fcvtmu    s0, s0            ; encoding: [0x00,0xb8,0x21,0x7e]
-; CHECK: fcvtmu    d0, d0            ; encoding: [0x00,0xb8,0x61,0x7e]
-
-  fcvtns.2s   v0, v0
-  fcvtns.4s   v0, v0
-  fcvtns.2d   v0, v0
-  fcvtns      s0, s0
-  fcvtns      d0, d0
-
-; CHECK: fcvtns.2s v0, v0            ; encoding: [0x00,0xa8,0x21,0x0e]
-; CHECK: fcvtns.4s v0, v0            ; encoding: [0x00,0xa8,0x21,0x4e]
-; CHECK: fcvtns.2d v0, v0            ; encoding: [0x00,0xa8,0x61,0x4e]
-; CHECK: fcvtns    s0, s0            ; encoding: [0x00,0xa8,0x21,0x5e]
-; CHECK: fcvtns    d0, d0            ; encoding: [0x00,0xa8,0x61,0x5e]
-
-  fcvtnu.2s   v0, v0
-  fcvtnu.4s   v0, v0
-  fcvtnu.2d   v0, v0
-  fcvtnu      s0, s0
-  fcvtnu      d0, d0
-
-; CHECK: fcvtnu.2s v0, v0            ; encoding: [0x00,0xa8,0x21,0x2e]
-; CHECK: fcvtnu.4s v0, v0            ; encoding: [0x00,0xa8,0x21,0x6e]
-; CHECK: fcvtnu.2d v0, v0            ; encoding: [0x00,0xa8,0x61,0x6e]
-; CHECK: fcvtnu    s0, s0            ; encoding: [0x00,0xa8,0x21,0x7e]
-; CHECK: fcvtnu    d0, d0            ; encoding: [0x00,0xa8,0x61,0x7e]
-
-  fcvtn   v2.4h, v4.4s
-  fcvtn   v3.2s, v5.2d
-  fcvtn2  v4.8h, v6.4s
-  fcvtn2  v5.4s, v7.2d
-  fcvtxn  v6.2s, v9.2d
-  fcvtxn2 v7.4s, v8.2d
-
-; CHECK: fcvtn	v2.4h, v4.4s            ; encoding: [0x82,0x68,0x21,0x0e]
-; CHECK: fcvtn	v3.2s, v5.2d            ; encoding: [0xa3,0x68,0x61,0x0e]
-; CHECK: fcvtn2	v4.8h, v6.4s            ; encoding: [0xc4,0x68,0x21,0x4e]
-; CHECK: fcvtn2	v5.4s, v7.2d            ; encoding: [0xe5,0x68,0x61,0x4e]
-; CHECK: fcvtxn	v6.2s, v9.2d            ; encoding: [0x26,0x69,0x61,0x2e]
-; CHECK: fcvtxn2 v7.4s, v8.2d           ; encoding: [0x07,0x69,0x61,0x6e]
-
-  fcvtps.2s  v0, v0
-  fcvtps.4s  v0, v0
-  fcvtps.2d  v0, v0
-  fcvtps     s0, s0
-  fcvtps     d0, d0
-
-; CHECK: fcvtps.2s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x0e]
-; CHECK: fcvtps.4s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x4e]
-; CHECK: fcvtps.2d v0, v0            ; encoding: [0x00,0xa8,0xe1,0x4e]
-; CHECK: fcvtps    s0, s0            ; encoding: [0x00,0xa8,0xa1,0x5e]
-; CHECK: fcvtps    d0, d0            ; encoding: [0x00,0xa8,0xe1,0x5e]
-
-  fcvtpu.2s  v0, v0
-  fcvtpu.4s  v0, v0
-  fcvtpu.2d  v0, v0
-  fcvtpu     s0, s0
-  fcvtpu     d0, d0
-
-; CHECK: fcvtpu.2s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x2e]
-; CHECK: fcvtpu.4s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x6e]
-; CHECK: fcvtpu.2d v0, v0            ; encoding: [0x00,0xa8,0xe1,0x6e]
-; CHECK: fcvtpu    s0, s0            ; encoding: [0x00,0xa8,0xa1,0x7e]
-; CHECK: fcvtpu    d0, d0            ; encoding: [0x00,0xa8,0xe1,0x7e]
-
-  fcvtzs.2s  v0, v0
-  fcvtzs.4s  v0, v0
-  fcvtzs.2d  v0, v0
-  fcvtzs     s0, s0
-  fcvtzs     d0, d0
-
-; CHECK: fcvtzs.2s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x0e]
-; CHECK: fcvtzs.4s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x4e]
-; CHECK: fcvtzs.2d v0, v0            ; encoding: [0x00,0xb8,0xe1,0x4e]
-; CHECK: fcvtzs    s0, s0            ; encoding: [0x00,0xb8,0xa1,0x5e]
-; CHECK: fcvtzs    d0, d0            ; encoding: [0x00,0xb8,0xe1,0x5e]
-
-  fcvtzu.2s  v0, v0
-  fcvtzu.4s  v0, v0
-  fcvtzu.2d  v0, v0
-  fcvtzu     s0, s0
-  fcvtzu     d0, d0
-
-; CHECK: fcvtzu.2s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x2e]
-; CHECK: fcvtzu.4s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x6e]
-; CHECK: fcvtzu.2d v0, v0            ; encoding: [0x00,0xb8,0xe1,0x6e]
-; CHECK: fcvtzu    s0, s0            ; encoding: [0x00,0xb8,0xa1,0x7e]
-; CHECK: fcvtzu    d0, d0            ; encoding: [0x00,0xb8,0xe1,0x7e]
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD modified immediate instructions
-;===-------------------------------------------------------------------------===
-
-  bic.2s  v0, #1
-  bic.2s  v0, #1, lsl #0
-  bic.2s  v0, #1, lsl #8
-  bic.2s  v0, #1, lsl #16
-  bic.2s  v0, #1, lsl #24
-
-; CHECK: bic.2s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x2f]
-; CHECK: bic.2s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x2f]
-; CHECK: bic.2s v0, #0x1, lsl #8       ; encoding: [0x20,0x34,0x00,0x2f]
-; CHECK: bic.2s v0, #0x1, lsl #16      ; encoding: [0x20,0x54,0x00,0x2f]
-; CHECK: bic.2s v0, #0x1, lsl #24      ; encoding: [0x20,0x74,0x00,0x2f]
-
-  bic.4h  v0, #1
-  bic.4h  v0, #1, lsl #0
-  bic.4h  v0, #1, lsl #8
-
-; CHECK: bic.4h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x2f]
-; CHECK: bic.4h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x2f]
-; CHECK: bic.4h v0, #0x1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x2f]
-
-  bic.4s  v0, #1
-  bic.4s  v0, #1, lsl #0
-  bic.4s  v0, #1, lsl #8
-  bic.4s  v0, #1, lsl #16
-  bic.4s  v0, #1, lsl #24
-
-; CHECK: bic.4s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x6f]
-; CHECK: bic.4s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x6f]
-; CHECK: bic.4s v0, #0x1, lsl #8       ; encoding: [0x20,0x34,0x00,0x6f]
-; CHECK: bic.4s v0, #0x1, lsl #16      ; encoding: [0x20,0x54,0x00,0x6f]
-; CHECK: bic.4s v0, #0x1, lsl #24      ; encoding: [0x20,0x74,0x00,0x6f]
-
-  bic.8h  v0, #1
-  bic.8h  v0, #1, lsl #0
-  bic.8h  v0, #1, lsl #8
-
-; CHECK: bic.8h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x6f]
-; CHECK: bic.8h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x6f]
-; CHECK: bic.8h v0, #0x1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x6f]
-
-  fmov.2d v0, #1.250000e-01
-
-; CHECK: fmov.2d v0, #0.12500000             ; encoding: [0x00,0xf4,0x02,0x6f]
-
-  fmov.2s v0, #1.250000e-01
-  fmov.4s v0, #1.250000e-01
-
-; CHECK: fmov.2s v0, #0.12500000             ; encoding: [0x00,0xf4,0x02,0x0f]
-; CHECK: fmov.4s v0, #0.12500000             ; encoding: [0x00,0xf4,0x02,0x4f]
-
-  orr.2s  v0, #1
-  orr.2s  v0, #1, lsl #0
-  orr.2s  v0, #1, lsl #8
-  orr.2s  v0, #1, lsl #16
-  orr.2s  v0, #1, lsl #24
-
-; CHECK: orr.2s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x0f]
-; CHECK: orr.2s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x0f]
-; CHECK: orr.2s v0, #0x1, lsl #8       ; encoding: [0x20,0x34,0x00,0x0f]
-; CHECK: orr.2s v0, #0x1, lsl #16      ; encoding: [0x20,0x54,0x00,0x0f]
-; CHECK: orr.2s v0, #0x1, lsl #24      ; encoding: [0x20,0x74,0x00,0x0f]
-
-  orr.4h  v0, #1
-  orr.4h  v0, #1, lsl #0
-  orr.4h  v0, #1, lsl #8
-
-; CHECK: orr.4h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x0f]
-; CHECK: orr.4h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x0f]
-; CHECK: orr.4h v0, #0x1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x0f]
-
-  orr.4s  v0, #1
-  orr.4s  v0, #1, lsl #0
-  orr.4s  v0, #1, lsl #8
-  orr.4s  v0, #1, lsl #16
-  orr.4s  v0, #1, lsl #24
-
-; CHECK: orr.4s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x4f]
-; CHECK: orr.4s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x4f]
-; CHECK: orr.4s v0, #0x1, lsl #8       ; encoding: [0x20,0x34,0x00,0x4f]
-; CHECK: orr.4s v0, #0x1, lsl #16      ; encoding: [0x20,0x54,0x00,0x4f]
-; CHECK: orr.4s v0, #0x1, lsl #24      ; encoding: [0x20,0x74,0x00,0x4f]
-
-  orr.8h  v0, #1
-  orr.8h  v0, #1, lsl #0
-  orr.8h  v0, #1, lsl #8
-
-; CHECK: orr.8h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x4f]
-; CHECK: orr.8h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x4f]
-; CHECK: orr.8h v0, #0x1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x4f]
-
-  movi     d0, #0x000000000000ff
-  movi.2d  v0, #0x000000000000ff
-
-; CHECK: movi     d0, #0x000000000000ff ; encoding: [0x20,0xe4,0x00,0x2f]
-; CHECK: movi.2d  v0, #0x000000000000ff ; encoding: [0x20,0xe4,0x00,0x6f]
-
-  movi.2s v0, #1
-  movi.2s v0, #1, lsl #0
-  movi.2s v0, #1, lsl #8
-  movi.2s v0, #1, lsl #16
-  movi.2s v0, #1, lsl #24
-
-; CHECK: movi.2s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x0f]
-; CHECK: movi.2s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x0f]
-; CHECK: movi.2s v0, #0x1, lsl #8      ; encoding: [0x20,0x24,0x00,0x0f]
-; CHECK: movi.2s v0, #0x1, lsl #16     ; encoding: [0x20,0x44,0x00,0x0f]
-; CHECK: movi.2s v0, #0x1, lsl #24     ; encoding: [0x20,0x64,0x00,0x0f]
-
-  movi.4s v0, #1
-  movi.4s v0, #1, lsl #0
-  movi.4s v0, #1, lsl #8
-  movi.4s v0, #1, lsl #16
-  movi.4s v0, #1, lsl #24
-
-; CHECK: movi.4s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x4f]
-; CHECK: movi.4s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x4f]
-; CHECK: movi.4s v0, #0x1, lsl #8      ; encoding: [0x20,0x24,0x00,0x4f]
-; CHECK: movi.4s v0, #0x1, lsl #16     ; encoding: [0x20,0x44,0x00,0x4f]
-; CHECK: movi.4s v0, #0x1, lsl #24     ; encoding: [0x20,0x64,0x00,0x4f]
-
-  movi.4h v0, #1
-  movi.4h v0, #1, lsl #0
-  movi.4h v0, #1, lsl #8
-
-; CHECK: movi.4h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x0f]
-; CHECK: movi.4h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x0f]
-; CHECK: movi.4h v0, #0x1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x0f]
-
-  movi.8h v0, #1
-  movi.8h v0, #1, lsl #0
-  movi.8h v0, #1, lsl #8
-
-; CHECK: movi.8h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x4f]
-; CHECK: movi.8h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x4f]
-; CHECK: movi.8h v0, #0x1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x4f]
-
-  movi.2s v0, #1, msl #8
-  movi.2s v0, #1, msl #16
-  movi.4s v0, #1, msl #8
-  movi.4s v0, #1, msl #16
-
-; CHECK: movi.2s v0, #0x1, msl #8      ; encoding: [0x20,0xc4,0x00,0x0f]
-; CHECK: movi.2s v0, #0x1, msl #16     ; encoding: [0x20,0xd4,0x00,0x0f]
-; CHECK: movi.4s v0, #0x1, msl #8      ; encoding: [0x20,0xc4,0x00,0x4f]
-; CHECK: movi.4s v0, #0x1, msl #16     ; encoding: [0x20,0xd4,0x00,0x4f]
-
-  movi.8b  v0, #1
-  movi.16b v0, #1
-
-; CHECK: movi.8b  v0, #0x1             ; encoding: [0x20,0xe4,0x00,0x0f]
-; CHECK: movi.16b v0, #0x1             ; encoding: [0x20,0xe4,0x00,0x4f]
-
-  mvni.2s v0, #1
-  mvni.2s v0, #1, lsl #0
-  mvni.2s v0, #1, lsl #8
-  mvni.2s v0, #1, lsl #16
-  mvni.2s v0, #1, lsl #24
-
-; CHECK: mvni.2s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x2f]
-; CHECK: mvni.2s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x2f]
-; CHECK: mvni.2s v0, #0x1, lsl #8      ; encoding: [0x20,0x24,0x00,0x2f]
-; CHECK: mvni.2s v0, #0x1, lsl #16     ; encoding: [0x20,0x44,0x00,0x2f]
-; CHECK: mvni.2s v0, #0x1, lsl #24     ; encoding: [0x20,0x64,0x00,0x2f]
-
-  mvni.4s v0, #1
-  mvni.4s v0, #1, lsl #0
-  mvni.4s v0, #1, lsl #8
-  mvni.4s v0, #1, lsl #16
-  mvni.4s v0, #1, lsl #24
-
-; CHECK: mvni.4s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x6f]
-; CHECK: mvni.4s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x6f]
-; CHECK: mvni.4s v0, #0x1, lsl #8      ; encoding: [0x20,0x24,0x00,0x6f]
-; CHECK: mvni.4s v0, #0x1, lsl #16     ; encoding: [0x20,0x44,0x00,0x6f]
-; CHECK: mvni.4s v0, #0x1, lsl #24     ; encoding: [0x20,0x64,0x00,0x6f]
-
-  mvni.4h v0, #1
-  mvni.4h v0, #1, lsl #0
-  mvni.4h v0, #1, lsl #8
-
-; CHECK: mvni.4h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x2f]
-; CHECK: mvni.4h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x2f]
-; CHECK: mvni.4h v0, #0x1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x2f]
-
-  mvni.8h v0, #1
-  mvni.8h v0, #1, lsl #0
-  mvni.8h v0, #1, lsl #8
-
-; CHECK: mvni.8h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x6f]
-; CHECK: mvni.8h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x6f]
-; CHECK: mvni.8h v0, #0x1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x6f]
-
-  mvni.2s v0, #1, msl #8
-  mvni.2s v0, #1, msl #16
-  mvni.4s v0, #1, msl #8
-  mvni.4s v0, #1, msl #16
-
-; CHECK: mvni.2s v0, #0x1, msl #8      ; encoding: [0x20,0xc4,0x00,0x2f]
-; CHECK: mvni.2s v0, #0x1, msl #16     ; encoding: [0x20,0xd4,0x00,0x2f]
-; CHECK: mvni.4s v0, #0x1, msl #8      ; encoding: [0x20,0xc4,0x00,0x6f]
-; CHECK: mvni.4s v0, #0x1, msl #16     ; encoding: [0x20,0xd4,0x00,0x6f]
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD scalar x index
-;===-------------------------------------------------------------------------===
-
-  fmla.s  s0, s0, v0[3]
-  fmla.d  d0, d0, v0[1]
-  fmls.s  s0, s0, v0[3]
-  fmls.d  d0, d0, v0[1]
-  fmulx.s s0, s0, v0[3]
-  fmulx.d d0, d0, v0[1]
-  fmul.s  s0, s0, v0[3]
-  fmul.d  d0, d0, v0[1]
-  sqdmlal.h s0, h0, v0[7]
-  sqdmlal.s d0, s0, v0[3]
-  sqdmlsl.h s0, h0, v0[7]
-  sqdmulh.h h0, h0, v0[7]
-  sqdmulh.s s0, s0, v0[3]
-  sqdmull.h s0, h0, v0[7]
-  sqdmull.s d0, s0, v0[3]
-  sqrdmulh.h  h0, h0, v0[7]
-  sqrdmulh.s  s0, s0, v0[3]
-
-; CHECK: fmla.s	s0, s0, v0[3]           ; encoding: [0x00,0x18,0xa0,0x5f]
-; CHECK: fmla.d	d0, d0, v0[1]           ; encoding: [0x00,0x18,0xc0,0x5f]
-; CHECK: fmls.s	s0, s0, v0[3]           ; encoding: [0x00,0x58,0xa0,0x5f]
-; CHECK: fmls.d	d0, d0, v0[1]           ; encoding: [0x00,0x58,0xc0,0x5f]
-; CHECK: fmulx.s	s0, s0, v0[3]           ; encoding: [0x00,0x98,0xa0,0x7f]
-; CHECK: fmulx.d	d0, d0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x7f]
-; CHECK: fmul.s	s0, s0, v0[3]           ; encoding: [0x00,0x98,0xa0,0x5f]
-; CHECK: fmul.d	d0, d0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x5f]
-; CHECK: sqdmlal.h	s0, h0, v0[7]   ; encoding: [0x00,0x38,0x70,0x5f]
-; CHECK: sqdmlal.s	d0, s0, v0[3]   ; encoding: [0x00,0x38,0xa0,0x5f]
-; CHECK: sqdmlsl.h	s0, h0, v0[7]   ; encoding: [0x00,0x78,0x70,0x5f]
-; CHECK: sqdmulh.h	h0, h0, v0[7]   ; encoding: [0x00,0xc8,0x70,0x5f]
-; CHECK: sqdmulh.s	s0, s0, v0[3]   ; encoding: [0x00,0xc8,0xa0,0x5f]
-; CHECK: sqdmull.h	s0, h0, v0[7]   ; encoding: [0x00,0xb8,0x70,0x5f]
-; CHECK: sqdmull.s	d0, s0, v0[3]   ; encoding: [0x00,0xb8,0xa0,0x5f]
-; CHECK: sqrdmulh.h	h0, h0, v0[7]   ; encoding: [0x00,0xd8,0x70,0x5f]
-; CHECK: sqrdmulh.s	s0, s0, v0[3]   ; encoding: [0x00,0xd8,0xa0,0x5f]
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD SMLAL
-;===-------------------------------------------------------------------------===
-        smlal.8h v1, v2, v3
-        smlal.4s v1, v2, v3
-        smlal.2d v1, v2, v3
-        smlal2.8h v1, v2, v3
-        smlal2.4s v1, v2, v3
-        smlal2.2d v1, v2, v3
-
-        smlal v13.8h, v8.8b, v0.8b
-        smlal v13.4s, v8.4h, v0.4h
-        smlal v13.2d, v8.2s, v0.2s
-        smlal2 v13.8h, v8.16b, v0.16b
-        smlal2 v13.4s, v8.8h, v0.8h
-        smlal2 v13.2d, v8.4s, v0.4s
-
-; CHECK: smlal.8h	v1, v2, v3      ; encoding: [0x41,0x80,0x23,0x0e]
-; CHECK: smlal.4s	v1, v2, v3      ; encoding: [0x41,0x80,0x63,0x0e]
-; CHECK: smlal.2d	v1, v2, v3      ; encoding: [0x41,0x80,0xa3,0x0e]
-; CHECK: smlal2.8h	v1, v2, v3      ; encoding: [0x41,0x80,0x23,0x4e]
-; CHECK: smlal2.4s	v1, v2, v3      ; encoding: [0x41,0x80,0x63,0x4e]
-; CHECK: smlal2.2d	v1, v2, v3      ; encoding: [0x41,0x80,0xa3,0x4e]
-; CHECK: smlal.8h	v13, v8, v0     ; encoding: [0x0d,0x81,0x20,0x0e]
-; CHECK: smlal.4s	v13, v8, v0     ; encoding: [0x0d,0x81,0x60,0x0e]
-; CHECK: smlal.2d	v13, v8, v0     ; encoding: [0x0d,0x81,0xa0,0x0e]
-; CHECK: smlal2.8h	v13, v8, v0     ; encoding: [0x0d,0x81,0x20,0x4e]
-; CHECK: smlal2.4s	v13, v8, v0     ; encoding: [0x0d,0x81,0x60,0x4e]
-; CHECK: smlal2.2d	v13, v8, v0     ; encoding: [0x0d,0x81,0xa0,0x4e]
-
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD scalar x index
-;===-------------------------------------------------------------------------===
-
-  fmla.2s v0, v0, v0[0]
-  fmla.4s v0, v0, v0[1]
-  fmla.2d v0, v0, v0[1]
-  fmls.2s v0, v0, v0[0]
-  fmls.4s v0, v0, v0[1]
-  fmls.2d v0, v0, v0[1]
-  fmulx.2s  v0, v0, v0[0]
-  fmulx.4s  v0, v0, v0[1]
-  fmulx.2d  v0, v0, v0[1]
-  fmul.2s v0, v0, v0[0]
-  fmul.4s v0, v0, v0[1]
-  fmul.2d v0, v0, v0[1]
-  mla.4h  v0, v0, v0[0]
-  mla.8h  v0, v0, v0[1]
-  mla.2s  v0, v0, v0[2]
-  mla.4s  v0, v0, v0[3]
-  mls.4h  v0, v0, v0[0]
-  mls.8h  v0, v0, v0[1]
-  mls.2s  v0, v0, v0[2]
-  mls.4s  v0, v0, v0[3]
-  mul.4h  v0, v0, v0[0]
-  mul.8h  v0, v0, v0[1]
-  mul.2s  v0, v0, v0[2]
-  mul.4s  v0, v0, v0[3]
-  smlal.4s  v0, v0, v0[0]
-  smlal2.4s v0, v0, v0[1]
-  smlal.2d  v0, v0, v0[2]
-  smlal2.2d v0, v0, v0[3]
-  smlsl.4s  v0, v0, v0[0]
-  smlsl2.4s v0, v0, v0[1]
-  smlsl.2d  v0, v0, v0[2]
-  smlsl2.2d v0, v0, v0[3]
-  smull.4s  v0, v0, v0[0]
-  smull2.4s v0, v0, v0[1]
-  smull.2d  v0, v0, v0[2]
-  smull2.2d v0, v0, v0[3]
-  sqdmlal.4s  v0, v0, v0[0]
-  sqdmlal2.4s v0, v0, v0[1]
-  sqdmlal.2d  v0, v0, v0[2]
-  sqdmlal2.2d v0, v0, v0[3]
-  sqdmlsl.4s  v0, v0, v0[0]
-  sqdmlsl2.4s v0, v0, v0[1]
-  sqdmlsl.2d  v0, v0, v0[2]
-  sqdmlsl2.2d v0, v0, v0[3]
-  sqdmulh.4h  v0, v0, v0[0]
-  sqdmulh.8h  v0, v0, v0[1]
-  sqdmulh.2s  v0, v0, v0[2]
-  sqdmulh.4s  v0, v0, v0[3]
-  sqdmull.4s  v0, v0, v0[0]
-  sqdmull2.4s v0, v0, v0[1]
-  sqdmull.2d  v0, v0, v0[2]
-  sqdmull2.2d v0, v0, v0[3]
-  sqrdmulh.4h v0, v0, v0[0]
-  sqrdmulh.8h v0, v0, v0[1]
-  sqrdmulh.2s v0, v0, v0[2]
-  sqrdmulh.4s v0, v0, v0[3]
-  umlal.4s  v0, v0, v0[0]
-  umlal2.4s v0, v0, v0[1]
-  umlal.2d  v0, v0, v0[2]
-  umlal2.2d v0, v0, v0[3]
-  umlsl.4s  v0, v0, v0[0]
-  umlsl2.4s v0, v0, v0[1]
-  umlsl.2d  v0, v0, v0[2]
-  umlsl2.2d v0, v0, v0[3]
-  umull.4s  v0, v0, v0[0]
-  umull2.4s v0, v0, v0[1]
-  umull.2d  v0, v0, v0[2]
-  umull2.2d v0, v0, v0[3]
-
-; CHECK: fmla.2s	v0, v0, v0[0]           ; encoding: [0x00,0x10,0x80,0x0f]
-; CHECK: fmla.4s	v0, v0, v0[1]           ; encoding: [0x00,0x10,0xa0,0x4f]
-; CHECK: fmla.2d	v0, v0, v0[1]           ; encoding: [0x00,0x18,0xc0,0x4f]
-; CHECK: fmls.2s	v0, v0, v0[0]           ; encoding: [0x00,0x50,0x80,0x0f]
-; CHECK: fmls.4s	v0, v0, v0[1]           ; encoding: [0x00,0x50,0xa0,0x4f]
-; CHECK: fmls.2d	v0, v0, v0[1]           ; encoding: [0x00,0x58,0xc0,0x4f]
-; CHECK: fmulx.2s	v0, v0, v0[0]   ; encoding: [0x00,0x90,0x80,0x2f]
-; CHECK: fmulx.4s	v0, v0, v0[1]   ; encoding: [0x00,0x90,0xa0,0x6f]
-; CHECK: fmulx.2d	v0, v0, v0[1]   ; encoding: [0x00,0x98,0xc0,0x6f]
-; CHECK: fmul.2s	v0, v0, v0[0]           ; encoding: [0x00,0x90,0x80,0x0f]
-; CHECK: fmul.4s	v0, v0, v0[1]           ; encoding: [0x00,0x90,0xa0,0x4f]
-; CHECK: fmul.2d	v0, v0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x4f]
-; CHECK: mla.4h	v0, v0, v0[0]           ; encoding: [0x00,0x00,0x40,0x2f]
-; CHECK: mla.8h	v0, v0, v0[1]           ; encoding: [0x00,0x00,0x50,0x6f]
-; CHECK: mla.2s	v0, v0, v0[2]           ; encoding: [0x00,0x08,0x80,0x2f]
-; CHECK: mla.4s	v0, v0, v0[3]           ; encoding: [0x00,0x08,0xa0,0x6f]
-; CHECK: mls.4h	v0, v0, v0[0]           ; encoding: [0x00,0x40,0x40,0x2f]
-; CHECK: mls.8h	v0, v0, v0[1]           ; encoding: [0x00,0x40,0x50,0x6f]
-; CHECK: mls.2s	v0, v0, v0[2]           ; encoding: [0x00,0x48,0x80,0x2f]
-; CHECK: mls.4s	v0, v0, v0[3]           ; encoding: [0x00,0x48,0xa0,0x6f]
-; CHECK: mul.4h	v0, v0, v0[0]           ; encoding: [0x00,0x80,0x40,0x0f]
-; CHECK: mul.8h	v0, v0, v0[1]           ; encoding: [0x00,0x80,0x50,0x4f]
-; CHECK: mul.2s	v0, v0, v0[2]           ; encoding: [0x00,0x88,0x80,0x0f]
-; CHECK: mul.4s	v0, v0, v0[3]           ; encoding: [0x00,0x88,0xa0,0x4f]
-; CHECK: smlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x20,0x40,0x0f]
-; CHECK: smlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x20,0x50,0x4f]
-; CHECK: smlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x28,0x80,0x0f]
-; CHECK: smlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x28,0xa0,0x4f]
-; CHECK: smlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x60,0x40,0x0f]
-; CHECK: smlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x60,0x50,0x4f]
-; CHECK: smlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x68,0x80,0x0f]
-; CHECK: smlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x68,0xa0,0x4f]
-; CHECK: smull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xa0,0x40,0x0f]
-; CHECK: smull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xa0,0x50,0x4f]
-; CHECK: smull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xa8,0x80,0x0f]
-; CHECK: smull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xa8,0xa0,0x4f]
-; CHECK: sqdmlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x30,0x40,0x0f]
-; CHECK: sqdmlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x30,0x50,0x4f]
-; CHECK: sqdmlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x38,0x80,0x0f]
-; CHECK: sqdmlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x38,0xa0,0x4f]
-; CHECK: sqdmlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x70,0x40,0x0f]
-; CHECK: sqdmlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x70,0x50,0x4f]
-; CHECK: sqdmlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x78,0x80,0x0f]
-; CHECK: sqdmlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x78,0xa0,0x4f]
-; CHECK: sqdmulh.4h	v0, v0, v0[0]   ; encoding: [0x00,0xc0,0x40,0x0f]
-; CHECK: sqdmulh.8h	v0, v0, v0[1]   ; encoding: [0x00,0xc0,0x50,0x4f]
-; CHECK: sqdmulh.2s	v0, v0, v0[2]   ; encoding: [0x00,0xc8,0x80,0x0f]
-; CHECK: sqdmulh.4s	v0, v0, v0[3]   ; encoding: [0x00,0xc8,0xa0,0x4f]
-; CHECK: sqdmull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xb0,0x40,0x0f]
-; CHECK: sqdmull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xb0,0x50,0x4f]
-; CHECK: sqdmull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xb8,0x80,0x0f]
-; CHECK: sqdmull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xb8,0xa0,0x4f]
-; CHECK: sqrdmulh.4h	v0, v0, v0[0]   ; encoding: [0x00,0xd0,0x40,0x0f]
-; CHECK: sqrdmulh.8h	v0, v0, v0[1]   ; encoding: [0x00,0xd0,0x50,0x4f]
-; CHECK: sqrdmulh.2s	v0, v0, v0[2]   ; encoding: [0x00,0xd8,0x80,0x0f]
-; CHECK: sqrdmulh.4s	v0, v0, v0[3]   ; encoding: [0x00,0xd8,0xa0,0x4f]
-; CHECK: umlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x20,0x40,0x2f]
-; CHECK: umlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x20,0x50,0x6f]
-; CHECK: umlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x28,0x80,0x2f]
-; CHECK: umlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x28,0xa0,0x6f]
-; CHECK: umlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x60,0x40,0x2f]
-; CHECK: umlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x60,0x50,0x6f]
-; CHECK: umlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x68,0x80,0x2f]
-; CHECK: umlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x68,0xa0,0x6f]
-; CHECK: umull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xa0,0x40,0x2f]
-; CHECK: umull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xa0,0x50,0x6f]
-; CHECK: umull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xa8,0x80,0x2f]
-; CHECK: umull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xa8,0xa0,0x6f]
-
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD scalar with shift
-;===-------------------------------------------------------------------------===
-
-  fcvtzs s0, s0, #1
-  fcvtzs d0, d0, #2
-  fcvtzu s0, s0, #1
-  fcvtzu d0, d0, #2
-  shl    d0, d0, #1
-  sli    d0, d0, #1
-  sqrshrn b0, h0, #1
-  sqrshrn h0, s0, #2
-  sqrshrn s0, d0, #3
-  sqrshrun b0, h0, #1
-  sqrshrun h0, s0, #2
-  sqrshrun s0, d0, #3
-  sqshlu  b0, b0, #1
-  sqshlu  h0, h0, #2
-  sqshlu  s0, s0, #3
-  sqshlu  d0, d0, #4
-  sqshl   b0, b0, #1
-  sqshl   h0, h0, #2
-  sqshl   s0, s0, #3
-  sqshl   d0, d0, #4
-  sqshrn  b0, h0, #1
-  sqshrn  h0, s0, #2
-  sqshrn  s0, d0, #3
-  sqshrun b0, h0, #1
-  sqshrun h0, s0, #2
-  sqshrun s0, d0, #3
-  sri     d0, d0, #1
-  srshr   d0, d0, #1
-  srsra   d0, d0, #1
-  sshr    d0, d0, #1
-  ucvtf   s0, s0, #1
-  ucvtf   d0, d0, #2
-  scvtf   s0, s0, #1
-  scvtf   d0, d0, #2
-  uqrshrn b0, h0, #1
-  uqrshrn h0, s0, #2
-  uqrshrn s0, d0, #3
-  uqshl   b0, b0, #1
-  uqshl   h0, h0, #2
-  uqshl   s0, s0, #3
-  uqshl   d0, d0, #4
-  uqshrn  b0, h0, #1
-  uqshrn  h0, s0, #2
-  uqshrn  s0, d0, #3
-  urshr   d0, d0, #1
-  ursra   d0, d0, #1
-  ushr    d0, d0, #1
-  usra    d0, d0, #1
-
-; CHECK: fcvtzs	s0, s0, #1              ; encoding: [0x00,0xfc,0x3f,0x5f]
-; CHECK: fcvtzs	d0, d0, #2              ; encoding: [0x00,0xfc,0x7e,0x5f]
-; CHECK: fcvtzu	s0, s0, #1              ; encoding: [0x00,0xfc,0x3f,0x7f]
-; CHECK: fcvtzu	d0, d0, #2              ; encoding: [0x00,0xfc,0x7e,0x7f]
-; CHECK: shl	d0, d0, #1              ; encoding: [0x00,0x54,0x41,0x5f]
-; CHECK: sli	d0, d0, #1              ; encoding: [0x00,0x54,0x41,0x7f]
-; CHECK: sqrshrn	b0, h0, #1              ; encoding: [0x00,0x9c,0x0f,0x5f]
-; CHECK: sqrshrn	h0, s0, #2              ; encoding: [0x00,0x9c,0x1e,0x5f]
-; CHECK: sqrshrn	s0, d0, #3              ; encoding: [0x00,0x9c,0x3d,0x5f]
-; CHECK: sqrshrun	b0, h0, #1      ; encoding: [0x00,0x8c,0x0f,0x7f]
-; CHECK: sqrshrun	h0, s0, #2      ; encoding: [0x00,0x8c,0x1e,0x7f]
-; CHECK: sqrshrun	s0, d0, #3      ; encoding: [0x00,0x8c,0x3d,0x7f]
-; CHECK: sqshlu	b0, b0, #1              ; encoding: [0x00,0x64,0x09,0x7f]
-; CHECK: sqshlu	h0, h0, #2              ; encoding: [0x00,0x64,0x12,0x7f]
-; CHECK: sqshlu	s0, s0, #3              ; encoding: [0x00,0x64,0x23,0x7f]
-; CHECK: sqshlu	d0, d0, #4              ; encoding: [0x00,0x64,0x44,0x7f]
-; CHECK: sqshl	b0, b0, #1              ; encoding: [0x00,0x74,0x09,0x5f]
-; CHECK: sqshl	h0, h0, #2              ; encoding: [0x00,0x74,0x12,0x5f]
-; CHECK: sqshl	s0, s0, #3              ; encoding: [0x00,0x74,0x23,0x5f]
-; CHECK: sqshl	d0, d0, #4              ; encoding: [0x00,0x74,0x44,0x5f]
-; CHECK: sqshrn	b0, h0, #1              ; encoding: [0x00,0x94,0x0f,0x5f]
-; CHECK: sqshrn	h0, s0, #2              ; encoding: [0x00,0x94,0x1e,0x5f]
-; CHECK: sqshrn	s0, d0, #3              ; encoding: [0x00,0x94,0x3d,0x5f]
-; CHECK: sqshrun	b0, h0, #1              ; encoding: [0x00,0x84,0x0f,0x7f]
-; CHECK: sqshrun	h0, s0, #2              ; encoding: [0x00,0x84,0x1e,0x7f]
-; CHECK: sqshrun	s0, d0, #3              ; encoding: [0x00,0x84,0x3d,0x7f]
-; CHECK: sri	d0, d0, #1              ; encoding: [0x00,0x44,0x7f,0x7f]
-; CHECK: srshr	d0, d0, #1              ; encoding: [0x00,0x24,0x7f,0x5f]
-; CHECK: srsra	d0, d0, #1              ; encoding: [0x00,0x34,0x7f,0x5f]
-; CHECK: sshr	d0, d0, #1              ; encoding: [0x00,0x04,0x7f,0x5f]
-; CHECK: ucvtf	s0, s0, #1              ; encoding: [0x00,0xe4,0x3f,0x7f]
-; CHECK: ucvtf	d0, d0, #2              ; encoding: [0x00,0xe4,0x7e,0x7f]
-; check: scvtf  s0, s0, #1              ; encoding: [0x00,0xe4,0x3f,0x5f]
-; check: scvtf  d0, d0, #2              ; encoding: [0x00,0xe4,0x7e,0x5f]
-; CHECK: uqrshrn	b0, h0, #1              ; encoding: [0x00,0x9c,0x0f,0x7f]
-; CHECK: uqrshrn	h0, s0, #2              ; encoding: [0x00,0x9c,0x1e,0x7f]
-; CHECK: uqrshrn	s0, d0, #3              ; encoding: [0x00,0x9c,0x3d,0x7f]
-; CHECK: uqshl	b0, b0, #1              ; encoding: [0x00,0x74,0x09,0x7f]
-; CHECK: uqshl	h0, h0, #2              ; encoding: [0x00,0x74,0x12,0x7f]
-; CHECK: uqshl	s0, s0, #3              ; encoding: [0x00,0x74,0x23,0x7f]
-; CHECK: uqshl	d0, d0, #4              ; encoding: [0x00,0x74,0x44,0x7f]
-; CHECK: uqshrn	b0, h0, #1              ; encoding: [0x00,0x94,0x0f,0x7f]
-; CHECK: uqshrn	h0, s0, #2              ; encoding: [0x00,0x94,0x1e,0x7f]
-; CHECK: uqshrn	s0, d0, #3              ; encoding: [0x00,0x94,0x3d,0x7f]
-; CHECK: urshr	d0, d0, #1              ; encoding: [0x00,0x24,0x7f,0x7f]
-; CHECK: ursra	d0, d0, #1              ; encoding: [0x00,0x34,0x7f,0x7f]
-; CHECK: ushr	d0, d0, #1              ; encoding: [0x00,0x04,0x7f,0x7f]
-; CHECK: usra	d0, d0, #1              ; encoding: [0x00,0x14,0x7f,0x7f]
-
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD vector with shift
-;===-------------------------------------------------------------------------===
-
-   fcvtzs.2s v0, v0, #1
-   fcvtzs.4s v0, v0, #2
-   fcvtzs.2d v0, v0, #3
-   fcvtzu.2s v0, v0, #1
-   fcvtzu.4s v0, v0, #2
-   fcvtzu.2d v0, v0, #3
-   rshrn.8b v0, v0, #1
-   rshrn2.16b v0, v0, #2
-   rshrn.4h v0, v0, #3
-   rshrn2.8h v0, v0, #4
-   rshrn.2s v0, v0, #5
-   rshrn2.4s v0, v0, #6
-   scvtf.2s v0, v0, #1
-   scvtf.4s v0, v0, #2
-   scvtf.2d v0, v0, #3
-   shl.8b v0, v0, #1
-   shl.16b v0, v0, #2
-   shl.4h v0, v0, #3
-   shl.8h v0, v0, #4
-   shl.2s v0, v0, #5
-   shl.4s v0, v0, #6
-   shl.2d v0, v0, #7
-   shrn.8b v0, v0, #1
-   shrn2.16b v0, v0, #2
-   shrn.4h v0, v0, #3
-   shrn2.8h v0, v0, #4
-   shrn.2s v0, v0, #5
-   shrn2.4s v0, v0, #6
-   sli.8b v0, v0, #1
-   sli.16b v0, v0, #2
-   sli.4h v0, v0, #3
-   sli.8h v0, v0, #4
-   sli.2s v0, v0, #5
-   sli.4s v0, v0, #6
-   sli.2d v0, v0, #7
-   sqrshrn.8b v0, v0, #1
-   sqrshrn2.16b v0, v0, #2
-   sqrshrn.4h v0, v0, #3
-   sqrshrn2.8h v0, v0, #4
-   sqrshrn.2s v0, v0, #5
-   sqrshrn2.4s v0, v0, #6
-   sqrshrun.8b v0, v0, #1
-   sqrshrun2.16b v0, v0, #2
-   sqrshrun.4h v0, v0, #3
-   sqrshrun2.8h v0, v0, #4
-   sqrshrun.2s v0, v0, #5
-   sqrshrun2.4s v0, v0, #6
-   sqshlu.8b v0, v0, #1
-   sqshlu.16b v0, v0, #2
-   sqshlu.4h v0, v0, #3
-   sqshlu.8h v0, v0, #4
-   sqshlu.2s v0, v0, #5
-   sqshlu.4s v0, v0, #6
-   sqshlu.2d v0, v0, #7
-   sqshl.8b v0, v0, #1
-   sqshl.16b v0, v0, #2
-   sqshl.4h v0, v0, #3
-   sqshl.8h v0, v0, #4
-   sqshl.2s v0, v0, #5
-   sqshl.4s v0, v0, #6
-   sqshl.2d v0, v0, #7
-   sqshrn.8b v0, v0, #1
-   sqshrn2.16b v0, v0, #2
-   sqshrn.4h v0, v0, #3
-   sqshrn2.8h v0, v0, #4
-   sqshrn.2s v0, v0, #5
-   sqshrn2.4s v0, v0, #6
-   sqshrun.8b v0, v0, #1
-   sqshrun2.16b v0, v0, #2
-   sqshrun.4h v0, v0, #3
-   sqshrun2.8h v0, v0, #4
-   sqshrun.2s v0, v0, #5
-   sqshrun2.4s v0, v0, #6
-   sri.8b v0, v0, #1
-   sri.16b v0, v0, #2
-   sri.4h v0, v0, #3
-   sri.8h v0, v0, #4
-   sri.2s v0, v0, #5
-   sri.4s v0, v0, #6
-   sri.2d v0, v0, #7
-   srshr.8b v0, v0, #1
-   srshr.16b v0, v0, #2
-   srshr.4h v0, v0, #3
-   srshr.8h v0, v0, #4
-   srshr.2s v0, v0, #5
-   srshr.4s v0, v0, #6
-   srshr.2d v0, v0, #7
-   srsra.8b v0, v0, #1
-   srsra.16b v0, v0, #2
-   srsra.4h v0, v0, #3
-   srsra.8h v0, v0, #4
-   srsra.2s v0, v0, #5
-   srsra.4s v0, v0, #6
-   srsra.2d v0, v0, #7
-   sshll.8h v0, v0, #1
-   sshll2.8h v0, v0, #2
-   sshll.4s v0, v0, #3
-   sshll2.4s v0, v0, #4
-   sshll.2d v0, v0, #5
-   sshll2.2d v0, v0, #6
-   sshr.8b v0, v0, #1
-   sshr.16b v0, v0, #2
-   sshr.4h v0, v0, #3
-   sshr.8h v0, v0, #4
-   sshr.2s v0, v0, #5
-   sshr.4s v0, v0, #6
-   sshr.2d v0, v0, #7
-   sshr.8b v0, v0, #1
-   ssra.16b v0, v0, #2
-   ssra.4h v0, v0, #3
-   ssra.8h v0, v0, #4
-   ssra.2s v0, v0, #5
-   ssra.4s v0, v0, #6
-   ssra.2d v0, v0, #7
-   ssra d0, d0, #64
-   ucvtf.2s v0, v0, #1
-   ucvtf.4s v0, v0, #2
-   ucvtf.2d v0, v0, #3
-   uqrshrn.8b v0, v0, #1
-   uqrshrn2.16b v0, v0, #2
-   uqrshrn.4h v0, v0, #3
-   uqrshrn2.8h v0, v0, #4
-   uqrshrn.2s v0, v0, #5
-   uqrshrn2.4s v0, v0, #6
-   uqshl.8b v0, v0, #1
-   uqshl.16b v0, v0, #2
-   uqshl.4h v0, v0, #3
-   uqshl.8h v0, v0, #4
-   uqshl.2s v0, v0, #5
-   uqshl.4s v0, v0, #6
-   uqshl.2d v0, v0, #7
-   uqshrn.8b v0, v0, #1
-   uqshrn2.16b v0, v0, #2
-   uqshrn.4h v0, v0, #3
-   uqshrn2.8h v0, v0, #4
-   uqshrn.2s v0, v0, #5
-   uqshrn2.4s v0, v0, #6
-   urshr.8b v0, v0, #1
-   urshr.16b v0, v0, #2
-   urshr.4h v0, v0, #3
-   urshr.8h v0, v0, #4
-   urshr.2s v0, v0, #5
-   urshr.4s v0, v0, #6
-   urshr.2d v0, v0, #7
-   ursra.8b v0, v0, #1
-   ursra.16b v0, v0, #2
-   ursra.4h v0, v0, #3
-   ursra.8h v0, v0, #4
-   ursra.2s v0, v0, #5
-   ursra.4s v0, v0, #6
-   ursra.2d v0, v0, #7
-   ushll.8h v0, v0, #1
-   ushll2.8h v0, v0, #2
-   ushll.4s v0, v0, #3
-   ushll2.4s v0, v0, #4
-   ushll.2d v0, v0, #5
-   ushll2.2d v0, v0, #6
-   ushr.8b v0, v0, #1
-   ushr.16b v0, v0, #2
-   ushr.4h v0, v0, #3
-   ushr.8h v0, v0, #4
-   ushr.2s v0, v0, #5
-   ushr.4s v0, v0, #6
-   ushr.2d v0, v0, #7
-   usra.8b v0, v0, #1
-   usra.16b v0, v0, #2
-   usra.4h v0, v0, #3
-   usra.8h v0, v0, #4
-   usra.2s v0, v0, #5
-   usra.4s v0, v0, #6
-   usra.2d v0, v0, #7
-
-; CHECK: fcvtzs.2s	v0, v0, #1      ; encoding: [0x00,0xfc,0x3f,0x0f]
-; CHECK: fcvtzs.4s	v0, v0, #2      ; encoding: [0x00,0xfc,0x3e,0x4f]
-; CHECK: fcvtzs.2d	v0, v0, #3      ; encoding: [0x00,0xfc,0x7d,0x4f]
-; CHECK: fcvtzu.2s	v0, v0, #1      ; encoding: [0x00,0xfc,0x3f,0x2f]
-; CHECK: fcvtzu.4s	v0, v0, #2      ; encoding: [0x00,0xfc,0x3e,0x6f]
-; CHECK: fcvtzu.2d	v0, v0, #3      ; encoding: [0x00,0xfc,0x7d,0x6f]
-; CHECK: rshrn.8b	v0, v0, #1      ; encoding: [0x00,0x8c,0x0f,0x0f]
-; CHECK: rshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x8c,0x0e,0x4f]
-; CHECK: rshrn.4h	v0, v0, #3      ; encoding: [0x00,0x8c,0x1d,0x0f]
-; CHECK: rshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x8c,0x1c,0x4f]
-; CHECK: rshrn.2s	v0, v0, #5      ; encoding: [0x00,0x8c,0x3b,0x0f]
-; CHECK: rshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x8c,0x3a,0x4f]
-; CHECK: scvtf.2s	v0, v0, #1      ; encoding: [0x00,0xe4,0x3f,0x0f]
-; CHECK: scvtf.4s	v0, v0, #2      ; encoding: [0x00,0xe4,0x3e,0x4f]
-; CHECK: scvtf.2d	v0, v0, #3      ; encoding: [0x00,0xe4,0x7d,0x4f]
-; CHECK: shl.8b	v0, v0, #1              ; encoding: [0x00,0x54,0x09,0x0f]
-; CHECK: shl.16b	v0, v0, #2              ; encoding: [0x00,0x54,0x0a,0x4f]
-; CHECK: shl.4h	v0, v0, #3              ; encoding: [0x00,0x54,0x13,0x0f]
-; CHECK: shl.8h	v0, v0, #4              ; encoding: [0x00,0x54,0x14,0x4f]
-; CHECK: shl.2s	v0, v0, #5              ; encoding: [0x00,0x54,0x25,0x0f]
-; CHECK: shl.4s	v0, v0, #6              ; encoding: [0x00,0x54,0x26,0x4f]
-; CHECK: shl.2d	v0, v0, #7              ; encoding: [0x00,0x54,0x47,0x4f]
-; CHECK: shrn.8b	v0, v0, #1              ; encoding: [0x00,0x84,0x0f,0x0f]
-; CHECK: shrn2.16b	v0, v0, #2      ; encoding: [0x00,0x84,0x0e,0x4f]
-; CHECK: shrn.4h	v0, v0, #3              ; encoding: [0x00,0x84,0x1d,0x0f]
-; CHECK: shrn2.8h	v0, v0, #4      ; encoding: [0x00,0x84,0x1c,0x4f]
-; CHECK: shrn.2s	v0, v0, #5              ; encoding: [0x00,0x84,0x3b,0x0f]
-; CHECK: shrn2.4s	v0, v0, #6      ; encoding: [0x00,0x84,0x3a,0x4f]
-; CHECK: sli.8b	v0, v0, #1              ; encoding: [0x00,0x54,0x09,0x2f]
-; CHECK: sli.16b	v0, v0, #2              ; encoding: [0x00,0x54,0x0a,0x6f]
-; CHECK: sli.4h	v0, v0, #3              ; encoding: [0x00,0x54,0x13,0x2f]
-; CHECK: sli.8h	v0, v0, #4              ; encoding: [0x00,0x54,0x14,0x6f]
-; CHECK: sli.2s	v0, v0, #5              ; encoding: [0x00,0x54,0x25,0x2f]
-; CHECK: sli.4s	v0, v0, #6              ; encoding: [0x00,0x54,0x26,0x6f]
-; CHECK: sli.2d	v0, v0, #7              ; encoding: [0x00,0x54,0x47,0x6f]
-; CHECK: sqrshrn.8b	v0, v0, #1      ; encoding: [0x00,0x9c,0x0f,0x0f]
-; CHECK: sqrshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x9c,0x0e,0x4f]
-; CHECK: sqrshrn.4h	v0, v0, #3      ; encoding: [0x00,0x9c,0x1d,0x0f]
-; CHECK: sqrshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x9c,0x1c,0x4f]
-; CHECK: sqrshrn.2s	v0, v0, #5      ; encoding: [0x00,0x9c,0x3b,0x0f]
-; CHECK: sqrshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x9c,0x3a,0x4f]
-; CHECK: sqrshrun.8b	v0, v0, #1      ; encoding: [0x00,0x8c,0x0f,0x2f]
-; CHECK: sqrshrun2.16b	v0, v0, #2      ; encoding: [0x00,0x8c,0x0e,0x6f]
-; CHECK: sqrshrun.4h	v0, v0, #3      ; encoding: [0x00,0x8c,0x1d,0x2f]
-; CHECK: sqrshrun2.8h	v0, v0, #4      ; encoding: [0x00,0x8c,0x1c,0x6f]
-; CHECK: sqrshrun.2s	v0, v0, #5      ; encoding: [0x00,0x8c,0x3b,0x2f]
-; CHECK: sqrshrun2.4s	v0, v0, #6      ; encoding: [0x00,0x8c,0x3a,0x6f]
-; CHECK: sqshlu.8b	v0, v0, #1      ; encoding: [0x00,0x64,0x09,0x2f]
-; CHECK: sqshlu.16b	v0, v0, #2      ; encoding: [0x00,0x64,0x0a,0x6f]
-; CHECK: sqshlu.4h	v0, v0, #3      ; encoding: [0x00,0x64,0x13,0x2f]
-; CHECK: sqshlu.8h	v0, v0, #4      ; encoding: [0x00,0x64,0x14,0x6f]
-; CHECK: sqshlu.2s	v0, v0, #5      ; encoding: [0x00,0x64,0x25,0x2f]
-; CHECK: sqshlu.4s	v0, v0, #6      ; encoding: [0x00,0x64,0x26,0x6f]
-; CHECK: sqshlu.2d	v0, v0, #7      ; encoding: [0x00,0x64,0x47,0x6f]
-; CHECK: sqshl.8b	v0, v0, #1      ; encoding: [0x00,0x74,0x09,0x0f]
-; CHECK: sqshl.16b	v0, v0, #2      ; encoding: [0x00,0x74,0x0a,0x4f]
-; CHECK: sqshl.4h	v0, v0, #3      ; encoding: [0x00,0x74,0x13,0x0f]
-; CHECK: sqshl.8h	v0, v0, #4      ; encoding: [0x00,0x74,0x14,0x4f]
-; CHECK: sqshl.2s	v0, v0, #5      ; encoding: [0x00,0x74,0x25,0x0f]
-; CHECK: sqshl.4s	v0, v0, #6      ; encoding: [0x00,0x74,0x26,0x4f]
-; CHECK: sqshl.2d	v0, v0, #7      ; encoding: [0x00,0x74,0x47,0x4f]
-; CHECK: sqshrn.8b	v0, v0, #1      ; encoding: [0x00,0x94,0x0f,0x0f]
-; CHECK: sqshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x94,0x0e,0x4f]
-; CHECK: sqshrn.4h	v0, v0, #3      ; encoding: [0x00,0x94,0x1d,0x0f]
-; CHECK: sqshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x94,0x1c,0x4f]
-; CHECK: sqshrn.2s	v0, v0, #5      ; encoding: [0x00,0x94,0x3b,0x0f]
-; CHECK: sqshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x94,0x3a,0x4f]
-; CHECK: sqshrun.8b	v0, v0, #1      ; encoding: [0x00,0x84,0x0f,0x2f]
-; CHECK: sqshrun2.16b	v0, v0, #2      ; encoding: [0x00,0x84,0x0e,0x6f]
-; CHECK: sqshrun.4h	v0, v0, #3      ; encoding: [0x00,0x84,0x1d,0x2f]
-; CHECK: sqshrun2.8h	v0, v0, #4      ; encoding: [0x00,0x84,0x1c,0x6f]
-; CHECK: sqshrun.2s	v0, v0, #5      ; encoding: [0x00,0x84,0x3b,0x2f]
-; CHECK: sqshrun2.4s	v0, v0, #6      ; encoding: [0x00,0x84,0x3a,0x6f]
-; CHECK: sri.8b	v0, v0, #1              ; encoding: [0x00,0x44,0x0f,0x2f]
-; CHECK: sri.16b	v0, v0, #2              ; encoding: [0x00,0x44,0x0e,0x6f]
-; CHECK: sri.4h	v0, v0, #3              ; encoding: [0x00,0x44,0x1d,0x2f]
-; CHECK: sri.8h	v0, v0, #4              ; encoding: [0x00,0x44,0x1c,0x6f]
-; CHECK: sri.2s	v0, v0, #5              ; encoding: [0x00,0x44,0x3b,0x2f]
-; CHECK: sri.4s	v0, v0, #6              ; encoding: [0x00,0x44,0x3a,0x6f]
-; CHECK: sri.2d	v0, v0, #7              ; encoding: [0x00,0x44,0x79,0x6f]
-; CHECK: srshr.8b	v0, v0, #1      ; encoding: [0x00,0x24,0x0f,0x0f]
-; CHECK: srshr.16b	v0, v0, #2      ; encoding: [0x00,0x24,0x0e,0x4f]
-; CHECK: srshr.4h	v0, v0, #3      ; encoding: [0x00,0x24,0x1d,0x0f]
-; CHECK: srshr.8h	v0, v0, #4      ; encoding: [0x00,0x24,0x1c,0x4f]
-; CHECK: srshr.2s	v0, v0, #5      ; encoding: [0x00,0x24,0x3b,0x0f]
-; CHECK: srshr.4s	v0, v0, #6      ; encoding: [0x00,0x24,0x3a,0x4f]
-; CHECK: srshr.2d	v0, v0, #7      ; encoding: [0x00,0x24,0x79,0x4f]
-; CHECK: srsra.8b	v0, v0, #1      ; encoding: [0x00,0x34,0x0f,0x0f]
-; CHECK: srsra.16b	v0, v0, #2      ; encoding: [0x00,0x34,0x0e,0x4f]
-; CHECK: srsra.4h	v0, v0, #3      ; encoding: [0x00,0x34,0x1d,0x0f]
-; CHECK: srsra.8h	v0, v0, #4      ; encoding: [0x00,0x34,0x1c,0x4f]
-; CHECK: srsra.2s	v0, v0, #5      ; encoding: [0x00,0x34,0x3b,0x0f]
-; CHECK: srsra.4s	v0, v0, #6      ; encoding: [0x00,0x34,0x3a,0x4f]
-; CHECK: srsra.2d	v0, v0, #7      ; encoding: [0x00,0x34,0x79,0x4f]
-; CHECK: sshll.8h	v0, v0, #1      ; encoding: [0x00,0xa4,0x09,0x0f]
-; CHECK: sshll2.8h	v0, v0, #2      ; encoding: [0x00,0xa4,0x0a,0x4f]
-; CHECK: sshll.4s	v0, v0, #3      ; encoding: [0x00,0xa4,0x13,0x0f]
-; CHECK: sshll2.4s	v0, v0, #4      ; encoding: [0x00,0xa4,0x14,0x4f]
-; CHECK: sshll.2d	v0, v0, #5      ; encoding: [0x00,0xa4,0x25,0x0f]
-; CHECK: sshll2.2d	v0, v0, #6      ; encoding: [0x00,0xa4,0x26,0x4f]
-; CHECK: sshr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x0f]
-; CHECK: sshr.16b	v0, v0, #2      ; encoding: [0x00,0x04,0x0e,0x4f]
-; CHECK: sshr.4h	v0, v0, #3              ; encoding: [0x00,0x04,0x1d,0x0f]
-; CHECK: sshr.8h	v0, v0, #4              ; encoding: [0x00,0x04,0x1c,0x4f]
-; CHECK: sshr.2s	v0, v0, #5              ; encoding: [0x00,0x04,0x3b,0x0f]
-; CHECK: sshr.4s	v0, v0, #6              ; encoding: [0x00,0x04,0x3a,0x4f]
-; CHECK: sshr.2d	v0, v0, #7              ; encoding: [0x00,0x04,0x79,0x4f]
-; CHECK: sshr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x0f]
-; CHECK: ssra.16b	v0, v0, #2      ; encoding: [0x00,0x14,0x0e,0x4f]
-; CHECK: ssra.4h	v0, v0, #3              ; encoding: [0x00,0x14,0x1d,0x0f]
-; CHECK: ssra.8h	v0, v0, #4              ; encoding: [0x00,0x14,0x1c,0x4f]
-; CHECK: ssra.2s	v0, v0, #5              ; encoding: [0x00,0x14,0x3b,0x0f]
-; CHECK: ssra.4s	v0, v0, #6              ; encoding: [0x00,0x14,0x3a,0x4f]
-; CHECK: ssra.2d	v0, v0, #7              ; encoding: [0x00,0x14,0x79,0x4f]
-; CHECK: ssra		d0, d0, #64             ; encoding: [0x00,0x14,0x40,0x5f]
-; CHECK: ucvtf.2s	v0, v0, #1      ; encoding: [0x00,0xe4,0x3f,0x2f]
-; CHECK: ucvtf.4s	v0, v0, #2      ; encoding: [0x00,0xe4,0x3e,0x6f]
-; CHECK: ucvtf.2d	v0, v0, #3      ; encoding: [0x00,0xe4,0x7d,0x6f]
-; CHECK: uqrshrn.8b	v0, v0, #1      ; encoding: [0x00,0x9c,0x0f,0x2f]
-; CHECK: uqrshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x9c,0x0e,0x6f]
-; CHECK: uqrshrn.4h	v0, v0, #3      ; encoding: [0x00,0x9c,0x1d,0x2f]
-; CHECK: uqrshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x9c,0x1c,0x6f]
-; CHECK: uqrshrn.2s	v0, v0, #5      ; encoding: [0x00,0x9c,0x3b,0x2f]
-; CHECK: uqrshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x9c,0x3a,0x6f]
-; CHECK: uqshl.8b	v0, v0, #1      ; encoding: [0x00,0x74,0x09,0x2f]
-; CHECK: uqshl.16b	v0, v0, #2      ; encoding: [0x00,0x74,0x0a,0x6f]
-; CHECK: uqshl.4h	v0, v0, #3      ; encoding: [0x00,0x74,0x13,0x2f]
-; CHECK: uqshl.8h	v0, v0, #4      ; encoding: [0x00,0x74,0x14,0x6f]
-; CHECK: uqshl.2s	v0, v0, #5      ; encoding: [0x00,0x74,0x25,0x2f]
-; CHECK: uqshl.4s	v0, v0, #6      ; encoding: [0x00,0x74,0x26,0x6f]
-; CHECK: uqshl.2d	v0, v0, #7      ; encoding: [0x00,0x74,0x47,0x6f]
-; CHECK: uqshrn.8b	v0, v0, #1      ; encoding: [0x00,0x94,0x0f,0x2f]
-; CHECK: uqshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x94,0x0e,0x6f]
-; CHECK: uqshrn.4h	v0, v0, #3      ; encoding: [0x00,0x94,0x1d,0x2f]
-; CHECK: uqshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x94,0x1c,0x6f]
-; CHECK: uqshrn.2s	v0, v0, #5      ; encoding: [0x00,0x94,0x3b,0x2f]
-; CHECK: uqshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x94,0x3a,0x6f]
-; CHECK: urshr.8b	v0, v0, #1      ; encoding: [0x00,0x24,0x0f,0x2f]
-; CHECK: urshr.16b	v0, v0, #2      ; encoding: [0x00,0x24,0x0e,0x6f]
-; CHECK: urshr.4h	v0, v0, #3      ; encoding: [0x00,0x24,0x1d,0x2f]
-; CHECK: urshr.8h	v0, v0, #4      ; encoding: [0x00,0x24,0x1c,0x6f]
-; CHECK: urshr.2s	v0, v0, #5      ; encoding: [0x00,0x24,0x3b,0x2f]
-; CHECK: urshr.4s	v0, v0, #6      ; encoding: [0x00,0x24,0x3a,0x6f]
-; CHECK: urshr.2d	v0, v0, #7      ; encoding: [0x00,0x24,0x79,0x6f]
-; CHECK: ursra.8b	v0, v0, #1      ; encoding: [0x00,0x34,0x0f,0x2f]
-; CHECK: ursra.16b	v0, v0, #2      ; encoding: [0x00,0x34,0x0e,0x6f]
-; CHECK: ursra.4h	v0, v0, #3      ; encoding: [0x00,0x34,0x1d,0x2f]
-; CHECK: ursra.8h	v0, v0, #4      ; encoding: [0x00,0x34,0x1c,0x6f]
-; CHECK: ursra.2s	v0, v0, #5      ; encoding: [0x00,0x34,0x3b,0x2f]
-; CHECK: ursra.4s	v0, v0, #6      ; encoding: [0x00,0x34,0x3a,0x6f]
-; CHECK: ursra.2d	v0, v0, #7      ; encoding: [0x00,0x34,0x79,0x6f]
-; CHECK: ushll.8h	v0, v0, #1      ; encoding: [0x00,0xa4,0x09,0x2f]
-; CHECK: ushll2.8h	v0, v0, #2      ; encoding: [0x00,0xa4,0x0a,0x6f]
-; CHECK: ushll.4s	v0, v0, #3      ; encoding: [0x00,0xa4,0x13,0x2f]
-; CHECK: ushll2.4s	v0, v0, #4      ; encoding: [0x00,0xa4,0x14,0x6f]
-; CHECK: ushll.2d	v0, v0, #5      ; encoding: [0x00,0xa4,0x25,0x2f]
-; CHECK: ushll2.2d	v0, v0, #6      ; encoding: [0x00,0xa4,0x26,0x6f]
-; CHECK: ushr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x2f]
-; CHECK: ushr.16b	v0, v0, #2      ; encoding: [0x00,0x04,0x0e,0x6f]
-; CHECK: ushr.4h	v0, v0, #3              ; encoding: [0x00,0x04,0x1d,0x2f]
-; CHECK: ushr.8h	v0, v0, #4              ; encoding: [0x00,0x04,0x1c,0x6f]
-; CHECK: ushr.2s	v0, v0, #5              ; encoding: [0x00,0x04,0x3b,0x2f]
-; CHECK: ushr.4s	v0, v0, #6              ; encoding: [0x00,0x04,0x3a,0x6f]
-; CHECK: ushr.2d	v0, v0, #7              ; encoding: [0x00,0x04,0x79,0x6f]
-; CHECK: usra.8b	v0, v0, #1              ; encoding: [0x00,0x14,0x0f,0x2f]
-; CHECK: usra.16b	v0, v0, #2      ; encoding: [0x00,0x14,0x0e,0x6f]
-; CHECK: usra.4h	v0, v0, #3              ; encoding: [0x00,0x14,0x1d,0x2f]
-; CHECK: usra.8h	v0, v0, #4              ; encoding: [0x00,0x14,0x1c,0x6f]
-; CHECK: usra.2s	v0, v0, #5              ; encoding: [0x00,0x14,0x3b,0x2f]
-; CHECK: usra.4s	v0, v0, #6              ; encoding: [0x00,0x14,0x3a,0x6f]
-; CHECK: usra.2d	v0, v0, #7              ; encoding: [0x00,0x14,0x79,0x6f]
-
-
-; ARM Verbose syntax variants.
-
-   rshrn v9.8b, v11.8h, #1
-   rshrn2 v8.16b, v9.8h, #2
-   rshrn v7.4h, v8.4s, #3
-   rshrn2 v6.8h, v7.4s, #4
-   rshrn v5.2s, v6.2d, #5
-   rshrn2 v4.4s, v5.2d, #6
-
-   shrn v9.8b, v11.8h, #1
-   shrn2 v8.16b, v9.8h, #2
-   shrn v7.4h, v8.4s, #3
-   shrn2 v6.8h, v7.4s, #4
-   shrn v5.2s, v6.2d, #5
-   shrn2 v4.4s, v5.2d, #6
-
-   sqrshrn v9.8b, v11.8h, #1
-   sqrshrn2 v8.16b, v9.8h, #2
-   sqrshrn v7.4h, v8.4s, #3
-   sqrshrn2 v6.8h, v7.4s, #4
-   sqrshrn v5.2s, v6.2d, #5
-   sqrshrn2 v4.4s, v5.2d, #6
-
-   sqshrn v9.8b, v11.8h, #1
-   sqshrn2 v8.16b, v9.8h, #2
-   sqshrn v7.4h, v8.4s, #3
-   sqshrn2 v6.8h, v7.4s, #4
-   sqshrn v5.2s, v6.2d, #5
-   sqshrn2 v4.4s, v5.2d, #6
-
-   sqrshrun v9.8b, v11.8h, #1
-   sqrshrun2 v8.16b, v9.8h, #2
-   sqrshrun v7.4h, v8.4s, #3
-   sqrshrun2 v6.8h, v7.4s, #4
-   sqrshrun v5.2s, v6.2d, #5
-   sqrshrun2 v4.4s, v5.2d, #6
-
-   sqshrun v9.8b, v11.8h, #1
-   sqshrun2 v8.16b, v9.8h, #2
-   sqshrun v7.4h, v8.4s, #3
-   sqshrun2 v6.8h, v7.4s, #4
-   sqshrun v5.2s, v6.2d, #5
-   sqshrun2 v4.4s, v5.2d, #6
-
-   uqrshrn v9.8b, v11.8h, #1
-   uqrshrn2 v8.16b, v9.8h, #2
-   uqrshrn v7.4h, v8.4s, #3
-   uqrshrn2 v6.8h, v7.4s, #4
-   uqrshrn v5.2s, v6.2d, #5
-   uqrshrn2 v4.4s, v5.2d, #6
-
-   uqshrn v9.8b, v11.8h, #1
-   uqshrn2 v8.16b, v9.8h, #2
-   uqshrn v7.4h, v8.4s, #3
-   uqshrn2 v6.8h, v7.4s, #4
-   uqshrn v5.2s, v6.2d, #5
-   uqshrn2 v4.4s, v5.2d, #6
-
-   sshll2 v10.8h, v3.16b, #6
-   sshll2 v11.4s, v4.8h, #5
-   sshll2 v12.2d, v5.4s, #4
-   sshll v13.8h, v6.8b, #3
-   sshll v14.4s, v7.4h, #2
-   sshll v15.2d, v8.2s, #7
-
-   ushll2 v10.8h, v3.16b, #6
-   ushll2 v11.4s, v4.8h, #5
-   ushll2 v12.2d, v5.4s, #4
-   ushll v13.8h, v6.8b, #3
-   ushll v14.4s, v7.4h, #2
-   ushll v15.2d, v8.2s, #7
-
-
-; CHECK: rshrn.8b	v9, v11, #1     ; encoding: [0x69,0x8d,0x0f,0x0f]
-; CHECK: rshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x8d,0x0e,0x4f]
-; CHECK: rshrn.4h	v7, v8, #3      ; encoding: [0x07,0x8d,0x1d,0x0f]
-; CHECK: rshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x8c,0x1c,0x4f]
-; CHECK: rshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x8c,0x3b,0x0f]
-; CHECK: rshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x8c,0x3a,0x4f]
-; CHECK: shrn.8b	v9, v11, #1             ; encoding: [0x69,0x85,0x0f,0x0f]
-; CHECK: shrn2.16b	v8, v9, #2      ; encoding: [0x28,0x85,0x0e,0x4f]
-; CHECK: shrn.4h	v7, v8, #3              ; encoding: [0x07,0x85,0x1d,0x0f]
-; CHECK: shrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x84,0x1c,0x4f]
-; CHECK: shrn.2s	v5, v6, #5              ; encoding: [0xc5,0x84,0x3b,0x0f]
-; CHECK: shrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x84,0x3a,0x4f]
-; CHECK: sqrshrn.8b	v9, v11, #1     ; encoding: [0x69,0x9d,0x0f,0x0f]
-; CHECK: sqrshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x9d,0x0e,0x4f]
-; CHECK: sqrshrn.4h	v7, v8, #3      ; encoding: [0x07,0x9d,0x1d,0x0f]
-; CHECK: sqrshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x9c,0x1c,0x4f]
-; CHECK: sqrshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x9c,0x3b,0x0f]
-; CHECK: sqrshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x9c,0x3a,0x4f]
-; CHECK: sqshrn.8b	v9, v11, #1     ; encoding: [0x69,0x95,0x0f,0x0f]
-; CHECK: sqshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x95,0x0e,0x4f]
-; CHECK: sqshrn.4h	v7, v8, #3      ; encoding: [0x07,0x95,0x1d,0x0f]
-; CHECK: sqshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x94,0x1c,0x4f]
-; CHECK: sqshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x94,0x3b,0x0f]
-; CHECK: sqshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x94,0x3a,0x4f]
-; CHECK: sqrshrun.8b	v9, v11, #1     ; encoding: [0x69,0x8d,0x0f,0x2f]
-; CHECK: sqrshrun2.16b	v8, v9, #2      ; encoding: [0x28,0x8d,0x0e,0x6f]
-; CHECK: sqrshrun.4h	v7, v8, #3      ; encoding: [0x07,0x8d,0x1d,0x2f]
-; CHECK: sqrshrun2.8h	v6, v7, #4      ; encoding: [0xe6,0x8c,0x1c,0x6f]
-; CHECK: sqrshrun.2s	v5, v6, #5      ; encoding: [0xc5,0x8c,0x3b,0x2f]
-; CHECK: sqrshrun2.4s	v4, v5, #6      ; encoding: [0xa4,0x8c,0x3a,0x6f]
-; CHECK: sqshrun.8b	v9, v11, #1     ; encoding: [0x69,0x85,0x0f,0x2f]
-; CHECK: sqshrun2.16b	v8, v9, #2      ; encoding: [0x28,0x85,0x0e,0x6f]
-; CHECK: sqshrun.4h	v7, v8, #3      ; encoding: [0x07,0x85,0x1d,0x2f]
-; CHECK: sqshrun2.8h	v6, v7, #4      ; encoding: [0xe6,0x84,0x1c,0x6f]
-; CHECK: sqshrun.2s	v5, v6, #5      ; encoding: [0xc5,0x84,0x3b,0x2f]
-; CHECK: sqshrun2.4s	v4, v5, #6      ; encoding: [0xa4,0x84,0x3a,0x6f]
-; CHECK: uqrshrn.8b	v9, v11, #1     ; encoding: [0x69,0x9d,0x0f,0x2f]
-; CHECK: uqrshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x9d,0x0e,0x6f]
-; CHECK: uqrshrn.4h	v7, v8, #3      ; encoding: [0x07,0x9d,0x1d,0x2f]
-; CHECK: uqrshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x9c,0x1c,0x6f]
-; CHECK: uqrshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x9c,0x3b,0x2f]
-; CHECK: uqrshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x9c,0x3a,0x6f]
-; CHECK: uqshrn.8b	v9, v11, #1     ; encoding: [0x69,0x95,0x0f,0x2f]
-; CHECK: uqshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x95,0x0e,0x6f]
-; CHECK: uqshrn.4h	v7, v8, #3      ; encoding: [0x07,0x95,0x1d,0x2f]
-; CHECK: uqshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x94,0x1c,0x6f]
-; CHECK: uqshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x94,0x3b,0x2f]
-; CHECK: uqshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x94,0x3a,0x6f]
-; CHECK: sshll2.8h	v10, v3, #6     ; encoding: [0x6a,0xa4,0x0e,0x4f]
-; CHECK: sshll2.4s	v11, v4, #5     ; encoding: [0x8b,0xa4,0x15,0x4f]
-; CHECK: sshll2.2d	v12, v5, #4     ; encoding: [0xac,0xa4,0x24,0x4f]
-; CHECK: sshll.8h	v13, v6, #3     ; encoding: [0xcd,0xa4,0x0b,0x0f]
-; CHECK: sshll.4s	v14, v7, #2     ; encoding: [0xee,0xa4,0x12,0x0f]
-; CHECK: sshll.2d	v15, v8, #7     ; encoding: [0x0f,0xa5,0x27,0x0f]
-; CHECK: ushll2.8h	v10, v3, #6     ; encoding: [0x6a,0xa4,0x0e,0x6f]
-; CHECK: ushll2.4s	v11, v4, #5     ; encoding: [0x8b,0xa4,0x15,0x6f]
-; CHECK: ushll2.2d	v12, v5, #4     ; encoding: [0xac,0xa4,0x24,0x6f]
-; CHECK: ushll.8h	v13, v6, #3     ; encoding: [0xcd,0xa4,0x0b,0x2f]
-; CHECK: ushll.4s	v14, v7, #2     ; encoding: [0xee,0xa4,0x12,0x2f]
-; CHECK: ushll.2d	v15, v8, #7     ; encoding: [0x0f,0xa5,0x27,0x2f]
-
-
-  pmull.8h v0, v0, v0
-  pmull2.8h v0, v0, v0
-  pmull.1q v2, v3, v4
-  pmull2.1q v2, v3, v4
-  pmull v2.1q, v3.1d, v4.1d
-  pmull2 v2.1q, v3.2d, v4.2d
-
-; CHECK: pmull.8h	v0, v0, v0      ; encoding: [0x00,0xe0,0x20,0x0e]
-; CHECK: pmull2.8h	v0, v0, v0      ; encoding: [0x00,0xe0,0x20,0x4e]
-; CHECK: pmull.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x0e]
-; CHECK: pmull2.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x4e]
-; CHECK: pmull.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x0e]
-; CHECK: pmull2.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x4e]
-
-
-  faddp.2d d1, v2
-  faddp.2s s3, v4
-; CHECK: faddp.2d	d1, v2          ; encoding: [0x41,0xd8,0x70,0x7e]
-; CHECK: faddp.2s	s3, v4          ; encoding: [0x83,0xd8,0x30,0x7e]
-
-  tbl.16b v2, {v4,v5,v6,v7}, v1
-  tbl.8b v0, {v4,v5,v6,v7}, v1
-  tbl.16b v2, {v5}, v1
-  tbl.8b v0, {v5}, v1
-  tbl.16b v2, {v5,v6,v7}, v1
-  tbl.8b v0, {v5,v6,v7}, v1
-  tbl.16b v2, {v6,v7}, v1
-  tbl.8b v0, {v6,v7}, v1
-; CHECK: tbl.16b	v2, { v4, v5, v6, v7 }, v1 ; encoding: [0x82,0x60,0x01,0x4e]
-; CHECK: tbl.8b	v0, { v4, v5, v6, v7 }, v1 ; encoding: [0x80,0x60,0x01,0x0e]
-; CHECK: tbl.16b	v2, { v5 }, v1          ; encoding: [0xa2,0x00,0x01,0x4e]
-; CHECK: tbl.8b	v0, { v5 }, v1          ; encoding: [0xa0,0x00,0x01,0x0e]
-; CHECK: tbl.16b	v2, { v5, v6, v7 }, v1  ; encoding: [0xa2,0x40,0x01,0x4e]
-; CHECK: tbl.8b	v0, { v5, v6, v7 }, v1  ; encoding: [0xa0,0x40,0x01,0x0e]
-; CHECK: tbl.16b	v2, { v6, v7 }, v1      ; encoding: [0xc2,0x20,0x01,0x4e]
-; CHECK: tbl.8b	v0, { v6, v7 }, v1      ; encoding: [0xc0,0x20,0x01,0x0e]
-
-  tbl v2.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v1.16b
-  tbl v0.8b, {v4.16b,v5.16b,v6.16b,v7.16b}, v1.8b
-  tbl v2.16b, {v5.16b}, v1.16b
-  tbl v0.8b, {v5.16b}, v1.8b
-  tbl v2.16b, {v5.16b,v6.16b,v7.16b}, v1.16b
-  tbl v0.8b, {v5.16b,v6.16b,v7.16b}, v1.8b
-  tbl v2.16b, {v6.16b,v7.16b}, v1.16b
-  tbl v0.8b, {v6.16b,v7.16b}, v1.8b
-; CHECK: tbl.16b v2, { v4, v5, v6, v7 }, v1 ; encoding: [0x82,0x60,0x01,0x4e]
-; CHECK: tbl.8b v0, { v4, v5, v6, v7 }, v1 ; encoding: [0x80,0x60,0x01,0x0e]
-; CHECK: tbl.16b v2, { v5 }, v1          ; encoding: [0xa2,0x00,0x01,0x4e]
-; CHECK: tbl.8b v0, { v5 }, v1          ; encoding: [0xa0,0x00,0x01,0x0e]
-; CHECK: tbl.16b v2, { v5, v6, v7 }, v1  ; encoding: [0xa2,0x40,0x01,0x4e]
-; CHECK: tbl.8b v0, { v5, v6, v7 }, v1  ; encoding: [0xa0,0x40,0x01,0x0e]
-; CHECK: tbl.16b v2, { v6, v7 }, v1      ; encoding: [0xc2,0x20,0x01,0x4e]
-; CHECK: tbl.8b v0, { v6, v7 }, v1      ; encoding: [0xc0,0x20,0x01,0x0e]
-
-  sqdmull	s0, h0, h0
-  sqdmull	d0, s0, s0
-; CHECK: sqdmull	s0, h0, h0              ; encoding: [0x00,0xd0,0x60,0x5e]
-; CHECK: sqdmull	d0, s0, s0              ; encoding: [0x00,0xd0,0xa0,0x5e]
-
-  frsqrte s0, s0
-  frsqrte d0, d0
-; CHECK: frsqrte s0, s0                  ; encoding: [0x00,0xd8,0xa1,0x7e]
-; CHECK: frsqrte d0, d0                  ; encoding: [0x00,0xd8,0xe1,0x7e]
-
-  mov.16b v0, v0
-  mov.2s v0, v0
-; CHECK: mov.16b v0, v0              ; encoding: [0x00,0x1c,0xa0,0x4e]
-; CHECK: mov.8b v0, v0              ; encoding: [0x00,0x1c,0xa0,0x0e]
-
-
-; uadalp/sadalp verbose mode aliases.
-  uadalp v14.4h, v25.8b
-  uadalp v15.8h, v24.16b
-  uadalp v16.2s, v23.4h
-  uadalp v17.4s, v22.8h
-  uadalp v18.1d, v21.2s
-  uadalp v19.2d, v20.4s
-
-  sadalp v1.4h, v11.8b
-  sadalp v2.8h, v12.16b
-  sadalp v3.2s, v13.4h
-  sadalp v4.4s, v14.8h
-  sadalp v5.1d, v15.2s
-  sadalp v6.2d, v16.4s
-
-; CHECK: uadalp.4h	v14, v25        ; encoding: [0x2e,0x6b,0x20,0x2e]
-; CHECK: uadalp.8h	v15, v24        ; encoding: [0x0f,0x6b,0x20,0x6e]
-; CHECK: uadalp.2s	v16, v23        ; encoding: [0xf0,0x6a,0x60,0x2e]
-; CHECK: uadalp.4s	v17, v22        ; encoding: [0xd1,0x6a,0x60,0x6e]
-; CHECK: uadalp.1d	v18, v21        ; encoding: [0xb2,0x6a,0xa0,0x2e]
-; CHECK: uadalp.2d	v19, v20        ; encoding: [0x93,0x6a,0xa0,0x6e]
-; CHECK: sadalp.4h	v1, v11         ; encoding: [0x61,0x69,0x20,0x0e]
-; CHECK: sadalp.8h	v2, v12         ; encoding: [0x82,0x69,0x20,0x4e]
-; CHECK: sadalp.2s	v3, v13         ; encoding: [0xa3,0x69,0x60,0x0e]
-; CHECK: sadalp.4s	v4, v14         ; encoding: [0xc4,0x69,0x60,0x4e]
-; CHECK: sadalp.1d	v5, v15         ; encoding: [0xe5,0x69,0xa0,0x0e]
-; CHECK: sadalp.2d	v6, v16         ; encoding: [0x06,0x6a,0xa0,0x4e]
-
-; MVN is an alias for 'not'.
-  mvn v1.8b, v4.8b
-  mvn v19.16b, v17.16b
-  mvn.8b v10, v6
-  mvn.16b v11, v7
-
-; CHECK: mvn.8b	v1, v4                  ; encoding: [0x81,0x58,0x20,0x2e]
-; CHECK: mvn.16b	v19, v17                ; encoding: [0x33,0x5a,0x20,0x6e]
-; CHECK: mvn.8b	v10, v6                 ; encoding: [0xca,0x58,0x20,0x2e]
-; CHECK: mvn.16b	v11, v7                 ; encoding: [0xeb,0x58,0x20,0x6e]
-
-; sqdmull verbose mode aliases
- sqdmull v10.4s, v12.4h, v12.4h
- sqdmull2 v10.4s, v13.8h, v13.8h
- sqdmull v10.2d, v13.2s, v13.2s
- sqdmull2 v10.2d, v13.4s, v13.4s
-; CHECK: sqdmull.4s	v10, v12, v12   ; encoding: [0x8a,0xd1,0x6c,0x0e]
-; CHECK: sqdmull2.4s	v10, v13, v13   ; encoding: [0xaa,0xd1,0x6d,0x4e]
-; CHECK: sqdmull.2d	v10, v13, v13   ; encoding: [0xaa,0xd1,0xad,0x0e]
-; CHECK: sqdmull2.2d	v10, v13, v13   ; encoding: [0xaa,0xd1,0xad,0x4e]
-
-; xtn verbose mode aliases
- xtn v14.8b, v14.8h
- xtn2 v14.16b, v14.8h
- xtn v14.4h, v14.4s
- xtn2 v14.8h, v14.4s
- xtn v14.2s, v14.2d
- xtn2 v14.4s, v14.2d
-; CHECK: xtn.8b v14, v14                ; encoding: [0xce,0x29,0x21,0x0e]
-; CHECK: xtn2.16b v14, v14              ; encoding: [0xce,0x29,0x21,0x4e]
-; CHECK: xtn.4h v14, v14                ; encoding: [0xce,0x29,0x61,0x0e]
-; CHECK: xtn2.8h v14, v14               ; encoding: [0xce,0x29,0x61,0x4e]
-; CHECK: xtn.2s v14, v14                ; encoding: [0xce,0x29,0xa1,0x0e]
-; CHECK: xtn2.4s v14, v14               ; encoding: [0xce,0x29,0xa1,0x4e]
-
-; uaddl verbose mode aliases
- uaddl v9.8h, v13.8b, v14.8b
- uaddl2 v9.8h, v13.16b, v14.16b
- uaddl v9.4s, v13.4h, v14.4h
- uaddl2 v9.4s, v13.8h, v14.8h
- uaddl v9.2d, v13.2s, v14.2s
- uaddl2 v9.2d, v13.4s, v14.4s
-; CHECK: uaddl.8h	v9, v13, v14    ; encoding: [0xa9,0x01,0x2e,0x2e]
-; CHECK: uaddl2.8h	v9, v13, v14    ; encoding: [0xa9,0x01,0x2e,0x6e]
-; CHECK: uaddl.4s	v9, v13, v14    ; encoding: [0xa9,0x01,0x6e,0x2e]
-; CHECK: uaddl2.4s	v9, v13, v14    ; encoding: [0xa9,0x01,0x6e,0x6e]
-; CHECK: uaddl.2d	v9, v13, v14    ; encoding: [0xa9,0x01,0xae,0x2e]
-; CHECK: uaddl2.2d	v9, v13, v14    ; encoding: [0xa9,0x01,0xae,0x6e]
-
-; bit verbose mode aliases
- bit v9.16b, v10.16b, v10.16b
- bit v9.8b, v10.8b, v10.8b
-; CHECK: bit.16b v9, v10, v10           ; encoding: [0x49,0x1d,0xaa,0x6e]
-; CHECK: bit.8b v9, v10, v10            ; encoding: [0x49,0x1d,0xaa,0x2e]
-
-; pmull verbose mode aliases
- pmull v8.8h, v8.8b, v8.8b
- pmull2 v8.8h, v8.16b, v8.16b
- pmull v8.1q, v8.1d, v8.1d
- pmull2 v8.1q, v8.2d, v8.2d
-; CHECK: pmull.8h	v8, v8, v8      ; encoding: [0x08,0xe1,0x28,0x0e]
-; CHECK: pmull2.8h	v8, v8, v8      ; encoding: [0x08,0xe1,0x28,0x4e]
-; CHECK: pmull.1q	v8, v8, v8      ; encoding: [0x08,0xe1,0xe8,0x0e]
-; CHECK: pmull2.1q	v8, v8, v8      ; encoding: [0x08,0xe1,0xe8,0x4e]
-
-; usubl verbose mode aliases
- usubl v9.8h, v13.8b, v14.8b
- usubl2 v9.8h, v13.16b, v14.16b
- usubl v9.4s, v13.4h, v14.4h
- usubl2 v9.4s, v13.8h, v14.8h
- usubl v9.2d, v13.2s, v14.2s
- usubl2 v9.2d, v13.4s, v14.4s
-; CHECK: usubl.8h	v9, v13, v14    ; encoding: [0xa9,0x21,0x2e,0x2e]
-; CHECK: usubl2.8h	v9, v13, v14    ; encoding: [0xa9,0x21,0x2e,0x6e]
-; CHECK: usubl.4s	v9, v13, v14    ; encoding: [0xa9,0x21,0x6e,0x2e]
-; CHECK: usubl2.4s	v9, v13, v14    ; encoding: [0xa9,0x21,0x6e,0x6e]
-; CHECK: usubl.2d	v9, v13, v14    ; encoding: [0xa9,0x21,0xae,0x2e]
-; CHECK: usubl2.2d	v9, v13, v14    ; encoding: [0xa9,0x21,0xae,0x6e]
-
-; uabdl verbose mode aliases
- uabdl v9.8h, v13.8b, v14.8b
- uabdl2 v9.8h, v13.16b, v14.16b
- uabdl v9.4s, v13.4h, v14.4h
- uabdl2 v9.4s, v13.8h, v14.8h
- uabdl v9.2d, v13.2s, v14.2s
- uabdl2 v9.2d, v13.4s, v14.4s
-; CHECK: uabdl.8h	v9, v13, v14    ; encoding: [0xa9,0x71,0x2e,0x2e]
-; CHECK: uabdl2.8h	v9, v13, v14    ; encoding: [0xa9,0x71,0x2e,0x6e]
-; CHECK: uabdl.4s	v9, v13, v14    ; encoding: [0xa9,0x71,0x6e,0x2e]
-; CHECK: uabdl2.4s	v9, v13, v14    ; encoding: [0xa9,0x71,0x6e,0x6e]
-; CHECK: uabdl.2d	v9, v13, v14    ; encoding: [0xa9,0x71,0xae,0x2e]
-; CHECK: uabdl2.2d	v9, v13, v14    ; encoding: [0xa9,0x71,0xae,0x6e]
-
-; umull verbose mode aliases
- umull v9.8h, v13.8b, v14.8b
- umull2 v9.8h, v13.16b, v14.16b
- umull v9.4s, v13.4h, v14.4h
- umull2 v9.4s, v13.8h, v14.8h
- umull v9.2d, v13.2s, v14.2s
- umull2 v9.2d, v13.4s, v14.4s
-; CHECK: umull.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x2e]
-; CHECK: umull2.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x6e]
-; CHECK: umull.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x2e]
-; CHECK: umull2.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x6e]
-; CHECK: umull.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x2e]
-; CHECK: umull2.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x6e]
-
-; smull verbose mode aliases
- smull v9.8h, v13.8b, v14.8b
- smull2 v9.8h, v13.16b, v14.16b
- smull v9.4s, v13.4h, v14.4h
- smull2 v9.4s, v13.8h, v14.8h
- smull v9.2d, v13.2s, v14.2s
- smull2 v9.2d, v13.4s, v14.4s
-; CHECK: smull.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x0e]
-; CHECK: smull2.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x4e]
-; CHECK: smull.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x0e]
-; CHECK: smull2.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x4e]
-; CHECK: smull.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x0e]
-; CHECK: smull2.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x4e]
diff --git a/test/MC/ARM64/aliases.s b/test/MC/ARM64/aliases.s
deleted file mode 100644
index c3affe37aa9..00000000000
--- a/test/MC/ARM64/aliases.s
+++ /dev/null
@@ -1,753 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon -output-asm-variant=1 -show-encoding < %s | FileCheck %s
-
-foo:
-;-----------------------------------------------------------------------------
-; ADD #0 to/from SP/WSP is a MOV
-;-----------------------------------------------------------------------------
-  add x1, sp, #0
-; CHECK: mov x1, sp
-  add sp, x2, #0
-; CHECK: mov sp, x2
-  add w3, wsp, #0
-; CHECK: mov w3, wsp
-  add wsp, w4, #0
-; CHECK: mov wsp, w4
-  mov x5, sp
-; CHECK: mov x5, sp
-  mov sp, x6
-; CHECK: mov sp, x6
-  mov w7, wsp
-; CHECK: mov w7, wsp
-  mov wsp, w8
-; CHECK: mov wsp, w8
-
-;-----------------------------------------------------------------------------
-; ORR Rd, Rn, Rn is a MOV
-;-----------------------------------------------------------------------------
-  orr x2, xzr, x9
-; CHECK: mov x2, x9
-  orr w2, wzr, w9
-; CHECK: mov w2, w9
-  mov x3, x4
-; CHECK: mov x3, x4
-  mov w5, w6
-; CHECK: mov w5, w6
-
-;-----------------------------------------------------------------------------
-; TST Xn, #<imm>
-;-----------------------------------------------------------------------------
-        tst w1, #3
-        tst x1, #3
-        tst w1, w2
-        tst x1, x2
-        ands wzr, w1, w2, lsl #2
-        ands xzr, x1, x2, lsl #3
-        tst w3, w7, lsl #31
-        tst x2, x20, asr #0
-
-; CHECK: tst	w1, #0x3                ; encoding: [0x3f,0x04,0x00,0x72]
-; CHECK: tst	x1, #0x3                ; encoding: [0x3f,0x04,0x40,0xf2]
-; CHECK: tst	w1, w2                  ; encoding: [0x3f,0x00,0x02,0x6a]
-; CHECK: tst	x1, x2                  ; encoding: [0x3f,0x00,0x02,0xea]
-; CHECK: tst	w1, w2, lsl #2          ; encoding: [0x3f,0x08,0x02,0x6a]
-; CHECK: tst	x1, x2, lsl #3          ; encoding: [0x3f,0x0c,0x02,0xea]
-; CHECK: tst	w3, w7, lsl #31         ; encoding: [0x7f,0x7c,0x07,0x6a]
-; CHECK: tst	x2, x20, asr #0         ; encoding: [0x5f,0x00,0x94,0xea]
-
-;-----------------------------------------------------------------------------
-; ADDS to WZR/XZR is a CMN
-;-----------------------------------------------------------------------------
-  cmn w1, #3, lsl #0
-  cmn x2, #4194304
-  cmn w4, w5
-  cmn x6, x7
-  cmn w8, w9, asr #3
-  cmn x2, x3, lsr #4
-  cmn x2, w3, uxtb #1
-  cmn x4, x5, uxtx #1
-
-; CHECK: cmn	w1, #3                  ; encoding: [0x3f,0x0c,0x00,0x31]
-; CHECK: cmn	x2, #1024, lsl #12      ; encoding: [0x5f,0x00,0x50,0xb1]
-; CHECK: cmn	w4, w5                  ; encoding: [0x9f,0x00,0x05,0x2b]
-; CHECK: cmn	x6, x7                  ; encoding: [0xdf,0x00,0x07,0xab]
-; CHECK: cmn	w8, w9, asr #3          ; encoding: [0x1f,0x0d,0x89,0x2b]
-; CHECK: cmn	x2, x3, lsr #4          ; encoding: [0x5f,0x10,0x43,0xab]
-; CHECK: cmn	x2, w3, uxtb #1         ; encoding: [0x5f,0x04,0x23,0xab]
-; CHECK: cmn	x4, x5, uxtx #1         ; encoding: [0x9f,0x64,0x25,0xab]
-
-
-;-----------------------------------------------------------------------------
-; SUBS to WZR/XZR is a CMP
-;-----------------------------------------------------------------------------
-  cmp w1, #1024, lsl #12
-  cmp x2, #1024
-  cmp w4, w5
-  cmp x6, x7
-  cmp w8, w9, asr #3
-  cmp x2, x3, lsr #4
-  cmp x2, w3, uxth #2
-  cmp x4, x5, uxtx
-  cmp wzr, w1
-  cmp x8, w8, uxtw
-  cmp w9, w8, uxtw
-  cmp wsp, w9, lsl #0
-
-; CHECK: cmp	w1, #1024, lsl #12      ; encoding: [0x3f,0x00,0x50,0x71]
-; CHECK: cmp	x2, #1024               ; encoding: [0x5f,0x00,0x10,0xf1]
-; CHECK: cmp	w4, w5                  ; encoding: [0x9f,0x00,0x05,0x6b]
-; CHECK: cmp	x6, x7                  ; encoding: [0xdf,0x00,0x07,0xeb]
-; CHECK: cmp	w8, w9, asr #3          ; encoding: [0x1f,0x0d,0x89,0x6b]
-; CHECK: cmp	x2, x3, lsr #4          ; encoding: [0x5f,0x10,0x43,0xeb]
-; CHECK: cmp	x2, w3, uxth #2         ; encoding: [0x5f,0x28,0x23,0xeb]
-; CHECK: cmp	x4, x5, uxtx            ; encoding: [0x9f,0x60,0x25,0xeb]
-; CHECK: cmp	wzr, w1                 ; encoding: [0xff,0x03,0x01,0x6b]
-; CHECK: cmp	x8, w8, uxtw            ; encoding: [0x1f,0x41,0x28,0xeb]
-; CHECK: cmp	w9, w8, uxtw            ; encoding: [0x3f,0x41,0x28,0x6b]
-; CHECK: cmp	wsp, w9                 ; encoding: [0xff,0x43,0x29,0x6b]
-
-
-;-----------------------------------------------------------------------------
-; SUB/SUBS from WZR/XZR is a NEG
-;-----------------------------------------------------------------------------
-
-  neg w0, w1
-; CHECK: neg w0, w1
-  neg w0, w1, lsl #1
-; CHECK: neg w0, w1, lsl #1
-  neg x0, x1
-; CHECK: neg x0, x1
-  neg x0, x1, asr #1
-; CHECK: neg x0, x1, asr #1
-  negs w0, w1
-; CHECK: negs w0, w1
-  negs w0, w1, lsl #1
-; CHECK: negs w0, w1, lsl #1
-  negs x0, x1
-; CHECK: negs x0, x1
-  negs x0, x1, asr #1
-; CHECK: negs x0, x1, asr #1
-
-;-----------------------------------------------------------------------------
-; MOV aliases
-;-----------------------------------------------------------------------------
-
-  mov x0, #281470681743360
-  mov x0, #18446744073709486080
-
-; CHECK: movz	x0, #0xffff, lsl #32
-; CHECK: movn	x0, #0xffff
-
-  mov w0, #0xffffffff
-  mov w0, #0xffffff00
-  mov wzr, #0xffffffff
-  mov wzr, #0xffffff00
-
-; CHECK: movn   w0, #0
-; CHECK: movn   w0, #0xff
-; CHECK: movn   wzr, #0
-; CHECK: movn   wzr, #0xff
-
-;-----------------------------------------------------------------------------
-; MVN aliases
-;-----------------------------------------------------------------------------
-
-        mvn w4, w9
-        mvn x2, x3
-        orn w4, wzr, w9
-
-; CHECK: mvn	w4, w9             ; encoding: [0xe4,0x03,0x29,0x2a]
-; CHECK: mvn	x2, x3             ; encoding: [0xe2,0x03,0x23,0xaa]
-; CHECK: mvn	w4, w9             ; encoding: [0xe4,0x03,0x29,0x2a]
-
-        mvn w4, w9, lsl #1
-        mvn x2, x3, lsl #1
-        orn w4, wzr, w9, lsl #1
-
-; CHECK: mvn	w4, w9, lsl #1     ; encoding: [0xe4,0x07,0x29,0x2a]
-; CHECK: mvn	x2, x3, lsl #1     ; encoding: [0xe2,0x07,0x23,0xaa]
-; CHECK: mvn	w4, w9, lsl #1     ; encoding: [0xe4,0x07,0x29,0x2a]
-
-;-----------------------------------------------------------------------------
-; Bitfield aliases
-;-----------------------------------------------------------------------------
-
-  bfi   w0, w0, #1, #4
-  bfi   x0, x0, #1, #4
-  bfi   w0, w0, #0, #2
-  bfi   x0, x0, #0, #2
-  bfxil w0, w0, #2, #3
-  bfxil x0, x0, #2, #3
-  sbfiz w0, w0, #1, #4
-  sbfiz x0, x0, #1, #4
-  sbfx  w0, w0, #2, #3
-  sbfx  x0, x0, #2, #3
-  ubfiz w0, w0, #1, #4
-  ubfiz x0, x0, #1, #4
-  ubfx  w0, w0, #2, #3
-  ubfx  x0, x0, #2, #3
-
-; CHECK: bfi   w0, w0, #1, #4
-; CHECK: bfi   x0, x0, #1, #4
-; CHECK: bfxil w0, w0, #0, #2
-; CHECK: bfxil x0, x0, #0, #2
-; CHECK: bfxil w0, w0, #2, #3
-; CHECK: bfxil x0, x0, #2, #3
-; CHECK: sbfiz w0, w0, #1, #4
-; CHECK: sbfiz x0, x0, #1, #4
-; CHECK: sbfx  w0, w0, #2, #3
-; CHECK: sbfx  x0, x0, #2, #3
-; CHECK: ubfiz w0, w0, #1, #4
-; CHECK: ubfiz x0, x0, #1, #4
-; CHECK: ubfx  w0, w0, #2, #3
-; CHECK: ubfx  x0, x0, #2, #3
-
-;-----------------------------------------------------------------------------
-; Shift (immediate) aliases
-;-----------------------------------------------------------------------------
-
-; CHECK: asr w1, w3, #13
-; CHECK: asr x1, x3, #13
-; CHECK: lsl w0, w0, #1
-; CHECK: lsl x0, x0, #1
-; CHECK: lsr w0, w0, #4
-; CHECK: lsr x0, x0, #4
-
-   sbfm w1, w3, #13, #31
-   sbfm x1, x3, #13, #63
-   ubfm w0, w0, #31, #30
-   ubfm x0, x0, #63, #62
-   ubfm w0, w0, #4, #31
-   ubfm x0, x0, #4, #63
-; CHECK: ror w1, w3, #5
-; CHECK: ror x1, x3, #5
-   ror w1, w3, #5
-   ror x1, x3, #5
-; CHECK: lsl w1, wzr, #3
-   lsl w1, wzr, #3
-
-;-----------------------------------------------------------------------------
-; Sign/Zero extend aliases
-;-----------------------------------------------------------------------------
-
-  sxtb  w1, w2
-  sxth  w1, w2
-  uxtb  w1, w2
-  uxth  w1, w2
-
-; CHECK: sxtb w1, w2
-; CHECK: sxth w1, w2
-; CHECK: uxtb w1, w2
-; CHECK: uxth w1, w2
-
-  sxtb  x1, w2
-  sxth  x1, w2
-  sxtw  x1, w2
-  uxtb  x1, w2
-  uxth  x1, w2
-  uxtw  x1, w2
-
-; CHECK: sxtb x1, w2
-; CHECK: sxth x1, w2
-; CHECK: sxtw x1, w2
-; CHECK: uxtb w1, w2
-; CHECK: uxth w1, w2
-; CHECK: ubfx x1, x2, #0, #32
-
-;-----------------------------------------------------------------------------
-; Negate with carry
-;-----------------------------------------------------------------------------
-
-  ngc   w1, w2
-  ngc   x1, x2
-  ngcs  w1, w2
-  ngcs  x1, x2
-
-; CHECK: ngc  w1, w2
-; CHECK: ngc  x1, x2
-; CHECK: ngcs w1, w2
-; CHECK: ngcs x1, x2
-
-;-----------------------------------------------------------------------------
-; 6.6.1 Multiply aliases
-;-----------------------------------------------------------------------------
-
-  mneg   w1, w2, w3
-  mneg   x1, x2, x3
-  mul    w1, w2, w3
-  mul    x1, x2, x3
-  smnegl x1, w2, w3
-  umnegl x1, w2, w3
-  smull   x1, w2, w3
-  umull   x1, w2, w3
-
-; CHECK: mneg w1, w2, w3
-; CHECK: mneg x1, x2, x3
-; CHECK: mul w1, w2, w3
-; CHECK: mul x1, x2, x3
-; CHECK: smnegl x1, w2, w3
-; CHECK: umnegl x1, w2, w3
-; CHECK: smull x1, w2, w3
-; CHECK: umull x1, w2, w3
-
-;-----------------------------------------------------------------------------
-; Conditional select aliases
-;-----------------------------------------------------------------------------
-
-  cset   w1, eq
-  cset   x1, eq
-  csetm  w1, ne
-  csetm  x1, ne
-  cinc   w1, w2, lt
-  cinc   x1, x2, lt
-  cinv   w1, w2, mi
-  cinv   x1, x2, mi
-
-; CHECK: cset  w1, eq
-; CHECK: cset  x1, eq
-; CHECK: csetm  w1, ne
-; CHECK: csetm  x1, ne
-; CHECK: cinc  w1, w2, lt
-; CHECK: cinc  x1, x2, lt
-; CHECK: cinv  w1, w2, mi
-; CHECK: cinv  x1, x2, mi
-
-;-----------------------------------------------------------------------------
-; SYS aliases
-;-----------------------------------------------------------------------------
-
-  sys #0, c7, c1, #0
-; CHECK: ic ialluis
-  sys #0, c7, c5, #0
-; CHECK: ic iallu
-  sys #3, c7, c5, #1
-; CHECK: ic ivau
-
-  sys #3, c7, c4, #1
-; CHECK: dc zva
-  sys #0, c7, c6, #1
-; CHECK: dc ivac
-  sys #0, c7, c6, #2
-; CHECK: dc isw
-  sys #3, c7, c10, #1
-; CHECK: dc cvac
-  sys #0, c7, c10, #2
-; CHECK: dc csw
-  sys #3, c7, c11, #1
-; CHECK: dc cvau
-  sys #3, c7, c14, #1
-; CHECK: dc civac
-  sys #0, c7, c14, #2
-; CHECK: dc cisw
-
-  sys #0, c7, c8, #0
-; CHECK: at s1e1r
-  sys #4, c7, c8, #0
-; CHECK: at s1e2r
-  sys #6, c7, c8, #0
-; CHECK: at s1e3r
-  sys #0, c7, c8, #1
-; CHECK: at s1e1w
-  sys #4, c7, c8, #1
-; CHECK: at s1e2w
-  sys #6, c7, c8, #1
-; CHECK: at s1e3w
-  sys #0, c7, c8, #2
-; CHECK: at s1e0r
-  sys #0, c7, c8, #3
-; CHECK: at s1e0w
-  sys #4, c7, c8, #4
-; CHECK: at s12e1r
-  sys #4, c7, c8, #5
-; CHECK: at s12e1w
-  sys #4, c7, c8, #6
-; CHECK: at s12e0r
-  sys #4, c7, c8, #7
-; CHECK: at s12e0w
-
-  sys #0, c8, c3, #0
-; CHECK: tlbi vmalle1is
-  sys #4, c8, c3, #0
-; CHECK: tlbi alle2is
-  sys #6, c8, c3, #0
-; CHECK: tlbi alle3is
-  sys #0, c8, c3, #1
-; CHECK: tlbi vae1is
-  sys #4, c8, c3, #1
-; CHECK: tlbi vae2is
-  sys #6, c8, c3, #1
-; CHECK: tlbi vae3is
-  sys #0, c8, c3, #2
-; CHECK: tlbi aside1is
-  sys #0, c8, c3, #3
-; CHECK: tlbi vaae1is
-  sys #4, c8, c3, #4
-; CHECK: tlbi alle1is
-  sys #0, c8, c3, #5
-; CHECK: tlbi vale1is
-  sys #0, c8, c3, #7
-; CHECK: tlbi vaale1is
-  sys #0, c8, c7, #0
-; CHECK: tlbi vmalle1
-  sys #4, c8, c7, #0
-; CHECK: tlbi alle2
-  sys #4, c8, c3, #5
-; CHECK: tlbi vale2is
-  sys #6, c8, c3, #5
-; CHECK: tlbi vale3is
-  sys #6, c8, c7, #0
-; CHECK: tlbi alle3
-  sys #0, c8, c7, #1
-; CHECK: tlbi vae1
-  sys #4, c8, c7, #1
-; CHECK: tlbi vae2
-  sys #6, c8, c7, #1
-; CHECK: tlbi vae3
-  sys #0, c8, c7, #2
-; CHECK: tlbi aside1
-  sys #0, c8, c7, #3
-; CHECK: tlbi vaae1
-  sys #4, c8, c7, #4
-; CHECK: tlbi alle1
-  sys #0, c8, c7, #5
-; CHECK: tlbi vale1
-  sys #4, c8, c7, #5
-; CHECK: tlbi vale2
-  sys #6, c8, c7, #5
-; CHECK: tlbi vale3
-  sys #0, c8, c7, #7
-; CHECK: tlbi vaale1
-  sys #4, c8, c4, #1
-; CHECK: tlbi ipas2e1
-  sys #4, c8, c4, #5
-; CHECK: tlbi ipas2le1
-  sys #4, c8, c0, #1
-; CHECK: tlbi ipas2e1is
-  sys #4, c8, c0, #5
-; CHECK: tlbi ipas2le1is
-  sys #4, c8, c7, #6
-; CHECK: tlbi vmalls12e1
-  sys #4, c8, c3, #6
-; CHECK: tlbi vmalls12e1is
-
-  ic ialluis
-; CHECK: ic ialluis                 ; encoding: [0x1f,0x71,0x08,0xd5]
-  ic iallu
-; CHECK: ic iallu                   ; encoding: [0x1f,0x75,0x08,0xd5]
-  ic ivau, x0
-; CHECK: ic ivau, x0                ; encoding: [0x20,0x75,0x0b,0xd5]
-
-  dc zva, x0
-; CHECK: dc zva, x0                 ; encoding: [0x20,0x74,0x0b,0xd5]
-  dc ivac, x0
-; CHECK: dc ivac, x0                ; encoding: [0x20,0x76,0x08,0xd5]
-  dc isw, x0
-; CHECK: dc isw, x0                 ; encoding: [0x40,0x76,0x08,0xd5]
-  dc cvac, x0
-; CHECK: dc cvac, x0                ; encoding: [0x20,0x7a,0x0b,0xd5]
-  dc csw, x0
-; CHECK: dc csw, x0                 ; encoding: [0x40,0x7a,0x08,0xd5]
-  dc cvau, x0
-; CHECK: dc cvau, x0                ; encoding: [0x20,0x7b,0x0b,0xd5]
-  dc civac, x0
-; CHECK: dc civac, x0               ; encoding: [0x20,0x7e,0x0b,0xd5]
-  dc cisw, x0
-; CHECK: dc cisw, x0                ; encoding: [0x40,0x7e,0x08,0xd5]
-
-  at s1e1r, x0
-; CHECK: at s1e1r, x0               ; encoding: [0x00,0x78,0x08,0xd5]
-  at s1e2r, x0
-; CHECK: at s1e2r, x0               ; encoding: [0x00,0x78,0x0c,0xd5]
-  at s1e3r, x0
-; CHECK: at s1e3r, x0               ; encoding: [0x00,0x78,0x0e,0xd5]
-  at s1e1w, x0
-; CHECK: at s1e1w, x0               ; encoding: [0x20,0x78,0x08,0xd5]
-  at s1e2w, x0
-; CHECK: at s1e2w, x0               ; encoding: [0x20,0x78,0x0c,0xd5]
-  at s1e3w, x0
-; CHECK: at s1e3w, x0               ; encoding: [0x20,0x78,0x0e,0xd5]
-  at s1e0r, x0
-; CHECK: at s1e0r, x0               ; encoding: [0x40,0x78,0x08,0xd5]
-  at s1e0w, x0
-; CHECK: at s1e0w, x0               ; encoding: [0x60,0x78,0x08,0xd5]
-  at s12e1r, x0
-; CHECK: at s12e1r, x0              ; encoding: [0x80,0x78,0x0c,0xd5]
-  at s12e1w, x0
-; CHECK: at s12e1w, x0              ; encoding: [0xa0,0x78,0x0c,0xd5]
-  at s12e0r, x0
-; CHECK: at s12e0r, x0              ; encoding: [0xc0,0x78,0x0c,0xd5]
-  at s12e0w, x0
-; CHECK: at s12e0w, x0              ; encoding: [0xe0,0x78,0x0c,0xd5]
-
-  tlbi vmalle1is
-; CHECK: tlbi vmalle1is             ; encoding: [0x1f,0x83,0x08,0xd5]
-  tlbi alle2is
-; CHECK: tlbi alle2is               ; encoding: [0x1f,0x83,0x0c,0xd5]
-  tlbi alle3is
-; CHECK: tlbi alle3is               ; encoding: [0x1f,0x83,0x0e,0xd5]
-  tlbi vae1is, x0
-; CHECK: tlbi vae1is, x0            ; encoding: [0x20,0x83,0x08,0xd5]
-  tlbi vae2is, x0
-; CHECK: tlbi vae2is, x0            ; encoding: [0x20,0x83,0x0c,0xd5]
-  tlbi vae3is, x0
-; CHECK: tlbi vae3is, x0            ; encoding: [0x20,0x83,0x0e,0xd5]
-  tlbi aside1is, x0
-; CHECK: tlbi aside1is, x0          ; encoding: [0x40,0x83,0x08,0xd5]
-  tlbi vaae1is, x0
-; CHECK: tlbi vaae1is, x0           ; encoding: [0x60,0x83,0x08,0xd5]
-  tlbi alle1is
-; CHECK: tlbi alle1is               ; encoding: [0x9f,0x83,0x0c,0xd5]
-  tlbi vale1is, x0
-; CHECK: tlbi vale1is, x0           ; encoding: [0xa0,0x83,0x08,0xd5]
-  tlbi vaale1is, x0
-; CHECK: tlbi vaale1is, x0          ; encoding: [0xe0,0x83,0x08,0xd5]
-  tlbi vmalle1
-; CHECK: tlbi vmalle1               ; encoding: [0x1f,0x87,0x08,0xd5]
-  tlbi alle2
-; CHECK: tlbi alle2                 ; encoding: [0x1f,0x87,0x0c,0xd5]
-  tlbi vale2is, x0
-; CHECK: tlbi vale2is, x0           ; encoding: [0xa0,0x83,0x0c,0xd5]
-  tlbi vale3is, x0
-; CHECK: tlbi vale3is, x0           ; encoding: [0xa0,0x83,0x0e,0xd5]
-  tlbi alle3
-; CHECK: tlbi alle3                 ; encoding: [0x1f,0x87,0x0e,0xd5]
-  tlbi vae1, x0
-; CHECK: tlbi vae1, x0              ; encoding: [0x20,0x87,0x08,0xd5]
-  tlbi vae2, x0
-; CHECK: tlbi vae2, x0              ; encoding: [0x20,0x87,0x0c,0xd5]
-  tlbi vae3, x0
-; CHECK: tlbi vae3, x0              ; encoding: [0x20,0x87,0x0e,0xd5]
-  tlbi aside1, x0
-; CHECK: tlbi aside1, x0            ; encoding: [0x40,0x87,0x08,0xd5]
-  tlbi vaae1, x0
-; CHECK: tlbi vaae1, x0             ; encoding: [0x60,0x87,0x08,0xd5]
-  tlbi alle1
-; CHECK: tlbi alle1                 ; encoding: [0x9f,0x87,0x0c,0xd5
-  tlbi vale1, x0
-; CHECK: tlbi vale1, x0             ; encoding: [0xa0,0x87,0x08,0xd5]
-  tlbi vale2, x0
-; CHECK: tlbi vale2, x0             ; encoding: [0xa0,0x87,0x0c,0xd5]
-  tlbi vale3, x0
-; CHECK: tlbi vale3, x0             ; encoding: [0xa0,0x87,0x0e,0xd5]
-  tlbi vaale1, x0
-; CHECK: tlbi vaale1, x0            ; encoding: [0xe0,0x87,0x08,0xd5]
-  tlbi ipas2e1, x0
-; CHECK: tlbi ipas2e1, x0           ; encoding: [0x20,0x84,0x0c,0xd5]
-  tlbi ipas2le1, x0
-; CHECK: tlbi ipas2le1, x0          ; encoding: [0xa0,0x84,0x0c,0xd5]
-  tlbi ipas2e1is, x0
-; CHECK: tlbi ipas2e1is, x0         ; encoding: [0x20,0x80,0x0c,0xd5]
-  tlbi ipas2le1is, x0
-; CHECK: tlbi ipas2le1is, x0        ; encoding: [0xa0,0x80,0x0c,0xd5]
-  tlbi vmalls12e1
-; CHECK: tlbi vmalls12e1            ; encoding: [0xdf,0x87,0x0c,0xd5]
-  tlbi vmalls12e1is
-; CHECK: tlbi vmalls12e1is          ; encoding: [0xdf,0x83,0x0c,0xd5]
-
-;-----------------------------------------------------------------------------
-; 5.8.5 Vector Arithmetic aliases
-;-----------------------------------------------------------------------------
-
-  cmls.8b v0, v2, v1
-  cmls.16b v0, v2, v1
-  cmls.4h v0, v2, v1
-  cmls.8h v0, v2, v1
-  cmls.2s v0, v2, v1
-  cmls.4s v0, v2, v1
-  cmls.2d v0, v2, v1
-; CHECK: cmhs.8b v0, v1, v2
-; CHECK: cmhs.16b v0, v1, v2
-; CHECK: cmhs.4h v0, v1, v2
-; CHECK: cmhs.8h v0, v1, v2
-; CHECK: cmhs.2s v0, v1, v2
-; CHECK: cmhs.4s v0, v1, v2
-; CHECK: cmhs.2d v0, v1, v2
-
-  cmlo.8b v0, v2, v1
-  cmlo.16b v0, v2, v1
-  cmlo.4h v0, v2, v1
-  cmlo.8h v0, v2, v1
-  cmlo.2s v0, v2, v1
-  cmlo.4s v0, v2, v1
-  cmlo.2d v0, v2, v1
-; CHECK: cmhi.8b v0, v1, v2
-; CHECK: cmhi.16b v0, v1, v2
-; CHECK: cmhi.4h v0, v1, v2
-; CHECK: cmhi.8h v0, v1, v2
-; CHECK: cmhi.2s v0, v1, v2
-; CHECK: cmhi.4s v0, v1, v2
-; CHECK: cmhi.2d v0, v1, v2
-
-  cmle.8b v0, v2, v1
-  cmle.16b v0, v2, v1
-  cmle.4h v0, v2, v1
-  cmle.8h  v0, v2, v1
-  cmle.2s v0, v2, v1
-  cmle.4s v0, v2, v1
-  cmle.2d v0, v2, v1
-; CHECK: cmge.8b v0, v1, v2
-; CHECK: cmge.16b v0, v1, v2
-; CHECK: cmge.4h v0, v1, v2
-; CHECK: cmge.8h v0, v1, v2
-; CHECK: cmge.2s v0, v1, v2
-; CHECK: cmge.4s v0, v1, v2
-; CHECK: cmge.2d v0, v1, v2
-
-  cmlt.8b v0, v2, v1
-  cmlt.16b v0, v2, v1
-  cmlt.4h v0, v2, v1
-  cmlt.8h  v0, v2, v1
-  cmlt.2s v0, v2, v1
-  cmlt.4s v0, v2, v1
-  cmlt.2d v0, v2, v1
-; CHECK: cmgt.8b v0, v1, v2
-; CHECK: cmgt.16b v0, v1, v2
-; CHECK: cmgt.4h v0, v1, v2
-; CHECK: cmgt.8h v0, v1, v2
-; CHECK: cmgt.2s v0, v1, v2
-; CHECK: cmgt.4s v0, v1, v2
-; CHECK: cmgt.2d v0, v1, v2
-
-  fcmle.2s v0, v2, v1
-  fcmle.4s v0, v2, v1
-  fcmle.2d v0, v2, v1
-; CHECK: fcmge.2s v0, v1, v2
-; CHECK: fcmge.4s v0, v1, v2
-; CHECK: fcmge.2d v0, v1, v2
-
-  fcmlt.2s v0, v2, v1
-  fcmlt.4s v0, v2, v1
-  fcmlt.2d v0, v2, v1
-; CHECK: fcmgt.2s v0, v1, v2
-; CHECK: fcmgt.4s v0, v1, v2
-; CHECK: fcmgt.2d v0, v1, v2
-
-  facle.2s v0, v2, v1
-  facle.4s v0, v2, v1
-  facle.2d v0, v2, v1
-; CHECK: facge.2s v0, v1, v2
-; CHECK: facge.4s v0, v1, v2
-; CHECK: facge.2d v0, v1, v2
-
-  faclt.2s v0, v2, v1
-  faclt.4s v0, v2, v1
-  faclt.2d v0, v2, v1
-; CHECK: facgt.2s v0, v1, v2
-; CHECK: facgt.4s v0, v1, v2
-; CHECK: facgt.2d v0, v1, v2
-
-;-----------------------------------------------------------------------------
-; 5.8.6 Scalar Arithmetic aliases
-;-----------------------------------------------------------------------------
-
-  cmls d0, d2, d1
-; CHECK: cmhs d0, d1, d2
-
-  cmle d0, d2, d1
-; CHECK: cmge d0, d1, d2
-
-  cmlo d0, d2, d1
-; CHECK: cmhi d0, d1, d2
-
-  cmlt d0, d2, d1
-; CHECK: cmgt d0, d1, d2
-
-  fcmle s0, s2, s1
-  fcmle d0, d2, d1
-; CHECK: fcmge s0, s1, s2
-; CHECK: fcmge d0, d1, d2
-
-  fcmlt s0, s2, s1
-  fcmlt d0, d2, d1
-; CHECK: fcmgt s0, s1, s2
-; CHECK: fcmgt d0, d1, d2
-
-  facle s0, s2, s1
-  facle d0, d2, d1
-; CHECK: facge s0, s1, s2
-; CHECK: facge d0, d1, d2
-
-  faclt s0, s2, s1
-  faclt d0, d2, d1
-; CHECK: facgt s0, s1, s2
-; CHECK: facgt d0, d1, d2
-
-;-----------------------------------------------------------------------------
-; 5.8.14 Vector Shift (immediate)
-;-----------------------------------------------------------------------------
-  sxtl v1.8h, v2.8b
-; CHECK: sshll.8h v1, v2, #0
-  sxtl.8h v1, v2
-; CHECK: sshll.8h v1, v2, #0
-
-  sxtl v1.4s, v2.4h
-; CHECK: sshll.4s v1, v2, #0
-  sxtl.4s v1, v2
-; CHECK: sshll.4s v1, v2, #0
-
-  sxtl v1.2d, v2.2s
-; CHECK: sshll.2d v1, v2, #0
-  sxtl.2d v1, v2
-; CHECK: sshll.2d v1, v2, #0
-
-  sxtl2 v1.8h, v2.16b
-; CHECK: sshll2.8h v1, v2, #0
-  sxtl2.8h v1, v2
-; CHECK: sshll2.8h v1, v2, #0
-
-  sxtl2 v1.4s, v2.8h
-; CHECK: sshll2.4s v1, v2, #0
-  sxtl2.4s v1, v2
-; CHECK: sshll2.4s v1, v2, #0
-
-  sxtl2 v1.2d, v2.4s
-; CHECK: sshll2.2d v1, v2, #0
-  sxtl2.2d v1, v2
-; CHECK: sshll2.2d v1, v2, #0
-
-  uxtl v1.8h, v2.8b
-; CHECK: ushll.8h v1, v2, #0
-  uxtl.8h v1, v2
-; CHECK: ushll.8h v1, v2, #0
-
-  uxtl v1.4s, v2.4h
-; CHECK: ushll.4s v1, v2, #0
-  uxtl.4s v1, v2
-; CHECK: ushll.4s v1, v2, #0
-
-  uxtl v1.2d, v2.2s
-; CHECK: ushll.2d v1, v2, #0
-  uxtl.2d v1, v2
-; CHECK: ushll.2d v1, v2, #0
-
-  uxtl2 v1.8h, v2.16b
-; CHECK: ushll2.8h v1, v2, #0
-  uxtl2.8h v1, v2
-; CHECK: ushll2.8h v1, v2, #0
-
-  uxtl2 v1.4s, v2.8h
-; CHECK: ushll2.4s v1, v2, #0
-  uxtl2.4s v1, v2
-; CHECK: ushll2.4s v1, v2, #0
-
-  uxtl2 v1.2d, v2.4s
-; CHECK: ushll2.2d v1, v2, #0
-  uxtl2.2d v1, v2
-; CHECK: ushll2.2d v1, v2, #0
-
-
-;-----------------------------------------------------------------------------
-; MOVI verbose syntax with shift operand omitted.
-;-----------------------------------------------------------------------------
-  movi v4.16b, #0x00
-  movi v4.16B, #0x01
-  movi v4.8b, #0x02
-  movi v4.8B, #0x03
-  movi v1.2d, #0x000000000000ff
-  movi v2.2D, #0x000000000000ff
-
-; CHECK: movi.16b	v4, #0              ; encoding: [0x04,0xe4,0x00,0x4f]
-; CHECK: movi.16b	v4, #0x1              ; encoding: [0x24,0xe4,0x00,0x4f]
-; CHECK: movi.8b	v4, #0x2               ; encoding: [0x44,0xe4,0x00,0x0f]
-; CHECK: movi.8b	v4, #0x3               ; encoding: [0x64,0xe4,0x00,0x0f]
-; CHECK: movi.2d	v1, #0x000000000000ff ; encoding: [0x21,0xe4,0x00,0x6f]
-; CHECK: movi.2d	v2, #0x000000000000ff ; encoding: [0x22,0xe4,0x00,0x6f]
diff --git a/test/MC/ARM64/arithmetic-encoding.s b/test/MC/ARM64/arithmetic-encoding.s
deleted file mode 100644
index 5fd591240e2..00000000000
--- a/test/MC/ARM64/arithmetic-encoding.s
+++ /dev/null
@@ -1,615 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon -show-encoding < %s | FileCheck %s
-
-foo:
-;==---------------------------------------------------------------------------==
-; Add/Subtract with carry/borrow
-;==---------------------------------------------------------------------------==
-
-  adc   w1, w2, w3
-  adc   x1, x2, x3
-  adcs  w5, w4, w3
-  adcs  x5, x4, x3
-
-; CHECK: adc  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x1a]
-; CHECK: adc  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0x9a]
-; CHECK: adcs w5, w4, w3             ; encoding: [0x85,0x00,0x03,0x3a]
-; CHECK: adcs x5, x4, x3             ; encoding: [0x85,0x00,0x03,0xba]
-
-  sbc   w1, w2, w3
-  sbc   x1, x2, x3
-  sbcs  w1, w2, w3
-  sbcs  x1, x2, x3
-
-; CHECK: sbc  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x5a]
-; CHECK: sbc  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xda]
-; CHECK: sbcs w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x7a]
-; CHECK: sbcs x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xfa]
-
-;==---------------------------------------------------------------------------==
-; Add/Subtract with (optionally shifted) immediate
-;==---------------------------------------------------------------------------==
-
-  add w3, w4, #1024
-  add w3, w4, #1024, lsl #0
-  add x3, x4, #1024
-  add x3, x4, #1024, lsl #0
-
-; CHECK: add w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x11]
-; CHECK: add w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x11]
-; CHECK: add x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0x91]
-; CHECK: add x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0x91]
-
-  add w3, w4, #1024, lsl #12
-  add w3, w4, #4194304
-  add w3, w4, #0, lsl #12
-  add x3, x4, #1024, lsl #12
-  add x3, x4, #4194304
-  add x3, x4, #0, lsl #12
-  add sp, sp, #32
-
-; CHECK: add w3, w4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x11]
-; CHECK: add w3, w4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x11]
-; CHECK: add w3, w4, #0, lsl #12     ; encoding: [0x83,0x00,0x40,0x11]
-; CHECK: add x3, x4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x91]
-; CHECK: add x3, x4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x91]
-; CHECK: add x3, x4, #0, lsl #12     ; encoding: [0x83,0x00,0x40,0x91]
-; CHECK: add sp, sp, #32             ; encoding: [0xff,0x83,0x00,0x91]
-
-  adds w3, w4, #1024
-  adds w3, w4, #1024, lsl #0
-  adds w3, w4, #1024, lsl #12
-  adds x3, x4, #1024
-  adds x3, x4, #1024, lsl #0
-  adds x3, x4, #1024, lsl #12
-
-; CHECK: adds w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x31]
-; CHECK: adds w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x31]
-; CHECK: adds w3, w4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0x31]
-; CHECK: adds x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xb1]
-; CHECK: adds x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xb1]
-; CHECK: adds x3, x4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0xb1]
-
-  sub w3, w4, #1024
-  sub w3, w4, #1024, lsl #0
-  sub w3, w4, #1024, lsl #12
-  sub x3, x4, #1024
-  sub x3, x4, #1024, lsl #0
-  sub x3, x4, #1024, lsl #12
-  sub sp, sp, #32
-
-; CHECK: sub w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x51]
-; CHECK: sub w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x51]
-; CHECK: sub w3, w4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x51]
-; CHECK: sub x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0xd1]
-; CHECK: sub x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0xd1]
-; CHECK: sub x3, x4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0xd1]
-; CHECK: sub sp, sp, #32             ; encoding: [0xff,0x83,0x00,0xd1]
-
-  subs w3, w4, #1024
-  subs w3, w4, #1024, lsl #0
-  subs w3, w4, #1024, lsl #12
-  subs x3, x4, #1024
-  subs x3, x4, #1024, lsl #0
-  subs x3, x4, #1024, lsl #12
-
-; CHECK: subs w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x71]
-; CHECK: subs w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x71]
-; CHECK: subs w3, w4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0x71]
-; CHECK: subs x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xf1]
-; CHECK: subs x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xf1]
-; CHECK: subs x3, x4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0xf1]
-
-;==---------------------------------------------------------------------------==
-; Add/Subtract register with (optional) shift
-;==---------------------------------------------------------------------------==
-
-  add w12, w13, w14
-  add x12, x13, x14
-  add w12, w13, w14, lsl #12
-  add x12, x13, x14, lsl #12
-  add x12, x13, x14, lsr #42
-  add x12, x13, x14, asr #39
-
-; CHECK: add w12, w13, w14           ; encoding: [0xac,0x01,0x0e,0x0b]
-; CHECK: add x12, x13, x14           ; encoding: [0xac,0x01,0x0e,0x8b]
-; CHECK: add w12, w13, w14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x0b]
-; CHECK: add x12, x13, x14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x8b]
-; CHECK: add x12, x13, x14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0x8b]
-; CHECK: add x12, x13, x14, asr #39  ; encoding: [0xac,0x9d,0x8e,0x8b]
-
-  sub w12, w13, w14
-  sub x12, x13, x14
-  sub w12, w13, w14, lsl #12
-  sub x12, x13, x14, lsl #12
-  sub x12, x13, x14, lsr #42
-  sub x12, x13, x14, asr #39
-
-; CHECK: sub w12, w13, w14           ; encoding: [0xac,0x01,0x0e,0x4b]
-; CHECK: sub x12, x13, x14           ; encoding: [0xac,0x01,0x0e,0xcb]
-; CHECK: sub w12, w13, w14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x4b]
-; CHECK: sub x12, x13, x14, lsl #12  ; encoding: [0xac,0x31,0x0e,0xcb]
-; CHECK: sub x12, x13, x14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0xcb]
-; CHECK: sub x12, x13, x14, asr #39  ; encoding: [0xac,0x9d,0x8e,0xcb]
-
-  adds w12, w13, w14
-  adds x12, x13, x14
-  adds w12, w13, w14, lsl #12
-  adds x12, x13, x14, lsl #12
-  adds x12, x13, x14, lsr #42
-  adds x12, x13, x14, asr #39
-
-; CHECK: adds w12, w13, w14          ; encoding: [0xac,0x01,0x0e,0x2b]
-; CHECK: adds x12, x13, x14          ; encoding: [0xac,0x01,0x0e,0xab]
-; CHECK: adds w12, w13, w14, lsl #12 ; encoding: [0xac,0x31,0x0e,0x2b]
-; CHECK: adds x12, x13, x14, lsl #12 ; encoding: [0xac,0x31,0x0e,0xab]
-; CHECK: adds x12, x13, x14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0xab]
-; CHECK: adds x12, x13, x14, asr #39 ; encoding: [0xac,0x9d,0x8e,0xab]
-
-  subs w12, w13, w14
-  subs x12, x13, x14
-  subs w12, w13, w14, lsl #12
-  subs x12, x13, x14, lsl #12
-  subs x12, x13, x14, lsr #42
-  subs x12, x13, x14, asr #39
-
-; CHECK: subs w12, w13, w14          ; encoding: [0xac,0x01,0x0e,0x6b]
-; CHECK: subs x12, x13, x14          ; encoding: [0xac,0x01,0x0e,0xeb]
-; CHECK: subs w12, w13, w14, lsl #12 ; encoding: [0xac,0x31,0x0e,0x6b]
-; CHECK: subs x12, x13, x14, lsl #12 ; encoding: [0xac,0x31,0x0e,0xeb]
-; CHECK: subs x12, x13, x14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0xeb]
-; CHECK: subs x12, x13, x14, asr #39 ; encoding: [0xac,0x9d,0x8e,0xeb]
-
-; Check use of upper case register names rdar://14354073
-  add X2, X2, X2
-; CHECK: add x2, x2, x2              ; encoding: [0x42,0x00,0x02,0x8b]
-
-;==---------------------------------------------------------------------------==
-; Add/Subtract with (optional) extend
-;==---------------------------------------------------------------------------==
-
-  add w1, w2, w3, uxtb
-  add w1, w2, w3, uxth
-  add w1, w2, w3, uxtw
-  add w1, w2, w3, uxtx
-  add w1, w2, w3, sxtb
-  add w1, w2, w3, sxth
-  add w1, w2, w3, sxtw
-  add w1, w2, w3, sxtx
-
-; CHECK: add w1, w2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x0b]
-; CHECK: add w1, w2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x0b]
-; CHECK: add w1, w2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x0b]
-; CHECK: add w1, w2, w3, uxtx        ; encoding: [0x41,0x60,0x23,0x0b]
-; CHECK: add w1, w2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x0b]
-; CHECK: add w1, w2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x0b]
-; CHECK: add w1, w2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x0b]
-; CHECK: add w1, w2, w3, sxtx        ; encoding: [0x41,0xe0,0x23,0x0b]
-
-  add x1, x2, w3, uxtb
-  add x1, x2, w3, uxth
-  add x1, x2, w3, uxtw
-  add x1, x2, w3, sxtb
-  add x1, x2, w3, sxth
-  add x1, x2, w3, sxtw
-
-; CHECK: add x1, x2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x8b]
-; CHECK: add x1, x2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x8b]
-; CHECK: add x1, x2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x8b]
-; CHECK: add x1, x2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x8b]
-; CHECK: add x1, x2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x8b]
-; CHECK: add x1, x2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x8b]
-
-  add w1, wsp, w3
-  add w1, wsp, w3, uxtw #0
-  add w2, wsp, w3, lsl #1
-  add sp, x2, x3
-  add sp, x2, x3, uxtx #0
-
-; CHECK: add w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x0b]
-; CHECK: add w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x0b]
-; CHECK: add w2, wsp, w3, lsl #1     ; encoding: [0xe2,0x47,0x23,0x0b]
-; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
-; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
-
-  sub w1, w2, w3, uxtb
-  sub w1, w2, w3, uxth
-  sub w1, w2, w3, uxtw
-  sub w1, w2, w3, uxtx
-  sub w1, w2, w3, sxtb
-  sub w1, w2, w3, sxth
-  sub w1, w2, w3, sxtw
-  sub w1, w2, w3, sxtx
-
-; CHECK: sub w1, w2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x4b]
-; CHECK: sub w1, w2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x4b]
-; CHECK: sub w1, w2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x4b]
-; CHECK: sub w1, w2, w3, uxtx        ; encoding: [0x41,0x60,0x23,0x4b]
-; CHECK: sub w1, w2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x4b]
-; CHECK: sub w1, w2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x4b]
-; CHECK: sub w1, w2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x4b]
-; CHECK: sub w1, w2, w3, sxtx        ; encoding: [0x41,0xe0,0x23,0x4b]
-
-  sub x1, x2, w3, uxtb
-  sub x1, x2, w3, uxth
-  sub x1, x2, w3, uxtw
-  sub x1, x2, w3, sxtb
-  sub x1, x2, w3, sxth
-  sub x1, x2, w3, sxtw
-
-; CHECK: sub x1, x2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0xcb]
-; CHECK: sub x1, x2, w3, uxth        ; encoding: [0x41,0x20,0x23,0xcb]
-; CHECK: sub x1, x2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0xcb]
-; CHECK: sub x1, x2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0xcb]
-; CHECK: sub x1, x2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0xcb]
-; CHECK: sub x1, x2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0xcb]
-
-  sub w1, wsp, w3
-  sub w1, wsp, w3, uxtw #0
-  sub sp, x2, x3
-  sub sp, x2, x3, uxtx #0
-  sub sp, x3, x7, lsl #4
-
-; CHECK: sub w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x4b]
-; CHECK: sub w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x4b]
-; CHECK: sub sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0xcb]
-; CHECK: sub sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0xcb]
-; CHECK: sp, x3, x7, lsl #4          ; encoding: [0x7f,0x70,0x27,0xcb]
-
-  adds w1, w2, w3, uxtb
-  adds w1, w2, w3, uxth
-  adds w1, w2, w3, uxtw
-  adds w1, w2, w3, uxtx
-  adds w1, w2, w3, sxtb
-  adds w1, w2, w3, sxth
-  adds w1, w2, w3, sxtw
-  adds w1, w2, w3, sxtx
-
-; CHECK: adds w1, w2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0x2b]
-; CHECK: adds w1, w2, w3, uxth       ; encoding: [0x41,0x20,0x23,0x2b]
-; CHECK: adds w1, w2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0x2b]
-; CHECK: adds w1, w2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0x2b]
-; CHECK: adds w1, w2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0x2b]
-; CHECK: adds w1, w2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0x2b]
-; CHECK: adds w1, w2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0x2b]
-; CHECK: adds w1, w2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0x2b]
-
-  adds x1, x2, w3, uxtb
-  adds x1, x2, w3, uxth
-  adds x1, x2, w3, uxtw
-  adds x1, x2, w3, uxtx
-  adds x1, x2, w3, sxtb
-  adds x1, x2, w3, sxth
-  adds x1, x2, w3, sxtw
-  adds x1, x2, w3, sxtx
-
-; CHECK: adds x1, x2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0xab]
-; CHECK: adds x1, x2, w3, uxth       ; encoding: [0x41,0x20,0x23,0xab]
-; CHECK: adds x1, x2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0xab]
-; CHECK: adds x1, x2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0xab]
-; CHECK: adds x1, x2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0xab]
-; CHECK: adds x1, x2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0xab]
-; CHECK: adds x1, x2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0xab]
-; CHECK: adds x1, x2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0xab]
-
-  adds w1, wsp, w3
-  adds w1, wsp, w3, uxtw #0
-  adds wzr, wsp, w3, lsl #4
-
-; CHECK: adds w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x2b]
-; CHECK: adds w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x2b]
-; CHECK: cmn wsp, w3, lsl #4         ; encoding: [0xff,0x53,0x23,0x2b]
-
-  subs w1, w2, w3, uxtb
-  subs w1, w2, w3, uxth
-  subs w1, w2, w3, uxtw
-  subs w1, w2, w3, uxtx
-  subs w1, w2, w3, sxtb
-  subs w1, w2, w3, sxth
-  subs w1, w2, w3, sxtw
-  subs w1, w2, w3, sxtx
-
-; CHECK: subs w1, w2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0x6b]
-; CHECK: subs w1, w2, w3, uxth       ; encoding: [0x41,0x20,0x23,0x6b]
-; CHECK: subs w1, w2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0x6b]
-; CHECK: subs w1, w2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0x6b]
-; CHECK: subs w1, w2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0x6b]
-; CHECK: subs w1, w2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0x6b]
-; CHECK: subs w1, w2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0x6b]
-; CHECK: subs w1, w2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0x6b]
-
-  subs x1, x2, w3, uxtb
-  subs x1, x2, w3, uxth
-  subs x1, x2, w3, uxtw
-  subs x1, x2, w3, uxtx
-  subs x1, x2, w3, sxtb
-  subs x1, x2, w3, sxth
-  subs x1, x2, w3, sxtw
-  subs x1, x2, w3, sxtx
-
-; CHECK: subs x1, x2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0xeb]
-; CHECK: subs x1, x2, w3, uxth       ; encoding: [0x41,0x20,0x23,0xeb]
-; CHECK: subs x1, x2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0xeb]
-; CHECK: subs x1, x2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0xeb]
-; CHECK: subs x1, x2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0xeb]
-; CHECK: subs x1, x2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0xeb]
-; CHECK: subs x1, x2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0xeb]
-; CHECK: subs x1, x2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0xeb]
-
-  subs w1, wsp, w3
-  subs w1, wsp, w3, uxtw #0
-
-; CHECK: subs w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x6b]
-; CHECK: subs w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x6b]
-
-  cmp wsp, w9, lsl #0
-  subs x3, sp, x9, lsl #2
-  cmp wsp, w8, uxtw
-  subs wzr, wsp, w8, uxtw
-  cmp sp, w8, uxtw
-  subs xzr, sp, w8, uxtw
-
-; CHECK: cmp wsp, w9                 ; encoding: [0xff,0x43,0x29,0x6b]
-; CHECK: subs x3, sp, x9, lsl #2     ; encoding: [0xe3,0x6b,0x29,0xeb]
-; CHECK: cmp wsp, w8                 ; encoding: [0xff,0x43,0x28,0x6b]
-; CHECK: cmp wsp, w8                 ; encoding: [0xff,0x43,0x28,0x6b]
-; CHECK: cmp sp, w8, uxtw            ; encoding: [0xff,0x43,0x28,0xeb]
-; CHECK: cmp sp, w8, uxtw            ; encoding: [0xff,0x43,0x28,0xeb]
-
-  sub wsp, w9, w8, uxtw
-  sub w1, wsp, w8, uxtw
-  sub wsp, wsp, w8, uxtw
-  sub sp, x9, w8, uxtw
-  sub x1, sp, w8, uxtw
-  sub sp, sp, w8, uxtw
-  subs w1, wsp, w8, uxtw
-  subs x1, sp, w8, uxtw
-
-; CHECK: sub wsp, w9, w8             ; encoding: [0x3f,0x41,0x28,0x4b]
-; CHECK: sub w1, wsp, w8             ; encoding: [0xe1,0x43,0x28,0x4b]
-; CHECK: sub wsp, wsp, w8            ; encoding: [0xff,0x43,0x28,0x4b]
-; CHECK: sub sp, x9, w8, uxtw        ; encoding: [0x3f,0x41,0x28,0xcb]
-; CHECK: sub x1, sp, w8, uxtw        ; encoding: [0xe1,0x43,0x28,0xcb]
-; CHECK: sub sp, sp, w8, uxtw        ; encoding: [0xff,0x43,0x28,0xcb]
-; CHECK: subs w1, wsp, w8            ; encoding: [0xe1,0x43,0x28,0x6b]
-; CHECK: subs x1, sp, w8, uxtw       ; encoding: [0xe1,0x43,0x28,0xeb]
-
-;==---------------------------------------------------------------------------==
-; Signed/Unsigned divide
-;==---------------------------------------------------------------------------==
-
-  sdiv w1, w2, w3
-  sdiv x1, x2, x3
-  udiv w1, w2, w3
-  udiv x1, x2, x3
-
-; CHECK: sdiv w1, w2, w3             ; encoding: [0x41,0x0c,0xc3,0x1a]
-; CHECK: sdiv x1, x2, x3             ; encoding: [0x41,0x0c,0xc3,0x9a]
-; CHECK: udiv w1, w2, w3             ; encoding: [0x41,0x08,0xc3,0x1a]
-; CHECK: udiv x1, x2, x3             ; encoding: [0x41,0x08,0xc3,0x9a]
-
-;==---------------------------------------------------------------------------==
-; Variable shifts
-;==---------------------------------------------------------------------------==
-
-  asrv w1, w2, w3
-  asrv x1, x2, x3
-  asr w1, w2, w3
-  asr x1, x2, x3
-  lslv w1, w2, w3
-  lslv x1, x2, x3
-  lsl w1, w2, w3
-  lsl x1, x2, x3
-  lsrv w1, w2, w3
-  lsrv x1, x2, x3
-  lsr w1, w2, w3
-  lsr x1, x2, x3
-  rorv w1, w2, w3
-  rorv x1, x2, x3
-  ror w1, w2, w3
-  ror x1, x2, x3
-
-; CHECK: encoding: [0x41,0x28,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x28,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x28,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x28,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x20,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x20,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x20,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x20,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x24,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x24,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x24,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x24,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x2c,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x2c,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x2c,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x2c,0xc3,0x9a]
-
-;==---------------------------------------------------------------------------==
-; One operand instructions
-;==---------------------------------------------------------------------------==
-
-  cls w1, w2
-  cls x1, x2
-  clz w1, w2
-  clz x1, x2
-  rbit w1, w2
-  rbit x1, x2
-  rev w1, w2
-  rev x1, x2
-  rev16 w1, w2
-  rev16 x1, x2
-  rev32 x1, x2
-
-; CHECK: encoding: [0x41,0x14,0xc0,0x5a]
-; CHECK: encoding: [0x41,0x14,0xc0,0xda]
-; CHECK: encoding: [0x41,0x10,0xc0,0x5a]
-; CHECK: encoding: [0x41,0x10,0xc0,0xda]
-; CHECK: encoding: [0x41,0x00,0xc0,0x5a]
-; CHECK: encoding: [0x41,0x00,0xc0,0xda]
-; CHECK: encoding: [0x41,0x08,0xc0,0x5a]
-; CHECK: encoding: [0x41,0x0c,0xc0,0xda]
-; CHECK: encoding: [0x41,0x04,0xc0,0x5a]
-; CHECK: encoding: [0x41,0x04,0xc0,0xda]
-; CHECK: encoding: [0x41,0x08,0xc0,0xda]
-
-;==---------------------------------------------------------------------------==
-; 6.6.1 Multiply-add instructions
-;==---------------------------------------------------------------------------==
-
-  madd   w1, w2, w3, w4
-  madd   x1, x2, x3, x4
-  msub   w1, w2, w3, w4
-  msub   x1, x2, x3, x4
-  smaddl x1, w2, w3, x4
-  smsubl x1, w2, w3, x4
-  umaddl x1, w2, w3, x4
-  umsubl x1, w2, w3, x4
-
-; CHECK: madd   w1, w2, w3, w4       ; encoding: [0x41,0x10,0x03,0x1b]
-; CHECK: madd   x1, x2, x3, x4       ; encoding: [0x41,0x10,0x03,0x9b]
-; CHECK: msub   w1, w2, w3, w4       ; encoding: [0x41,0x90,0x03,0x1b]
-; CHECK: msub   x1, x2, x3, x4       ; encoding: [0x41,0x90,0x03,0x9b]
-; CHECK: smaddl x1, w2, w3, x4       ; encoding: [0x41,0x10,0x23,0x9b]
-; CHECK: smsubl x1, w2, w3, x4       ; encoding: [0x41,0x90,0x23,0x9b]
-; CHECK: umaddl x1, w2, w3, x4       ; encoding: [0x41,0x10,0xa3,0x9b]
-; CHECK: umsubl x1, w2, w3, x4       ; encoding: [0x41,0x90,0xa3,0x9b]
-
-;==---------------------------------------------------------------------------==
-; Multiply-high instructions
-;==---------------------------------------------------------------------------==
-
-  smulh x1, x2, x3
-  umulh x1, x2, x3
-
-; CHECK: smulh x1, x2, x3            ; encoding: [0x41,0x7c,0x43,0x9b]
-; CHECK: umulh x1, x2, x3            ; encoding: [0x41,0x7c,0xc3,0x9b]
-
-;==---------------------------------------------------------------------------==
-; Move immediate instructions
-;==---------------------------------------------------------------------------==
-
-  movz w0, #1
-  movz x0, #1
-  movz w0, #1, lsl #16
-  movz x0, #1, lsl #16
-
-; CHECK: movz w0, #0x1                 ; encoding: [0x20,0x00,0x80,0x52]
-; CHECK: movz x0, #0x1                 ; encoding: [0x20,0x00,0x80,0xd2]
-; CHECK: movz w0, #0x1, lsl #16        ; encoding: [0x20,0x00,0xa0,0x52]
-; CHECK: movz x0, #0x1, lsl #16        ; encoding: [0x20,0x00,0xa0,0xd2]
-
-  movn w0, #2
-  movn x0, #2
-  movn w0, #2, lsl #16
-  movn x0, #2, lsl #16
-
-; CHECK: movn w0, #0x2                 ; encoding: [0x40,0x00,0x80,0x12]
-; CHECK: movn x0, #0x2                 ; encoding: [0x40,0x00,0x80,0x92]
-; CHECK: movn w0, #0x2, lsl #16        ; encoding: [0x40,0x00,0xa0,0x12]
-; CHECK: movn x0, #0x2, lsl #16        ; encoding: [0x40,0x00,0xa0,0x92]
-
-  movk w0, #1
-  movk x0, #1
-  movk w0, #1, lsl #16
-  movk x0, #1, lsl #16
-
-; CHECK: movk w0, #0x1                 ; encoding: [0x20,0x00,0x80,0x72]
-; CHECK: movk x0, #0x1                 ; encoding: [0x20,0x00,0x80,0xf2]
-; CHECK: movk w0, #0x1, lsl #16        ; encoding: [0x20,0x00,0xa0,0x72]
-; CHECK: movk x0, #0x1, lsl #16        ; encoding: [0x20,0x00,0xa0,0xf2]
-
-;==---------------------------------------------------------------------------==
-; Conditionally set flags instructions
-;==---------------------------------------------------------------------------==
-
-  ccmn w1, #2, #3, eq
-  ccmn x1, #2, #3, eq
-  ccmp w1, #2, #3, eq
-  ccmp x1, #2, #3, eq
-
-; CHECK: encoding: [0x23,0x08,0x42,0x3a]
-; CHECK: encoding: [0x23,0x08,0x42,0xba]
-; CHECK: encoding: [0x23,0x08,0x42,0x7a]
-; CHECK: encoding: [0x23,0x08,0x42,0xfa]
-
-  ccmn w1, w2, #3, eq
-  ccmn x1, x2, #3, eq
-  ccmp w1, w2, #3, eq
-  ccmp x1, x2, #3, eq
-
-; CHECK: encoding: [0x23,0x00,0x42,0x3a]
-; CHECK: encoding: [0x23,0x00,0x42,0xba]
-; CHECK: encoding: [0x23,0x00,0x42,0x7a]
-; CHECK: encoding: [0x23,0x00,0x42,0xfa]
-
-;==---------------------------------------------------------------------------==
-; Conditional select instructions
-;==---------------------------------------------------------------------------==
-
-  csel w1, w2, w3, eq
-  csel x1, x2, x3, eq
-  csinc w1, w2, w3, eq
-  csinc x1, x2, x3, eq
-  csinv w1, w2, w3, eq
-  csinv x1, x2, x3, eq
-  csneg w1, w2, w3, eq
-  csneg x1, x2, x3, eq
-
-; CHECK: encoding: [0x41,0x00,0x83,0x1a]
-; CHECK: encoding: [0x41,0x00,0x83,0x9a]
-; CHECK: encoding: [0x41,0x04,0x83,0x1a]
-; CHECK: encoding: [0x41,0x04,0x83,0x9a]
-; CHECK: encoding: [0x41,0x00,0x83,0x5a]
-; CHECK: encoding: [0x41,0x00,0x83,0xda]
-; CHECK: encoding: [0x41,0x04,0x83,0x5a]
-; CHECK: encoding: [0x41,0x04,0x83,0xda]
-
-; Make sure we handle upper case, too. In particular, condition codes.
-  CSEL W16, W7, W27, EQ
-  CSEL W15, W6, W26, NE
-  CSEL W14, W5, W25, CS
-  CSEL W13, W4, W24, HS
-  csel w12, w3, w23, CC
-  csel w11, w2, w22, LO
-  csel w10, w1, w21, MI
-  csel x9, x9, x1, PL
-  csel x8, x8, x2, VS
-  CSEL X7, X7, X3, VC
-  CSEL X6, X7, X4, HI
-  CSEL X5, X6, X5, LS
-  CSEL X4, X5, X6, GE
-  csel x3, x4, x7, LT
-  csel x2, x3, x8, GT
-  csel x1, x2, x9, LE
-  csel x10, x1, x20, AL
-
-; CHECK: csel	w16, w7, w27, eq        ; encoding: [0xf0,0x00,0x9b,0x1a]
-; CHECK: csel	w15, w6, w26, ne        ; encoding: [0xcf,0x10,0x9a,0x1a]
-; CHECK: csel	w14, w5, w25, hs        ; encoding: [0xae,0x20,0x99,0x1a]
-; CHECK: csel	w13, w4, w24, hs        ; encoding: [0x8d,0x20,0x98,0x1a]
-; CHECK: csel	w12, w3, w23, lo        ; encoding: [0x6c,0x30,0x97,0x1a]
-; CHECK: csel	w11, w2, w22, lo        ; encoding: [0x4b,0x30,0x96,0x1a]
-; CHECK: csel	w10, w1, w21, mi        ; encoding: [0x2a,0x40,0x95,0x1a]
-; CHECK: csel	x9, x9, x1, pl          ; encoding: [0x29,0x51,0x81,0x9a]
-; CHECK: csel	x8, x8, x2, vs          ; encoding: [0x08,0x61,0x82,0x9a]
-; CHECK: csel	x7, x7, x3, vc          ; encoding: [0xe7,0x70,0x83,0x9a]
-; CHECK: csel	x6, x7, x4, hi          ; encoding: [0xe6,0x80,0x84,0x9a]
-; CHECK: csel	x5, x6, x5, ls          ; encoding: [0xc5,0x90,0x85,0x9a]
-; CHECK: csel	x4, x5, x6, ge          ; encoding: [0xa4,0xa0,0x86,0x9a]
-; CHECK: csel	x3, x4, x7, lt          ; encoding: [0x83,0xb0,0x87,0x9a]
-; CHECK: csel	x2, x3, x8, gt          ; encoding: [0x62,0xc0,0x88,0x9a]
-; CHECK: csel	x1, x2, x9, le          ; encoding: [0x41,0xd0,0x89,0x9a]
-; CHECK: csel	x10, x1, x20, al        ; encoding: [0x2a,0xe0,0x94,0x9a]
-
-
-;==---------------------------------------------------------------------------==
-; Scalar saturating arithmetic
-;==---------------------------------------------------------------------------==
-  uqxtn b4, h2
-  uqxtn h2, s3
-  uqxtn s9, d2
-
-; CHECK: uqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x7e]
-; CHECK: uqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x7e]
-; CHECK: uqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x7e]
diff --git a/test/MC/ARM64/arm64-fixup.s b/test/MC/ARM64/arm64-fixup.s
deleted file mode 100644
index eae6f68390e..00000000000
--- a/test/MC/ARM64/arm64-fixup.s
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llvm-mc < %s -triple arm64-apple-darwin --show-encoding | FileCheck %s
-
-foo:
-  adr x3, Lbar
-; CHECK: adr x3, Lbar            ; encoding: [0x03'A',A,A,0x10'A']
-; CHECK: fixup A - offset: 0, value: Lbar, kind: fixup_arm64_pcrel_adr_imm21
-Lbar:
-  adrp x3, _printf@page
-; CHECK: adrp x3, _printf@PAGE      ; encoding: [0x03'A',A,A,0x90'A']
-; CHECK: fixup A - offset: 0, value: _printf@PAGE, kind: fixup_arm64_pcrel_adrp_imm21
diff --git a/test/MC/ARM64/basic-a64-instructions.s b/test/MC/ARM64/basic-a64-instructions.s
deleted file mode 100644
index 2f58eadfc84..00000000000
--- a/test/MC/ARM64/basic-a64-instructions.s
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: llvm-mc -triple arm64 -mattr=+crc -show-encoding < %s | FileCheck %s
-
-        crc32b  w5, w7, w20
-        crc32h  w28, wzr, w30
-        crc32w  w0, w1, w2
-        crc32x  w7, w9, x20
-        crc32cb w9, w5, w4
-        crc32ch w13, w17, w25
-        crc32cw wzr, w3, w5
-        crc32cx w18, w16, xzr
-// CHECK: crc32b   w5, w7, w20             // encoding: [0xe5,0x40,0xd4,0x1a]
-// CHECK: crc32h   w28, wzr, w30           // encoding: [0xfc,0x47,0xde,0x1a]
-// CHECK: crc32w   w0, w1, w2              // encoding: [0x20,0x48,0xc2,0x1a]
-// CHECK: crc32x   w7, w9, x20             // encoding: [0x27,0x4d,0xd4,0x9a]
-// CHECK: crc32cb  w9, w5, w4              // encoding: [0xa9,0x50,0xc4,0x1a]
-// CHECK: crc32ch  w13, w17, w25           // encoding: [0x2d,0x56,0xd9,0x1a]
-// CHECK: crc32cw  wzr, w3, w5             // encoding: [0x7f,0x58,0xc5,0x1a]
-// CHECK: crc32cx  w18, w16, xzr           // encoding: [0x12,0x5e,0xdf,0x9a]
diff --git a/test/MC/ARM64/be-datalayout.s b/test/MC/ARM64/be-datalayout.s
deleted file mode 100644
index f448a4b86e1..00000000000
--- a/test/MC/ARM64/be-datalayout.s
+++ /dev/null
@@ -1,4 +0,0 @@
-// RUN: llvm-mc -filetype=obj -triple arm64_be %s | llvm-readobj -section-data -sections | FileCheck %s
-
-// CHECK: 0000: 00123456 789ABCDE
-foo:    .xword 0x123456789abcde
diff --git a/test/MC/ARM64/bitfield-encoding.s b/test/MC/ARM64/bitfield-encoding.s
deleted file mode 100644
index 1589aa7139f..00000000000
--- a/test/MC/ARM64/bitfield-encoding.s
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-
-foo:
-;==---------------------------------------------------------------------------==
-; 5.4.4 Bitfield Operations
-;==---------------------------------------------------------------------------==
-
-  bfm  w1, w2, #1, #15
-  bfm  x1, x2, #1, #15
-  sbfm w1, w2, #1, #15
-  sbfm x1, x2, #1, #15
-  ubfm w1, w2, #1, #15
-  ubfm x1, x2, #1, #15
-  sbfiz wzr, w0, #31, #1
-  sbfiz xzr, x0, #31, #1
-  ubfiz wzr, w0, #31, #1
-  ubfiz xzr, x0, #31, #1
-
-; CHECK: bfxil w1, w2, #1, #15       ; encoding: [0x41,0x3c,0x01,0x33]
-; CHECK: bfxil x1, x2, #1, #15       ; encoding: [0x41,0x3c,0x41,0xb3]
-; CHECK: sbfx w1, w2, #1, #15        ; encoding: [0x41,0x3c,0x01,0x13]
-; CHECK: sbfx x1, x2, #1, #15        ; encoding: [0x41,0x3c,0x41,0x93]
-; CHECK: ubfx w1, w2, #1, #15        ; encoding: [0x41,0x3c,0x01,0x53]
-; CHECK: ubfx x1, x2, #1, #15        ; encoding: [0x41,0x3c,0x41,0xd3]
-; CHECK: sbfiz wzr, w0, #31, #1      ; encoding: [0x1f,0x00,0x01,0x13]
-; CHECK: sbfiz xzr, x0, #31, #1      ; encoding: [0x1f,0x00,0x61,0x93]
-; CHECK: lsl  wzr, w0, #31           ; encoding: [0x1f,0x00,0x01,0x53]
-; CHECK: ubfiz xzr, x0, #31, #1      ; encoding: [0x1f,0x00,0x61,0xd3]
-
-;==---------------------------------------------------------------------------==
-; 5.4.5 Extract (immediate)
-;==---------------------------------------------------------------------------==
-
-  extr w1, w2, w3, #15
-  extr x2, x3, x4, #1
-
-; CHECK: extr w1, w2, w3, #15        ; encoding: [0x41,0x3c,0x83,0x13]
-; CHECK: extr x2, x3, x4, #1         ; encoding: [0x62,0x04,0xc4,0x93]
diff --git a/test/MC/ARM64/branch-encoding.s b/test/MC/ARM64/branch-encoding.s
deleted file mode 100644
index ba8fb3d9018..00000000000
--- a/test/MC/ARM64/branch-encoding.s
+++ /dev/null
@@ -1,159 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-
-foo:
-
-;-----------------------------------------------------------------------------
-; Unconditional branch (register) instructions.
-;-----------------------------------------------------------------------------
-
-  ret
-; CHECK: encoding: [0xc0,0x03,0x5f,0xd6]
-  ret x1
-; CHECK: encoding: [0x20,0x00,0x5f,0xd6]
-  drps
-; CHECK: encoding: [0xe0,0x03,0xbf,0xd6]
-  eret
-; CHECK: encoding: [0xe0,0x03,0x9f,0xd6]
-  br  x5
-; CHECK: encoding: [0xa0,0x00,0x1f,0xd6]
-  blr x9
-; CHECK: encoding: [0x20,0x01,0x3f,0xd6]
-  bl  L1
-; CHECK: bl L1   ; encoding: [A,A,A,0b100101AA]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_call26
-
-;-----------------------------------------------------------------------------
-; Contitional branch instructions.
-;-----------------------------------------------------------------------------
-
-  b     L1
-; CHECK: b L1      ; encoding: [A,A,A,0b000101AA]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch26
-  b.eq  L1
-; CHECK: b.eq L1   ; encoding: [0bAAA00000,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.ne  L1
-; CHECK: b.ne L1   ; encoding: [0bAAA00001,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.cs  L1
-; CHECK: b.hs L1   ; encoding: [0bAAA00010,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.cc  L1
-; CHECK: b.lo L1   ; encoding: [0bAAA00011,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.mi  L1
-; CHECK: b.mi L1   ; encoding: [0bAAA00100,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.pl  L1
-; CHECK: b.pl L1   ; encoding: [0bAAA00101,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.vs  L1
-; CHECK: b.vs L1   ; encoding: [0bAAA00110,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.vc  L1
-; CHECK: b.vc L1   ; encoding: [0bAAA00111,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.hi  L1
-; CHECK: b.hi L1   ; encoding: [0bAAA01000,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.ls  L1
-; CHECK: b.ls L1   ; encoding: [0bAAA01001,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.ge  L1
-; CHECK: b.ge L1   ; encoding: [0bAAA01010,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.lt  L1
-; CHECK: b.lt L1   ; encoding: [0bAAA01011,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.gt  L1
-; CHECK: b.gt L1   ; encoding: [0bAAA01100,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.le  L1
-; CHECK: b.le L1   ; encoding: [0bAAA01101,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-  b.al  L1
-; CHECK: b.al L1      ; encoding: [0bAAA01110,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
-L1:
-  b #28
-; CHECK: b #28
-  b.lt #28
-; CHECK: b.lt #28
-  b.cc #1048572
-; CHECK: b.lo	#1048572                ; encoding: [0xe3,0xff,0x7f,0x54]
-  b #134217724
-; CHECK: b	#134217724              ; encoding: [0xff,0xff,0xff,0x15]
-  b #-134217728
-; CHECK: b	#-134217728             ; encoding: [0x00,0x00,0x00,0x16]
-
-;-----------------------------------------------------------------------------
-; Compare-and-branch instructions.
-;-----------------------------------------------------------------------------
-
-  cbz w1, foo
-; CHECK: encoding: [0bAAA00001,A,A,0x34]
-  cbz x1, foo
-; CHECK: encoding: [0bAAA00001,A,A,0xb4]
-  cbnz w2, foo
-; CHECK: encoding: [0bAAA00010,A,A,0x35]
-  cbnz x2, foo
-; CHECK: encoding: [0bAAA00010,A,A,0xb5]
-  cbz w1, #28
-; CHECK: cbz w1, #28
-  cbz     w20, #1048572
-; CHECK: cbz	w20, #1048572           ; encoding: [0xf4,0xff,0x7f,0x34]
-  cbnz x2, #-1048576
-; CHECK: cbnz	x2, #-1048576           ; encoding: [0x02,0x00,0x80,0xb5]
-
-
-;-----------------------------------------------------------------------------
-; Bit-test-and-branch instructions.
-;-----------------------------------------------------------------------------
-
-  tbz x1, #3, foo
-; CHECK: encoding: [0bAAA00001,A,0b00011AAA,0x36]
-  tbnz x1, #63, foo
-; CHECK: encoding: [0bAAA00001,A,0b11111AAA,0xb7]
-
-  tbz w1, #3, foo
-; CHECK: encoding: [0bAAA00001,A,0b00011AAA,0x36]
-  tbnz w1, #31, foo
-; CHECK: encoding: [0bAAA00001,A,0b11111AAA,0x37]
-
-  tbz w1, #3, #28
-; CHECK: tbz w1, #3, #28
-  tbz w3, #5, #32764
-; CHECK: tbz	w3, #5, #32764          ; encoding: [0xe3,0xff,0x2b,0x36]
-  tbnz x3, #8, #-32768
-; CHECK: tbnz	w3, #8, #-32768         ; encoding: [0x03,0x00,0x44,0x37]
-
-;-----------------------------------------------------------------------------
-; Exception generation instructions.
-;-----------------------------------------------------------------------------
-
-  brk   #1
-; CHECK: encoding: [0x20,0x00,0x20,0xd4]
-  dcps1 #2
-; CHECK: encoding: [0x41,0x00,0xa0,0xd4]
-  dcps2 #3
-; CHECK: encoding: [0x62,0x00,0xa0,0xd4]
-  dcps3 #4
-; CHECK: encoding: [0x83,0x00,0xa0,0xd4]
-  hlt   #5
-; CHECK: encoding: [0xa0,0x00,0x40,0xd4]
-  hvc   #6
-; CHECK: encoding: [0xc2,0x00,0x00,0xd4]
-  smc   #7
-; CHECK: encoding: [0xe3,0x00,0x00,0xd4]
-  svc   #8
-; CHECK: encoding: [0x01,0x01,0x00,0xd4]
-
-; The immediate defaults to zero for DCPSn
-  dcps1
-  dcps2
-  dcps3
-
-; CHECK: dcps1                     ; encoding: [0x01,0x00,0xa0,0xd4]
-; CHECK: dcps2                     ; encoding: [0x02,0x00,0xa0,0xd4]
-; CHECK: dcps3                     ; encoding: [0x03,0x00,0xa0,0xd4]
-
diff --git a/test/MC/ARM64/condbr-without-dots.s b/test/MC/ARM64/condbr-without-dots.s
deleted file mode 100644
index 2a9f7a7cf74..00000000000
--- a/test/MC/ARM64/condbr-without-dots.s
+++ /dev/null
@@ -1,37 +0,0 @@
-// RUN: llvm-mc -triple arm64-apple-ios -o - %s | FileCheck %s
-        
-        beq lbl
-        bne lbl
-        bcs lbl
-        bhs lbl
-        blo lbl
-        bcc lbl
-        bmi lbl
-        bpl lbl
-        bvs lbl
-        bvc lbl
-        bhi lbl
-        bls lbl
-        bge lbl
-        blt lbl
-        bgt lbl
-        ble lbl
-        bal lbl
-
-// CHECK: b.eq lbl
-// CHECK: b.ne lbl
-// CHECK: b.hs lbl
-// CHECK: b.hs lbl
-// CHECK: b.lo lbl
-// CHECK: b.lo lbl
-// CHECK: b.mi lbl
-// CHECK: b.pl lbl
-// CHECK: b.vs lbl
-// CHECK: b.vc lbl
-// CHECK: b.hi lbl
-// CHECK: b.ls lbl
-// CHECK: b.ge lbl
-// CHECK: b.lt lbl
-// CHECK: b.gt lbl
-// CHECK: b.le lbl
-// CHECK: b.al lbl
diff --git a/test/MC/ARM64/crypto.s b/test/MC/ARM64/crypto.s
deleted file mode 100644
index 51efd2132a7..00000000000
--- a/test/MC/ARM64/crypto.s
+++ /dev/null
@@ -1,66 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -show-encoding -output-asm-variant=1 < %s | FileCheck %s
-
-foo:
-  aese.16b v0, v1
-  aesd.16b v0, v1
-  aesmc.16b v0, v1
-  aesimc.16b v0, v1
-
-  sha1c.4s q0, s1, v2
-  sha1p.4s q0, s1, v2
-  sha1m.4s q0, s1, v2
-  sha1su0.4s v0, v1, v2
-  sha256h.4s q0, q1, v2
-  sha256h2.4s q0, q1, v2
-  sha256su1.4s v0, v1, v2
-  sha1h s0, s1
-  sha1su1.4s v0, v1
-  sha256su0.4s v0, v1
-
-; CHECK: aese.16b v0, v1               ; encoding: [0x20,0x48,0x28,0x4e]
-; CHECK: aesd.16b v0, v1               ; encoding: [0x20,0x58,0x28,0x4e]
-; CHECK: aesmc.16b v0, v1              ; encoding: [0x20,0x68,0x28,0x4e]
-; CHECK: aesimc.16b v0, v1             ; encoding: [0x20,0x78,0x28,0x4e]
-
-; CHECK: sha1c.4s q0, s1, v2           ; encoding: [0x20,0x00,0x02,0x5e]
-; CHECK: sha1p.4s q0, s1, v2           ; encoding: [0x20,0x10,0x02,0x5e]
-; CHECK: sha1m.4s q0, s1, v2           ; encoding: [0x20,0x20,0x02,0x5e]
-; CHECK: sha1su0.4s v0, v1, v2         ; encoding: [0x20,0x30,0x02,0x5e]
-; CHECK: sha256h.4s q0, q1, v2         ; encoding: [0x20,0x40,0x02,0x5e]
-; CHECK: sha256h2.4s q0, q1, v2        ; encoding: [0x20,0x50,0x02,0x5e]
-; CHECK: sha256su1.4s v0, v1, v2       ; encoding: [0x20,0x60,0x02,0x5e]
-; CHECK: sha1h s0, s1                  ; encoding: [0x20,0x08,0x28,0x5e]
-; CHECK: sha1su1.4s v0, v1             ; encoding: [0x20,0x18,0x28,0x5e]
-; CHECK: sha256su0.4s v0, v1           ; encoding: [0x20,0x28,0x28,0x5e]
-
-  aese v2.16b, v3.16b
-  aesd v5.16b, v7.16b
-  aesmc v11.16b, v13.16b
-  aesimc v17.16b, v19.16b
-
-; CHECK: aese.16b v2, v3            ; encoding: [0x62,0x48,0x28,0x4e]
-; CHECK: aesd.16b v5, v7            ; encoding: [0xe5,0x58,0x28,0x4e]
-; CHECK: aesmc.16b v11, v13         ; encoding: [0xab,0x69,0x28,0x4e]
-; CHECK: aesimc.16b v17, v19        ; encoding: [0x71,0x7a,0x28,0x4e]
-
-  sha1c q23, s29, v3.4s
-  sha1p q14, s15, v9.4s
-  sha1m q2, s6, v5.4s
-  sha1su0 v3.4s, v5.4s, v9.4s
-  sha256h q2, q7, v18.4s
-  sha256h2 q28, q18, v28.4s
-  sha256su1 v4.4s, v5.4s, v9.4s
-  sha1h s30, s0
-  sha1su1 v10.4s, v21.4s
-  sha256su0 v2.4s, v31.4s
-
-; CHECK: sha1c.4s q23, s29, v3       ; encoding: [0xb7,0x03,0x03,0x5e]
-; CHECK: sha1p.4s q14, s15, v9       ; encoding: [0xee,0x11,0x09,0x5e]
-; CHECK: sha1m.4s q2, s6, v5         ; encoding: [0xc2,0x20,0x05,0x5e]
-; CHECK: sha1su0.4s v3, v5, v9       ; encoding: [0xa3,0x30,0x09,0x5e]
-; CHECK: sha256h.4s q2, q7, v18      ; encoding: [0xe2,0x40,0x12,0x5e]
-; CHECK: sha256h2.4s q28, q18, v28   ; encoding: [0x5c,0x52,0x1c,0x5e]
-; CHECK: sha256su1.4s v4, v5, v9     ; encoding: [0xa4,0x60,0x09,0x5e]
-; CHECK: sha1h s30, s0               ; encoding: [0x1e,0x08,0x28,0x5e]
-; CHECK: sha1su1.4s v10, v21         ; encoding: [0xaa,0x1a,0x28,0x5e]
-; CHECK: sha256su0.4s v2, v31        ; encoding: [0xe2,0x2b,0x28,0x5e]
diff --git a/test/MC/ARM64/diagno-predicate.s b/test/MC/ARM64/diagno-predicate.s
deleted file mode 100644
index 3b757e836d3..00000000000
--- a/test/MC/ARM64/diagno-predicate.s
+++ /dev/null
@@ -1,24 +0,0 @@
-// RUN: not llvm-mc  -triple arm64-linux-gnu -mattr=-fp-armv8,-crc < %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
-
-
-        fcvt d0, s0
-// CHECK-ERROR: error: instruction requires: fp-armv8
-// CHECK-ERROR-NEXT:        fcvt d0, s0
-// CHECK-ERROR-NEXT:        ^
-
-        fmla v9.2s, v9.2s, v0.2s
-// CHECK-ERROR: error: instruction requires: neon
-// CHECK-ERROR-NEXT:        fmla v9.2s, v9.2s, v0.2s
-// CHECK-ERROR-NEXT:        ^
-
-        pmull v0.1q, v1.1d, v2.1d
-// CHECK-ERROR: error: instruction requires: crypto
-// CHECK-ERROR-NEXT:        pmull v0.1q, v1.1d, v2.1d
-// CHECK-ERROR-NEXT:        ^
-
-        crc32b  w5, w7, w20
-// CHECK-ERROR: error: instruction requires: crc
-// CHECK-ERROR-NEXT:        crc32b  w5, w7, w20
-// CHECK-ERROR-NEXT:        ^
-
diff --git a/test/MC/ARM64/diags.s b/test/MC/ARM64/diags.s
deleted file mode 100644
index 3ff2b54998f..00000000000
--- a/test/MC/ARM64/diags.s
+++ /dev/null
@@ -1,392 +0,0 @@
-; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-foo:
-
-; The first should encode as an expression. The second should error expecting
-; a register.
-  ldr x3, (foo + 4)
-  ldr x3, [foo + 4]
-; CHECK:  ldr x3, foo+4               ; encoding: [0bAAA00011,A,A,0x58]
-; CHECK:                              ;   fixup A - offset: 0, value: foo+4, kind: fixup_arm64_ldr_pcrel_imm19
-; CHECK-ERRORS: error: invalid operand for instruction
-
-; The last argument should be flagged as an error.  rdar://9576009
-  ld4.8b	{v0, v1, v2, v3}, [x0], #33
-; CHECK-ERRORS: error: invalid operand for instruction
-; CHECK-ERRORS: ld4.8b	{v0, v1, v2, v3}, [x0], #33
-
-
-        ldr x0, [x0, #804]
-        ldr w0, [x0, #802]
-        ldr x0, [x0, #804]!
-        ldr w0, [w0, #301]!
-        ldr x0, [x0], #804
-        ldr w0, [w0], #301
-
-        ldp w3, w4, [x5, #11]!
-        ldp x3, x4, [x5, #12]!
-        ldp q3, q4, [x5, #12]!
-        ldp w3, w4, [x5], #11
-        ldp x3, x4, [x5], #12
-        ldp q3, q4, [x5], #12
-
-        ldur x0, [x1, #-257]
-
-; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
-; CHECK-ERRORS:         ldr x0, [x0, #804]
-; CHECK-ERRORS:                 ^
-; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
-; CHECK-ERRORS:         ldr w0, [x0, #802]
-; CHECK-ERRORS:                 ^
-; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
-; CHECK-ERRORS:         ldr x0, [x0, #804]!
-; CHECK-ERRORS:                 ^
-; CHECK-ERRORS: error: invalid operand for instruction
-; CHECK-ERRORS:         ldr w0, [w0, #301]!
-; CHECK-ERRORS:                  ^
-; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
-; CHECK-ERRORS:         ldr x0, [x0], #804
-; CHECK-ERRORS:                       ^
-; CHECK-ERRORS: error: invalid operand for instruction
-; CHECK-ERRORS:         ldr w0, [w0], #301
-; CHECK-ERRORS:                  ^
-; CHECK-ERRORS: error: index must be a multiple of 4 in range [-256, 252].
-; CHECK-ERRORS:         ldp w3, w4, [x5, #11]!
-; CHECK-ERRORS:                     ^
-; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512, 504].
-; CHECK-ERRORS:         ldp x3, x4, [x5, #12]!
-; CHECK-ERRORS:                     ^
-; CHECK-ERRORS: error: index must be a multiple of 16 in range [-1024, 1008].
-; CHECK-ERRORS:         ldp q3, q4, [x5, #12]!
-; CHECK-ERRORS:                     ^
-; CHECK-ERRORS: error: index must be a multiple of 4 in range [-256, 252].
-; CHECK-ERRORS:         ldp w3, w4, [x5], #11
-; CHECK-ERRORS:                           ^
-; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512, 504].
-; CHECK-ERRORS:         ldp x3, x4, [x5], #12
-; CHECK-ERRORS:                           ^
-; CHECK-ERRORS: error: index must be a multiple of 16 in range [-1024, 1008].
-; CHECK-ERRORS:         ldp q3, q4, [x5], #12
-; CHECK-ERRORS:                           ^
-; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
-; CHECK-ERRORS:         ldur x0, [x1, #-257]
-; CHECK-ERRORS:                   ^
-
-
-ldrb   w1, [x3, w3, sxtw #4]
-ldrh   w1, [x3, w3, sxtw #4]
-ldr    w1, [x3, w3, sxtw #4]
-ldr    x1, [x3, w3, sxtw #4]
-ldr    b1, [x3, w3, sxtw #4]
-ldr    h1, [x3, w3, sxtw #4]
-ldr    s1, [x3, w3, sxtw #4]
-ldr    d1, [x3, w3, sxtw #4]
-ldr    q1, [x3, w3, sxtw #1]
-
-; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0
-; CHECK-ERRORS:ldrb   w1, [x3, w3, sxtw #4]
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #1
-; CHECK-ERRORS:ldrh   w1, [x3, w3, sxtw #4]
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
-; CHECK-ERRORS:ldr    w1, [x3, w3, sxtw #4]
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
-; CHECK-ERRORS:ldr    x1, [x3, w3, sxtw #4]
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0
-; CHECK-ERRORS:ldr    b1, [x3, w3, sxtw #4]
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #1
-; CHECK-ERRORS:ldr    h1, [x3, w3, sxtw #4]
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
-; CHECK-ERRORS:ldr    s1, [x3, w3, sxtw #4]
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
-; CHECK-ERRORS:ldr    d1, [x3, w3, sxtw #4]
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #4
-; CHECK-ERRORS:ldr    q1, [x3, w3, sxtw #1]
-; CHECK-ERRORS:           ^
-
-; Check that register offset addressing modes only accept 32-bit offset
-; registers when using uxtw/sxtw extends. Everything else requires a 64-bit
-; register.
-  str    d1, [x3, w3, sxtx #3]
-  ldr    s1, [x3, d3, sxtx #2]
-
-; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
-; CHECK-ERRORS:   str    d1, [x3, w3, sxtx #3]
-; CHECK-ERRORS:                       ^
-; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
-; CHECK-ERRORS:   ldr    s1, [x3, d3, sxtx #2]
-; CHECK-ERRORS:                   ^
-
-; Shift immediates range checking.
-  sqrshrn b4, h9, #10
-  rshrn v9.8b, v11.8h, #17
-  sqrshrn v7.4h, v8.4s, #39
-  uqshrn2 v4.4s, v5.2d, #67
-
-; CHECK-ERRORS: error: immediate must be an integer in range [1, 8].
-; CHECK-ERRORS:   sqrshrn b4, h9, #10
-; CHECK-ERRORS:                   ^
-; CHECK-ERRORS: error: immediate must be an integer in range [1, 8].
-; CHECK-ERRORS:   rshrn v9.8b, v11.8h, #17
-; CHECK-ERRORS:                        ^
-; CHECK-ERRORS: error: immediate must be an integer in range [1, 16].
-; CHECK-ERRORS:   sqrshrn v7.4h, v8.4s, #39
-; CHECK-ERRORS:                         ^
-; CHECK-ERRORS: error: immediate must be an integer in range [1, 32].
-; CHECK-ERRORS:   uqshrn2 v4.4s, v5.2d, #67
-; CHECK-ERRORS:                         ^
-
-
-  st1.s4 {v14, v15}, [x2], #32
-; CHECK-ERRORS: error: invalid type suffix for instruction
-; CHECK-ERRORS: st1.s4 {v14, v15}, [x2], #32
-; CHECK-ERRORS:     ^
-
-
-
-; Load pair instructions where Rt==Rt2 and writeback load/store instructions
-; where Rt==Rn or Rt2==Rn are unpredicatable.
-  ldp x1, x2, [x2], #16
-  ldp x2, x2, [x2], #16
-  ldp w1, w2, [x2], #16
-  ldp w2, w2, [x2], #16
-  ldp x1, x1, [x2]
-
-  ldr x2, [x2], #8
-  ldr x2, [x2, #8]!
-  ldr w2, [x2], #8
-  ldr w2, [x2, #8]!
-
-  str x2, [x2], #8
-  str x2, [x2, #8]!
-  str w2, [x2], #8
-  str w2, [x2, #8]!
-
-; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
-; CHECK-ERRORS:   ldp x1, x2, [x2], #16
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
-; CHECK-ERRORS:   ldp x2, x2, [x2], #16
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
-; CHECK-ERRORS:   ldp w1, w2, [x2], #16
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
-; CHECK-ERRORS:   ldp w2, w2, [x2], #16
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
-; CHECK-ERRORS:   ldp x1, x1, [x2]
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
-; CHECK-ERRORS:   ldr x2, [x2], #8
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
-; CHECK-ERRORS:   ldr x2, [x2, #8]!
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
-; CHECK-ERRORS:   ldr w2, [x2], #8
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
-; CHECK-ERRORS:   ldr w2, [x2, #8]!
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
-; CHECK-ERRORS:   str x2, [x2], #8
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
-; CHECK-ERRORS:   str x2, [x2, #8]!
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
-; CHECK-ERRORS:   str w2, [x2], #8
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
-; CHECK-ERRORS:   str w2, [x2, #8]!
-; CHECK-ERRORS:       ^
-
-; The validity checking for shifted-immediate operands.  rdar://13174476
-; Where the immediate is out of range.
-  add w1, w2, w3, lsr #75
-
-; CHECK-ERRORS: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
-; CHECK-ERRORS: add w1, w2, w3, lsr #75
-; CHECK-ERRORS:                      ^
-
-; logical instructions on 32-bit regs with shift > 31 is not legal
-orr w0, w0, w0, lsl #32
-; CHECK-ERRORS: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
-; CHECK-ERRORS:        orr w0, w0, w0, lsl #32
-; CHECK-ERRORS:                        ^
-eor w0, w0, w0, lsl #32
-; CHECK-ERRORS: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
-; CHECK-ERRORS:        eor w0, w0, w0, lsl #32
-; CHECK-ERRORS:                        ^
-and w0, w0, w0, lsl #32
-; CHECK-ERRORS: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
-; CHECK-ERRORS:        and w0, w0, w0, lsl #32
-; CHECK-ERRORS:                        ^
-ands w0, w0, w0, lsl #32
-; CHECK-ERRORS: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
-; CHECK-ERRORS:        ands w0, w0, w0, lsl #32
-; CHECK-ERRORS:                        ^
-
-; Relocated expressions should not be accepted for 32-bit adds or sub (imm)
-add w3, w5, sym@PAGEOFF
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: add w3, w5, sym@PAGEOFF
-; CHECK-ERRORS:             ^
-
-adds w3, w5, sym@PAGEOFF
-adds x9, x12, sym@PAGEOFF
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: adds w3, w5, sym@PAGEOFF
-; CHECK-ERRORS:              ^
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: adds x9, x12, sym@PAGEOFF
-; CHECK-ERRORS:               ^
-
-sub x3, x5, sym@PAGEOFF
-sub w20, w30, sym@PAGEOFF
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: sub x3, x5, sym@PAGEOFF
-; CHECK-ERRORS:             ^
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: sub w20, w30, sym@PAGEOFF
-; CHECK-ERRORS:               ^
-
-subs w9, w10, sym@PAGEOFF
-subs x20, x30, sym@PAGEOFF
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: subs w9, w10, sym@PAGEOFF
-; CHECK-ERRORS:               ^
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: subs x20, x30, sym@PAGEOFF
-; CHECK-ERRORS:                ^
-
-tbl v0.8b, { v1 }, v0.8b
-tbl v0.16b, { v1.8b, v2.8b, v3.8b }, v0.16b
-tbx v3.16b, { v12.8b, v13.8b, v14.8b }, v6.8b
-tbx v2.8b, { v0 }, v6.8b
-; CHECK-ERRORS: error: invalid operand for instruction
-; CHECK-ERRORS: tbl v0.8b, { v1 }, v0.8b
-; CHECK-ERRORS:            ^
-; CHECK-ERRORS: error: invalid operand for instruction
-; CHECK-ERRORS: tbl v0.16b, { v1.8b, v2.8b, v3.8b }, v0.16b
-; CHECK-ERRORS:             ^
-; CHECK-ERRORS: error: invalid operand for instruction
-; CHECK-ERRORS: tbx v3.16b, { v12.8b, v13.8b, v14.8b }, v6.8b
-; CHECK-ERRORS:             ^
-; CHECK-ERRORS: error: invalid operand for instruction
-; CHECK-ERRORS: tbx v2.8b, { v0 }, v6.8b
-; CHECK-ERRORS:            ^
-
-b.c #0x4
-; CHECK-ERRORS: error: invalid condition code
-; CHECK-ERRORS: b.c #0x4
-; CHECK-ERRORS:   ^
-
-ic ialluis, x0
-; CHECK-ERRORS: error: specified ic op does not use a register
-ic iallu, x0
-; CHECK-ERRORS: error: specified ic op does not use a register
-ic ivau
-; CHECK-ERRORS: error: specified ic op requires a register
-
-dc zva
-; CHECK-ERRORS: error: specified dc op requires a register
-dc ivac
-; CHECK-ERRORS: error: specified dc op requires a register
-dc isw
-; CHECK-ERRORS: error: specified dc op requires a register
-dc cvac
-; CHECK-ERRORS: error: specified dc op requires a register
-dc csw
-; CHECK-ERRORS: error: specified dc op requires a register
-dc cvau
-; CHECK-ERRORS: error: specified dc op requires a register
-dc civac
-; CHECK-ERRORS: error: specified dc op requires a register
-dc cisw
-; CHECK-ERRORS: error: specified dc op requires a register
-
-at s1e1r
-; CHECK-ERRORS: error: specified at op requires a register
-at s1e2r
-; CHECK-ERRORS: error: specified at op requires a register
-at s1e3r
-; CHECK-ERRORS: error: specified at op requires a register
-at s1e1w
-; CHECK-ERRORS: error: specified at op requires a register
-at s1e2w
-; CHECK-ERRORS: error: specified at op requires a register
-at s1e3w
-; CHECK-ERRORS: error: specified at op requires a register
-at s1e0r
-; CHECK-ERRORS: error: specified at op requires a register
-at s1e0w
-; CHECK-ERRORS: error: specified at op requires a register
-at s12e1r
-; CHECK-ERRORS: error: specified at op requires a register
-at s12e1w
-; CHECK-ERRORS: error: specified at op requires a register
-at s12e0r
-; CHECK-ERRORS: error: specified at op requires a register
-at s12e0w
-; CHECK-ERRORS: error: specified at op requires a register
-
-tlbi vmalle1is, x0
-; CHECK-ERRORS: error: specified tlbi op does not use a register
-tlbi vmalle1, x0
-; CHECK-ERRORS: error: specified tlbi op does not use a register
-tlbi alle1is, x0
-; CHECK-ERRORS: error: specified tlbi op does not use a register
-tlbi alle2is, x0
-; CHECK-ERRORS: error: specified tlbi op does not use a register
-tlbi alle3is, x0
-; CHECK-ERRORS: error: specified tlbi op does not use a register
-tlbi alle1, x0
-; CHECK-ERRORS: error: specified tlbi op does not use a register
-tlbi alle2, x0
-; CHECK-ERRORS: error: specified tlbi op does not use a register
-tlbi alle3, x0
-; CHECK-ERRORS: error: specified tlbi op does not use a register
-tlbi vae1is
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vae2is
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vae3is
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi aside1is
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vaae1is
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vale1is
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vaale1is
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vale2is
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vale3is
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vae1
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vae2
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vae3
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi aside1
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vaae1
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vale1
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vale2
-; CHECK-ERRORS: error: specified tlbi op requires a register
-tlbi vale3
-; CHECK-ERRORS: error: specified tlbi op requires a register
diff --git a/test/MC/ARM64/directive_loh.s b/test/MC/ARM64/directive_loh.s
deleted file mode 100644
index 76d2d7f2186..00000000000
--- a/test/MC/ARM64/directive_loh.s
+++ /dev/null
@@ -1,93 +0,0 @@
-# RUN: not llvm-mc -triple arm64-apple-darwin < %s 2> %t | FileCheck %s
-# RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-.globl _fct1
-_fct1:
-  L1:
-  L2:
-  L3:
-  L4:
-  ret lr;
-
-# Known LOHs with:
-# - Regular syntax.
-# - Alternative syntax.
-
-# CHECK: .loh AdrpAdrp L1, L2
-# CHECK: .loh AdrpAdrp L1, L2
-.loh AdrpAdrp L1, L2
-.loh 1 L1, L2
-
-# CHECK: .loh AdrpLdr L1, L2
-# CHECK: .loh AdrpLdr L1, L2
-.loh AdrpLdr L1, L2
-.loh 2 L1, L2
-
-# CHECK: .loh AdrpAddLdr L1, L2, L3
-# CHECK: .loh AdrpAddLdr L1, L2, L3
-.loh AdrpAddLdr L1, L2, L3
-.loh 3 L1, L2, L3
-
-# CHECK: .loh AdrpLdrGotLdr L1, L2, L3
-# CHECK: .loh AdrpLdrGotLdr L1, L2, L3
-.loh AdrpLdrGotLdr L1, L2, L3
-.loh 4 L1, L2, L3
-
-# CHECK: .loh AdrpAddStr L1, L2, L3
-# CHECK: .loh AdrpAddStr L1, L2, L3
-.loh AdrpAddStr L1, L2, L3
-.loh 5 L1, L2, L3
-
-# CHECK: .loh AdrpLdrGotStr L1, L2, L3
-# CHECK: .loh AdrpLdrGotStr L1, L2, L3
-.loh AdrpLdrGotStr L1, L2, L3
-.loh 6 L1, L2, L3
-
-# CHECK: .loh AdrpAdd L1, L2
-# CHECK: .loh AdrpAdd L1, L2
-.loh AdrpAdd L1, L2
-.loh 7 L1, L2
-
-# CHECK: .loh AdrpLdrGot L1, L2
-# CHECK: .loh AdrpLdrGot L1, L2
-.loh AdrpLdrGot L1, L2
-.loh 8 L1, L2
-
-# End Known LOHs.
-
-### Errors Check ####
-
-# Unknown textual identifier.
-# CHECK-ERRORS: error: invalid identifier in directive
-# CHECK-ERRORS-NEXT: .loh Unknown
-# CHECK-ERRORS-NEXT:      ^
-.loh Unknown
-# Unknown numeric identifier.
-# CHECK-ERRORS: error: invalid numeric identifier in directive
-# CHECK-ERRORS-NEXT: .loh 153, L1
-# CHECK-ERRORS-NEXT:      ^
-.loh 153, L1
-
-# Too much arguments.
-# CHECK-ERRORS: error: unexpected token in '.loh' directive
-# CHECK-ERRORS-NEXT: .loh AdrpAdrp L1, L2, L3
-# CHECK-ERRORS-NEXT:                     ^
-.loh AdrpAdrp L1, L2, L3
-
-# Too much arguments with alternative syntax.
-# CHECK-ERRORS: error: unexpected token in '.loh' directive
-# CHECK-ERRORS-NEXT: .loh 1 L1, L2, L3
-# CHECK-ERRORS-NEXT:              ^
-.loh 1 L1, L2, L3
-
-# Too few argumets.
-# CHECK-ERRORS: error: unexpected token in '.loh' directive
-# CHECK-ERRORS-NEXT: .loh AdrpAdrp L1
-# CHECK-ERRORS-NEXT:                 ^
-.loh AdrpAdrp L1
-
-# Too few argumets with alternative syntax.
-# CHECK-ERRORS: error: unexpected token in '.loh' directive
-# CHECK-ERRORS-NEXT: .loh 1 L1
-# CHECK-ERRORS-NEXT:          ^
-.loh 1 L1
diff --git a/test/MC/ARM64/elf-reloc-condbr.s b/test/MC/ARM64/elf-reloc-condbr.s
deleted file mode 100644
index 9b70a20e1bc..00000000000
--- a/test/MC/ARM64/elf-reloc-condbr.s
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
-// RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
-
-        b.eq somewhere
-
-// OBJ:      Relocations [
-// OBJ-NEXT:   Section (2) .rela.text {
-// OBJ-NEXT:     0x0 R_AARCH64_CONDBR19 somewhere 0x0
-// OBJ-NEXT:   }
-// OBJ-NEXT: ]
diff --git a/test/MC/ARM64/elf-relocs.s b/test/MC/ARM64/elf-relocs.s
deleted file mode 100644
index eb22cc2f236..00000000000
--- a/test/MC/ARM64/elf-relocs.s
+++ /dev/null
@@ -1,249 +0,0 @@
-// RUN: llvm-mc -triple=arm64-linux-gnu -o - < %s | FileCheck %s
-// RUN: llvm-mc -triple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
-
-   add x0, x2, #:lo12:sym
-// CHECK: add x0, x2, :lo12:sym
-// CHECK-OBJ: 0 R_AARCH64_ADD_ABS_LO12_NC sym
-
-   add x5, x7, #:dtprel_lo12:sym
-// CHECK: add x5, x7, :dtprel_lo12:sym
-// CHECK-OBJ: 4 R_AARCH64_TLSLD_ADD_DTPREL_LO12 sym
-
-   add x9, x12, #:dtprel_lo12_nc:sym
-// CHECK: add x9, x12, :dtprel_lo12_nc:sym
-// CHECK-OBJ: 8 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC sym
-
-   add x20, x30, #:tprel_lo12:sym
-// CHECK: add x20, x30, :tprel_lo12:sym
-// CHECK-OBJ: c R_AARCH64_TLSLE_ADD_TPREL_LO12 sym
-
-   add x9, x12, #:tprel_lo12_nc:sym
-// CHECK: add x9, x12, :tprel_lo12_nc:sym
-// CHECK-OBJ: 10 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC sym
-
-   add x5, x0, #:tlsdesc_lo12:sym
-// CHECK: add x5, x0, :tlsdesc_lo12:sym
-// CHECK-OBJ: 14 R_AARCH64_TLSDESC_ADD_LO12_NC sym
-
-        add x0, x2, #:lo12:sym+8
-// CHECK: add x0, x2, :lo12:sym
-// CHECK-OBJ: 18 R_AARCH64_ADD_ABS_LO12_NC sym+8
-
-   add x5, x7, #:dtprel_lo12:sym+1
-// CHECK: add x5, x7, :dtprel_lo12:sym+1
-// CHECK-OBJ: 1c R_AARCH64_TLSLD_ADD_DTPREL_LO12 sym+1
-
-   add x9, x12, #:dtprel_lo12_nc:sym+2
-// CHECK: add x9, x12, :dtprel_lo12_nc:sym+2
-// CHECK-OBJ:20 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC sym+2
-
-   add x20, x30, #:tprel_lo12:sym+12
-// CHECK: add x20, x30, :tprel_lo12:sym+12
-// CHECK-OBJ: 24 R_AARCH64_TLSLE_ADD_TPREL_LO12 sym+12
-
-   add x9, x12, #:tprel_lo12_nc:sym+54
-// CHECK: add x9, x12, :tprel_lo12_nc:sym+54
-// CHECK-OBJ: 28 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC sym+54
-
-   add x5, x0, #:tlsdesc_lo12:sym+70
-// CHECK: add x5, x0, :tlsdesc_lo12:sym+70
-// CHECK-OBJ: 2c R_AARCH64_TLSDESC_ADD_LO12_NC sym+70
-
-        .hword sym + 4 - .
-// CHECK-OBJ: 30 R_AARCH64_PREL16 sym+4
-        .word sym - . + 8
-// CHECK-OBJ: 32 R_AARCH64_PREL32 sym+8
-        .xword sym-.
-// CHECK-OBJ: 36 R_AARCH64_PREL64 sym{{$}}
-
-        .hword sym
-// CHECK-OBJ: 3e R_AARCH64_ABS16 sym
-        .word sym+1
-// CHECK-OBJ: 40 R_AARCH64_ABS32 sym+1
-        .xword sym+16
-// CHECK-OBJ: 44 R_AARCH64_ABS64 sym+16
-
-   adrp x0, sym
-// CHECK: adrp x0, sym
-// CHECK-OBJ: 4c R_AARCH64_ADR_PREL_PG_HI21 sym
-
-   adrp x15, :got:sym
-// CHECK: adrp x15, :got:sym
-// CHECK-OBJ: 50 R_AARCH64_ADR_GOT_PAGE sym
-
-   adrp x29, :gottprel:sym
-// CHECK: adrp x29, :gottprel:sym
-// CHECK-OBJ: 54 R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 sym
-
-   adrp x2, :tlsdesc:sym
-// CHECK: adrp x2, :tlsdesc:sym
-// CHECK-OBJ: 58 R_AARCH64_TLSDESC_ADR_PAGE sym
-
-   // LLVM is not competent enough to do this relocation because the
-   // page boundary could occur anywhere after linking. A relocation
-   // is needed.
-   adrp x3, trickQuestion
-   .global trickQuestion
-trickQuestion:
-// CHECK: adrp x3, trickQuestion
-// CHECK-OBJ: 5c R_AARCH64_ADR_PREL_PG_HI21 trickQuestion
-
-   ldrb w2, [x3, :lo12:sym]
-   ldrsb w5, [x7, #:lo12:sym]
-   ldrsb x11, [x13, :lo12:sym]
-   ldr b17, [x19, #:lo12:sym]
-// CHECK: ldrb w2, [x3, :lo12:sym]
-// CHECK: ldrsb w5, [x7, :lo12:sym]
-// CHECK: ldrsb x11, [x13, :lo12:sym]
-// CHECK: ldr b17, [x19, :lo12:sym]
-// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
-
-   ldrb w23, [x29, #:dtprel_lo12_nc:sym]
-   ldrsb w23, [x19, #:dtprel_lo12:sym]
-   ldrsb x17, [x13, :dtprel_lo12_nc:sym]
-   ldr b11, [x7, #:dtprel_lo12:sym]
-// CHECK: ldrb w23, [x29, :dtprel_lo12_nc:sym]
-// CHECK: ldrsb w23, [x19, :dtprel_lo12:sym]
-// CHECK: ldrsb x17, [x13, :dtprel_lo12_nc:sym]
-// CHECK: ldr b11, [x7, :dtprel_lo12:sym]
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12 sym
-
-   ldrb w1, [x2, :tprel_lo12:sym]
-   ldrsb w3, [x4, #:tprel_lo12_nc:sym]
-   ldrsb x5, [x6, :tprel_lo12:sym]
-   ldr b7, [x8, #:tprel_lo12_nc:sym]
-// CHECK: ldrb w1, [x2, :tprel_lo12:sym]
-// CHECK: ldrsb w3, [x4, :tprel_lo12_nc:sym]
-// CHECK: ldrsb x5, [x6, :tprel_lo12:sym]
-// CHECK: ldr b7, [x8, :tprel_lo12_nc:sym]
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC sym
-
-   ldrh w2, [x3, #:lo12:sym]
-   ldrsh w5, [x7, :lo12:sym]
-   ldrsh x11, [x13, #:lo12:sym]
-   ldr h17, [x19, :lo12:sym]
-// CHECK: ldrh w2, [x3, :lo12:sym]
-// CHECK: ldrsh w5, [x7, :lo12:sym]
-// CHECK: ldrsh x11, [x13, :lo12:sym]
-// CHECK: ldr h17, [x19, :lo12:sym]
-// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
-
-   ldrh w23, [x29, #:dtprel_lo12_nc:sym]
-   ldrsh w23, [x19, :dtprel_lo12:sym]
-   ldrsh x17, [x13, :dtprel_lo12_nc:sym]
-   ldr h11, [x7, #:dtprel_lo12:sym]
-// CHECK: ldrh w23, [x29, :dtprel_lo12_nc:sym]
-// CHECK: ldrsh w23, [x19, :dtprel_lo12:sym]
-// CHECK: ldrsh x17, [x13, :dtprel_lo12_nc:sym]
-// CHECK: ldr h11, [x7, :dtprel_lo12:sym]
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12 sym
-
-   ldrh w1, [x2, :tprel_lo12:sym]
-   ldrsh w3, [x4, #:tprel_lo12_nc:sym]
-   ldrsh x5, [x6, :tprel_lo12:sym]
-   ldr h7, [x8, #:tprel_lo12_nc:sym]
-// CHECK: ldrh w1, [x2, :tprel_lo12:sym]
-// CHECK: ldrsh w3, [x4, :tprel_lo12_nc:sym]
-// CHECK: ldrsh x5, [x6, :tprel_lo12:sym]
-// CHECK: ldr h7, [x8, :tprel_lo12_nc:sym]
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC sym
-
-   ldr w1, [x2, #:lo12:sym]
-   ldrsw x3, [x4, #:lo12:sym]
-   ldr s4, [x5, :lo12:sym]
-// CHECK: ldr w1, [x2, :lo12:sym]
-// CHECK: ldrsw x3, [x4, :lo12:sym]
-// CHECK: ldr s4, [x5, :lo12:sym]
-// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
-
-   ldr w1, [x2, :dtprel_lo12:sym]
-   ldrsw x3, [x4, #:dtprel_lo12_nc:sym]
-   ldr s4, [x5, #:dtprel_lo12_nc:sym]
-// CHECK: ldr w1, [x2, :dtprel_lo12:sym]
-// CHECK: ldrsw x3, [x4, :dtprel_lo12_nc:sym]
-// CHECK: ldr s4, [x5, :dtprel_lo12_nc:sym]
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC sym
-
-
-   ldr w1, [x2, #:tprel_lo12:sym]
-   ldrsw x3, [x4, :tprel_lo12_nc:sym]
-   ldr s4, [x5, :tprel_lo12_nc:sym]
-// CHECK: ldr w1, [x2, :tprel_lo12:sym]
-// CHECK: ldrsw x3, [x4, :tprel_lo12_nc:sym]
-// CHECK: ldr s4, [x5, :tprel_lo12_nc:sym]
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC sym
-
-   ldr x28, [x27, :lo12:sym]
-   ldr d26, [x25, #:lo12:sym]
-// CHECK: ldr x28, [x27, :lo12:sym]
-// CHECK: ldr d26, [x25, :lo12:sym]
-// CHECK-OBJ: R_AARCH64_LDST64_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST64_ABS_LO12_NC sym
-
-   ldr x24, [x23, #:got_lo12:sym]
-   ldr d22, [x21, :got_lo12:sym]
-// CHECK: ldr x24, [x23, :got_lo12:sym]
-// CHECK: ldr d22, [x21, :got_lo12:sym]
-// CHECK-OBJ: R_AARCH64_LD64_GOT_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LD64_GOT_LO12_NC sym
-
-   ldr x24, [x23, :dtprel_lo12_nc:sym]
-   ldr d22, [x21, #:dtprel_lo12:sym]
-// CHECK: ldr x24, [x23, :dtprel_lo12_nc:sym]
-// CHECK: ldr d22, [x21, :dtprel_lo12:sym]
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST64_DTPREL_LO12 sym
-
-   ldr x24, [x23, #:tprel_lo12:sym]
-   ldr d22, [x21, :tprel_lo12_nc:sym]
-// CHECK: ldr x24, [x23, :tprel_lo12:sym]
-// CHECK: ldr d22, [x21, :tprel_lo12_nc:sym]
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST64_TPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC sym
-
-   ldr x24, [x23, :gottprel_lo12:sym]
-   ldr d22, [x21, #:gottprel_lo12:sym]
-// CHECK: ldr x24, [x23, :gottprel_lo12:sym]
-// CHECK: ldr d22, [x21, :gottprel_lo12:sym]
-// CHECK-OBJ: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC sym
-
-   ldr x24, [x23, #:tlsdesc_lo12:sym]
-   ldr d22, [x21, :tlsdesc_lo12:sym]
-// CHECK: ldr x24, [x23, :tlsdesc_lo12:sym]
-// CHECK: ldr d22, [x21, :tlsdesc_lo12:sym]
-// CHECK-OBJ: R_AARCH64_TLSDESC_LD64_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSDESC_LD64_LO12_NC sym
-
-   ldr q20, [x19, #:lo12:sym]
-// CHECK: ldr q20, [x19, :lo12:sym]
-// CHECK-OBJ: R_AARCH64_LDST128_ABS_LO12_NC sym
-
-// Since relocated instructions print without a '#', that syntax should
-// certainly be accepted when assembling.
-   add x3, x5, :lo12:imm
-// CHECK: add x3, x5, :lo12:imm
diff --git a/test/MC/ARM64/fp-encoding.s b/test/MC/ARM64/fp-encoding.s
deleted file mode 100644
index 684d9883e37..00000000000
--- a/test/MC/ARM64/fp-encoding.s
+++ /dev/null
@@ -1,443 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon -show-encoding -output-asm-variant=1 < %s | FileCheck %s
-
-foo:
-;-----------------------------------------------------------------------------
-; Floating-point arithmetic
-;-----------------------------------------------------------------------------
-
-  fabs s1, s2
-  fabs d1, d2
-
-; CHECK: fabs s1, s2                 ; encoding: [0x41,0xc0,0x20,0x1e]
-; CHECK: fabs d1, d2                 ; encoding: [0x41,0xc0,0x60,0x1e]
-
-  fadd s1, s2, s3
-  fadd d1, d2, d3
-
-; CHECK: fadd s1, s2, s3             ; encoding: [0x41,0x28,0x23,0x1e]
-; CHECK: fadd d1, d2, d3             ; encoding: [0x41,0x28,0x63,0x1e]
-
-  fdiv s1, s2, s3
-  fdiv d1, d2, d3
-
-; CHECK: fdiv s1, s2, s3             ; encoding: [0x41,0x18,0x23,0x1e]
-; CHECK: fdiv d1, d2, d3             ; encoding: [0x41,0x18,0x63,0x1e]
-
-  fmadd s1, s2, s3, s4
-  fmadd d1, d2, d3, d4
-
-; CHECK: fmadd s1, s2, s3, s4        ; encoding: [0x41,0x10,0x03,0x1f]
-; CHECK: fmadd d1, d2, d3, d4        ; encoding: [0x41,0x10,0x43,0x1f]
-
-  fmax   s1, s2, s3
-  fmax   d1, d2, d3
-  fmaxnm s1, s2, s3
-  fmaxnm d1, d2, d3
-
-; CHECK: fmax   s1, s2, s3           ; encoding: [0x41,0x48,0x23,0x1e]
-; CHECK: fmax   d1, d2, d3           ; encoding: [0x41,0x48,0x63,0x1e]
-; CHECK: fmaxnm s1, s2, s3           ; encoding: [0x41,0x68,0x23,0x1e]
-; CHECK: fmaxnm d1, d2, d3           ; encoding: [0x41,0x68,0x63,0x1e]
-
-  fmin   s1, s2, s3
-  fmin   d1, d2, d3
-  fminnm s1, s2, s3
-  fminnm d1, d2, d3
-
-; CHECK: fmin   s1, s2, s3           ; encoding: [0x41,0x58,0x23,0x1e]
-; CHECK: fmin   d1, d2, d3           ; encoding: [0x41,0x58,0x63,0x1e]
-; CHECK: fminnm s1, s2, s3           ; encoding: [0x41,0x78,0x23,0x1e]
-; CHECK: fminnm d1, d2, d3           ; encoding: [0x41,0x78,0x63,0x1e]
-
-  fmsub s1, s2, s3, s4
-  fmsub d1, d2, d3, d4
-
-; CHECK: fmsub s1, s2, s3, s4        ; encoding: [0x41,0x90,0x03,0x1f]
-; CHECK: fmsub d1, d2, d3, d4        ; encoding: [0x41,0x90,0x43,0x1f]
-
-  fmul s1, s2, s3
-  fmul d1, d2, d3
-
-; CHECK: fmul s1, s2, s3             ; encoding: [0x41,0x08,0x23,0x1e]
-; CHECK: fmul d1, d2, d3             ; encoding: [0x41,0x08,0x63,0x1e]
-
-  fneg s1, s2
-  fneg d1, d2
-
-; CHECK: fneg s1, s2                 ; encoding: [0x41,0x40,0x21,0x1e]
-; CHECK: fneg d1, d2                 ; encoding: [0x41,0x40,0x61,0x1e]
-
-  fnmadd s1, s2, s3, s4
-  fnmadd d1, d2, d3, d4
-
-; CHECK: fnmadd s1, s2, s3, s4       ; encoding: [0x41,0x10,0x23,0x1f]
-; CHECK: fnmadd d1, d2, d3, d4       ; encoding: [0x41,0x10,0x63,0x1f]
-
-  fnmsub s1, s2, s3, s4
-  fnmsub d1, d2, d3, d4
-
-; CHECK: fnmsub s1, s2, s3, s4       ; encoding: [0x41,0x90,0x23,0x1f]
-; CHECK: fnmsub d1, d2, d3, d4       ; encoding: [0x41,0x90,0x63,0x1f]
-
-  fnmul s1, s2, s3
-  fnmul d1, d2, d3
-
-; CHECK: fnmul s1, s2, s3            ; encoding: [0x41,0x88,0x23,0x1e]
-; CHECK: fnmul d1, d2, d3            ; encoding: [0x41,0x88,0x63,0x1e]
-
-  fsqrt s1, s2
-  fsqrt d1, d2
-
-; CHECK: fsqrt s1, s2                ; encoding: [0x41,0xc0,0x21,0x1e]
-; CHECK: fsqrt d1, d2                ; encoding: [0x41,0xc0,0x61,0x1e]
-
-  fsub s1, s2, s3
-  fsub d1, d2, d3
-
-; CHECK: fsub s1, s2, s3             ; encoding: [0x41,0x38,0x23,0x1e]
-; CHECK: fsub d1, d2, d3             ; encoding: [0x41,0x38,0x63,0x1e]
-
-;-----------------------------------------------------------------------------
-; Floating-point comparison
-;-----------------------------------------------------------------------------
-
-  fccmp  s1, s2, #0, eq
-  fccmp  d1, d2, #0, eq
-  fccmpe s1, s2, #0, eq
-  fccmpe d1, d2, #0, eq
-
-; CHECK: fccmp  s1, s2, #0, eq       ; encoding: [0x20,0x04,0x22,0x1e]
-; CHECK: fccmp  d1, d2, #0, eq       ; encoding: [0x20,0x04,0x62,0x1e]
-; CHECK: fccmpe s1, s2, #0, eq       ; encoding: [0x30,0x04,0x22,0x1e]
-; CHECK: fccmpe d1, d2, #0, eq       ; encoding: [0x30,0x04,0x62,0x1e]
-
-  fcmp  s1, s2
-  fcmp  d1, d2
-  fcmp  s1, #0.0
-  fcmp  d1, #0.0
-  fcmpe s1, s2
-  fcmpe d1, d2
-  fcmpe s1, #0.0
-  fcmpe d1, #0.0
-
-; CHECK: fcmp  s1, s2                ; encoding: [0x20,0x20,0x22,0x1e]
-; CHECK: fcmp  d1, d2                ; encoding: [0x20,0x20,0x62,0x1e]
-; CHECK: fcmp  s1, #0.0              ; encoding: [0x28,0x20,0x20,0x1e]
-; CHECK: fcmp  d1, #0.0              ; encoding: [0x28,0x20,0x60,0x1e]
-; CHECK: fcmpe s1, s2                ; encoding: [0x30,0x20,0x22,0x1e]
-; CHECK: fcmpe d1, d2                ; encoding: [0x30,0x20,0x62,0x1e]
-; CHECK: fcmpe s1, #0.0              ; encoding: [0x38,0x20,0x20,0x1e]
-; CHECK: fcmpe d1, #0.0              ; encoding: [0x38,0x20,0x60,0x1e]
-
-;-----------------------------------------------------------------------------
-; Floating-point conditional select
-;-----------------------------------------------------------------------------
-
-  fcsel s1, s2, s3, eq
-  fcsel d1, d2, d3, eq
-
-; CHECK: fcsel s1, s2, s3, eq        ; encoding: [0x41,0x0c,0x23,0x1e]
-; CHECK: fcsel d1, d2, d3, eq        ; encoding: [0x41,0x0c,0x63,0x1e]
-
-;-----------------------------------------------------------------------------
-; Floating-point convert
-;-----------------------------------------------------------------------------
-
-  fcvt h1, d2
-  fcvt s1, d2
-  fcvt d1, h2
-  fcvt s1, h2
-  fcvt d1, s2
-  fcvt h1, s2
-
-; CHECK: fcvt h1, d2                 ; encoding: [0x41,0xc0,0x63,0x1e]
-; CHECK: fcvt s1, d2                 ; encoding: [0x41,0x40,0x62,0x1e]
-; CHECK: fcvt d1, h2                 ; encoding: [0x41,0xc0,0xe2,0x1e]
-; CHECK: fcvt s1, h2                 ; encoding: [0x41,0x40,0xe2,0x1e]
-; CHECK: fcvt d1, s2                 ; encoding: [0x41,0xc0,0x22,0x1e]
-; CHECK: fcvt h1, s2                 ; encoding: [0x41,0xc0,0x23,0x1e]
-
-  fcvtas w1, d2
-  fcvtas x1, d2
-  fcvtas w1, s2
-  fcvtas x1, s2
-
-; CHECK: fcvtas	w1, d2                  ; encoding: [0x41,0x00,0x64,0x1e]
-; CHECK: fcvtas	x1, d2                  ; encoding: [0x41,0x00,0x64,0x9e]
-; CHECK: fcvtas	w1, s2                  ; encoding: [0x41,0x00,0x24,0x1e]
-; CHECK: fcvtas	x1, s2                  ; encoding: [0x41,0x00,0x24,0x9e]
-
-  fcvtau w1, s2
-  fcvtau w1, d2
-  fcvtau x1, s2
-  fcvtau x1, d2
-
-; CHECK: fcvtau	w1, s2                  ; encoding: [0x41,0x00,0x25,0x1e]
-; CHECK: fcvtau	w1, d2                  ; encoding: [0x41,0x00,0x65,0x1e]
-; CHECK: fcvtau	x1, s2                  ; encoding: [0x41,0x00,0x25,0x9e]
-; CHECK: fcvtau	x1, d2                  ; encoding: [0x41,0x00,0x65,0x9e]
-
-  fcvtms w1, s2
-  fcvtms w1, d2
-  fcvtms x1, s2
-  fcvtms x1, d2
-
-; CHECK: fcvtms	w1, s2                  ; encoding: [0x41,0x00,0x30,0x1e]
-; CHECK: fcvtms	w1, d2                  ; encoding: [0x41,0x00,0x70,0x1e]
-; CHECK: fcvtms	x1, s2                  ; encoding: [0x41,0x00,0x30,0x9e]
-; CHECK: fcvtms	x1, d2                  ; encoding: [0x41,0x00,0x70,0x9e]
-
-  fcvtmu w1, s2
-  fcvtmu w1, d2
-  fcvtmu x1, s2
-  fcvtmu x1, d2
-
-; CHECK: fcvtmu	w1, s2                  ; encoding: [0x41,0x00,0x31,0x1e]
-; CHECK: fcvtmu	w1, d2                  ; encoding: [0x41,0x00,0x71,0x1e]
-; CHECK: fcvtmu	x1, s2                  ; encoding: [0x41,0x00,0x31,0x9e]
-; CHECK: fcvtmu	x1, d2                  ; encoding: [0x41,0x00,0x71,0x9e]
-
-  fcvtns w1, s2
-  fcvtns w1, d2
-  fcvtns x1, s2
-  fcvtns x1, d2
-
-; CHECK: fcvtns	w1, s2                  ; encoding: [0x41,0x00,0x20,0x1e]
-; CHECK: fcvtns	w1, d2                  ; encoding: [0x41,0x00,0x60,0x1e]
-; CHECK: fcvtns	x1, s2                  ; encoding: [0x41,0x00,0x20,0x9e]
-; CHECK: fcvtns	x1, d2                  ; encoding: [0x41,0x00,0x60,0x9e]
-
-  fcvtnu w1, s2
-  fcvtnu w1, d2
-  fcvtnu x1, s2
-  fcvtnu x1, d2
-
-; CHECK: fcvtnu	w1, s2                  ; encoding: [0x41,0x00,0x21,0x1e]
-; CHECK: fcvtnu	w1, d2                  ; encoding: [0x41,0x00,0x61,0x1e]
-; CHECK: fcvtnu	x1, s2                  ; encoding: [0x41,0x00,0x21,0x9e]
-; CHECK: fcvtnu	x1, d2                  ; encoding: [0x41,0x00,0x61,0x9e]
-
-  fcvtps w1, s2
-  fcvtps w1, d2
-  fcvtps x1, s2
-  fcvtps x1, d2
-
-; CHECK: fcvtps	w1, s2                  ; encoding: [0x41,0x00,0x28,0x1e]
-; CHECK: fcvtps	w1, d2                  ; encoding: [0x41,0x00,0x68,0x1e]
-; CHECK: fcvtps	x1, s2                  ; encoding: [0x41,0x00,0x28,0x9e]
-; CHECK: fcvtps	x1, d2                  ; encoding: [0x41,0x00,0x68,0x9e]
-
-  fcvtpu w1, s2
-  fcvtpu w1, d2
-  fcvtpu x1, s2
-  fcvtpu x1, d2
-
-; CHECK: fcvtpu	w1, s2                  ; encoding: [0x41,0x00,0x29,0x1e]
-; CHECK: fcvtpu	w1, d2                  ; encoding: [0x41,0x00,0x69,0x1e]
-; CHECK: fcvtpu	x1, s2                  ; encoding: [0x41,0x00,0x29,0x9e]
-; CHECK: fcvtpu	x1, d2                  ; encoding: [0x41,0x00,0x69,0x9e]
-
-  fcvtzs w1, s2
-  fcvtzs w1, s2, #1
-  fcvtzs w1, d2
-  fcvtzs w1, d2, #1
-  fcvtzs x1, s2
-  fcvtzs x1, s2, #1
-  fcvtzs x1, d2
-  fcvtzs x1, d2, #1
-
-; CHECK: fcvtzs	w1, s2                  ; encoding: [0x41,0x00,0x38,0x1e]
-; CHECK: fcvtzs	w1, s2, #1              ; encoding: [0x41,0xfc,0x18,0x1e]
-; CHECK: fcvtzs	w1, d2                  ; encoding: [0x41,0x00,0x78,0x1e]
-; CHECK: fcvtzs	w1, d2, #1              ; encoding: [0x41,0xfc,0x58,0x1e]
-; CHECK: fcvtzs	x1, s2                  ; encoding: [0x41,0x00,0x38,0x9e]
-; CHECK: fcvtzs	x1, s2, #1              ; encoding: [0x41,0xfc,0x18,0x9e]
-; CHECK: fcvtzs	x1, d2                  ; encoding: [0x41,0x00,0x78,0x9e]
-; CHECK: fcvtzs	x1, d2, #1              ; encoding: [0x41,0xfc,0x58,0x9e]
-
-  fcvtzu w1, s2
-  fcvtzu w1, s2, #1
-  fcvtzu w1, d2
-  fcvtzu w1, d2, #1
-  fcvtzu x1, s2
-  fcvtzu x1, s2, #1
-  fcvtzu x1, d2
-  fcvtzu x1, d2, #1
-
-; CHECK: fcvtzu	w1, s2                  ; encoding: [0x41,0x00,0x39,0x1e]
-; CHECK: fcvtzu	w1, s2, #1              ; encoding: [0x41,0xfc,0x19,0x1e]
-; CHECK: fcvtzu	w1, d2                  ; encoding: [0x41,0x00,0x79,0x1e]
-; CHECK: fcvtzu	w1, d2, #1              ; encoding: [0x41,0xfc,0x59,0x1e]
-; CHECK: fcvtzu	x1, s2                  ; encoding: [0x41,0x00,0x39,0x9e]
-; CHECK: fcvtzu	x1, s2, #1              ; encoding: [0x41,0xfc,0x19,0x9e]
-; CHECK: fcvtzu	x1, d2                  ; encoding: [0x41,0x00,0x79,0x9e]
-; CHECK: fcvtzu	x1, d2, #1              ; encoding: [0x41,0xfc,0x59,0x9e]
-
-  scvtf s1, w2
-  scvtf s1, w2, #1
-  scvtf d1, w2
-  scvtf d1, w2, #1
-  scvtf s1, x2
-  scvtf s1, x2, #1
-  scvtf d1, x2
-  scvtf d1, x2, #1
-
-; CHECK: scvtf	s1, w2                  ; encoding: [0x41,0x00,0x22,0x1e]
-; CHECK: scvtf	s1, w2, #1              ; encoding: [0x41,0xfc,0x02,0x1e]
-; CHECK: scvtf	d1, w2                  ; encoding: [0x41,0x00,0x62,0x1e]
-; CHECK: scvtf	d1, w2, #1              ; encoding: [0x41,0xfc,0x42,0x1e]
-; CHECK: scvtf	s1, x2                  ; encoding: [0x41,0x00,0x22,0x9e]
-; CHECK: scvtf	s1, x2, #1              ; encoding: [0x41,0xfc,0x02,0x9e]
-; CHECK: scvtf	d1, x2                  ; encoding: [0x41,0x00,0x62,0x9e]
-; CHECK: scvtf	d1, x2, #1              ; encoding: [0x41,0xfc,0x42,0x9e]
-
-  ucvtf s1, w2
-  ucvtf s1, w2, #1
-  ucvtf d1, w2
-  ucvtf d1, w2, #1
-  ucvtf s1, x2
-  ucvtf s1, x2, #1
-  ucvtf d1, x2
-  ucvtf d1, x2, #1
-
-; CHECK: ucvtf	s1, w2                  ; encoding: [0x41,0x00,0x23,0x1e]
-; CHECK: ucvtf	s1, w2, #1              ; encoding: [0x41,0xfc,0x03,0x1e]
-; CHECK: ucvtf	d1, w2                  ; encoding: [0x41,0x00,0x63,0x1e]
-; CHECK: ucvtf	d1, w2, #1              ; encoding: [0x41,0xfc,0x43,0x1e]
-; CHECK: ucvtf	s1, x2                  ; encoding: [0x41,0x00,0x23,0x9e]
-; CHECK: ucvtf	s1, x2, #1              ; encoding: [0x41,0xfc,0x03,0x9e]
-; CHECK: ucvtf	d1, x2                  ; encoding: [0x41,0x00,0x63,0x9e]
-; CHECK: ucvtf	d1, x2, #1              ; encoding: [0x41,0xfc,0x43,0x9e]
-
-;-----------------------------------------------------------------------------
-; Floating-point move
-;-----------------------------------------------------------------------------
-
-  fmov s1, w2
-  fmov w1, s2
-  fmov d1, x2
-  fmov x1, d2
-
-; CHECK: fmov s1, w2                 ; encoding: [0x41,0x00,0x27,0x1e]
-; CHECK: fmov w1, s2                 ; encoding: [0x41,0x00,0x26,0x1e]
-; CHECK: fmov d1, x2                 ; encoding: [0x41,0x00,0x67,0x9e]
-; CHECK: fmov x1, d2                 ; encoding: [0x41,0x00,0x66,0x9e]
-
-  fmov s1, #0.125
-  fmov s1, #0x40
-  fmov d1, #0.125
-  fmov d1, #0x40
-  fmov d1, #-4.843750e-01
-  fmov d1, #4.843750e-01
-  fmov d3, #3
-  fmov s2, #0.0
-  fmov d2, #0.0
-
-; CHECK: fmov s1, #0.12500000      ; encoding: [0x01,0x10,0x28,0x1e]
-; CHECK: fmov s1, #0.12500000      ; encoding: [0x01,0x10,0x28,0x1e]
-; CHECK: fmov d1, #0.12500000      ; encoding: [0x01,0x10,0x68,0x1e]
-; CHECK: fmov d1, #0.12500000      ; encoding: [0x01,0x10,0x68,0x1e]
-; CHECK: fmov d1, #-0.48437500     ; encoding: [0x01,0xf0,0x7b,0x1e]
-; CHECK: fmov d1, #0.48437500      ; encoding: [0x01,0xf0,0x6b,0x1e]
-; CHECK: fmov d3, #3.00000000      ; encoding: [0x03,0x10,0x61,0x1e]
-; CHECK: fmov s2, wzr                ; encoding: [0xe2,0x03,0x27,0x1e]
-; CHECK: fmov d2, xzr                ; encoding: [0xe2,0x03,0x67,0x9e]
-
-  fmov s1, s2
-  fmov d1, d2
-
-; CHECK: fmov s1, s2                 ; encoding: [0x41,0x40,0x20,0x1e]
-; CHECK: fmov d1, d2                 ; encoding: [0x41,0x40,0x60,0x1e]
-
-
-  fmov x2, v5.d[1]
-  fmov.d x9, v7[1]
-  fmov v1.d[1], x1
-  fmov.d v8[1], x6
-
-; CHECK: fmov.d	x2, v5[1]               ; encoding: [0xa2,0x00,0xae,0x9e]
-; CHECK: fmov.d	x9, v7[1]               ; encoding: [0xe9,0x00,0xae,0x9e]
-; CHECK: fmov.d	v1[1], x1               ; encoding: [0x21,0x00,0xaf,0x9e]
-; CHECK: fmov.d	v8[1], x6               ; encoding: [0xc8,0x00,0xaf,0x9e]
-
-
-;-----------------------------------------------------------------------------
-; Floating-point round to integral
-;-----------------------------------------------------------------------------
-
-  frinta s1, s2
-  frinta d1, d2
-
-; CHECK: frinta s1, s2               ; encoding: [0x41,0x40,0x26,0x1e]
-; CHECK: frinta d1, d2               ; encoding: [0x41,0x40,0x66,0x1e]
-
-  frinti s1, s2
-  frinti d1, d2
-
-; CHECK: frinti s1, s2               ; encoding: [0x41,0xc0,0x27,0x1e]
-; CHECK: frinti d1, d2               ; encoding: [0x41,0xc0,0x67,0x1e]
-
-  frintm s1, s2
-  frintm d1, d2
-
-; CHECK: frintm s1, s2               ; encoding: [0x41,0x40,0x25,0x1e]
-; CHECK: frintm d1, d2               ; encoding: [0x41,0x40,0x65,0x1e]
-
-  frintn s1, s2
-  frintn d1, d2
-
-; CHECK: frintn s1, s2               ; encoding: [0x41,0x40,0x24,0x1e]
-; CHECK: frintn d1, d2               ; encoding: [0x41,0x40,0x64,0x1e]
-
-  frintp s1, s2
-  frintp d1, d2
-
-; CHECK: frintp s1, s2               ; encoding: [0x41,0xc0,0x24,0x1e]
-; CHECK: frintp d1, d2               ; encoding: [0x41,0xc0,0x64,0x1e]
-
-  frintx s1, s2
-  frintx d1, d2
-
-; CHECK: frintx s1, s2               ; encoding: [0x41,0x40,0x27,0x1e]
-; CHECK: frintx d1, d2               ; encoding: [0x41,0x40,0x67,0x1e]
-
-  frintz s1, s2
-  frintz d1, d2
-
-; CHECK: frintz s1, s2               ; encoding: [0x41,0xc0,0x25,0x1e]
-; CHECK: frintz d1, d2               ; encoding: [0x41,0xc0,0x65,0x1e]
-
-  cmhs d0, d0, d0
-  cmtst d0, d0, d0
-
-; CHECK: cmhs	d0, d0, d0              ; encoding: [0x00,0x3c,0xe0,0x7e]
-; CHECK: cmtst	d0, d0, d0              ; encoding: [0x00,0x8c,0xe0,0x5e]
-
-
-
-;-----------------------------------------------------------------------------
-; Floating-point extract and narrow
-;-----------------------------------------------------------------------------
-  sqxtn b4, h2
-  sqxtn h2, s3
-  sqxtn s9, d2
-
-; CHECK: sqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x5e]
-; CHECK: sqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x5e]
-; CHECK: sqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x5e]
-
-  sqxtun b4, h2
-  sqxtun h2, s3
-  sqxtun s9, d2
-
-; CHECK: sqxtun b4, h2                  ; encoding: [0x44,0x28,0x21,0x7e]
-; CHECK: sqxtun h2, s3                  ; encoding: [0x62,0x28,0x61,0x7e]
-; CHECK: sqxtun s9, d2                  ; encoding: [0x49,0x28,0xa1,0x7e]
-
-  uqxtn b4, h2
-  uqxtn h2, s3
-  uqxtn s9, d2
-
-; CHECK: uqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x7e]
-; CHECK: uqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x7e]
-; CHECK: uqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x7e]
diff --git a/test/MC/ARM64/large-relocs.s b/test/MC/ARM64/large-relocs.s
deleted file mode 100644
index 48aea43d6bc..00000000000
--- a/test/MC/ARM64/large-relocs.s
+++ /dev/null
@@ -1,38 +0,0 @@
-// RUN: llvm-mc -triple=arm64-linux-gnu -show-encoding -o - %s | FileCheck %s
-// RUN: llvm-mc -triple=arm64-linux-gnu -show-encoding -filetype=obj -o - %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-OBJ %s
-
-        movz x2, #:abs_g0:sym
-        movk w3, #:abs_g0_nc:sym
-// CHECK: movz    x2, #:abs_g0:sym        // encoding: [0bAAA00010,A,0b100AAAAA,0xd2]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_arm64_movw
-// CHECK: movk     w3, #:abs_g0_nc:sym    // encoding: [0bAAA00011,A,0b100AAAAA,0x72]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_arm64_movw
-
-// CHECK-OBJ: 0 R_AARCH64_MOVW_UABS_G0 sym
-// CHECK-OBJ: 4 R_AARCH64_MOVW_UABS_G0_NC sym
-
-        movz x4, #:abs_g1:sym
-        movk w5, #:abs_g1_nc:sym
-// CHECK: movz     x4, #:abs_g1:sym       // encoding: [0bAAA00100,A,0b101AAAAA,0xd2]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_arm64_movw
-// CHECK: movk     w5, #:abs_g1_nc:sym    // encoding: [0bAAA00101,A,0b101AAAAA,0x72]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_arm64_movw
-
-// CHECK-OBJ: 8 R_AARCH64_MOVW_UABS_G1 sym
-// CHECK-OBJ: c R_AARCH64_MOVW_UABS_G1_NC sym
-
-        movz x6, #:abs_g2:sym
-        movk x7, #:abs_g2_nc:sym
-// CHECK: movz     x6, #:abs_g2:sym       // encoding: [0bAAA00110,A,0b110AAAAA,0xd2]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_arm64_movw
-// CHECK: movk     x7, #:abs_g2_nc:sym    // encoding: [0bAAA00111,A,0b110AAAAA,0xf2]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_arm64_movw
-
-// CHECK-OBJ: 10 R_AARCH64_MOVW_UABS_G2 sym
-// CHECK-OBJ: 14 R_AARCH64_MOVW_UABS_G2_NC sym
-
-        movz x8, #:abs_g3:sym
-// CHECK: movz     x8, #:abs_g3:sym       // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_arm64_movw
-
-// CHECK-OBJ: 18 R_AARCH64_MOVW_UABS_G3 sym
diff --git a/test/MC/ARM64/leaf-compact-unwind.s b/test/MC/ARM64/leaf-compact-unwind.s
deleted file mode 100644
index d6998134d04..00000000000
--- a/test/MC/ARM64/leaf-compact-unwind.s
+++ /dev/null
@@ -1,208 +0,0 @@
-// RUN: llvm-mc -triple=arm64-apple-ios -filetype=obj < %s | \
-// RUN: llvm-readobj -sections -section-relocations -section-data | \
-// RUN: FileCheck %s
-//
-// rdar://13070556
-
-// FIXME: we should add compact unwind support to llvm-objdump -unwind-info
-
-// CHECK:      Section {
-// CHECK:        Index: 1
-// CHECK-NEXT:   Name: __compact_unwind
-// CHECK-NEXT:   Segment: __LD
-// CHECK-NEXT:   Address:
-// CHECK-NEXT:   Size:
-// CHECK-NEXT:   Offset:
-// CHECK-NEXT:   Alignment:
-// CHECK-NEXT:   RelocationOffset:
-// CHECK-NEXT:   RelocationCount:
-// CHECK-NEXT:   Type:
-// CHECK-NEXT:   Attributes [
-// CHECK-NEXT:     Debug
-// CHECK-NEXT:   ]
-// CHECK-NEXT:   Reserved1:
-// CHECK-NEXT:   Reserved2:
-// CHECK-NEXT:   Relocations [
-// CHECK-NEXT:     0x60 0 3 0 ARM64_RELOC_UNSIGNED 0 -
-// CHECK-NEXT:     0x40 0 3 0 ARM64_RELOC_UNSIGNED 0 -
-// CHECK-NEXT:     0x20 0 3 0 ARM64_RELOC_UNSIGNED 0 -
-// CHECK-NEXT:     0x0 0 3 0 ARM64_RELOC_UNSIGNED 0 -
-// CHECK-NEXT:   ]
-// CHECK-NEXT:   SectionData (
-// CHECK-NEXT:     0000: 00000000 00000000 08000000 00000002
-// CHECK-NEXT:     0010: 00000000 00000000 00000000 00000000
-// CHECK-NEXT:     0020: 08000000 00000000 40000000 00900002
-// CHECK-NEXT:     0030: 00000000 00000000 00000000 00000000
-// CHECK-NEXT:     0040: 48000000 00000000 D4000000 0F400002
-// CHECK-NEXT:     0050: 00000000 00000000 00000000 00000000
-// CHECK-NEXT:     0060: 1C010000 00000000 54000000 10100202
-// CHECK-NEXT:     0070: 00000000 00000000 00000000 00000000
-// CHECK-NEXT:   )
-// CHECK-NEXT: }
-
-	.section	__TEXT,__text,regular,pure_instructions
-	.globl	_foo1
-	.align	2
-_foo1:                                  ; @foo1
-	.cfi_startproc
-; BB#0:                                 ; %entry
-	add	w0, w0, #42             ; =#42
-	ret
-	.cfi_endproc
-
-	.globl	_foo2
-	.align	2
-_foo2:                                  ; @foo2
-	.cfi_startproc
-; BB#0:                                 ; %entry
-	sub	sp, sp, #144            ; =#144
-Ltmp2:
-	.cfi_def_cfa_offset 144
-	mov	x9, xzr
-	mov	x8, sp
-LBB1_1:                                 ; %for.body
-                                        ; =>This Inner Loop Header: Depth=1
-	str	w9, [x8, x9, lsl #2]
-	add	x9, x9, #1              ; =#1
-	cmp	w9, #36                 ; =#36
-	b.ne	LBB1_1
-; BB#2:
-	mov	x9, xzr
-	mov	w0, wzr
-LBB1_3:                                 ; %for.body4
-                                        ; =>This Inner Loop Header: Depth=1
-	ldr	w10, [x8, x9]
-	add	x9, x9, #4              ; =#4
-	cmp	w9, #144                ; =#144
-	add	w0, w10, w0
-	b.ne	LBB1_3
-; BB#4:                                 ; %for.end9
-	add	sp, sp, #144            ; =#144
-	ret
-	.cfi_endproc
-
-	.globl	_foo3
-	.align	2
-_foo3:                                  ; @foo3
-	.cfi_startproc
-; BB#0:                                 ; %entry
-	stp	x26, x25, [sp, #-64]!
-	stp	x24, x23, [sp, #16]
-	stp	x22, x21, [sp, #32]
-	stp	x20, x19, [sp, #48]
-Ltmp3:
-	.cfi_def_cfa_offset 64
-Ltmp4:
-	.cfi_offset w19, -16
-Ltmp5:
-	.cfi_offset w20, -24
-Ltmp6:
-	.cfi_offset w21, -32
-Ltmp7:
-	.cfi_offset w22, -40
-Ltmp8:
-	.cfi_offset w23, -48
-Ltmp9:
-	.cfi_offset w24, -56
-Ltmp10:
-	.cfi_offset w25, -64
-Ltmp11:
-	.cfi_offset w26, -72
-Lloh0:
-	adrp	x8, _bar@GOTPAGE
-Lloh1:
-	ldr	x8, [x8, _bar@GOTPAGEOFF]
-	ldr	w9, [x8]
-	ldr	w10, [x8]
-	ldr	w11, [x8]
-	ldr	w12, [x8]
-	ldr	w13, [x8]
-	ldr	w14, [x8]
-	ldr	w15, [x8]
-	ldr	w16, [x8]
-	ldr	w17, [x8]
-	ldr	w0, [x8]
-	ldr	w19, [x8]
-	ldr	w20, [x8]
-	ldr	w21, [x8]
-	ldr	w22, [x8]
-	ldr	w23, [x8]
-	ldr	w24, [x8]
-	ldr	w25, [x8]
-	ldr	w8, [x8]
-	add	w9, w10, w9
-	add	w9, w9, w11
-	add	w9, w9, w12
-	add	w9, w9, w13
-	add	w9, w9, w14
-	add	w9, w9, w15
-	add	w9, w9, w16
-	add	w9, w9, w17
-	add	w9, w9, w0
-	add	w9, w9, w19
-	add	w9, w9, w20
-	add	w9, w9, w21
-	add	w9, w9, w22
-	add	w9, w9, w23
-	add	w9, w9, w24
-	add	w9, w9, w25
-	sub	w8, w8, w9
-	sub	w8, w8, w7, lsl #1
-	sub	w8, w8, w6, lsl #1
-	sub	w8, w8, w5, lsl #1
-	sub	w8, w8, w4, lsl #1
-	sub	w8, w8, w3, lsl #1
-	sub	w8, w8, w2, lsl #1
-	sub	w0, w8, w1, lsl #1
-	ldp	x20, x19, [sp, #48]
-	ldp	x22, x21, [sp, #32]
-	ldp	x24, x23, [sp, #16]
-	ldp	x26, x25, [sp], #64
-	ret
-	.loh AdrpLdrGot	Lloh0, Lloh1
-	.cfi_endproc
-
-	.globl	_foo4
-	.align	2
-_foo4:                                  ; @foo4
-	.cfi_startproc
-; BB#0:                                 ; %entry
-	stp	x28, x27, [sp, #-16]!
-	sub	sp, sp, #512            ; =#512
-Ltmp12:
-	.cfi_def_cfa_offset 528
-Ltmp13:
-	.cfi_offset w27, -16
-Ltmp14:
-	.cfi_offset w28, -24
-                                        ; kill: W0<def> W0<kill> X0<def>
-	mov	x9, xzr
-	ubfx	x10, x0, #0, #32
-	mov	x8, sp
-LBB3_1:                                 ; %for.body
-                                        ; =>This Inner Loop Header: Depth=1
-	add	w11, w10, w9
-	str	w11, [x8, x9, lsl #2]
-	add	x9, x9, #1              ; =#1
-	cmp	w9, #128                ; =#128
-	b.ne	LBB3_1
-; BB#2:                                 ; %for.cond2.preheader
-	mov	x9, xzr
-	mov	w0, wzr
-	add	x8, x8, w5, sxtw #2
-LBB3_3:                                 ; %for.body4
-                                        ; =>This Inner Loop Header: Depth=1
-	ldr	w10, [x8, x9]
-	add	x9, x9, #4              ; =#4
-	cmp	w9, #512                ; =#512
-	add	w0, w10, w0
-	b.ne	LBB3_3
-; BB#4:                                 ; %for.end11
-	add	sp, sp, #512            ; =#512
-	ldp	x28, x27, [sp], #16
-	ret
-	.cfi_endproc
-
-	.comm	_bar,4,2                ; @bar
-
-.subsections_via_symbols
diff --git a/test/MC/ARM64/lit.local.cfg b/test/MC/ARM64/lit.local.cfg
deleted file mode 100644
index 4d6d8826b53..00000000000
--- a/test/MC/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-config.suffixes = ['.ll', '.s']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/MC/ARM64/logical-encoding.s b/test/MC/ARM64/logical-encoding.s
deleted file mode 100644
index e5f1436d1ab..00000000000
--- a/test/MC/ARM64/logical-encoding.s
+++ /dev/null
@@ -1,224 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-
-foo:
-;==---------------------------------------------------------------------------==
-; 5.4.2 Logical (immediate)
-;==---------------------------------------------------------------------------==
-
-  and   w0, w0, #1
-  and   x0, x0, #1
-  and   w1, w2, #15
-  and   x1, x2, #15
-  and   sp, x5, #~15
-  ands  w0, w0, #1
-  ands  x0, x0, #1
-  ands  w1, w2, #15
-  ands  x1, x2, #15
-
-; CHECK: and  w0, w0, #0x1           ; encoding: [0x00,0x00,0x00,0x12]
-; CHECK: and  x0, x0, #0x1           ; encoding: [0x00,0x00,0x40,0x92]
-; CHECK: and  w1, w2, #0xf           ; encoding: [0x41,0x0c,0x00,0x12]
-; CHECK: and  x1, x2, #0xf           ; encoding: [0x41,0x0c,0x40,0x92]
-; CHECK: and  sp, x5, #0xfffffffffffffff0 ; encoding: [0xbf,0xec,0x7c,0x92]
-; CHECK: ands w0, w0, #0x1           ; encoding: [0x00,0x00,0x00,0x72]
-; CHECK: ands x0, x0, #0x1           ; encoding: [0x00,0x00,0x40,0xf2]
-; CHECK: ands w1, w2, #0xf           ; encoding: [0x41,0x0c,0x00,0x72]
-; CHECK: ands x1, x2, #0xf           ; encoding: [0x41,0x0c,0x40,0xf2]
-
-  eor w1, w2, #0x4000
-  eor x1, x2, #0x8000
-
-; CHECK: eor w1, w2, #0x4000         ; encoding: [0x41,0x00,0x12,0x52]
-; CHECK: eor x1, x2, #0x8000         ; encoding: [0x41,0x00,0x71,0xd2]
-
-  orr w1, w2, #0x4000
-  orr x1, x2, #0x8000
-
-; CHECK: orr w1, w2, #0x4000         ; encoding: [0x41,0x00,0x12,0x32]
-; CHECK: orr x1, x2, #0x8000         ; encoding: [0x41,0x00,0x71,0xb2]
-
-  orr w8, wzr, #0x1
-  orr x8, xzr, #0x1
-
-; CHECK: orr w8, wzr, #0x1           ; encoding: [0xe8,0x03,0x00,0x32]
-; CHECK: orr x8, xzr, #0x1           ; encoding: [0xe8,0x03,0x40,0xb2]
-
-;==---------------------------------------------------------------------------==
-; 5.5.3 Logical (shifted register)
-;==---------------------------------------------------------------------------==
-
-  and   w1, w2, w3
-  and   x1, x2, x3
-  and   w1, w2, w3, lsl #2
-  and   x1, x2, x3, lsl #2
-  and   w1, w2, w3, lsr #2
-  and   x1, x2, x3, lsr #2
-  and   w1, w2, w3, asr #2
-  and   x1, x2, x3, asr #2
-  and   w1, w2, w3, ror #2
-  and   x1, x2, x3, ror #2
-
-; CHECK: and  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x0a]
-; CHECK: and  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0x8a]
-; CHECK: and  w1, w2, w3, lsl #2     ; encoding: [0x41,0x08,0x03,0x0a]
-; CHECK: and  x1, x2, x3, lsl #2     ; encoding: [0x41,0x08,0x03,0x8a]
-; CHECK: and  w1, w2, w3, lsr #2     ; encoding: [0x41,0x08,0x43,0x0a]
-; CHECK: and  x1, x2, x3, lsr #2     ; encoding: [0x41,0x08,0x43,0x8a]
-; CHECK: and  w1, w2, w3, asr #2     ; encoding: [0x41,0x08,0x83,0x0a]
-; CHECK: and  x1, x2, x3, asr #2     ; encoding: [0x41,0x08,0x83,0x8a]
-; CHECK: and  w1, w2, w3, ror #2     ; encoding: [0x41,0x08,0xc3,0x0a]
-; CHECK: and  x1, x2, x3, ror #2     ; encoding: [0x41,0x08,0xc3,0x8a]
-
-  ands  w1, w2, w3
-  ands  x1, x2, x3
-  ands  w1, w2, w3, lsl #2
-  ands  x1, x2, x3, lsl #2
-  ands  w1, w2, w3, lsr #2
-  ands  x1, x2, x3, lsr #2
-  ands  w1, w2, w3, asr #2
-  ands  x1, x2, x3, asr #2
-  ands  w1, w2, w3, ror #2
-  ands  x1, x2, x3, ror #2
-
-; CHECK: ands w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x6a]
-; CHECK: ands x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xea]
-; CHECK: ands w1, w2, w3, lsl #2     ; encoding: [0x41,0x08,0x03,0x6a]
-; CHECK: ands x1, x2, x3, lsl #2     ; encoding: [0x41,0x08,0x03,0xea]
-; CHECK: ands w1, w2, w3, lsr #2     ; encoding: [0x41,0x08,0x43,0x6a]
-; CHECK: ands x1, x2, x3, lsr #2     ; encoding: [0x41,0x08,0x43,0xea]
-; CHECK: ands w1, w2, w3, asr #2     ; encoding: [0x41,0x08,0x83,0x6a]
-; CHECK: ands x1, x2, x3, asr #2     ; encoding: [0x41,0x08,0x83,0xea]
-; CHECK: ands w1, w2, w3, ror #2     ; encoding: [0x41,0x08,0xc3,0x6a]
-; CHECK: ands x1, x2, x3, ror #2     ; encoding: [0x41,0x08,0xc3,0xea]
-
-  bic w1, w2, w3
-  bic x1, x2, x3
-  bic w1, w2, w3, lsl #3
-  bic x1, x2, x3, lsl #3
-  bic w1, w2, w3, lsr #3
-  bic x1, x2, x3, lsr #3
-  bic w1, w2, w3, asr #3
-  bic x1, x2, x3, asr #3
-  bic w1, w2, w3, ror #3
-  bic x1, x2, x3, ror #3
-
-; CHECK: bic w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x0a]
-; CHECK: bic x1, x2, x3              ; encoding: [0x41,0x00,0x23,0x8a]
-; CHECK: bic w1, w2, w3, lsl #3      ; encoding: [0x41,0x0c,0x23,0x0a]
-; CHECK: bic x1, x2, x3, lsl #3      ; encoding: [0x41,0x0c,0x23,0x8a]
-; CHECK: bic w1, w2, w3, lsr #3      ; encoding: [0x41,0x0c,0x63,0x0a]
-; CHECK: bic x1, x2, x3, lsr #3      ; encoding: [0x41,0x0c,0x63,0x8a]
-; CHECK: bic w1, w2, w3, asr #3      ; encoding: [0x41,0x0c,0xa3,0x0a]
-; CHECK: bic x1, x2, x3, asr #3      ; encoding: [0x41,0x0c,0xa3,0x8a]
-; CHECK: bic w1, w2, w3, ror #3      ; encoding: [0x41,0x0c,0xe3,0x0a]
-; CHECK: bic x1, x2, x3, ror #3      ; encoding: [0x41,0x0c,0xe3,0x8a]
-
-  bics w1, w2, w3
-  bics x1, x2, x3
-  bics w1, w2, w3, lsl #3
-  bics x1, x2, x3, lsl #3
-  bics w1, w2, w3, lsr #3
-  bics x1, x2, x3, lsr #3
-  bics w1, w2, w3, asr #3
-  bics x1, x2, x3, asr #3
-  bics w1, w2, w3, ror #3
-  bics x1, x2, x3, ror #3
-
-; CHECK: bics w1, w2, w3             ; encoding: [0x41,0x00,0x23,0x6a]
-; CHECK: bics x1, x2, x3             ; encoding: [0x41,0x00,0x23,0xea]
-; CHECK: bics w1, w2, w3, lsl #3     ; encoding: [0x41,0x0c,0x23,0x6a]
-; CHECK: bics x1, x2, x3, lsl #3     ; encoding: [0x41,0x0c,0x23,0xea]
-; CHECK: bics w1, w2, w3, lsr #3     ; encoding: [0x41,0x0c,0x63,0x6a]
-; CHECK: bics x1, x2, x3, lsr #3     ; encoding: [0x41,0x0c,0x63,0xea]
-; CHECK: bics w1, w2, w3, asr #3     ; encoding: [0x41,0x0c,0xa3,0x6a]
-; CHECK: bics x1, x2, x3, asr #3     ; encoding: [0x41,0x0c,0xa3,0xea]
-; CHECK: bics w1, w2, w3, ror #3     ; encoding: [0x41,0x0c,0xe3,0x6a]
-; CHECK: bics x1, x2, x3, ror #3     ; encoding: [0x41,0x0c,0xe3,0xea]
-
-  eon w1, w2, w3
-  eon x1, x2, x3
-  eon w1, w2, w3, lsl #4
-  eon x1, x2, x3, lsl #4
-  eon w1, w2, w3, lsr #4
-  eon x1, x2, x3, lsr #4
-  eon w1, w2, w3, asr #4
-  eon x1, x2, x3, asr #4
-  eon w1, w2, w3, ror #4
-  eon x1, x2, x3, ror #4
-
-; CHECK: eon w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x4a]
-; CHECK: eon x1, x2, x3              ; encoding: [0x41,0x00,0x23,0xca]
-; CHECK: eon w1, w2, w3, lsl #4      ; encoding: [0x41,0x10,0x23,0x4a]
-; CHECK: eon x1, x2, x3, lsl #4      ; encoding: [0x41,0x10,0x23,0xca]
-; CHECK: eon w1, w2, w3, lsr #4      ; encoding: [0x41,0x10,0x63,0x4a]
-; CHECK: eon x1, x2, x3, lsr #4      ; encoding: [0x41,0x10,0x63,0xca]
-; CHECK: eon w1, w2, w3, asr #4      ; encoding: [0x41,0x10,0xa3,0x4a]
-; CHECK: eon x1, x2, x3, asr #4      ; encoding: [0x41,0x10,0xa3,0xca]
-; CHECK: eon w1, w2, w3, ror #4      ; encoding: [0x41,0x10,0xe3,0x4a]
-; CHECK: eon x1, x2, x3, ror #4      ; encoding: [0x41,0x10,0xe3,0xca]
-
-  eor w1, w2, w3
-  eor x1, x2, x3
-  eor w1, w2, w3, lsl #5
-  eor x1, x2, x3, lsl #5
-  eor w1, w2, w3, lsr #5
-  eor x1, x2, x3, lsr #5
-  eor w1, w2, w3, asr #5
-  eor x1, x2, x3, asr #5
-  eor w1, w2, w3, ror #5
-  eor x1, x2, x3, ror #5
-
-; CHECK: eor w1, w2, w3              ; encoding: [0x41,0x00,0x03,0x4a]
-; CHECK: eor x1, x2, x3              ; encoding: [0x41,0x00,0x03,0xca]
-; CHECK: eor w1, w2, w3, lsl #5      ; encoding: [0x41,0x14,0x03,0x4a]
-; CHECK: eor x1, x2, x3, lsl #5      ; encoding: [0x41,0x14,0x03,0xca]
-; CHECK: eor w1, w2, w3, lsr #5      ; encoding: [0x41,0x14,0x43,0x4a]
-; CHECK: eor x1, x2, x3, lsr #5      ; encoding: [0x41,0x14,0x43,0xca]
-; CHECK: eor w1, w2, w3, asr #5      ; encoding: [0x41,0x14,0x83,0x4a]
-; CHECK: eor x1, x2, x3, asr #5      ; encoding: [0x41,0x14,0x83,0xca]
-; CHECK: eor w1, w2, w3, ror #5      ; encoding: [0x41,0x14,0xc3,0x4a]
-; CHECK: eor x1, x2, x3, ror #5      ; encoding: [0x41,0x14,0xc3,0xca]
-
-  orr w1, w2, w3
-  orr x1, x2, x3
-  orr w1, w2, w3, lsl #6
-  orr x1, x2, x3, lsl #6
-  orr w1, w2, w3, lsr #6
-  orr x1, x2, x3, lsr #6
-  orr w1, w2, w3, asr #6
-  orr x1, x2, x3, asr #6
-  orr w1, w2, w3, ror #6
-  orr x1, x2, x3, ror #6
-
-; CHECK: orr w1, w2, w3              ; encoding: [0x41,0x00,0x03,0x2a]
-; CHECK: orr x1, x2, x3              ; encoding: [0x41,0x00,0x03,0xaa]
-; CHECK: orr w1, w2, w3, lsl #6      ; encoding: [0x41,0x18,0x03,0x2a]
-; CHECK: orr x1, x2, x3, lsl #6      ; encoding: [0x41,0x18,0x03,0xaa]
-; CHECK: orr w1, w2, w3, lsr #6      ; encoding: [0x41,0x18,0x43,0x2a]
-; CHECK: orr x1, x2, x3, lsr #6      ; encoding: [0x41,0x18,0x43,0xaa]
-; CHECK: orr w1, w2, w3, asr #6      ; encoding: [0x41,0x18,0x83,0x2a]
-; CHECK: orr x1, x2, x3, asr #6      ; encoding: [0x41,0x18,0x83,0xaa]
-; CHECK: orr w1, w2, w3, ror #6      ; encoding: [0x41,0x18,0xc3,0x2a]
-; CHECK: orr x1, x2, x3, ror #6      ; encoding: [0x41,0x18,0xc3,0xaa]
-
-  orn w1, w2, w3
-  orn x1, x2, x3
-  orn w1, w2, w3, lsl #7
-  orn x1, x2, x3, lsl #7
-  orn w1, w2, w3, lsr #7
-  orn x1, x2, x3, lsr #7
-  orn w1, w2, w3, asr #7
-  orn x1, x2, x3, asr #7
-  orn w1, w2, w3, ror #7
-  orn x1, x2, x3, ror #7
-
-; CHECK: orn w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x2a]
-; CHECK: orn x1, x2, x3              ; encoding: [0x41,0x00,0x23,0xaa]
-; CHECK: orn w1, w2, w3, lsl #7      ; encoding: [0x41,0x1c,0x23,0x2a]
-; CHECK: orn x1, x2, x3, lsl #7      ; encoding: [0x41,0x1c,0x23,0xaa]
-; CHECK: orn w1, w2, w3, lsr #7      ; encoding: [0x41,0x1c,0x63,0x2a]
-; CHECK: orn x1, x2, x3, lsr #7      ; encoding: [0x41,0x1c,0x63,0xaa]
-; CHECK: orn w1, w2, w3, asr #7      ; encoding: [0x41,0x1c,0xa3,0x2a]
-; CHECK: orn x1, x2, x3, asr #7      ; encoding: [0x41,0x1c,0xa3,0xaa]
-; CHECK: orn w1, w2, w3, ror #7      ; encoding: [0x41,0x1c,0xe3,0x2a]
-; CHECK: orn x1, x2, x3, ror #7      ; encoding: [0x41,0x1c,0xe3,0xaa]
diff --git a/test/MC/ARM64/mapping-across-sections.s b/test/MC/ARM64/mapping-across-sections.s
deleted file mode 100644
index 00b324cb826..00000000000
--- a/test/MC/ARM64/mapping-across-sections.s
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
-
-        .text
-        add w0, w0, w0
-
-// .wibble should *not* inherit .text's mapping symbol. It's a completely different section.
-        .section .wibble
-        add w0, w0, w0
-
-// A setion should be able to start with a $d
-        .section .starts_data
-        .word 42
-
-// Changing back to .text should not emit a redundant $x
-        .text
-        add w0, w0, w0
-
-// With all those constraints, we want:
-//   + .text to have $x at 0 and no others
-//   + .wibble to have $x at 0
-//   + .starts_data to have $d at 0
-
-
-// CHECK: 00000000 .starts_data 00000000 $d
-// CHECK-NEXT: 00000000 .text 00000000 $x
-// CHECK-NEXT: 00000000 .wibble 00000000 $x
-// CHECK-NOT: ${{[adtx]}}
-
diff --git a/test/MC/ARM64/mapping-within-section.s b/test/MC/ARM64/mapping-within-section.s
deleted file mode 100644
index f515cb9a5c0..00000000000
--- a/test/MC/ARM64/mapping-within-section.s
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
-
-    .text
-// $x at 0x0000
-    add w0, w0, w0
-// $d at 0x0004
-    .ascii "012"
-    .byte 1
-    .hword 2
-    .word 4
-    .xword 8
-    .single 4.0
-    .double 8.0
-    .space 10
-    .zero 3
-    .fill 10, 2, 42
-    .org 100, 12
-// $x at 0x0018
-    add x0, x0, x0
-
-// CHECK: 00000004         .text  00000000 $d
-// CHECK-NEXT: 00000000         .text  00000000 $x
-// CHECK-NEXT: 00000064         .text  00000000 $x
diff --git a/test/MC/ARM64/memory.s b/test/MC/ARM64/memory.s
deleted file mode 100644
index 579859660f9..00000000000
--- a/test/MC/ARM64/memory.s
+++ /dev/null
@@ -1,634 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-
-foo:
-;-----------------------------------------------------------------------------
-; Indexed loads
-;-----------------------------------------------------------------------------
-
-  ldr    w5, [x4, #20]
-  ldr    x4, [x3]
-  ldr    x2, [sp, #32]
-  ldr    b5, [sp, #1]
-  ldr    h6, [sp, #2]
-  ldr    s7, [sp, #4]
-  ldr    d8, [sp, #8]
-  ldr    q9, [sp, #16]
-  ldrb   w4, [x3]
-  ldrb   w5, [x4, #20]
-  ldrb	 w2, [x3, _foo@pageoff]
-  ldrb   w3, [x2, "+[Test method].var"@PAGEOFF]
-  ldrsb  w9, [x3]
-  ldrsb  x2, [sp, #128]
-  ldrh   w2, [sp, #32]
-  ldrsh  w3, [sp, #32]
-  ldrsh  x5, [x9, #24]
-  ldrsw  x9, [sp, #512]
-
-  prfm   #5, [sp, #32]
-  prfm   #31, [sp, #32]
-  prfm   pldl1keep, [x2]
-  prfm   pldl1strm, [x2]
-  prfm   pldl2keep, [x2]
-  prfm   pldl2strm, [x2]
-  prfm   pldl3keep, [x2]
-  prfm   pldl3strm, [x2]
-  prfm   pstl1keep, [x2]
-  prfm   pstl1strm, [x2]
-  prfm   pstl2keep, [x2]
-  prfm   pstl2strm, [x2]
-  prfm   pstl3keep, [x2]
-  prfm   pstl3strm, [x2]
-  prfm  pstl3strm, [x4, x5, lsl #3]
-
-; CHECK: ldr    w5, [x4, #20]           ; encoding: [0x85,0x14,0x40,0xb9]
-; CHECK: ldr    x4, [x3]                ; encoding: [0x64,0x00,0x40,0xf9]
-; CHECK: ldr    x2, [sp, #32]           ; encoding: [0xe2,0x13,0x40,0xf9]
-; CHECK: ldr    b5, [sp, #1]            ; encoding: [0xe5,0x07,0x40,0x3d]
-; CHECK: ldr    h6, [sp, #2]            ; encoding: [0xe6,0x07,0x40,0x7d]
-; CHECK: ldr    s7, [sp, #4]            ; encoding: [0xe7,0x07,0x40,0xbd]
-; CHECK: ldr    d8, [sp, #8]            ; encoding: [0xe8,0x07,0x40,0xfd]
-; CHECK: ldr    q9, [sp, #16]           ; encoding: [0xe9,0x07,0xc0,0x3d]
-; CHECK: ldrb   w4, [x3]                ; encoding: [0x64,0x00,0x40,0x39]
-; CHECK: ldrb   w5, [x4, #20]           ; encoding: [0x85,0x50,0x40,0x39]
-; CHECK: ldrb	w2, [x3, _foo@PAGEOFF]  ; encoding: [0x62,0bAAAAAA00,0b01AAAAAA,0x39]
-; CHECK: ldrb	w3, [x2, "+[Test method].var"@PAGEOFF] ; encoding: [0x43,0bAAAAAA00,0b01AAAAAA,0x39]
-; CHECK: ldrsb  w9, [x3]                ; encoding: [0x69,0x00,0xc0,0x39]
-; CHECK: ldrsb  x2, [sp, #128]          ; encoding: [0xe2,0x03,0x82,0x39]
-; CHECK: ldrh   w2, [sp, #32]           ; encoding: [0xe2,0x43,0x40,0x79]
-; CHECK: ldrsh  w3, [sp, #32]           ; encoding: [0xe3,0x43,0xc0,0x79]
-; CHECK: ldrsh  x5, [x9, #24]           ; encoding: [0x25,0x31,0x80,0x79]
-; CHECK: ldrsw  x9, [sp, #512]          ; encoding: [0xe9,0x03,0x82,0xb9]
-; CHECK: prfm   pldl3strm, [sp, #32]    ; encoding: [0xe5,0x13,0x80,0xf9]
-; CHECK: prfm	#31, [sp, #32]          ; encoding: [0xff,0x13,0x80,0xf9]
-; CHECK: prfm   pldl1keep, [x2]         ; encoding: [0x40,0x00,0x80,0xf9]
-; CHECK: prfm   pldl1strm, [x2]         ; encoding: [0x41,0x00,0x80,0xf9]
-; CHECK: prfm   pldl2keep, [x2]         ; encoding: [0x42,0x00,0x80,0xf9]
-; CHECK: prfm   pldl2strm, [x2]         ; encoding: [0x43,0x00,0x80,0xf9]
-; CHECK: prfm   pldl3keep, [x2]         ; encoding: [0x44,0x00,0x80,0xf9]
-; CHECK: prfm   pldl3strm, [x2]         ; encoding: [0x45,0x00,0x80,0xf9]
-; CHECK: prfm   pstl1keep, [x2]         ; encoding: [0x50,0x00,0x80,0xf9]
-; CHECK: prfm   pstl1strm, [x2]         ; encoding: [0x51,0x00,0x80,0xf9]
-; CHECK: prfm   pstl2keep, [x2]         ; encoding: [0x52,0x00,0x80,0xf9]
-; CHECK: prfm   pstl2strm, [x2]         ; encoding: [0x53,0x00,0x80,0xf9]
-; CHECK: prfm   pstl3keep, [x2]         ; encoding: [0x54,0x00,0x80,0xf9]
-; CHECK: prfm   pstl3strm, [x2]         ; encoding: [0x55,0x00,0x80,0xf9]
-; CHECK: prfm	pstl3strm, [x4, x5, lsl #3] ; encoding: [0x95,0x78,0xa5,0xf8]
-
-;-----------------------------------------------------------------------------
-; Indexed stores
-;-----------------------------------------------------------------------------
-
-  str   x4, [x3]
-  str   x2, [sp, #32]
-  str   w5, [x4, #20]
-  str   b5, [sp, #1]
-  str   h6, [sp, #2]
-  str   s7, [sp, #4]
-  str   d8, [sp, #8]
-  str   q9, [sp, #16]
-  strb  w4, [x3]
-  strb  w5, [x4, #20]
-  strh  w2, [sp, #32]
-
-; CHECK: str   x4, [x3]                 ; encoding: [0x64,0x00,0x00,0xf9]
-; CHECK: str   x2, [sp, #32]            ; encoding: [0xe2,0x13,0x00,0xf9]
-; CHECK: str   w5, [x4, #20]            ; encoding: [0x85,0x14,0x00,0xb9]
-; CHECK: str   b5, [sp, #1]             ; encoding: [0xe5,0x07,0x00,0x3d]
-; CHECK: str   h6, [sp, #2]             ; encoding: [0xe6,0x07,0x00,0x7d]
-; CHECK: str   s7, [sp, #4]             ; encoding: [0xe7,0x07,0x00,0xbd]
-; CHECK: str   d8, [sp, #8]             ; encoding: [0xe8,0x07,0x00,0xfd]
-; CHECK: str   q9, [sp, #16]            ; encoding: [0xe9,0x07,0x80,0x3d]
-; CHECK: strb  w4, [x3]                 ; encoding: [0x64,0x00,0x00,0x39]
-; CHECK: strb  w5, [x4, #20]            ; encoding: [0x85,0x50,0x00,0x39]
-; CHECK: strh  w2, [sp, #32]            ; encoding: [0xe2,0x43,0x00,0x79]
-
-;-----------------------------------------------------------------------------
-; Unscaled immediate loads and stores
-;-----------------------------------------------------------------------------
-
-  ldur    w2, [x3]
-  ldur    w2, [sp, #24]
-  ldur    x2, [x3]
-  ldur    x2, [sp, #24]
-  ldur    b5, [sp, #1]
-  ldur    h6, [sp, #2]
-  ldur    s7, [sp, #4]
-  ldur    d8, [sp, #8]
-  ldur    q9, [sp, #16]
-  ldursb  w9, [x3]
-  ldursb  x2, [sp, #128]
-  ldursh  w3, [sp, #32]
-  ldursh  x5, [x9, #24]
-  ldursw  x9, [sp, #-128]
-
-; CHECK: ldur    w2, [x3]               ; encoding: [0x62,0x00,0x40,0xb8]
-; CHECK: ldur    w2, [sp, #24]          ; encoding: [0xe2,0x83,0x41,0xb8]
-; CHECK: ldur    x2, [x3]               ; encoding: [0x62,0x00,0x40,0xf8]
-; CHECK: ldur    x2, [sp, #24]          ; encoding: [0xe2,0x83,0x41,0xf8]
-; CHECK: ldur    b5, [sp, #1]           ; encoding: [0xe5,0x13,0x40,0x3c]
-; CHECK: ldur    h6, [sp, #2]           ; encoding: [0xe6,0x23,0x40,0x7c]
-; CHECK: ldur    s7, [sp, #4]           ; encoding: [0xe7,0x43,0x40,0xbc]
-; CHECK: ldur    d8, [sp, #8]           ; encoding: [0xe8,0x83,0x40,0xfc]
-; CHECK: ldur    q9, [sp, #16]          ; encoding: [0xe9,0x03,0xc1,0x3c]
-; CHECK: ldursb  w9, [x3]               ; encoding: [0x69,0x00,0xc0,0x38]
-; CHECK: ldursb  x2, [sp, #128]         ; encoding: [0xe2,0x03,0x88,0x38]
-; CHECK: ldursh  w3, [sp, #32]          ; encoding: [0xe3,0x03,0xc2,0x78]
-; CHECK: ldursh  x5, [x9, #24]          ; encoding: [0x25,0x81,0x81,0x78]
-; CHECK: ldursw  x9, [sp, #-128]        ; encoding: [0xe9,0x03,0x98,0xb8]
-
-  stur    w4, [x3]
-  stur    w2, [sp, #32]
-  stur    x4, [x3]
-  stur    x2, [sp, #32]
-  stur    w5, [x4, #20]
-  stur    b5, [sp, #1]
-  stur    h6, [sp, #2]
-  stur    s7, [sp, #4]
-  stur    d8, [sp, #8]
-  stur    q9, [sp, #16]
-  sturb   w4, [x3]
-  sturb   w5, [x4, #20]
-  sturh   w2, [sp, #32]
-  prfum   #5, [sp, #32]
-
-; CHECK: stur    w4, [x3]               ; encoding: [0x64,0x00,0x00,0xb8]
-; CHECK: stur    w2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0xb8]
-; CHECK: stur    x4, [x3]               ; encoding: [0x64,0x00,0x00,0xf8]
-; CHECK: stur    x2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0xf8]
-; CHECK: stur    w5, [x4, #20]          ; encoding: [0x85,0x40,0x01,0xb8]
-; CHECK: stur    b5, [sp, #1]           ; encoding: [0xe5,0x13,0x00,0x3c]
-; CHECK: stur    h6, [sp, #2]           ; encoding: [0xe6,0x23,0x00,0x7c]
-; CHECK: stur    s7, [sp, #4]           ; encoding: [0xe7,0x43,0x00,0xbc]
-; CHECK: stur    d8, [sp, #8]           ; encoding: [0xe8,0x83,0x00,0xfc]
-; CHECK: stur    q9, [sp, #16]          ; encoding: [0xe9,0x03,0x81,0x3c]
-; CHECK: sturb   w4, [x3]               ; encoding: [0x64,0x00,0x00,0x38]
-; CHECK: sturb   w5, [x4, #20]          ; encoding: [0x85,0x40,0x01,0x38]
-; CHECK: sturh   w2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0x78]
-; CHECK: prfum   pldl3strm, [sp, #32]   ; encoding: [0xe5,0x03,0x82,0xf8]
-
-;-----------------------------------------------------------------------------
-; Unprivileged loads and stores
-;-----------------------------------------------------------------------------
-
-  ldtr    w3, [x4, #16]
-  ldtr    x3, [x4, #16]
-  ldtrb   w3, [x4, #16]
-  ldtrsb  w9, [x3]
-  ldtrsb  x2, [sp, #128]
-  ldtrh   w3, [x4, #16]
-  ldtrsh  w3, [sp, #32]
-  ldtrsh  x5, [x9, #24]
-  ldtrsw  x9, [sp, #-128]
-
-; CHECK: ldtr   w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0xb8]
-; CHECK: ldtr   x3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0xf8]
-; CHECK: ldtrb  w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0x38]
-; CHECK: ldtrsb w9, [x3]                ; encoding: [0x69,0x08,0xc0,0x38]
-; CHECK: ldtrsb x2, [sp, #128]          ; encoding: [0xe2,0x0b,0x88,0x38]
-; CHECK: ldtrh  w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0x78]
-; CHECK: ldtrsh w3, [sp, #32]           ; encoding: [0xe3,0x0b,0xc2,0x78]
-; CHECK: ldtrsh x5, [x9, #24]           ; encoding: [0x25,0x89,0x81,0x78]
-; CHECK: ldtrsw x9, [sp, #-128]         ; encoding: [0xe9,0x0b,0x98,0xb8]
-
-  sttr    w5, [x4, #20]
-  sttr    x4, [x3]
-  sttr    x2, [sp, #32]
-  sttrb   w4, [x3]
-  sttrb   w5, [x4, #20]
-  sttrh   w2, [sp, #32]
-
-; CHECK: sttr   w5, [x4, #20]           ; encoding: [0x85,0x48,0x01,0xb8]
-; CHECK: sttr   x4, [x3]                ; encoding: [0x64,0x08,0x00,0xf8]
-; CHECK: sttr   x2, [sp, #32]           ; encoding: [0xe2,0x0b,0x02,0xf8]
-; CHECK: sttrb  w4, [x3]                ; encoding: [0x64,0x08,0x00,0x38]
-; CHECK: sttrb  w5, [x4, #20]           ; encoding: [0x85,0x48,0x01,0x38]
-; CHECK: sttrh  w2, [sp, #32]           ; encoding: [0xe2,0x0b,0x02,0x78]
-
-;-----------------------------------------------------------------------------
-; Pre-indexed loads and stores
-;-----------------------------------------------------------------------------
-
-  ldr   x29, [x7, #8]!
-  ldr   x30, [x7, #8]!
-  ldr   b5, [x0, #1]!
-  ldr   h6, [x0, #2]!
-  ldr   s7, [x0, #4]!
-  ldr   d8, [x0, #8]!
-  ldr   q9, [x0, #16]!
-
-  str   x30, [x7, #-8]!
-  str   x29, [x7, #-8]!
-  str   b5, [x0, #-1]!
-  str   h6, [x0, #-2]!
-  str   s7, [x0, #-4]!
-  str   d8, [x0, #-8]!
-  str   q9, [x0, #-16]!
-
-; CHECK: ldr  x29, [x7, #8]!             ; encoding: [0xfd,0x8c,0x40,0xf8]
-; CHECK: ldr  x30, [x7, #8]!             ; encoding: [0xfe,0x8c,0x40,0xf8]
-; CHECK: ldr  b5, [x0, #1]!             ; encoding: [0x05,0x1c,0x40,0x3c]
-; CHECK: ldr  h6, [x0, #2]!             ; encoding: [0x06,0x2c,0x40,0x7c]
-; CHECK: ldr  s7, [x0, #4]!             ; encoding: [0x07,0x4c,0x40,0xbc]
-; CHECK: ldr  d8, [x0, #8]!             ; encoding: [0x08,0x8c,0x40,0xfc]
-; CHECK: ldr  q9, [x0, #16]!            ; encoding: [0x09,0x0c,0xc1,0x3c]
-
-; CHECK: str  x30, [x7, #-8]!            ; encoding: [0xfe,0x8c,0x1f,0xf8]
-; CHECK: str  x29, [x7, #-8]!            ; encoding: [0xfd,0x8c,0x1f,0xf8]
-; CHECK: str  b5, [x0, #-1]!            ; encoding: [0x05,0xfc,0x1f,0x3c]
-; CHECK: str  h6, [x0, #-2]!            ; encoding: [0x06,0xec,0x1f,0x7c]
-; CHECK: str  s7, [x0, #-4]!            ; encoding: [0x07,0xcc,0x1f,0xbc]
-; CHECK: str  d8, [x0, #-8]!            ; encoding: [0x08,0x8c,0x1f,0xfc]
-; CHECK: str  q9, [x0, #-16]!           ; encoding: [0x09,0x0c,0x9f,0x3c]
-
-;-----------------------------------------------------------------------------
-; post-indexed loads and stores
-;-----------------------------------------------------------------------------
-  str x30, [x7], #-8
-  str x29, [x7], #-8
-  str b5, [x0], #-1
-  str h6, [x0], #-2
-  str s7, [x0], #-4
-  str d8, [x0], #-8
-  str q9, [x0], #-16
-
-  ldr x29, [x7], #8
-  ldr x30, [x7], #8
-  ldr b5, [x0], #1
-  ldr h6, [x0], #2
-  ldr s7, [x0], #4
-  ldr d8, [x0], #8
-  ldr q9, [x0], #16
-
-; CHECK: str x30, [x7], #-8             ; encoding: [0xfe,0x84,0x1f,0xf8]
-; CHECK: str x29, [x7], #-8             ; encoding: [0xfd,0x84,0x1f,0xf8]
-; CHECK: str b5, [x0], #-1             ; encoding: [0x05,0xf4,0x1f,0x3c]
-; CHECK: str h6, [x0], #-2             ; encoding: [0x06,0xe4,0x1f,0x7c]
-; CHECK: str s7, [x0], #-4             ; encoding: [0x07,0xc4,0x1f,0xbc]
-; CHECK: str d8, [x0], #-8             ; encoding: [0x08,0x84,0x1f,0xfc]
-; CHECK: str q9, [x0], #-16            ; encoding: [0x09,0x04,0x9f,0x3c]
-
-; CHECK: ldr x29, [x7], #8              ; encoding: [0xfd,0x84,0x40,0xf8]
-; CHECK: ldr x30, [x7], #8              ; encoding: [0xfe,0x84,0x40,0xf8]
-; CHECK: ldr b5, [x0], #1              ; encoding: [0x05,0x14,0x40,0x3c]
-; CHECK: ldr h6, [x0], #2              ; encoding: [0x06,0x24,0x40,0x7c]
-; CHECK: ldr s7, [x0], #4              ; encoding: [0x07,0x44,0x40,0xbc]
-; CHECK: ldr d8, [x0], #8              ; encoding: [0x08,0x84,0x40,0xfc]
-; CHECK: ldr q9, [x0], #16             ; encoding: [0x09,0x04,0xc1,0x3c]
-
-;-----------------------------------------------------------------------------
-; Load/Store pair (indexed, offset)
-;-----------------------------------------------------------------------------
-
-  ldp    w3, w2, [x15, #16]
-  ldp    x4, x9, [sp, #-16]
-  ldpsw  x2, x3, [x14, #16]
-  ldpsw  x2, x3, [sp, #-16]
-  ldp    s10, s1, [x2, #64]
-  ldp    d10, d1, [x2]
-  ldp    q2, q3, [x0, #32]
-
-; CHECK: ldp    w3, w2, [x15, #16]      ; encoding: [0xe3,0x09,0x42,0x29]
-; CHECK: ldp    x4, x9, [sp, #-16]      ; encoding: [0xe4,0x27,0x7f,0xa9]
-; CHECK: ldpsw  x2, x3, [x14, #16]      ; encoding: [0xc2,0x0d,0x42,0x69]
-; CHECK: ldpsw  x2, x3, [sp, #-16]      ; encoding: [0xe2,0x0f,0x7e,0x69]
-; CHECK: ldp    s10, s1, [x2, #64]      ; encoding: [0x4a,0x04,0x48,0x2d]
-; CHECK: ldp    d10, d1, [x2]           ; encoding: [0x4a,0x04,0x40,0x6d]
-; CHECK: ldp    q2, q3, [x0, #32]       ; encoding: [0x02,0x0c,0x41,0xad]
-
-  stp    w3, w2, [x15, #16]
-  stp    x4, x9, [sp, #-16]
-  stp    s10, s1, [x2, #64]
-  stp    d10, d1, [x2]
-  stp    q2, q3, [x0, #32]
-
-; CHECK: stp    w3, w2, [x15, #16]      ; encoding: [0xe3,0x09,0x02,0x29]
-; CHECK: stp    x4, x9, [sp, #-16]      ; encoding: [0xe4,0x27,0x3f,0xa9]
-; CHECK: stp    s10, s1, [x2, #64]      ; encoding: [0x4a,0x04,0x08,0x2d]
-; CHECK: stp    d10, d1, [x2]           ; encoding: [0x4a,0x04,0x00,0x6d]
-; CHECK: stp    q2, q3, [x0, #32]       ; encoding: [0x02,0x0c,0x01,0xad]
-
-;-----------------------------------------------------------------------------
-; Load/Store pair (pre-indexed)
-;-----------------------------------------------------------------------------
-
-  ldp    w3, w2, [x15, #16]!
-  ldp    x4, x9, [sp, #-16]!
-  ldpsw  x2, x3, [x14, #16]!
-  ldpsw  x2, x3, [sp, #-16]!
-  ldp    s10, s1, [x2, #64]!
-  ldp    d10, d1, [x2, #16]!
-
-; CHECK: ldp  w3, w2, [x15, #16]!       ; encoding: [0xe3,0x09,0xc2,0x29]
-; CHECK: ldp  x4, x9, [sp, #-16]!       ; encoding: [0xe4,0x27,0xff,0xa9]
-; CHECK: ldpsw	x2, x3, [x14, #16]!     ; encoding: [0xc2,0x0d,0xc2,0x69]
-; CHECK: ldpsw	x2, x3, [sp, #-16]!     ; encoding: [0xe2,0x0f,0xfe,0x69]
-; CHECK: ldp  s10, s1, [x2, #64]!       ; encoding: [0x4a,0x04,0xc8,0x2d]
-; CHECK: ldp  d10, d1, [x2, #16]!       ; encoding: [0x4a,0x04,0xc1,0x6d]
-
-  stp    w3, w2, [x15, #16]!
-  stp    x4, x9, [sp, #-16]!
-  stp    s10, s1, [x2, #64]!
-  stp    d10, d1, [x2, #16]!
-
-; CHECK: stp  w3, w2, [x15, #16]!       ; encoding: [0xe3,0x09,0x82,0x29]
-; CHECK: stp  x4, x9, [sp, #-16]!       ; encoding: [0xe4,0x27,0xbf,0xa9]
-; CHECK: stp  s10, s1, [x2, #64]!       ; encoding: [0x4a,0x04,0x88,0x2d]
-; CHECK: stp  d10, d1, [x2, #16]!       ; encoding: [0x4a,0x04,0x81,0x6d]
-
-;-----------------------------------------------------------------------------
-; Load/Store pair (post-indexed)
-;-----------------------------------------------------------------------------
-
-  ldp    w3, w2, [x15], #16
-  ldp    x4, x9, [sp], #-16
-  ldpsw  x2, x3, [x14], #16
-  ldpsw  x2, x3, [sp], #-16
-  ldp    s10, s1, [x2], #64
-  ldp    d10, d1, [x2], #16
-
-; CHECK: ldp  w3, w2, [x15], #16        ; encoding: [0xe3,0x09,0xc2,0x28]
-; CHECK: ldp  x4, x9, [sp], #-16        ; encoding: [0xe4,0x27,0xff,0xa8]
-; CHECK: ldpsw	x2, x3, [x14], #16      ; encoding: [0xc2,0x0d,0xc2,0x68]
-; CHECK: ldpsw	x2, x3, [sp], #-16      ; encoding: [0xe2,0x0f,0xfe,0x68]
-; CHECK: ldp  s10, s1, [x2], #64        ; encoding: [0x4a,0x04,0xc8,0x2c]
-; CHECK: ldp  d10, d1, [x2], #16        ; encoding: [0x4a,0x04,0xc1,0x6c]
-
-  stp    w3, w2, [x15], #16
-  stp    x4, x9, [sp], #-16
-  stp    s10, s1, [x2], #64
-  stp    d10, d1, [x2], #16
-
-; CHECK: stp  w3, w2, [x15], #16        ; encoding: [0xe3,0x09,0x82,0x28]
-; CHECK: stp  x4, x9, [sp], #-16        ; encoding: [0xe4,0x27,0xbf,0xa8]
-; CHECK: stp  s10, s1, [x2], #64        ; encoding: [0x4a,0x04,0x88,0x2c]
-; CHECK: stp  d10, d1, [x2], #16        ; encoding: [0x4a,0x04,0x81,0x6c]
-
-;-----------------------------------------------------------------------------
-; Load/Store pair (no-allocate)
-;-----------------------------------------------------------------------------
-
-  ldnp  w3, w2, [x15, #16]
-  ldnp  x4, x9, [sp, #-16]
-  ldnp  s10, s1, [x2, #64]
-  ldnp  d10, d1, [x2]
-
-; CHECK: ldnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x42,0x28]
-; CHECK: ldnp  x4, x9, [sp, #-16]       ; encoding: [0xe4,0x27,0x7f,0xa8]
-; CHECK: ldnp  s10, s1, [x2, #64]       ; encoding: [0x4a,0x04,0x48,0x2c]
-; CHECK: ldnp  d10, d1, [x2]            ; encoding: [0x4a,0x04,0x40,0x6c]
-
-  stnp  w3, w2, [x15, #16]
-  stnp  x4, x9, [sp, #-16]
-  stnp  s10, s1, [x2, #64]
-  stnp  d10, d1, [x2]
-
-; CHECK: stnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x02,0x28]
-; CHECK: stnp  x4, x9, [sp, #-16]       ; encoding: [0xe4,0x27,0x3f,0xa8]
-; CHECK: stnp  s10, s1, [x2, #64]       ; encoding: [0x4a,0x04,0x08,0x2c]
-; CHECK: stnp  d10, d1, [x2]            ; encoding: [0x4a,0x04,0x00,0x6c]
-
-;-----------------------------------------------------------------------------
-; Load/Store register offset
-;-----------------------------------------------------------------------------
-
-  ldr  w0, [x0, x0]
-  ldr  w0, [x0, x0, lsl #2]
-  ldr  x0, [x0, x0]
-  ldr  x0, [x0, x0, lsl #3]
-  ldr  x0, [x0, x0, sxtx]
-
-; CHECK: ldr  w0, [x0, x0]              ; encoding: [0x00,0x68,0x60,0xb8]
-; CHECK: ldr  w0, [x0, x0, lsl #2]      ; encoding: [0x00,0x78,0x60,0xb8]
-; CHECK: ldr  x0, [x0, x0]              ; encoding: [0x00,0x68,0x60,0xf8]
-; CHECK: ldr  x0, [x0, x0, lsl #3]      ; encoding: [0x00,0x78,0x60,0xf8]
-; CHECK: ldr  x0, [x0, x0, sxtx]        ; encoding: [0x00,0xe8,0x60,0xf8]
-
-  ldr  b1, [x1, x2]
-  ldr  b1, [x1, x2, lsl #0]
-  ldr  h1, [x1, x2]
-  ldr  h1, [x1, x2, lsl #1]
-  ldr  s1, [x1, x2]
-  ldr  s1, [x1, x2, lsl #2]
-  ldr  d1, [x1, x2]
-  ldr  d1, [x1, x2, lsl #3]
-  ldr  q1, [x1, x2]
-  ldr  q1, [x1, x2, lsl #4]
-
-; CHECK: ldr  b1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0x3c]
-; CHECK: ldr  b1, [x1, x2, lsl #0]      ; encoding: [0x21,0x78,0x62,0x3c]
-; CHECK: ldr  h1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0x7c]
-; CHECK: ldr  h1, [x1, x2, lsl #1]      ; encoding: [0x21,0x78,0x62,0x7c]
-; CHECK: ldr  s1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0xbc]
-; CHECK: ldr  s1, [x1, x2, lsl #2]      ; encoding: [0x21,0x78,0x62,0xbc]
-; CHECK: ldr  d1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0xfc]
-; CHECK: ldr  d1, [x1, x2, lsl #3]      ; encoding: [0x21,0x78,0x62,0xfc]
-; CHECK: ldr  q1, [x1, x2]              ; encoding: [0x21,0x68,0xe2,0x3c]
-; CHECK: ldr  q1, [x1, x2, lsl #4]      ; encoding: [0x21,0x78,0xe2,0x3c]
-
-  str  d1, [sp, x3]
-  str  d1, [sp, w3, uxtw #3]
-  str  q1, [sp, x3]
-  str  q1, [sp, w3, uxtw #4]
-
-; CHECK: str  d1, [sp, x3]              ; encoding: [0xe1,0x6b,0x23,0xfc]
-; CHECK: str  d1, [sp, w3, uxtw #3]     ; encoding: [0xe1,0x5b,0x23,0xfc]
-; CHECK: str  q1, [sp, x3]              ; encoding: [0xe1,0x6b,0xa3,0x3c]
-; CHECK: str  q1, [sp, w3, uxtw #4]     ; encoding: [0xe1,0x5b,0xa3,0x3c]
-
-;-----------------------------------------------------------------------------
-; Load literal
-;-----------------------------------------------------------------------------
-
-  ldr    w5, foo
-  ldr    x4, foo
-  ldrsw  x9, foo
-  prfm   #5, foo
-
-; CHECK: ldr    w5, foo                 ; encoding: [0bAAA00101,A,A,0x18]
-; CHECK: ldr    x4, foo                 ; encoding: [0bAAA00100,A,A,0x58]
-; CHECK: ldrsw  x9, foo                 ; encoding: [0bAAA01001,A,A,0x98]
-; CHECK: prfm   pldl3strm, foo          ; encoding: [0bAAA00101,A,A,0xd8]
-
-;-----------------------------------------------------------------------------
-; Load/Store exclusive
-;-----------------------------------------------------------------------------
-
-  ldxr   w6, [x1]
-  ldxr   x6, [x1]
-  ldxrb  w6, [x1]
-  ldxrh  w6, [x1]
-  ldxp   w7, w3, [x9]
-  ldxp   x7, x3, [x9]
-
-; CHECK: ldxrb  w6, [x1]                ; encoding: [0x26,0x7c,0x5f,0x08]
-; CHECK: ldxrh  w6, [x1]                ; encoding: [0x26,0x7c,0x5f,0x48]
-; CHECK: ldxp   w7, w3, [x9]            ; encoding: [0x27,0x0d,0x7f,0x88]
-; CHECK: ldxp   x7, x3, [x9]            ; encoding: [0x27,0x0d,0x7f,0xc8]
-
-  stxr   w1, x4, [x3]
-  stxr   w1, w4, [x3]
-  stxrb  w1, w4, [x3]
-  stxrh  w1, w4, [x3]
-  stxp   w1, x2, x6, [x1]
-  stxp   w1, w2, w6, [x1]
-
-; CHECK: stxr   w1, x4, [x3]            ; encoding: [0x64,0x7c,0x01,0xc8]
-; CHECK: stxr   w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x88]
-; CHECK: stxrb  w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x08]
-; CHECK: stxrh  w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x48]
-; CHECK: stxp   w1, x2, x6, [x1]        ; encoding: [0x22,0x18,0x21,0xc8]
-; CHECK: stxp   w1, w2, w6, [x1]        ; encoding: [0x22,0x18,0x21,0x88]
-
-;-----------------------------------------------------------------------------
-; Load-acquire/Store-release non-exclusive
-;-----------------------------------------------------------------------------
-
-  ldar   w4, [sp]
-  ldar   x4, [sp, #0]
-  ldarb  w4, [sp]
-  ldarh  w4, [sp]
-
-; CHECK: ldar   w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x88]
-; CHECK: ldar   x4, [sp]                ; encoding: [0xe4,0xff,0xdf,0xc8]
-; CHECK: ldarb  w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x08]
-; CHECK: ldarh  w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x48]
-
-  stlr   w3, [x6]
-  stlr   x3, [x6]
-  stlrb  w3, [x6]
-  stlrh  w3, [x6]
-
-; CHECK: stlr   w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x88]
-; CHECK: stlr   x3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0xc8]
-; CHECK: stlrb  w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x08]
-; CHECK: stlrh  w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x48]
-
-;-----------------------------------------------------------------------------
-; Load-acquire/Store-release exclusive
-;-----------------------------------------------------------------------------
-
-  ldaxr   w2, [x4]
-  ldaxr   x2, [x4]
-  ldaxrb  w2, [x4, #0]
-  ldaxrh  w2, [x4]
-  ldaxp   w2, w6, [x1]
-  ldaxp   x2, x6, [x1]
-
-; CHECK: ldaxr   w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x88]
-; CHECK: ldaxr   x2, [x4]               ; encoding: [0x82,0xfc,0x5f,0xc8]
-; CHECK: ldaxrb  w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x08]
-; CHECK: ldaxrh  w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x48]
-; CHECK: ldaxp   w2, w6, [x1]           ; encoding: [0x22,0x98,0x7f,0x88]
-; CHECK: ldaxp   x2, x6, [x1]           ; encoding: [0x22,0x98,0x7f,0xc8]
-
-  stlxr   w8, x7, [x1]
-  stlxr   w8, w7, [x1]
-  stlxrb  w8, w7, [x1]
-  stlxrh  w8, w7, [x1]
-  stlxp   w1, x2, x6, [x1]
-  stlxp   w1, w2, w6, [x1]
-
-; CHECK: stlxr  w8, x7, [x1]            ; encoding: [0x27,0xfc,0x08,0xc8]
-; CHECK: stlxr  w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x88]
-; CHECK: stlxrb w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x08]
-; CHECK: stlxrh w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x48]
-; CHECK: stlxp  w1, x2, x6, [x1]        ; encoding: [0x22,0x98,0x21,0xc8]
-; CHECK: stlxp  w1, w2, w6, [x1]        ; encoding: [0x22,0x98,0x21,0x88]
-
-
-;-----------------------------------------------------------------------------
-; LDUR/STUR aliases for negative and unaligned LDR/STR instructions.
-;
-; According to the ARM ISA documentation:
-; "A programmer-friendly assembler should also generate these instructions
-; in response to the standard LDR/STR mnemonics when the immediate offset is
-; unambiguous, i.e. negative or unaligned."
-;-----------------------------------------------------------------------------
-
-  ldr x11, [x29, #-8]
-  ldr x11, [x29, #7]
-  ldr w0, [x0, #2]
-  ldr w0, [x0, #-256]
-  ldr b2, [x1, #-2]
-  ldr h3, [x2, #3]
-  ldr h3, [x3, #-4]
-  ldr s3, [x4, #3]
-  ldr s3, [x5, #-4]
-  ldr d4, [x6, #4]
-  ldr d4, [x7, #-8]
-  ldr q5, [x8, #8]
-  ldr q5, [x9, #-16]
-
-; CHECK: ldur	x11, [x29, #-8]          ; encoding: [0xab,0x83,0x5f,0xf8]
-; CHECK: ldur	x11, [x29, #7]           ; encoding: [0xab,0x73,0x40,0xf8]
-; CHECK: ldur	w0, [x0, #2]            ; encoding: [0x00,0x20,0x40,0xb8]
-; CHECK: ldur	w0, [x0, #-256]         ; encoding: [0x00,0x00,0x50,0xb8]
-; CHECK: ldur	b2, [x1, #-2]           ; encoding: [0x22,0xe0,0x5f,0x3c]
-; CHECK: ldur	h3, [x2, #3]            ; encoding: [0x43,0x30,0x40,0x7c]
-; CHECK: ldur	h3, [x3, #-4]           ; encoding: [0x63,0xc0,0x5f,0x7c]
-; CHECK: ldur	s3, [x4, #3]            ; encoding: [0x83,0x30,0x40,0xbc]
-; CHECK: ldur	s3, [x5, #-4]           ; encoding: [0xa3,0xc0,0x5f,0xbc]
-; CHECK: ldur	d4, [x6, #4]            ; encoding: [0xc4,0x40,0x40,0xfc]
-; CHECK: ldur	d4, [x7, #-8]           ; encoding: [0xe4,0x80,0x5f,0xfc]
-; CHECK: ldur	q5, [x8, #8]            ; encoding: [0x05,0x81,0xc0,0x3c]
-; CHECK: ldur	q5, [x9, #-16]          ; encoding: [0x25,0x01,0xdf,0x3c]
-
-  str x11, [x29, #-8]
-  str x11, [x29, #7]
-  str w0, [x0, #2]
-  str w0, [x0, #-256]
-  str b2, [x1, #-2]
-  str h3, [x2, #3]
-  str h3, [x3, #-4]
-  str s3, [x4, #3]
-  str s3, [x5, #-4]
-  str d4, [x6, #4]
-  str d4, [x7, #-8]
-  str q5, [x8, #8]
-  str q5, [x9, #-16]
-
-; CHECK: stur	x11, [x29, #-8]          ; encoding: [0xab,0x83,0x1f,0xf8]
-; CHECK: stur	x11, [x29, #7]           ; encoding: [0xab,0x73,0x00,0xf8]
-; CHECK: stur	w0, [x0, #2]            ; encoding: [0x00,0x20,0x00,0xb8]
-; CHECK: stur	w0, [x0, #-256]         ; encoding: [0x00,0x00,0x10,0xb8]
-; CHECK: stur	b2, [x1, #-2]           ; encoding: [0x22,0xe0,0x1f,0x3c]
-; CHECK: stur	h3, [x2, #3]            ; encoding: [0x43,0x30,0x00,0x7c]
-; CHECK: stur	h3, [x3, #-4]           ; encoding: [0x63,0xc0,0x1f,0x7c]
-; CHECK: stur	s3, [x4, #3]            ; encoding: [0x83,0x30,0x00,0xbc]
-; CHECK: stur	s3, [x5, #-4]           ; encoding: [0xa3,0xc0,0x1f,0xbc]
-; CHECK: stur	d4, [x6, #4]            ; encoding: [0xc4,0x40,0x00,0xfc]
-; CHECK: stur	d4, [x7, #-8]           ; encoding: [0xe4,0x80,0x1f,0xfc]
-; CHECK: stur	q5, [x8, #8]            ; encoding: [0x05,0x81,0x80,0x3c]
-; CHECK: stur	q5, [x9, #-16]          ; encoding: [0x25,0x01,0x9f,0x3c]
-
-  ldrb w3, [x1, #-1]
-  ldrh w4, [x2, #1]
-  ldrh w5, [x3, #-1]
-  ldrsb w6, [x4, #-1]
-  ldrsb x7, [x5, #-1]
-  ldrsh w8, [x6, #1]
-  ldrsh w9, [x7, #-1]
-  ldrsh x1, [x8, #1]
-  ldrsh x2, [x9, #-1]
-  ldrsw x3, [x10, #10]
-  ldrsw x4, [x11, #-1]
-
-; CHECK: ldurb	w3, [x1, #-1]           ; encoding: [0x23,0xf0,0x5f,0x38]
-; CHECK: ldurh	w4, [x2, #1]            ; encoding: [0x44,0x10,0x40,0x78]
-; CHECK: ldurh	w5, [x3, #-1]           ; encoding: [0x65,0xf0,0x5f,0x78]
-; CHECK: ldursb	w6, [x4, #-1]           ; encoding: [0x86,0xf0,0xdf,0x38]
-; CHECK: ldursb	x7, [x5, #-1]           ; encoding: [0xa7,0xf0,0x9f,0x38]
-; CHECK: ldursh	w8, [x6, #1]            ; encoding: [0xc8,0x10,0xc0,0x78]
-; CHECK: ldursh	w9, [x7, #-1]           ; encoding: [0xe9,0xf0,0xdf,0x78]
-; CHECK: ldursh	x1, [x8, #1]            ; encoding: [0x01,0x11,0x80,0x78]
-; CHECK: ldursh	x2, [x9, #-1]           ; encoding: [0x22,0xf1,0x9f,0x78]
-; CHECK: ldursw	x3, [x10, #10]          ; encoding: [0x43,0xa1,0x80,0xb8]
-; CHECK: ldursw	x4, [x11, #-1]          ; encoding: [0x64,0xf1,0x9f,0xb8]
-
-  strb w3, [x1, #-1]
-  strh w4, [x2, #1]
-  strh w5, [x3, #-1]
-
-; CHECK: sturb	w3, [x1, #-1]           ; encoding: [0x23,0xf0,0x1f,0x38]
-; CHECK: sturh	w4, [x2, #1]            ; encoding: [0x44,0x10,0x00,0x78]
-; CHECK: sturh	w5, [x3, #-1]           ; encoding: [0x65,0xf0,0x1f,0x78]
diff --git a/test/MC/ARM64/nv-cond.s b/test/MC/ARM64/nv-cond.s
deleted file mode 100644
index 1b4d054d248..00000000000
--- a/test/MC/ARM64/nv-cond.s
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: llvm-mc < %s -triple arm64 -mattr=neon -show-encoding | FileCheck %s
-
-fcsel d28,d31,d31,nv
-csel x0,x0,x0,nv
-ccmp x0,x0,#0,nv
-b.nv #0
-
-// CHECK: fcsel   d28, d31, d31, nv       // encoding: [0xfc,0xff,0x7f,0x1e]
-// CHECK: csel    x0, x0, x0, nv          // encoding: [0x00,0xf0,0x80,0x9a]
-// CHECK: ccmp    x0, x0, #0, nv          // encoding: [0x00,0xf0,0x40,0xfa]
-// CHECK: b.nv #0                         // encoding: [0x0f,0x00,0x00,0x54]
diff --git a/test/MC/ARM64/optional-hash.s b/test/MC/ARM64/optional-hash.s
deleted file mode 100644
index 71e2fda217d..00000000000
--- a/test/MC/ARM64/optional-hash.s
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-.text
-; parseOperand check
-; CHECK: add sp, sp, #32             ; encoding: [0xff,0x83,0x00,0x91]
-    add sp, sp, 32
-
-; Optional shift
-; CHECK: adds x3, x4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0xb1]
-adds x3, x4, 1024, lsl 12
-
-; Optional extend
-; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
-add sp, x2, x3, uxtx 0
-
-; FP immediates
-; CHECK: fmov s1, #0.12500000      ; encoding: [0x01,0x10,0x28,0x1e]
-fmov s1, 0.125
-
-; Barrier operand
-; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
-dmb 3
-
-; Prefetch and memory
-
-; Single register inside []
-; CHECK: ldnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x42,0x28]
-ldnp  w3, w2, [x15, 16]
-
-; Memory, two registers inside []
-; CHECK: prfm   pstl3strm, [x4, x5, lsl #3] ; encoding: [0x95,0x78,0xa5,0xf8]
-prfm  pstl3strm, [x4, x5, lsl 3]
diff --git a/test/MC/ARM64/separator.s b/test/MC/ARM64/separator.s
deleted file mode 100644
index e67deba825d..00000000000
--- a/test/MC/ARM64/separator.s
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-
-; ARM64 uses a multi-character statement separator, "%%". Check that we lex
-; it properly and recognize the multiple assembly statements on the line.
-
-; To make sure the output assembly correctly handled the instructions,
-; tell it to show encodings. That will result in the two 'mov' instructions
-; being on separate lines in the output. We look for the "; encoding" string
-; to verify that. For this test, we don't care what the encoding is, just that
-; there is one for each 'mov' instruction.
-
-
-_foo:
-; CHECK: foo
-; CHECK: mov x0, x1 ; encoding
-; CHECK: mov x1, x0 ; encoding
-	mov x0, x1 %% mov x1, x0
-	ret	lr
-
-
diff --git a/test/MC/ARM64/simd-ldst.s b/test/MC/ARM64/simd-ldst.s
deleted file mode 100644
index 30854852c28..00000000000
--- a/test/MC/ARM64/simd-ldst.s
+++ /dev/null
@@ -1,2404 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon -output-asm-variant=1 -show-encoding < %s | FileCheck %s
-
-_ld1st1_multiple:
-  ld1.8b {v0}, [x1]
-  ld1.8b {v0, v1}, [x1]
-  ld1.8b {v0, v1, v2}, [x1]
-  ld1.8b {v0, v1, v2, v3}, [x1]
-
-  ld1.8b {v3}, [x1]
-  ld1.8b {v3, v4}, [x2]
-  ld1.8b {v4, v5, v6}, [x3]
-  ld1.8b {v7, v8, v9, v10}, [x4]
-
-  ld1.16b {v0}, [x1]
-  ld1.16b {v0, v1}, [x1]
-  ld1.16b {v0, v1, v2}, [x1]
-  ld1.16b {v0, v1, v2, v3}, [x1]
-
-  ld1.4h {v0}, [x1]
-  ld1.4h {v0, v1}, [x1]
-  ld1.4h {v0, v1, v2}, [x1]
-  ld1.4h {v0, v1, v2, v3}, [x1]
-
-  ld1.8h {v0}, [x1]
-  ld1.8h {v0, v1}, [x1]
-  ld1.8h {v0, v1, v2}, [x1]
-  ld1.8h {v0, v1, v2, v3}, [x1]
-
-  ld1.2s {v0}, [x1]
-  ld1.2s {v0, v1}, [x1]
-  ld1.2s {v0, v1, v2}, [x1]
-  ld1.2s {v0, v1, v2, v3}, [x1]
-
-  ld1.4s {v0}, [x1]
-  ld1.4s {v0, v1}, [x1]
-  ld1.4s {v0, v1, v2}, [x1]
-  ld1.4s {v0, v1, v2, v3}, [x1]
-
-  ld1.1d {v0}, [x1]
-  ld1.1d {v0, v1}, [x1]
-  ld1.1d {v0, v1, v2}, [x1]
-  ld1.1d {v0, v1, v2, v3}, [x1]
-
-  ld1.2d {v0}, [x1]
-  ld1.2d {v0, v1}, [x1]
-  ld1.2d {v0, v1, v2}, [x1]
-  ld1.2d {v0, v1, v2, v3}, [x1]
-
-  st1.8b {v0}, [x1]
-  st1.8b {v0, v1}, [x1]
-  st1.8b {v0, v1, v2}, [x1]
-  st1.8b {v0, v1, v2, v3}, [x1]
-
-  st1.16b {v0}, [x1]
-  st1.16b {v0, v1}, [x1]
-  st1.16b {v0, v1, v2}, [x1]
-  st1.16b {v0, v1, v2, v3}, [x1]
-
-  st1.4h {v0}, [x1]
-  st1.4h {v0, v1}, [x1]
-  st1.4h {v0, v1, v2}, [x1]
-  st1.4h {v0, v1, v2, v3}, [x1]
-
-  st1.8h {v0}, [x1]
-  st1.8h {v0, v1}, [x1]
-  st1.8h {v0, v1, v2}, [x1]
-  st1.8h {v0, v1, v2, v3}, [x1]
-
-  st1.2s {v0}, [x1]
-  st1.2s {v0, v1}, [x1]
-  st1.2s {v0, v1, v2}, [x1]
-  st1.2s {v0, v1, v2, v3}, [x1]
-
-  st1.4s {v0}, [x1]
-  st1.4s {v0, v1}, [x1]
-  st1.4s {v0, v1, v2}, [x1]
-  st1.4s {v0, v1, v2, v3}, [x1]
-
-  st1.1d {v0}, [x1]
-  st1.1d {v0, v1}, [x1]
-  st1.1d {v0, v1, v2}, [x1]
-  st1.1d {v0, v1, v2, v3}, [x1]
-
-  st1.2d {v0}, [x1]
-  st1.2d {v0, v1}, [x1]
-  st1.2d {v0, v1, v2}, [x1]
-  st1.2d {v0, v1, v2, v3}, [x1]
-
-  st1.2d {v5}, [x1]
-  st1.2d {v7, v8}, [x10]
-  st1.2d {v11, v12, v13}, [x1]
-  st1.2d {v28, v29, v30, v31}, [x13]
-
-; CHECK: _ld1st1_multiple:
-; CHECK: ld1.8b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x40,0x0c]
-; CHECK: ld1.8b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x40,0x0c]
-; CHECK: ld1.8b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x40,0x0c]
-; CHECK: ld1.8b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x40,0x0c]
-
-; CHECK: ld1.8b { v3 }, [x1]            ; encoding: [0x23,0x70,0x40,0x0c]
-; CHECK: ld1.8b { v3, v4 }, [x2]        ; encoding: [0x43,0xa0,0x40,0x0c]
-; CHECK: ld1.8b { v4, v5, v6 }, [x3]    ; encoding: [0x64,0x60,0x40,0x0c]
-; CHECK: ld1.8b { v7, v8, v9, v10 }, [x4] ; encoding: [0x87,0x20,0x40,0x0c]
-
-; CHECK: ld1.16b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x40,0x4c]
-; CHECK: ld1.16b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x40,0x4c]
-; CHECK: ld1.16b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x40,0x4c]
-; CHECK: ld1.16b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x40,0x4c]
-
-; CHECK: ld1.4h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x40,0x0c]
-; CHECK: ld1.4h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x40,0x0c]
-; CHECK: ld1.4h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x40,0x0c]
-; CHECK: ld1.4h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x40,0x0c]
-
-; CHECK: ld1.8h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x40,0x4c]
-; CHECK: ld1.8h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x40,0x4c]
-; CHECK: ld1.8h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x40,0x4c]
-; CHECK: ld1.8h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x40,0x4c]
-
-; CHECK: ld1.2s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x40,0x0c]
-; CHECK: ld1.2s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x40,0x0c]
-; CHECK: ld1.2s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x40,0x0c]
-; CHECK: ld1.2s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x40,0x0c]
-
-; CHECK: ld1.4s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x40,0x4c]
-; CHECK: ld1.4s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x40,0x4c]
-; CHECK: ld1.4s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x40,0x4c]
-; CHECK: ld1.4s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x40,0x4c]
-
-; CHECK: ld1.1d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x40,0x0c]
-; CHECK: ld1.1d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x40,0x0c]
-; CHECK: ld1.1d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x40,0x0c]
-; CHECK: ld1.1d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x40,0x0c]
-
-; CHECK: ld1.2d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x40,0x4c]
-; CHECK: ld1.2d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x40,0x4c]
-; CHECK: ld1.2d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x40,0x4c]
-; CHECK: ld1.2d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x40,0x4c]
-
-
-; CHECK: st1.8b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x00,0x0c]
-; CHECK: st1.8b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x00,0x0c]
-; CHECK: st1.8b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x00,0x0c]
-; CHECK: st1.8b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x00,0x0c]
-
-; CHECK: st1.16b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x00,0x4c]
-; CHECK: st1.16b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x00,0x4c]
-; CHECK: st1.16b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x00,0x4c]
-; CHECK: st1.16b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x00,0x4c]
-
-; CHECK: st1.4h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x00,0x0c]
-; CHECK: st1.4h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x00,0x0c]
-; CHECK: st1.4h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x00,0x0c]
-; CHECK: st1.4h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x00,0x0c]
-
-; CHECK: st1.8h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x00,0x4c]
-; CHECK: st1.8h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x00,0x4c]
-; CHECK: st1.8h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x00,0x4c]
-; CHECK: st1.8h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x00,0x4c]
-
-; CHECK: st1.2s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x00,0x0c]
-; CHECK: st1.2s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x00,0x0c]
-; CHECK: st1.2s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x00,0x0c]
-; CHECK: st1.2s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x00,0x0c]
-
-; CHECK: st1.4s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x00,0x4c]
-; CHECK: st1.4s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x00,0x4c]
-; CHECK: st1.4s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x00,0x4c]
-; CHECK: st1.4s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x00,0x4c]
-
-; CHECK: st1.1d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x00,0x0c]
-; CHECK: st1.1d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x00,0x0c]
-; CHECK: st1.1d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x00,0x0c]
-; CHECK: st1.1d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x00,0x0c]
-
-; CHECK: st1.2d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x00,0x4c]
-; CHECK: st1.2d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x00,0x4c]
-; CHECK: st1.2d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x00,0x4c]
-; CHECK: st1.2d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x00,0x4c]
-
-; CHECK: st1.2d { v5 }, [x1]            ; encoding: [0x25,0x7c,0x00,0x4c]
-; CHECK: st1.2d { v7, v8 }, [x10]       ; encoding: [0x47,0xad,0x00,0x4c]
-; CHECK: st1.2d { v11, v12, v13 }, [x1] ; encoding: [0x2b,0x6c,0x00,0x4c]
-; CHECK: st1.2d { v28, v29, v30, v31 }, [x13] ; encoding: [0xbc,0x2d,0x00,0x4c]
-
-_ld2st2_multiple:
-  ld2.8b {v4, v5}, [x19]
-  ld2.16b {v4, v5}, [x19]
-  ld2.4h {v4, v5}, [x19]
-  ld2.8h {v4, v5}, [x19]
-  ld2.2s {v4, v5}, [x19]
-  ld2.4s {v4, v5}, [x19]
-  ld2.2d {v4, v5}, [x19]
-
-  st2.8b {v4, v5}, [x19]
-  st2.16b {v4, v5}, [x19]
-  st2.4h {v4, v5}, [x19]
-  st2.8h {v4, v5}, [x19]
-  st2.2s {v4, v5}, [x19]
-  st2.4s {v4, v5}, [x19]
-  st2.2d {v4, v5}, [x19]
-
-
-; CHECK: _ld2st2_multiple
-; CHECK: ld2.8b { v4, v5 }, [x19]       ; encoding: [0x64,0x82,0x40,0x0c]
-; CHECK: ld2.16b { v4, v5 }, [x19]      ; encoding: [0x64,0x82,0x40,0x4c]
-; CHECK: ld2.4h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x40,0x0c]
-; CHECK: ld2.8h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x40,0x4c]
-; CHECK: ld2.2s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x40,0x0c]
-; CHECK: ld2.4s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x40,0x4c]
-; CHECK: ld2.2d { v4, v5 }, [x19]       ; encoding: [0x64,0x8e,0x40,0x4c]
-
-; CHECK: st2.8b { v4, v5 }, [x19]       ; encoding: [0x64,0x82,0x00,0x0c]
-; CHECK: st2.16b { v4, v5 }, [x19]      ; encoding: [0x64,0x82,0x00,0x4c]
-; CHECK: st2.4h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x00,0x0c]
-; CHECK: st2.8h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x00,0x4c]
-; CHECK: st2.2s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x00,0x0c]
-; CHECK: st2.4s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x00,0x4c]
-; CHECK: st2.2d { v4, v5 }, [x19]       ; encoding: [0x64,0x8e,0x00,0x4c]
-
-
-ld3st3_multiple:
-    ld3.8b {v4, v5, v6}, [x19]
-    ld3.16b {v4, v5, v6}, [x19]
-    ld3.4h {v4, v5, v6}, [x19]
-    ld3.8h {v4, v5, v6}, [x19]
-    ld3.2s {v4, v5, v6}, [x19]
-    ld3.4s {v4, v5, v6}, [x19]
-    ld3.2d {v4, v5, v6}, [x19]
-
-    ld3.8b {v9, v10, v11}, [x9]
-    ld3.16b {v14, v15, v16}, [x19]
-    ld3.4h {v24, v25, v26}, [x29]
-    ld3.8h {v30, v31, v0}, [x9]
-    ld3.2s {v2, v3, v4}, [x19]
-    ld3.4s {v4, v5, v6}, [x29]
-    ld3.2d {v7, v8, v9}, [x9]
-
-    st3.8b {v4, v5, v6}, [x19]
-    st3.16b {v4, v5, v6}, [x19]
-    st3.4h {v4, v5, v6}, [x19]
-    st3.8h {v4, v5, v6}, [x19]
-    st3.2s {v4, v5, v6}, [x19]
-    st3.4s {v4, v5, v6}, [x19]
-    st3.2d {v4, v5, v6}, [x19]
-
-    st3.8b {v10, v11, v12}, [x9]
-    st3.16b {v14, v15, v16}, [x19]
-    st3.4h {v24, v25, v26}, [x29]
-    st3.8h {v30, v31, v0}, [x9]
-    st3.2s {v2, v3, v4}, [x19]
-    st3.4s {v7, v8, v9}, [x29]
-    st3.2d {v4, v5, v6}, [x9]
-
-; CHECK: ld3st3_multiple:
-; CHECK: ld3.8b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x40,0x0c]
-; CHECK: ld3.16b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x40,0x4c]
-; CHECK: ld3.4h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x40,0x0c]
-; CHECK: ld3.8h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x40,0x4c]
-; CHECK: ld3.2s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x40,0x0c]
-; CHECK: ld3.4s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x40,0x4c]
-; CHECK: ld3.2d { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4e,0x40,0x4c]
-
-; CHECK: ld3.8b { v9, v10, v11 }, [x9]  ; encoding: [0x29,0x41,0x40,0x0c]
-; CHECK: ld3.16b { v14, v15, v16 }, [x19] ; encoding: [0x6e,0x42,0x40,0x4c]
-; CHECK: ld3.4h { v24, v25, v26 }, [x29] ; encoding: [0xb8,0x47,0x40,0x0c]
-; CHECK: ld3.8h { v30, v31, v0 }, [x9]  ; encoding: [0x3e,0x45,0x40,0x4c]
-; CHECK: ld3.2s { v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x0c]
-; CHECK: ld3.4s { v4, v5, v6 }, [x29]    ; encoding: [0xa4,0x4b,0x40,0x4c]
-; CHECK: ld3.2d { v7, v8, v9 }, [x9]    ; encoding: [0x27,0x4d,0x40,0x4c]
-
-; CHECK: st3.8b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x00,0x0c]
-; CHECK: st3.16b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x00,0x4c]
-; CHECK: st3.4h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x00,0x0c]
-; CHECK: st3.8h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x00,0x4c]
-; CHECK: st3.2s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x00,0x0c]
-; CHECK: st3.4s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x00,0x4c]
-; CHECK: st3.2d { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4e,0x00,0x4c]
-
-; CHECK: st3.8b { v10, v11, v12 }, [x9] ; encoding: [0x2a,0x41,0x00,0x0c]
-; CHECK: st3.16b { v14, v15, v16 }, [x19] ; encoding: [0x6e,0x42,0x00,0x4c]
-; CHECK: st3.4h { v24, v25, v26 }, [x29] ; encoding: [0xb8,0x47,0x00,0x0c]
-; CHECK: st3.8h { v30, v31, v0 }, [x9]  ; encoding: [0x3e,0x45,0x00,0x4c]
-; CHECK: st3.2s { v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x0c]
-; CHECK: st3.4s { v7, v8, v9 }, [x29]    ; encoding: [0xa7,0x4b,0x00,0x4c]
-; CHECK: st3.2d { v4, v5, v6 }, [x9]    ; encoding: [0x24,0x4d,0x00,0x4c]
-
-ld4st4_multiple:
-    ld4.8b {v4, v5, v6, v7}, [x19]
-    ld4.16b {v4, v5, v6, v7}, [x19]
-    ld4.4h {v4, v5, v6, v7}, [x19]
-    ld4.8h {v4, v5, v6, v7}, [x19]
-    ld4.2s {v4, v5, v6, v7}, [x19]
-    ld4.4s {v4, v5, v6, v7}, [x19]
-    ld4.2d {v4, v5, v6, v7}, [x19]
-
-    st4.8b {v4, v5, v6, v7}, [x19]
-    st4.16b {v4, v5, v6, v7}, [x19]
-    st4.4h {v4, v5, v6, v7}, [x19]
-    st4.8h {v4, v5, v6, v7}, [x19]
-    st4.2s {v4, v5, v6, v7}, [x19]
-    st4.4s {v4, v5, v6, v7}, [x19]
-    st4.2d {v4, v5, v6, v7}, [x19]
-
-; CHECK: ld4st4_multiple:
-; CHECK: ld4.8b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x40,0x0c]
-; CHECK: ld4.16b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x40,0x4c]
-; CHECK: ld4.4h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x40,0x0c]
-; CHECK: ld4.8h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x40,0x4c]
-; CHECK: ld4.2s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x40,0x0c]
-; CHECK: ld4.4s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x40,0x4c]
-; CHECK: ld4.2d { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0e,0x40,0x4c]
-
-; CHECK: st4.8b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x00,0x0c]
-; CHECK: st4.16b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x00,0x4c]
-; CHECK: st4.4h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x00,0x0c]
-; CHECK: st4.8h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x00,0x4c]
-; CHECK: st4.2s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x00,0x0c]
-; CHECK: st4.4s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x00,0x4c]
-; CHECK: st4.2d { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0e,0x00,0x4c]
-
-;-----------------------------------------------------------------------------
-; Post-increment versions.
-;-----------------------------------------------------------------------------
-
-_ld1st1_multiple_post:
-  ld1.8b {v0}, [x1], x15
-  ld1.8b {v0, v1}, [x1], x15
-  ld1.8b {v0, v1, v2}, [x1], x15
-  ld1.8b {v0, v1, v2, v3}, [x1], x15
-
-  ld1.16b {v0}, [x1], x15
-  ld1.16b {v0, v1}, [x1], x15
-  ld1.16b {v0, v1, v2}, [x1], x15
-  ld1.16b {v0, v1, v2, v3}, [x1], x15
-
-  ld1.4h {v0}, [x1], x15
-  ld1.4h {v0, v1}, [x1], x15
-  ld1.4h {v0, v1, v2}, [x1], x15
-  ld1.4h {v0, v1, v2, v3}, [x1], x15
-
-  ld1.8h {v0}, [x1], x15
-  ld1.8h {v0, v1}, [x1], x15
-  ld1.8h {v0, v1, v2}, [x1], x15
-  ld1.8h {v0, v1, v2, v3}, [x1], x15
-
-  ld1.2s {v0}, [x1], x15
-  ld1.2s {v0, v1}, [x1], x15
-  ld1.2s {v0, v1, v2}, [x1], x15
-  ld1.2s {v0, v1, v2, v3}, [x1], x15
-
-  ld1.4s {v0}, [x1], x15
-  ld1.4s {v0, v1}, [x1], x15
-  ld1.4s {v0, v1, v2}, [x1], x15
-  ld1.4s {v0, v1, v2, v3}, [x1], x15
-
-  ld1.1d {v0}, [x1], x15
-  ld1.1d {v0, v1}, [x1], x15
-  ld1.1d {v0, v1, v2}, [x1], x15
-  ld1.1d {v0, v1, v2, v3}, [x1], x15
-
-  ld1.2d {v0}, [x1], x15
-  ld1.2d {v0, v1}, [x1], x15
-  ld1.2d {v0, v1, v2}, [x1], x15
-  ld1.2d {v0, v1, v2, v3}, [x1], x15
-
-  st1.8b {v0}, [x1], x15
-  st1.8b {v0, v1}, [x1], x15
-  st1.8b {v0, v1, v2}, [x1], x15
-  st1.8b {v0, v1, v2, v3}, [x1], x15
-
-  st1.16b {v0}, [x1], x15
-  st1.16b {v0, v1}, [x1], x15
-  st1.16b {v0, v1, v2}, [x1], x15
-  st1.16b {v0, v1, v2, v3}, [x1], x15
-
-  st1.4h {v0}, [x1], x15
-  st1.4h {v0, v1}, [x1], x15
-  st1.4h {v0, v1, v2}, [x1], x15
-  st1.4h {v0, v1, v2, v3}, [x1], x15
-
-  st1.8h {v0}, [x1], x15
-  st1.8h {v0, v1}, [x1], x15
-  st1.8h {v0, v1, v2}, [x1], x15
-  st1.8h {v0, v1, v2, v3}, [x1], x15
-
-  st1.2s {v0}, [x1], x15
-  st1.2s {v0, v1}, [x1], x15
-  st1.2s {v0, v1, v2}, [x1], x15
-  st1.2s {v0, v1, v2, v3}, [x1], x15
-
-  st1.4s {v0}, [x1], x15
-  st1.4s {v0, v1}, [x1], x15
-  st1.4s {v0, v1, v2}, [x1], x15
-  st1.4s {v0, v1, v2, v3}, [x1], x15
-
-  st1.1d {v0}, [x1], x15
-  st1.1d {v0, v1}, [x1], x15
-  st1.1d {v0, v1, v2}, [x1], x15
-  st1.1d {v0, v1, v2, v3}, [x1], x15
-
-  st1.2d {v0}, [x1], x15
-  st1.2d {v0, v1}, [x1], x15
-  st1.2d {v0, v1, v2}, [x1], x15
-  st1.2d {v0, v1, v2, v3}, [x1], x15
-
-  ld1.8b {v0}, [x1], #8
-  ld1.8b {v0, v1}, [x1], #16
-  ld1.8b {v0, v1, v2}, [x1], #24
-  ld1.8b {v0, v1, v2, v3}, [x1], #32
-
-  ld1.16b {v0}, [x1], #16
-  ld1.16b {v0, v1}, [x1], #32
-  ld1.16b {v0, v1, v2}, [x1], #48
-  ld1.16b {v0, v1, v2, v3}, [x1], #64
-
-  ld1.4h {v0}, [x1], #8
-  ld1.4h {v0, v1}, [x1], #16
-  ld1.4h {v0, v1, v2}, [x1], #24
-  ld1.4h {v0, v1, v2, v3}, [x1], #32
-
-  ld1.8h {v0}, [x1], #16
-  ld1.8h {v0, v1}, [x1], #32
-  ld1.8h {v0, v1, v2}, [x1], #48
-  ld1.8h {v0, v1, v2, v3}, [x1], #64
-
-  ld1.2s {v0}, [x1], #8
-  ld1.2s {v0, v1}, [x1], #16
-  ld1.2s {v0, v1, v2}, [x1], #24
-  ld1.2s {v0, v1, v2, v3}, [x1], #32
-
-  ld1.4s {v0}, [x1], #16
-  ld1.4s {v0, v1}, [x1], #32
-  ld1.4s {v0, v1, v2}, [x1], #48
-  ld1.4s {v0, v1, v2, v3}, [x1], #64
-
-  ld1.1d {v0}, [x1], #8
-  ld1.1d {v0, v1}, [x1], #16
-  ld1.1d {v0, v1, v2}, [x1], #24
-  ld1.1d {v0, v1, v2, v3}, [x1], #32
-
-  ld1.2d {v0}, [x1], #16
-  ld1.2d {v0, v1}, [x1], #32
-  ld1.2d {v0, v1, v2}, [x1], #48
-  ld1.2d {v0, v1, v2, v3}, [x1], #64
-
-  st1.8b {v0}, [x1], #8
-  st1.8b {v0, v1}, [x1], #16
-  st1.8b {v0, v1, v2}, [x1], #24
-  st1.8b {v0, v1, v2, v3}, [x1], #32
-
-  st1.16b {v0}, [x1], #16
-  st1.16b {v0, v1}, [x1], #32
-  st1.16b {v0, v1, v2}, [x1], #48
-  st1.16b {v0, v1, v2, v3}, [x1], #64
-
-  st1.4h {v0}, [x1], #8
-  st1.4h {v0, v1}, [x1], #16
-  st1.4h {v0, v1, v2}, [x1], #24
-  st1.4h {v0, v1, v2, v3}, [x1], #32
-
-  st1.8h {v0}, [x1], #16
-  st1.8h {v0, v1}, [x1], #32
-  st1.8h {v0, v1, v2}, [x1], #48
-  st1.8h {v0, v1, v2, v3}, [x1], #64
-
-  st1.2s {v0}, [x1], #8
-  st1.2s {v0, v1}, [x1], #16
-  st1.2s {v0, v1, v2}, [x1], #24
-  st1.2s {v0, v1, v2, v3}, [x1], #32
-
-  st1.4s {v0}, [x1], #16
-  st1.4s {v0, v1}, [x1], #32
-  st1.4s {v0, v1, v2}, [x1], #48
-  st1.4s {v0, v1, v2, v3}, [x1], #64
-
-  st1.1d {v0}, [x1], #8
-  st1.1d {v0, v1}, [x1], #16
-  st1.1d {v0, v1, v2}, [x1], #24
-  st1.1d {v0, v1, v2, v3}, [x1], #32
-
-  st1.2d {v0}, [x1], #16
-  st1.2d {v0, v1}, [x1], #32
-  st1.2d {v0, v1, v2}, [x1], #48
-  st1.2d {v0, v1, v2, v3}, [x1], #64
-
-; CHECK: ld1st1_multiple_post:
-; CHECK: ld1.8b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0xcf,0x0c]
-; CHECK: ld1.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0xcf,0x0c]
-; CHECK: ld1.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0xcf,0x0c]
-; CHECK: ld1.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0xcf,0x0c]
-
-; CHECK: ld1.16b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0xcf,0x4c]
-; CHECK: ld1.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0xcf,0x4c]
-; CHECK: ld1.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0xcf,0x4c]
-; CHECK: ld1.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0xcf,0x4c]
-
-; CHECK: ld1.4h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0xcf,0x0c]
-; CHECK: ld1.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0xcf,0x0c]
-; CHECK: ld1.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0xcf,0x0c]
-; CHECK: ld1.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0xcf,0x0c]
-
-; CHECK: ld1.8h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0xcf,0x4c]
-; CHECK: ld1.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0xcf,0x4c]
-; CHECK: ld1.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0xcf,0x4c]
-; CHECK: ld1.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0xcf,0x4c]
-
-; CHECK: ld1.2s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0xcf,0x0c]
-; CHECK: ld1.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0xcf,0x0c]
-; CHECK: ld1.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0xcf,0x0c]
-; CHECK: ld1.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0xcf,0x0c]
-
-; CHECK: ld1.4s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0xcf,0x4c]
-; CHECK: ld1.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0xcf,0x4c]
-; CHECK: ld1.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0xcf,0x4c]
-; CHECK: ld1.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0xcf,0x4c]
-
-; CHECK: ld1.1d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0xcf,0x0c]
-; CHECK: ld1.1d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0xcf,0x0c]
-; CHECK: ld1.1d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0xcf,0x0c]
-; CHECK: ld1.1d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0xcf,0x0c]
-
-; CHECK: ld1.2d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0xcf,0x4c]
-; CHECK: ld1.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0xcf,0x4c]
-; CHECK: ld1.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0xcf,0x4c]
-; CHECK: ld1.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0xcf,0x4c]
-
-; CHECK: st1.8b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0x8f,0x0c]
-; CHECK: st1.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0x8f,0x0c]
-; CHECK: st1.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0x8f,0x0c]
-; CHECK: st1.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0x8f,0x0c]
-
-; CHECK: st1.16b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0x8f,0x4c]
-; CHECK: st1.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0x8f,0x4c]
-; CHECK: st1.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0x8f,0x4c]
-; CHECK: st1.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0x8f,0x4c]
-
-; CHECK: st1.4h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0x8f,0x0c]
-; CHECK: st1.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0x8f,0x0c]
-; CHECK: st1.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0x8f,0x0c]
-; CHECK: st1.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0x8f,0x0c]
-
-; CHECK: st1.8h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0x8f,0x4c]
-; CHECK: st1.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0x8f,0x4c]
-; CHECK: st1.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0x8f,0x4c]
-; CHECK: st1.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0x8f,0x4c]
-
-; CHECK: st1.2s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0x8f,0x0c]
-; CHECK: st1.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0x8f,0x0c]
-; CHECK: st1.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0x8f,0x0c]
-; CHECK: st1.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0x8f,0x0c]
-
-; CHECK: st1.4s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0x8f,0x4c]
-; CHECK: st1.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0x8f,0x4c]
-; CHECK: st1.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0x8f,0x4c]
-; CHECK: st1.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0x8f,0x4c]
-
-; CHECK: st1.1d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0x8f,0x0c]
-; CHECK: st1.1d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0x8f,0x0c]
-; CHECK: st1.1d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0x8f,0x0c]
-; CHECK: st1.1d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0x8f,0x0c]
-
-; CHECK: st1.2d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0x8f,0x4c]
-; CHECK: st1.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0x8f,0x4c]
-; CHECK: st1.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0x8f,0x4c]
-; CHECK: st1.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0x8f,0x4c]
-
-; CHECK: ld1.8b { v0 }, [x1], #8       ; encoding: [0x20,0x70,0xdf,0x0c]
-; CHECK: ld1.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa0,0xdf,0x0c]
-; CHECK: ld1.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x60,0xdf,0x0c]
-; CHECK: ld1.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x20,0xdf,0x0c]
-
-; CHECK: ld1.16b { v0 }, [x1], #16       ; encoding: [0x20,0x70,0xdf,0x4c]
-; CHECK: ld1.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa0,0xdf,0x4c]
-; CHECK: ld1.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x60,0xdf,0x4c]
-; CHECK: ld1.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x20,0xdf,0x4c]
-
-; CHECK: ld1.4h { v0 }, [x1], #8       ; encoding: [0x20,0x74,0xdf,0x0c]
-; CHECK: ld1.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa4,0xdf,0x0c]
-; CHECK: ld1.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x64,0xdf,0x0c]
-; CHECK: ld1.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x24,0xdf,0x0c]
-
-; CHECK: ld1.8h { v0 }, [x1], #16       ; encoding: [0x20,0x74,0xdf,0x4c]
-; CHECK: ld1.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa4,0xdf,0x4c]
-; CHECK: ld1.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x64,0xdf,0x4c]
-; CHECK: ld1.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x24,0xdf,0x4c]
-
-; CHECK: ld1.2s { v0 }, [x1], #8       ; encoding: [0x20,0x78,0xdf,0x0c]
-; CHECK: ld1.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa8,0xdf,0x0c]
-; CHECK: ld1.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x68,0xdf,0x0c]
-; CHECK: ld1.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x28,0xdf,0x0c]
-
-; CHECK: ld1.4s { v0 }, [x1], #16       ; encoding: [0x20,0x78,0xdf,0x4c]
-; CHECK: ld1.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa8,0xdf,0x4c]
-; CHECK: ld1.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x68,0xdf,0x4c]
-; CHECK: ld1.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x28,0xdf,0x4c]
-
-; CHECK: ld1.1d { v0 }, [x1], #8       ; encoding: [0x20,0x7c,0xdf,0x0c]
-; CHECK: ld1.1d { v0, v1 }, [x1], #16   ; encoding: [0x20,0xac,0xdf,0x0c]
-; CHECK: ld1.1d { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x6c,0xdf,0x0c]
-; CHECK: ld1.1d { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x2c,0xdf,0x0c]
-
-; CHECK: ld1.2d { v0 }, [x1], #16       ; encoding: [0x20,0x7c,0xdf,0x4c]
-; CHECK: ld1.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0xac,0xdf,0x4c]
-; CHECK: ld1.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x6c,0xdf,0x4c]
-; CHECK: ld1.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x2c,0xdf,0x4c]
-
-; CHECK: st1.8b { v0 }, [x1], #8       ; encoding: [0x20,0x70,0x9f,0x0c]
-; CHECK: st1.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa0,0x9f,0x0c]
-; CHECK: st1.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x60,0x9f,0x0c]
-; CHECK: st1.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x20,0x9f,0x0c]
-
-; CHECK: st1.16b { v0 }, [x1], #16       ; encoding: [0x20,0x70,0x9f,0x4c]
-; CHECK: st1.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa0,0x9f,0x4c]
-; CHECK: st1.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x60,0x9f,0x4c]
-; CHECK: st1.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x20,0x9f,0x4c]
-
-; CHECK: st1.4h { v0 }, [x1], #8       ; encoding: [0x20,0x74,0x9f,0x0c]
-; CHECK: st1.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa4,0x9f,0x0c]
-; CHECK: st1.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x64,0x9f,0x0c]
-; CHECK: st1.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x24,0x9f,0x0c]
-
-; CHECK: st1.8h { v0 }, [x1], #16       ; encoding: [0x20,0x74,0x9f,0x4c]
-; CHECK: st1.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa4,0x9f,0x4c]
-; CHECK: st1.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x64,0x9f,0x4c]
-; CHECK: st1.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x24,0x9f,0x4c]
-
-; CHECK: st1.2s { v0 }, [x1], #8       ; encoding: [0x20,0x78,0x9f,0x0c]
-; CHECK: st1.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa8,0x9f,0x0c]
-; CHECK: st1.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x68,0x9f,0x0c]
-; CHECK: st1.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x28,0x9f,0x0c]
-
-; CHECK: st1.4s { v0 }, [x1], #16       ; encoding: [0x20,0x78,0x9f,0x4c]
-; CHECK: st1.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa8,0x9f,0x4c]
-; CHECK: st1.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x68,0x9f,0x4c]
-; CHECK: st1.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x28,0x9f,0x4c]
-
-; CHECK: st1.1d { v0 }, [x1], #8       ; encoding: [0x20,0x7c,0x9f,0x0c]
-; CHECK: st1.1d { v0, v1 }, [x1], #16   ; encoding: [0x20,0xac,0x9f,0x0c]
-; CHECK: st1.1d { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x6c,0x9f,0x0c]
-; CHECK: st1.1d { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x2c,0x9f,0x0c]
-
-; CHECK: st1.2d { v0 }, [x1], #16       ; encoding: [0x20,0x7c,0x9f,0x4c]
-; CHECK: st1.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0xac,0x9f,0x4c]
-; CHECK: st1.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x6c,0x9f,0x4c]
-; CHECK: st1.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x2c,0x9f,0x4c]
-
-
-_ld2st2_multiple_post:
-  ld2.8b {v0, v1}, [x1], x15
-  ld2.16b {v0, v1}, [x1], x15
-  ld2.4h {v0, v1}, [x1], x15
-  ld2.8h {v0, v1}, [x1], x15
-  ld2.2s {v0, v1}, [x1], x15
-  ld2.4s {v0, v1}, [x1], x15
-  ld2.2d {v0, v1}, [x1], x15
-
-  st2.8b {v0, v1}, [x1], x15
-  st2.16b {v0, v1}, [x1], x15
-  st2.4h {v0, v1}, [x1], x15
-  st2.8h {v0, v1}, [x1], x15
-  st2.2s {v0, v1}, [x1], x15
-  st2.4s {v0, v1}, [x1], x15
-  st2.2d {v0, v1}, [x1], x15
-
-  ld2.8b {v0, v1}, [x1], #16
-  ld2.16b {v0, v1}, [x1], #32
-  ld2.4h {v0, v1}, [x1], #16
-  ld2.8h {v0, v1}, [x1], #32
-  ld2.2s {v0, v1}, [x1], #16
-  ld2.4s {v0, v1}, [x1], #32
-  ld2.2d {v0, v1}, [x1], #32
-
-  st2.8b {v0, v1}, [x1], #16
-  st2.16b {v0, v1}, [x1], #32
-  st2.4h {v0, v1}, [x1], #16
-  st2.8h {v0, v1}, [x1], #32
-  st2.2s {v0, v1}, [x1], #16
-  st2.4s {v0, v1}, [x1], #32
-  st2.2d {v0, v1}, [x1], #32
-
-
-; CHECK: ld2st2_multiple_post:
-; CHECK: ld2.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0xcf,0x0c]
-; CHECK: ld2.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0xcf,0x4c]
-; CHECK: ld2.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0xcf,0x0c]
-; CHECK: ld2.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0xcf,0x4c]
-; CHECK: ld2.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0xcf,0x0c]
-; CHECK: ld2.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0xcf,0x4c]
-; CHECK: ld2.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0x8c,0xcf,0x4c]
-
-; CHECK: st2.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0x8f,0x0c]
-; CHECK: st2.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0x8f,0x4c]
-; CHECK: st2.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0x8f,0x0c]
-; CHECK: st2.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0x8f,0x4c]
-; CHECK: st2.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0x8f,0x0c]
-; CHECK: st2.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0x8f,0x4c]
-; CHECK: st2.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0x8c,0x8f,0x4c]
-
-; CHECK: ld2.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0x80,0xdf,0x0c]
-; CHECK: ld2.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0x80,0xdf,0x4c]
-; CHECK: ld2.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0x84,0xdf,0x0c]
-; CHECK: ld2.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0x84,0xdf,0x4c]
-; CHECK: ld2.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0x88,0xdf,0x0c]
-; CHECK: ld2.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0x88,0xdf,0x4c]
-; CHECK: ld2.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0x8c,0xdf,0x4c]
-
-; CHECK: st2.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0x80,0x9f,0x0c]
-; CHECK: st2.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0x80,0x9f,0x4c]
-; CHECK: st2.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0x84,0x9f,0x0c]
-; CHECK: st2.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0x84,0x9f,0x4c]
-; CHECK: st2.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0x88,0x9f,0x0c]
-; CHECK: st2.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0x88,0x9f,0x4c]
-; CHECK: st2.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0x8c,0x9f,0x4c]
-
-
-_ld3st3_multiple_post:
-  ld3.8b {v0, v1, v2}, [x1], x15
-  ld3.16b {v0, v1, v2}, [x1], x15
-  ld3.4h {v0, v1, v2}, [x1], x15
-  ld3.8h {v0, v1, v2}, [x1], x15
-  ld3.2s {v0, v1, v2}, [x1], x15
-  ld3.4s {v0, v1, v2}, [x1], x15
-  ld3.2d {v0, v1, v2}, [x1], x15
-
-  st3.8b {v0, v1, v2}, [x1], x15
-  st3.16b {v0, v1, v2}, [x1], x15
-  st3.4h {v0, v1, v2}, [x1], x15
-  st3.8h {v0, v1, v2}, [x1], x15
-  st3.2s {v0, v1, v2}, [x1], x15
-  st3.4s {v0, v1, v2}, [x1], x15
-  st3.2d {v0, v1, v2}, [x1], x15
-
-  ld3.8b {v0, v1, v2}, [x1], #24
-  ld3.16b {v0, v1, v2}, [x1], #48
-  ld3.4h {v0, v1, v2}, [x1], #24
-  ld3.8h {v0, v1, v2}, [x1], #48
-  ld3.2s {v0, v1, v2}, [x1], #24
-  ld3.4s {v0, v1, v2}, [x1], #48
-  ld3.2d {v0, v1, v2}, [x1], #48
-
-  st3.8b {v0, v1, v2}, [x1], #24
-  st3.16b {v0, v1, v2}, [x1], #48
-  st3.4h {v0, v1, v2}, [x1], #24
-  st3.8h {v0, v1, v2}, [x1], #48
-  st3.2s {v0, v1, v2}, [x1], #24
-  st3.4s {v0, v1, v2}, [x1], #48
-  st3.2d {v0, v1, v2}, [x1], #48
-
-; CHECK: ld3st3_multiple_post:
-; CHECK: ld3.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0xcf,0x0c]
-; CHECK: ld3.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0xcf,0x4c]
-; CHECK: ld3.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0xcf,0x0c]
-; CHECK: ld3.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0xcf,0x4c]
-; CHECK: ld3.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0xcf,0x0c]
-; CHECK: ld3.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0xcf,0x4c]
-; CHECK: ld3.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x4c,0xcf,0x4c]
-
-; CHECK: st3.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0x8f,0x0c]
-; CHECK: st3.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0x8f,0x4c]
-; CHECK: st3.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0x8f,0x0c]
-; CHECK: st3.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0x8f,0x4c]
-; CHECK: st3.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0x8f,0x0c]
-; CHECK: st3.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0x8f,0x4c]
-; CHECK: st3.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x4c,0x8f,0x4c]
-
-; CHECK: ld3.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x40,0xdf,0x0c]
-; CHECK: ld3.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x40,0xdf,0x4c]
-; CHECK: ld3.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x44,0xdf,0x0c]
-; CHECK: ld3.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x44,0xdf,0x4c]
-; CHECK: ld3.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x48,0xdf,0x0c]
-; CHECK: ld3.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x48,0xdf,0x4c]
-; CHECK: ld3.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x4c,0xdf,0x4c]
-
-; CHECK: st3.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x40,0x9f,0x0c]
-; CHECK: st3.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x40,0x9f,0x4c]
-; CHECK: st3.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x44,0x9f,0x0c]
-; CHECK: st3.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x44,0x9f,0x4c]
-; CHECK: st3.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x48,0x9f,0x0c]
-; CHECK: st3.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x48,0x9f,0x4c]
-; CHECK: st3.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x4c,0x9f,0x4c]
-
-_ld4st4_multiple_post:
-  ld4.8b {v0, v1, v2, v3}, [x1], x15
-  ld4.16b {v0, v1, v2, v3}, [x1], x15
-  ld4.4h {v0, v1, v2, v3}, [x1], x15
-  ld4.8h {v0, v1, v2, v3}, [x1], x15
-  ld4.2s {v0, v1, v2, v3}, [x1], x15
-  ld4.4s {v0, v1, v2, v3}, [x1], x15
-  ld4.2d {v0, v1, v2, v3}, [x1], x15
-
-  st4.8b {v0, v1, v2, v3}, [x1], x15
-  st4.16b {v0, v1, v2, v3}, [x1], x15
-  st4.4h {v0, v1, v2, v3}, [x1], x15
-  st4.8h {v0, v1, v2, v3}, [x1], x15
-  st4.2s {v0, v1, v2, v3}, [x1], x15
-  st4.4s {v0, v1, v2, v3}, [x1], x15
-  st4.2d {v0, v1, v2, v3}, [x1], x15
-
-  ld4.8b {v0, v1, v2, v3}, [x1], #32
-  ld4.16b {v0, v1, v2, v3}, [x1], #64
-  ld4.4h {v0, v1, v2, v3}, [x1], #32
-  ld4.8h {v0, v1, v2, v3}, [x1], #64
-  ld4.2s {v0, v1, v2, v3}, [x1], #32
-  ld4.4s {v0, v1, v2, v3}, [x1], #64
-  ld4.2d {v0, v1, v2, v3}, [x1], #64
-
-  st4.8b {v0, v1, v2, v3}, [x1], #32
-  st4.16b {v0, v1, v2, v3}, [x1], #64
-  st4.4h {v0, v1, v2, v3}, [x1], #32
-  st4.8h {v0, v1, v2, v3}, [x1], #64
-  st4.2s {v0, v1, v2, v3}, [x1], #32
-  st4.4s {v0, v1, v2, v3}, [x1], #64
-  st4.2d {v0, v1, v2, v3}, [x1], #64
-
-
-; CHECK: ld4st4_multiple_post:
-; CHECK: ld4.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0xcf,0x0c]
-; CHECK: ld4.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0xcf,0x4c]
-; CHECK: ld4.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0xcf,0x0c]
-; CHECK: ld4.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0xcf,0x4c]
-; CHECK: ld4.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0xcf,0x0c]
-; CHECK: ld4.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0xcf,0x4c]
-; CHECK: ld4.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x0c,0xcf,0x4c]
-
-; CHECK: st4.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0x8f,0x0c]
-; CHECK: st4.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0x8f,0x4c]
-; CHECK: st4.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0x8f,0x0c]
-; CHECK: st4.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0x8f,0x4c]
-; CHECK: st4.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0x8f,0x0c]
-; CHECK: st4.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0x8f,0x4c]
-; CHECK: st4.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x0c,0x8f,0x4c]
-
-; CHECK: ld4.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x00,0xdf,0x0c]
-; CHECK: ld4.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x00,0xdf,0x4c]
-; CHECK: ld4.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x04,0xdf,0x0c]
-; CHECK: ld4.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x04,0xdf,0x4c]
-; CHECK: ld4.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x08,0xdf,0x0c]
-; CHECK: ld4.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x08,0xdf,0x4c]
-; CHECK: ld4.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x0c,0xdf,0x4c]
-
-; CHECK: st4.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x00,0x9f,0x0c]
-; CHECK: st4.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x00,0x9f,0x4c]
-; CHECK: st4.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x04,0x9f,0x0c]
-; CHECK: st4.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x04,0x9f,0x4c]
-; CHECK: st4.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x08,0x9f,0x0c]
-; CHECK: st4.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x08,0x9f,0x4c]
-; CHECK: st4.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x0c,0x9f,0x4c]
-
-ld1r:
-  ld1r.8b {v4}, [x2]
-  ld1r.8b {v4}, [x2], x3
-  ld1r.16b {v4}, [x2]
-  ld1r.16b {v4}, [x2], x3
-  ld1r.4h {v4}, [x2]
-  ld1r.4h {v4}, [x2], x3
-  ld1r.8h {v4}, [x2]
-  ld1r.8h {v4}, [x2], x3
-  ld1r.2s {v4}, [x2]
-  ld1r.2s {v4}, [x2], x3
-  ld1r.4s {v4}, [x2]
-  ld1r.4s {v4}, [x2], x3
-  ld1r.1d {v4}, [x2]
-  ld1r.1d {v4}, [x2], x3
-  ld1r.2d {v4}, [x2]
-  ld1r.2d {v4}, [x2], x3
-
-  ld1r.8b {v4}, [x2], #1
-  ld1r.16b {v4}, [x2], #1
-  ld1r.4h {v4}, [x2], #2
-  ld1r.8h {v4}, [x2], #2
-  ld1r.2s {v4}, [x2], #4
-  ld1r.4s {v4}, [x2], #4
-  ld1r.1d {v4}, [x2], #8
-  ld1r.2d {v4}, [x2], #8
-
-; CHECK: ld1r:
-; CHECK: ld1r.8b { v4 }, [x2]            ; encoding: [0x44,0xc0,0x40,0x0d]
-; CHECK: ld1r.8b { v4 }, [x2], x3        ; encoding: [0x44,0xc0,0xc3,0x0d]
-; CHECK: ld1r.16b { v4 }, [x2]    ; encoding: [0x44,0xc0,0x40,0x4d]
-; CHECK: ld1r.16b { v4 }, [x2], x3 ; encoding: [0x44,0xc0,0xc3,0x4d]
-; CHECK: ld1r.4h { v4 }, [x2]            ; encoding: [0x44,0xc4,0x40,0x0d]
-; CHECK: ld1r.4h { v4 }, [x2], x3        ; encoding: [0x44,0xc4,0xc3,0x0d]
-; CHECK: ld1r.8h { v4 }, [x2]            ; encoding: [0x44,0xc4,0x40,0x4d]
-; CHECK: ld1r.8h { v4 }, [x2], x3        ; encoding: [0x44,0xc4,0xc3,0x4d]
-; CHECK: ld1r.2s { v4 }, [x2]            ; encoding: [0x44,0xc8,0x40,0x0d]
-; CHECK: ld1r.2s { v4 }, [x2], x3        ; encoding: [0x44,0xc8,0xc3,0x0d]
-; CHECK: ld1r.4s { v4 }, [x2]            ; encoding: [0x44,0xc8,0x40,0x4d]
-; CHECK: ld1r.4s { v4 }, [x2], x3        ; encoding: [0x44,0xc8,0xc3,0x4d]
-; CHECK: ld1r.1d { v4 }, [x2]            ; encoding: [0x44,0xcc,0x40,0x0d]
-; CHECK: ld1r.1d { v4 }, [x2], x3        ; encoding: [0x44,0xcc,0xc3,0x0d]
-; CHECK: ld1r.2d { v4 }, [x2]            ; encoding: [0x44,0xcc,0x40,0x4d]
-; CHECK: ld1r.2d { v4 }, [x2], x3        ; encoding: [0x44,0xcc,0xc3,0x4d]
-
-; CHECK: ld1r.8b { v4 }, [x2], #1        ; encoding: [0x44,0xc0,0xdf,0x0d]
-; CHECK: ld1r.16b { v4 }, [x2], #1 ; encoding: [0x44,0xc0,0xdf,0x4d]
-; CHECK: ld1r.4h { v4 }, [x2], #2        ; encoding: [0x44,0xc4,0xdf,0x0d]
-; CHECK: ld1r.8h { v4 }, [x2], #2        ; encoding: [0x44,0xc4,0xdf,0x4d]
-; CHECK: ld1r.2s { v4 }, [x2], #4        ; encoding: [0x44,0xc8,0xdf,0x0d]
-; CHECK: ld1r.4s { v4 }, [x2], #4        ; encoding: [0x44,0xc8,0xdf,0x4d]
-; CHECK: ld1r.1d { v4 }, [x2], #8        ; encoding: [0x44,0xcc,0xdf,0x0d]
-; CHECK: ld1r.2d { v4 }, [x2], #8        ; encoding: [0x44,0xcc,0xdf,0x4d]
-
-ld2r:
-  ld2r.8b {v4, v5}, [x2]
-  ld2r.8b {v4, v5}, [x2], x3
-  ld2r.16b {v4, v5}, [x2]
-  ld2r.16b {v4, v5}, [x2], x3
-  ld2r.4h {v4, v5}, [x2]
-  ld2r.4h {v4, v5}, [x2], x3
-  ld2r.8h {v4, v5}, [x2]
-  ld2r.8h {v4, v5}, [x2], x3
-  ld2r.2s {v4, v5}, [x2]
-  ld2r.2s {v4, v5}, [x2], x3
-  ld2r.4s {v4, v5}, [x2]
-  ld2r.4s {v4, v5}, [x2], x3
-  ld2r.1d {v4, v5}, [x2]
-  ld2r.1d {v4, v5}, [x2], x3
-  ld2r.2d {v4, v5}, [x2]
-  ld2r.2d {v4, v5}, [x2], x3
-
-  ld2r.8b {v4, v5}, [x2], #2
-  ld2r.16b {v4, v5}, [x2], #2
-  ld2r.4h {v4, v5}, [x2], #4
-  ld2r.8h {v4, v5}, [x2], #4
-  ld2r.2s {v4, v5}, [x2], #8
-  ld2r.4s {v4, v5}, [x2], #8
-  ld2r.1d {v4, v5}, [x2], #16
-  ld2r.2d {v4, v5}, [x2], #16
-
-; CHECK: ld2r:
-; CHECK: ld2r.8b { v4, v5 }, [x2]        ; encoding: [0x44,0xc0,0x60,0x0d]
-; CHECK: ld2r.8b { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc0,0xe3,0x0d]
-; CHECK: ld2r.16b { v4, v5 }, [x2] ; encoding: [0x44,0xc0,0x60,0x4d]
-; CHECK: ld2r.16b { v4, v5 }, [x2], x3 ; encoding: [0x44,0xc0,0xe3,0x4d]
-; CHECK: ld2r.4h { v4, v5 }, [x2]        ; encoding: [0x44,0xc4,0x60,0x0d]
-; CHECK: ld2r.4h { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc4,0xe3,0x0d]
-; CHECK: ld2r.8h { v4, v5 }, [x2]        ; encoding: [0x44,0xc4,0x60,0x4d]
-; CHECK: ld2r.8h { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc4,0xe3,0x4d]
-; CHECK: ld2r.2s { v4, v5 }, [x2]        ; encoding: [0x44,0xc8,0x60,0x0d]
-; CHECK: ld2r.2s { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc8,0xe3,0x0d]
-; CHECK: ld2r.4s { v4, v5 }, [x2]        ; encoding: [0x44,0xc8,0x60,0x4d]
-; CHECK: ld2r.4s { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc8,0xe3,0x4d]
-; CHECK: ld2r.1d { v4, v5 }, [x2]        ; encoding: [0x44,0xcc,0x60,0x0d]
-; CHECK: ld2r.1d { v4, v5 }, [x2], x3    ; encoding: [0x44,0xcc,0xe3,0x0d]
-; CHECK: ld2r.2d { v4, v5 }, [x2]        ; encoding: [0x44,0xcc,0x60,0x4d]
-; CHECK: ld2r.2d { v4, v5 }, [x2], x3    ; encoding: [0x44,0xcc,0xe3,0x4d]
-
-; CHECK: ld2r.8b { v4, v5 }, [x2], #2    ; encoding: [0x44,0xc0,0xff,0x0d]
-; CHECK: ld2r.16b { v4, v5 }, [x2], #2 ; encoding: [0x44,0xc0,0xff,0x4d]
-; CHECK: ld2r.4h { v4, v5 }, [x2], #4    ; encoding: [0x44,0xc4,0xff,0x0d]
-; CHECK: ld2r.8h { v4, v5 }, [x2], #4    ; encoding: [0x44,0xc4,0xff,0x4d]
-; CHECK: ld2r.2s { v4, v5 }, [x2], #8    ; encoding: [0x44,0xc8,0xff,0x0d]
-; CHECK: ld2r.4s { v4, v5 }, [x2], #8    ; encoding: [0x44,0xc8,0xff,0x4d]
-; CHECK: ld2r.1d { v4, v5 }, [x2], #16    ; encoding: [0x44,0xcc,0xff,0x0d]
-; CHECK: ld2r.2d { v4, v5 }, [x2], #16    ; encoding: [0x44,0xcc,0xff,0x4d]
-
-ld3r:
-  ld3r.8b {v4, v5, v6}, [x2]
-  ld3r.8b {v4, v5, v6}, [x2], x3
-  ld3r.16b {v4, v5, v6}, [x2]
-  ld3r.16b {v4, v5, v6}, [x2], x3
-  ld3r.4h {v4, v5, v6}, [x2]
-  ld3r.4h {v4, v5, v6}, [x2], x3
-  ld3r.8h {v4, v5, v6}, [x2]
-  ld3r.8h {v4, v5, v6}, [x2], x3
-  ld3r.2s {v4, v5, v6}, [x2]
-  ld3r.2s {v4, v5, v6}, [x2], x3
-  ld3r.4s {v4, v5, v6}, [x2]
-  ld3r.4s {v4, v5, v6}, [x2], x3
-  ld3r.1d {v4, v5, v6}, [x2]
-  ld3r.1d {v4, v5, v6}, [x2], x3
-  ld3r.2d {v4, v5, v6}, [x2]
-  ld3r.2d {v4, v5, v6}, [x2], x3
-
-  ld3r.8b {v4, v5, v6}, [x2], #3
-  ld3r.16b {v4, v5, v6}, [x2], #3
-  ld3r.4h {v4, v5, v6}, [x2], #6
-  ld3r.8h {v4, v5, v6}, [x2], #6
-  ld3r.2s {v4, v5, v6}, [x2], #12
-  ld3r.4s {v4, v5, v6}, [x2], #12
-  ld3r.1d {v4, v5, v6}, [x2], #24
-  ld3r.2d {v4, v5, v6}, [x2], #24
-
-; CHECK: ld3r:
-; CHECK: ld3r.8b { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe0,0x40,0x0d]
-; CHECK: ld3r.8b { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe0,0xc3,0x0d]
-; CHECK: ld3r.16b { v4, v5, v6 }, [x2] ; encoding: [0x44,0xe0,0x40,0x4d]
-; CHECK: ld3r.16b { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe0,0xc3,0x4d]
-; CHECK: ld3r.4h { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe4,0x40,0x0d]
-; CHECK: ld3r.4h { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe4,0xc3,0x0d]
-; CHECK: ld3r.8h { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe4,0x40,0x4d]
-; CHECK: ld3r.8h { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe4,0xc3,0x4d]
-; CHECK: ld3r.2s { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe8,0x40,0x0d]
-; CHECK: ld3r.2s { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe8,0xc3,0x0d]
-; CHECK: ld3r.4s { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe8,0x40,0x4d]
-; CHECK: ld3r.4s { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe8,0xc3,0x4d]
-; CHECK: ld3r.1d { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xec,0x40,0x0d]
-; CHECK: ld3r.1d { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xec,0xc3,0x0d]
-; CHECK: ld3r.2d { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xec,0x40,0x4d]
-; CHECK: ld3r.2d { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xec,0xc3,0x4d]
-
-; CHECK: ld3r.8b { v4, v5, v6 }, [x2], #3 ; encoding: [0x44,0xe0,0xdf,0x0d]
-; CHECK: ld3r.16b { v4, v5, v6 }, [x2], #3 ; encoding: [0x44,0xe0,0xdf,0x4d]
-; CHECK: ld3r.4h { v4, v5, v6 }, [x2], #6 ; encoding: [0x44,0xe4,0xdf,0x0d]
-; CHECK: ld3r.8h { v4, v5, v6 }, [x2], #6 ; encoding: [0x44,0xe4,0xdf,0x4d]
-; CHECK: ld3r.2s { v4, v5, v6 }, [x2], #12 ; encoding: [0x44,0xe8,0xdf,0x0d]
-; CHECK: ld3r.4s { v4, v5, v6 }, [x2], #12 ; encoding: [0x44,0xe8,0xdf,0x4d]
-; CHECK: ld3r.1d { v4, v5, v6 }, [x2], #24 ; encoding: [0x44,0xec,0xdf,0x0d]
-; CHECK: ld3r.2d { v4, v5, v6 }, [x2], #24 ; encoding: [0x44,0xec,0xdf,0x4d]
-
-ld4r:
-  ld4r.8b {v4, v5, v6, v7}, [x2]
-  ld4r.8b {v4, v5, v6, v7}, [x2], x3
-  ld4r.16b {v4, v5, v6, v7}, [x2]
-  ld4r.16b {v4, v5, v6, v7}, [x2], x3
-  ld4r.4h {v4, v5, v6, v7}, [x2]
-  ld4r.4h {v4, v5, v6, v7}, [x2], x3
-  ld4r.8h {v4, v5, v6, v7}, [x2]
-  ld4r.8h {v4, v5, v6, v7}, [x2], x3
-  ld4r.2s {v4, v5, v6, v7}, [x2]
-  ld4r.2s {v4, v5, v6, v7}, [x2], x3
-  ld4r.4s {v4, v5, v6, v7}, [x2]
-  ld4r.4s {v4, v5, v6, v7}, [x2], x3
-  ld4r.1d {v4, v5, v6, v7}, [x2]
-  ld4r.1d {v4, v5, v6, v7}, [x2], x3
-  ld4r.2d {v4, v5, v6, v7}, [x2]
-  ld4r.2d {v4, v5, v6, v7}, [x2], x3
-
-  ld4r.8b {v4, v5, v6, v7}, [x2], #4
-  ld4r.16b {v5, v6, v7, v8}, [x2], #4
-  ld4r.4h {v6, v7, v8, v9}, [x2], #8
-  ld4r.8h {v1, v2, v3, v4}, [x2], #8
-  ld4r.2s {v2, v3, v4, v5}, [x2], #16
-  ld4r.4s {v3, v4, v5, v6}, [x2], #16
-  ld4r.1d {v0, v1, v2, v3}, [x2], #32
-  ld4r.2d {v4, v5, v6, v7}, [x2], #32
-
-; CHECK: ld4r:
-; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe0,0x60,0x0d]
-; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe0,0xe3,0x0d]
-; CHECK: ld4r.16b { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe0,0x60,0x4d]
-; CHECK: ld4r.16b { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe0,0xe3,0x4d]
-; CHECK: ld4r.4h { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe4,0x60,0x0d]
-; CHECK: ld4r.4h { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe4,0xe3,0x0d]
-; CHECK: ld4r.8h { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe4,0x60,0x4d]
-; CHECK: ld4r.8h { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe4,0xe3,0x4d]
-; CHECK: ld4r.2s { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe8,0x60,0x0d]
-; CHECK: ld4r.2s { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe8,0xe3,0x0d]
-; CHECK: ld4r.4s { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe8,0x60,0x4d]
-; CHECK: ld4r.4s { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe8,0xe3,0x4d]
-; CHECK: ld4r.1d { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xec,0x60,0x0d]
-; CHECK: ld4r.1d { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xec,0xe3,0x0d]
-; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xec,0x60,0x4d]
-; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xec,0xe3,0x4d]
-
-; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2], #4 ; encoding: [0x44,0xe0,0xff,0x0d]
-; CHECK: ld4r.16b { v5, v6, v7, v8 }, [x2], #4 ; encoding: [0x45,0xe0,0xff,0x4d]
-; CHECK: ld4r.4h { v6, v7, v8, v9 }, [x2], #8 ; encoding: [0x46,0xe4,0xff,0x0d]
-; CHECK: ld4r.8h { v1, v2, v3, v4 }, [x2], #8 ; encoding: [0x41,0xe4,0xff,0x4d]
-; CHECK: ld4r.2s { v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x0d]
-; CHECK: ld4r.4s { v3, v4, v5, v6 }, [x2], #16 ; encoding: [0x43,0xe8,0xff,0x4d]
-; CHECK: ld4r.1d { v0, v1, v2, v3 }, [x2], #32 ; encoding: [0x40,0xec,0xff,0x0d]
-; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2], #32 ; encoding: [0x44,0xec,0xff,0x4d]
-
-
-_ld1:
-  ld1.b {v4}[13], [x3]
-  ld1.h {v4}[2], [x3]
-  ld1.s {v4}[2], [x3]
-  ld1.d {v4}[1], [x3]
-  ld1.b {v4}[13], [x3], x5
-  ld1.h {v4}[2], [x3], x5
-  ld1.s {v4}[2], [x3], x5
-  ld1.d {v4}[1], [x3], x5
-  ld1.b {v4}[13], [x3], #1
-  ld1.h {v4}[2], [x3], #2
-  ld1.s {v4}[2], [x3], #4
-  ld1.d {v4}[1], [x3], #8
-
-; CHECK: _ld1:
-; CHECK: ld1.b { v4 }[13], [x3]        ; encoding: [0x64,0x14,0x40,0x4d]
-; CHECK: ld1.h { v4 }[2], [x3]         ; encoding: [0x64,0x50,0x40,0x0d]
-; CHECK: ld1.s { v4 }[2], [x3]         ; encoding: [0x64,0x80,0x40,0x4d]
-; CHECK: ld1.d { v4 }[1], [x3]         ; encoding: [0x64,0x84,0x40,0x4d]
-; CHECK: ld1.b { v4 }[13], [x3], x5    ; encoding: [0x64,0x14,0xc5,0x4d]
-; CHECK: ld1.h { v4 }[2], [x3], x5     ; encoding: [0x64,0x50,0xc5,0x0d]
-; CHECK: ld1.s { v4 }[2], [x3], x5     ; encoding: [0x64,0x80,0xc5,0x4d]
-; CHECK: ld1.d { v4 }[1], [x3], x5     ; encoding: [0x64,0x84,0xc5,0x4d]
-; CHECK: ld1.b { v4 }[13], [x3], #1   ; encoding: [0x64,0x14,0xdf,0x4d]
-; CHECK: ld1.h { v4 }[2], [x3], #2    ; encoding: [0x64,0x50,0xdf,0x0d]
-; CHECK: ld1.s { v4 }[2], [x3], #4    ; encoding: [0x64,0x80,0xdf,0x4d]
-; CHECK: ld1.d { v4 }[1], [x3], #8    ; encoding: [0x64,0x84,0xdf,0x4d]
-
-_ld2:
-  ld2.b {v4, v5}[13], [x3]
-  ld2.h {v4, v5}[2], [x3]
-  ld2.s {v4, v5}[2], [x3]
-  ld2.d {v4, v5}[1], [x3]
-  ld2.b {v4, v5}[13], [x3], x5
-  ld2.h {v4, v5}[2], [x3], x5
-  ld2.s {v4, v5}[2], [x3], x5
-  ld2.d {v4, v5}[1], [x3], x5
-  ld2.b {v4, v5}[13], [x3], #2
-  ld2.h {v4, v5}[2], [x3], #4
-  ld2.s {v4, v5}[2], [x3], #8
-  ld2.d {v4, v5}[1], [x3], #16
-
-
-; CHECK: _ld2:
-; CHECK: ld2.b { v4, v5 }[13], [x3]    ; encoding: [0x64,0x14,0x60,0x4d]
-; CHECK: ld2.h { v4, v5 }[2], [x3]     ; encoding: [0x64,0x50,0x60,0x0d]
-; CHECK: ld2.s { v4, v5 }[2], [x3]     ; encoding: [0x64,0x80,0x60,0x4d]
-; CHECK: ld2.d { v4, v5 }[1], [x3]     ; encoding: [0x64,0x84,0x60,0x4d]
-; CHECK: ld2.b { v4, v5 }[13], [x3], x5 ; encoding: [0x64,0x14,0xe5,0x4d]
-; CHECK: ld2.h { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x50,0xe5,0x0d]
-; CHECK: ld2.s { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x80,0xe5,0x4d]
-; CHECK: ld2.d { v4, v5 }[1], [x3], x5 ; encoding: [0x64,0x84,0xe5,0x4d]
-; CHECK: ld2.b { v4, v5 }[13], [x3], #2 ; encoding: [0x64,0x14,0xff,0x4d]
-; CHECK: ld2.h { v4, v5 }[2], [x3], #4 ; encoding: [0x64,0x50,0xff,0x0d]
-; CHECK: ld2.s { v4, v5 }[2], [x3], #8 ; encoding: [0x64,0x80,0xff,0x4d]
-; CHECK: ld2.d { v4, v5 }[1], [x3], #16 ; encoding: [0x64,0x84,0xff,0x4d]
-
-
-_ld3:
-  ld3.b {v4, v5, v6}[13], [x3]
-  ld3.h {v4, v5, v6}[2], [x3]
-  ld3.s {v4, v5, v6}[2], [x3]
-  ld3.d {v4, v5, v6}[1], [x3]
-  ld3.b {v4, v5, v6}[13], [x3], x5
-  ld3.h {v4, v5, v6}[2], [x3], x5
-  ld3.s {v4, v5, v6}[2], [x3], x5
-  ld3.d {v4, v5, v6}[1], [x3], x5
-  ld3.b {v4, v5, v6}[13], [x3], #3
-  ld3.h {v4, v5, v6}[2], [x3], #6
-  ld3.s {v4, v5, v6}[2], [x3], #12
-  ld3.d {v4, v5, v6}[1], [x3], #24
-
-
-; CHECK: _ld3:
-; CHECK: ld3.b { v4, v5, v6 }[13], [x3] ; encoding: [0x64,0x34,0x40,0x4d]
-; CHECK: ld3.h { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0x70,0x40,0x0d]
-; CHECK: ld3.s { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0xa0,0x40,0x4d]
-; CHECK: ld3.d { v4, v5, v6 }[1], [x3] ; encoding: [0x64,0xa4,0x40,0x4d]
-; CHECK: ld3.b { v4, v5, v6 }[13], [x3], x5 ; encoding: [0x64,0x34,0xc5,0x4d]
-; CHECK: ld3.h { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0x70,0xc5,0x0d]
-; CHECK: ld3.s { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xc5,0x4d]
-; CHECK: ld3.d { v4, v5, v6 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xc5,0x4d]
-; CHECK: ld3.b { v4, v5, v6 }[13], [x3], #3 ; encoding: [0x64,0x34,0xdf,0x4d]
-; CHECK: ld3.h { v4, v5, v6 }[2], [x3], #6 ; encoding: [0x64,0x70,0xdf,0x0d]
-; CHECK: ld3.s { v4, v5, v6 }[2], [x3], #12 ; encoding: [0x64,0xa0,0xdf,0x4d]
-; CHECK: ld3.d { v4, v5, v6 }[1], [x3], #24 ; encoding: [0x64,0xa4,0xdf,0x4d]
-
-
-_ld4:
-  ld4.b {v4, v5, v6, v7}[13], [x3]
-  ld4.h {v4, v5, v6, v7}[2], [x3]
-  ld4.s {v4, v5, v6, v7}[2], [x3]
-  ld4.d {v4, v5, v6, v7}[1], [x3]
-  ld4.b {v4, v5, v6, v7}[13], [x3], x5
-  ld4.h {v4, v5, v6, v7}[2], [x3], x5
-  ld4.s {v4, v5, v6, v7}[2], [x3], x5
-  ld4.d {v4, v5, v6, v7}[1], [x3], x5
-  ld4.b {v4, v5, v6, v7}[13], [x3], #4
-  ld4.h {v4, v5, v6, v7}[2], [x3], #8
-  ld4.s {v4, v5, v6, v7}[2], [x3], #16
-  ld4.d {v4, v5, v6, v7}[1], [x3], #32
-
-; CHECK: _ld4:
-; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3] ; encoding: [0x64,0x34,0x60,0x4d]
-; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0x70,0x60,0x0d]
-; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0xa0,0x60,0x4d]
-; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3] ; encoding: [0x64,0xa4,0x60,0x4d]
-; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3], x5 ; encoding: [0x64,0x34,0xe5,0x4d]
-; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0x70,0xe5,0x0d]
-; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xe5,0x4d]
-; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xe5,0x4d]
-; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3], #4 ; encoding: [0x64,0x34,0xff,0x4d]
-; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3], #8 ; encoding: [0x64,0x70,0xff,0x0d]
-; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3], #16 ; encoding: [0x64,0xa0,0xff,0x4d]
-; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3], #32 ; encoding: [0x64,0xa4,0xff,0x4d]
-
-_st1:
-  st1.b {v4}[13], [x3]
-  st1.h {v4}[2], [x3]
-  st1.s {v4}[2], [x3]
-  st1.d {v4}[1], [x3]
-  st1.b {v4}[13], [x3], x5
-  st1.h {v4}[2], [x3], x5
-  st1.s {v4}[2], [x3], x5
-  st1.d {v4}[1], [x3], x5
-  st1.b {v4}[13], [x3], #1
-  st1.h {v4}[2], [x3], #2
-  st1.s {v4}[2], [x3], #4
-  st1.d {v4}[1], [x3], #8
-
-; CHECK: _st1:
-; CHECK: st1.b { v4 }[13], [x3]        ; encoding: [0x64,0x14,0x00,0x4d]
-; CHECK: st1.h { v4 }[2], [x3]         ; encoding: [0x64,0x50,0x00,0x0d]
-; CHECK: st1.s { v4 }[2], [x3]         ; encoding: [0x64,0x80,0x00,0x4d]
-; CHECK: st1.d { v4 }[1], [x3]         ; encoding: [0x64,0x84,0x00,0x4d]
-; CHECK: st1.b { v4 }[13], [x3], x5    ; encoding: [0x64,0x14,0x85,0x4d]
-; CHECK: st1.h { v4 }[2], [x3], x5     ; encoding: [0x64,0x50,0x85,0x0d]
-; CHECK: st1.s { v4 }[2], [x3], x5     ; encoding: [0x64,0x80,0x85,0x4d]
-; CHECK: st1.d { v4 }[1], [x3], x5     ; encoding: [0x64,0x84,0x85,0x4d]
-; CHECK: st1.b { v4 }[13], [x3], #1   ; encoding: [0x64,0x14,0x9f,0x4d]
-; CHECK: st1.h { v4 }[2], [x3], #2    ; encoding: [0x64,0x50,0x9f,0x0d]
-; CHECK: st1.s { v4 }[2], [x3], #4    ; encoding: [0x64,0x80,0x9f,0x4d]
-; CHECK: st1.d { v4 }[1], [x3], #8    ; encoding: [0x64,0x84,0x9f,0x4d]
-
-_st2:
-  st2.b {v4, v5}[13], [x3]
-  st2.h {v4, v5}[2], [x3]
-  st2.s {v4, v5}[2], [x3]
-  st2.d {v4, v5}[1], [x3]
-  st2.b {v4, v5}[13], [x3], x5
-  st2.h {v4, v5}[2], [x3], x5
-  st2.s {v4, v5}[2], [x3], x5
-  st2.d {v4, v5}[1], [x3], x5
-  st2.b {v4, v5}[13], [x3], #2
-  st2.h {v4, v5}[2], [x3], #4
-  st2.s {v4, v5}[2], [x3], #8
-  st2.d {v4, v5}[1], [x3], #16
-
-; CHECK: _st2:
-; CHECK: st2.b { v4, v5 }[13], [x3]    ; encoding: [0x64,0x14,0x20,0x4d]
-; CHECK: st2.h { v4, v5 }[2], [x3]     ; encoding: [0x64,0x50,0x20,0x0d]
-; CHECK: st2.s { v4, v5 }[2], [x3]     ; encoding: [0x64,0x80,0x20,0x4d]
-; CHECK: st2.d { v4, v5 }[1], [x3]     ; encoding: [0x64,0x84,0x20,0x4d]
-; CHECK: st2.b { v4, v5 }[13], [x3], x5 ; encoding: [0x64,0x14,0xa5,0x4d]
-; CHECK: st2.h { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x50,0xa5,0x0d]
-; CHECK: st2.s { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x80,0xa5,0x4d]
-; CHECK: st2.d { v4, v5 }[1], [x3], x5 ; encoding: [0x64,0x84,0xa5,0x4d]
-; CHECK: st2.b { v4, v5 }[13], [x3], #2 ; encoding: [0x64,0x14,0xbf,0x4d]
-; CHECK: st2.h { v4, v5 }[2], [x3], #4 ; encoding: [0x64,0x50,0xbf,0x0d]
-; CHECK: st2.s { v4, v5 }[2], [x3], #8 ; encoding: [0x64,0x80,0xbf,0x4d]
-; CHECK: st2.d { v4, v5 }[1], [x3], #16 ; encoding: [0x64,0x84,0xbf,0x4d]
-
-
-_st3:
-  st3.b {v4, v5, v6}[13], [x3]
-  st3.h {v4, v5, v6}[2], [x3]
-  st3.s {v4, v5, v6}[2], [x3]
-  st3.d {v4, v5, v6}[1], [x3]
-  st3.b {v4, v5, v6}[13], [x3], x5
-  st3.h {v4, v5, v6}[2], [x3], x5
-  st3.s {v4, v5, v6}[2], [x3], x5
-  st3.d {v4, v5, v6}[1], [x3], x5
-  st3.b {v4, v5, v6}[13], [x3], #3
-  st3.h {v4, v5, v6}[2], [x3], #6
-  st3.s {v4, v5, v6}[2], [x3], #12
-  st3.d {v4, v5, v6}[1], [x3], #24
-
-; CHECK: _st3:
-; CHECK: st3.b { v4, v5, v6 }[13], [x3] ; encoding: [0x64,0x34,0x00,0x4d]
-; CHECK: st3.h { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0x70,0x00,0x0d]
-; CHECK: st3.s { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0xa0,0x00,0x4d]
-; CHECK: st3.d { v4, v5, v6 }[1], [x3] ; encoding: [0x64,0xa4,0x00,0x4d]
-; CHECK: st3.b { v4, v5, v6 }[13], [x3], x5 ; encoding: [0x64,0x34,0x85,0x4d]
-; CHECK: st3.h { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0x70,0x85,0x0d]
-; CHECK: st3.s { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0xa0,0x85,0x4d]
-; CHECK: st3.d { v4, v5, v6 }[1], [x3], x5 ; encoding: [0x64,0xa4,0x85,0x4d]
-; CHECK: st3.b { v4, v5, v6 }[13], [x3], #3 ; encoding: [0x64,0x34,0x9f,0x4d]
-; CHECK: st3.h { v4, v5, v6 }[2], [x3], #6 ; encoding: [0x64,0x70,0x9f,0x0d]
-; CHECK: st3.s { v4, v5, v6 }[2], [x3], #12 ; encoding: [0x64,0xa0,0x9f,0x4d]
-; CHECK: st3.d { v4, v5, v6 }[1], [x3], #24 ; encoding: [0x64,0xa4,0x9f,0x4d]
-
-_st4:
-  st4.b {v4, v5, v6, v7}[13], [x3]
-  st4.h {v4, v5, v6, v7}[2], [x3]
-  st4.s {v4, v5, v6, v7}[2], [x3]
-  st4.d {v4, v5, v6, v7}[1], [x3]
-  st4.b {v4, v5, v6, v7}[13], [x3], x5
-  st4.h {v4, v5, v6, v7}[2], [x3], x5
-  st4.s {v4, v5, v6, v7}[2], [x3], x5
-  st4.d {v4, v5, v6, v7}[1], [x3], x5
-  st4.b {v4, v5, v6, v7}[13], [x3], #4
-  st4.h {v4, v5, v6, v7}[2], [x3], #8
-  st4.s {v4, v5, v6, v7}[2], [x3], #16
-  st4.d {v4, v5, v6, v7}[1], [x3], #32
-
-; CHECK: _st4:
-; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3] ; encoding: [0x64,0x34,0x20,0x4d]
-; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0x70,0x20,0x0d]
-; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0xa0,0x20,0x4d]
-; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3] ; encoding: [0x64,0xa4,0x20,0x4d]
-; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3], x5 ; encoding: [0x64,0x34,0xa5,0x4d]
-; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0x70,0xa5,0x0d]
-; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xa5,0x4d]
-; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xa5,0x4d]
-; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3], #4 ; encoding: [0x64,0x34,0xbf,0x4d]
-; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3], #8 ; encoding: [0x64,0x70,0xbf,0x0d]
-; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3], #16 ; encoding: [0x64,0xa0,0xbf,0x4d]
-; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3], #32 ; encoding: [0x64,0xa4,0xbf,0x4d]
-
-
-;---------
-; ARM verbose syntax equivalents to the above.
-;---------
-verbose_syntax:
-
-  ld1 { v1.8b }, [x1]
-  ld1 { v2.8b, v3.8b }, [x1]
-  ld1 { v3.8b, v4.8b, v5.8b }, [x1]
-  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1]
-
-  ld1 { v1.16b }, [x1]
-  ld1 { v2.16b, v3.16b }, [x1]
-  ld1 { v3.16b, v4.16b, v5.16b }, [x1]
-  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1]
-
-  ld1 { v1.4h }, [x1]
-  ld1 { v2.4h, v3.4h }, [x1]
-  ld1 { v3.4h, v4.4h, v5.4h }, [x1]
-  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1]
-
-  ld1 { v1.8h }, [x1]
-  ld1 { v2.8h, v3.8h }, [x1]
-  ld1 { v3.8h, v4.8h, v5.8h }, [x1]
-  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1]
-
-  ld1 { v1.2s }, [x1]
-  ld1 { v2.2s, v3.2s }, [x1]
-  ld1 { v3.2s, v4.2s, v5.2s }, [x1]
-  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1]
-
-  ld1 { v1.4s }, [x1]
-  ld1 { v2.4s, v3.4s }, [x1]
-  ld1 { v3.4s, v4.4s, v5.4s }, [x1]
-  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1]
-
-  ld1 { v1.1d }, [x1]
-  ld1 { v2.1d, v3.1d }, [x1]
-  ld1 { v3.1d, v4.1d, v5.1d }, [x1]
-  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1]
-
-  ld1 { v1.2d }, [x1]
-  ld1 { v2.2d, v3.2d }, [x1]
-  ld1 { v3.2d, v4.2d, v5.2d }, [x1]
-  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1]
-
-  st1 { v1.8b }, [x1]
-  st1 { v2.8b, v3.8b }, [x1]
-  st1 { v3.8b, v4.8b, v5.8b }, [x1]
-  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1]
-
-  st1 { v1.16b }, [x1]
-  st1 { v2.16b, v3.16b }, [x1]
-  st1 { v3.16b, v4.16b, v5.16b }, [x1]
-  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1]
-
-  st1 { v1.4h }, [x1]
-  st1 { v2.4h, v3.4h }, [x1]
-  st1 { v3.4h, v4.4h, v5.4h }, [x1]
-  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1]
-
-  st1 { v1.8h }, [x1]
-  st1 { v2.8h, v3.8h }, [x1]
-  st1 { v3.8h, v4.8h, v5.8h }, [x1]
-  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1]
-
-  st1 { v1.2s }, [x1]
-  st1 { v2.2s, v3.2s }, [x1]
-  st1 { v3.2s, v4.2s, v5.2s }, [x1]
-  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1]
-
-  st1 { v1.4s }, [x1]
-  st1 { v2.4s, v3.4s }, [x1]
-  st1 { v3.4s, v4.4s, v5.4s }, [x1]
-  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1]
-
-  st1 { v1.1d }, [x1]
-  st1 { v2.1d, v3.1d }, [x1]
-  st1 { v3.1d, v4.1d, v5.1d }, [x1]
-  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1]
-
-  st1 { v1.2d }, [x1]
-  st1 { v2.2d, v3.2d }, [x1]
-  st1 { v3.2d, v4.2d, v5.2d }, [x1]
-  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1]
-
-  ld2 { v3.8b, v4.8b }, [x19]
-  ld2 { v3.16b, v4.16b }, [x19]
-  ld2 { v3.4h, v4.4h }, [x19]
-  ld2 { v3.8h, v4.8h }, [x19]
-  ld2 { v3.2s, v4.2s }, [x19]
-  ld2 { v3.4s, v4.4s }, [x19]
-  ld2 { v3.2d, v4.2d }, [x19]
-
-  st2 { v3.8b, v4.8b }, [x19]
-  st2 { v3.16b, v4.16b }, [x19]
-  st2 { v3.4h, v4.4h }, [x19]
-  st2 { v3.8h, v4.8h }, [x19]
-  st2 { v3.2s, v4.2s }, [x19]
-  st2 { v3.4s, v4.4s }, [x19]
-  st2 { v3.2d, v4.2d }, [x19]
-
-  ld3 { v2.8b, v3.8b, v4.8b }, [x19]
-  ld3 { v2.16b, v3.16b, v4.16b }, [x19]
-  ld3 { v2.4h, v3.4h, v4.4h }, [x19]
-  ld3 { v2.8h, v3.8h, v4.8h }, [x19]
-  ld3 { v2.2s, v3.2s, v4.2s }, [x19]
-  ld3 { v2.4s, v3.4s, v4.4s }, [x19]
-  ld3 { v2.2d, v3.2d, v4.2d }, [x19]
-
-  st3 { v2.8b, v3.8b, v4.8b }, [x19]
-  st3 { v2.16b, v3.16b, v4.16b }, [x19]
-  st3 { v2.4h, v3.4h, v4.4h }, [x19]
-  st3 { v2.8h, v3.8h, v4.8h }, [x19]
-  st3 { v2.2s, v3.2s, v4.2s }, [x19]
-  st3 { v2.4s, v3.4s, v4.4s }, [x19]
-  st3 { v2.2d, v3.2d, v4.2d }, [x19]
-
-  ld4 { v2.8b, v3.8b, v4.8b, v5.8b }, [x19]
-  ld4 { v2.16b, v3.16b, v4.16b, v5.16b }, [x19]
-  ld4 { v2.4h, v3.4h, v4.4h, v5.4h }, [x19]
-  ld4 { v2.8h, v3.8h, v4.8h, v5.8h }, [x19]
-  ld4 { v2.2s, v3.2s, v4.2s, v5.2s }, [x19]
-  ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x19]
-  ld4 { v2.2d, v3.2d, v4.2d, v5.2d }, [x19]
-
-  st4 { v2.8b, v3.8b, v4.8b, v5.8b }, [x19]
-  st4 { v2.16b, v3.16b, v4.16b, v5.16b }, [x19]
-  st4 { v2.4h, v3.4h, v4.4h, v5.4h }, [x19]
-  st4 { v2.8h, v3.8h, v4.8h, v5.8h }, [x19]
-  st4 { v2.2s, v3.2s, v4.2s, v5.2s }, [x19]
-  st4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x19]
-  st4 { v2.2d, v3.2d, v4.2d, v5.2d }, [x19]
-
-  ld1 { v1.8b }, [x1], x15
-  ld1 { v2.8b, v3.8b }, [x1], x15
-  ld1 { v3.8b, v4.8b, v5.8b }, [x1], x15
-  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
-
-  ld1 { v1.16b }, [x1], x15
-  ld1 { v2.16b, v3.16b }, [x1], x15
-  ld1 { v3.16b, v4.16b, v5.16b }, [x1], x15
-  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
-
-  ld1 { v1.4h }, [x1], x15
-  ld1 { v2.4h, v3.4h }, [x1], x15
-  ld1 { v3.4h, v4.4h, v5.4h }, [x1], x15
-  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
-
-  ld1 { v1.8h }, [x1], x15
-  ld1 { v2.8h, v3.8h }, [x1], x15
-  ld1 { v3.8h, v4.8h, v5.8h }, [x1], x15
-  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
-
-  ld1 { v1.2s }, [x1], x15
-  ld1 { v2.2s, v3.2s }, [x1], x15
-  ld1 { v3.2s, v4.2s, v5.2s }, [x1], x15
-  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
-
-  ld1 { v1.4s }, [x1], x15
-  ld1 { v2.4s, v3.4s }, [x1], x15
-  ld1 { v3.4s, v4.4s, v5.4s }, [x1], x15
-  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
-
-  ld1 { v1.1d }, [x1], x15
-  ld1 { v2.1d, v3.1d }, [x1], x15
-  ld1 { v3.1d, v4.1d, v5.1d }, [x1], x15
-  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], x15
-
-  ld1 { v1.2d }, [x1], x15
-  ld1 { v2.2d, v3.2d }, [x1], x15
-  ld1 { v3.2d, v4.2d, v5.2d }, [x1], x15
-  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
-
-  st1 { v1.8b }, [x1], x15
-  st1 { v2.8b, v3.8b }, [x1], x15
-  st1 { v3.8b, v4.8b, v5.8b }, [x1], x15
-  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
-
-  st1 { v1.16b }, [x1], x15
-  st1 { v2.16b, v3.16b }, [x1], x15
-  st1 { v3.16b, v4.16b, v5.16b }, [x1], x15
-  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
-
-  st1 { v1.4h }, [x1], x15
-  st1 { v2.4h, v3.4h }, [x1], x15
-  st1 { v3.4h, v4.4h, v5.4h }, [x1], x15
-  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
-
-  st1 { v1.8h }, [x1], x15
-  st1 { v2.8h, v3.8h }, [x1], x15
-  st1 { v3.8h, v4.8h, v5.8h }, [x1], x15
-  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
-
-  st1 { v1.2s }, [x1], x15
-  st1 { v2.2s, v3.2s }, [x1], x15
-  st1 { v3.2s, v4.2s, v5.2s }, [x1], x15
-  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
-
-  st1 { v1.4s }, [x1], x15
-  st1 { v2.4s, v3.4s }, [x1], x15
-  st1 { v3.4s, v4.4s, v5.4s }, [x1], x15
-  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
-
-  st1 { v1.1d }, [x1], x15
-  st1 { v2.1d, v3.1d }, [x1], x15
-  st1 { v3.1d, v4.1d, v5.1d }, [x1], x15
-  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], x15
-
-  st1 { v1.2d }, [x1], x15
-  st1 { v2.2d, v3.2d }, [x1], x15
-  st1 { v3.2d, v4.2d, v5.2d }, [x1], x15
-  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
-
-  ld1 { v1.8b }, [x1], #8
-  ld1 { v2.8b, v3.8b }, [x1], #16
-  ld1 { v3.8b, v4.8b, v5.8b }, [x1], #24
-  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
-
-  ld1 { v1.16b }, [x1], #16
-  ld1 { v2.16b, v3.16b }, [x1], #32
-  ld1 { v3.16b, v4.16b, v5.16b }, [x1], #48
-  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
-
-  ld1 { v1.4h }, [x1], #8
-  ld1 { v2.4h, v3.4h }, [x1], #16
-  ld1 { v3.4h, v4.4h, v5.4h }, [x1], #24
-  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
-
-  ld1 { v1.8h }, [x1], #16
-  ld1 { v2.8h, v3.8h }, [x1], #32
-  ld1 { v3.8h, v4.8h, v5.8h }, [x1], #48
-  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
-
-  ld1 { v1.2s }, [x1], #8
-  ld1 { v2.2s, v3.2s }, [x1], #16
-  ld1 { v3.2s, v4.2s, v5.2s }, [x1], #24
-  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
-
-  ld1 { v1.4s }, [x1], #16
-  ld1 { v2.4s, v3.4s }, [x1], #32
-  ld1 { v3.4s, v4.4s, v5.4s }, [x1], #48
-  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
-
-  ld1 { v1.1d }, [x1], #8
-  ld1 { v2.1d, v3.1d }, [x1], #16
-  ld1 { v3.1d, v4.1d, v5.1d }, [x1], #24
-  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], #32
-
-  ld1 { v1.2d }, [x1], #16
-  ld1 { v2.2d, v3.2d }, [x1], #32
-  ld1 { v3.2d, v4.2d, v5.2d }, [x1], #48
-  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
-
-  st1 { v1.8b }, [x1], #8
-  st1 { v2.8b, v3.8b }, [x1], #16
-  st1 { v3.8b, v4.8b, v5.8b }, [x1], #24
-  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
-
-  st1 { v1.16b }, [x1], #16
-  st1 { v2.16b, v3.16b }, [x1], #32
-  st1 { v3.16b, v4.16b, v5.16b }, [x1], #48
-  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
-
-  st1 { v1.4h }, [x1], #8
-  st1 { v2.4h, v3.4h }, [x1], #16
-  st1 { v3.4h, v4.4h, v5.4h }, [x1], #24
-  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
-
-  st1 { v1.8h }, [x1], #16
-  st1 { v2.8h, v3.8h }, [x1], #32
-  st1 { v3.8h, v4.8h, v5.8h }, [x1], #48
-  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
-
-  st1 { v1.2s }, [x1], #8
-  st1 { v2.2s, v3.2s }, [x1], #16
-  st1 { v3.2s, v4.2s, v5.2s }, [x1], #24
-  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
-
-  st1 { v1.4s }, [x1], #16
-  st1 { v2.4s, v3.4s }, [x1], #32
-  st1 { v3.4s, v4.4s, v5.4s }, [x1], #48
-  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
-
-  st1 { v1.1d }, [x1], #8
-  st1 { v2.1d, v3.1d }, [x1], #16
-  st1 { v3.1d, v4.1d, v5.1d }, [x1], #24
-  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], #32
-
-  st1 { v1.2d }, [x1], #16
-  st1 { v2.2d, v3.2d }, [x1], #32
-  st1 { v3.2d, v4.2d, v5.2d }, [x1], #48
-  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
-
-  ld2 { v2.8b, v3.8b }, [x1], x15
-  ld2 { v2.16b, v3.16b }, [x1], x15
-  ld2 { v2.4h, v3.4h }, [x1], x15
-  ld2 { v2.8h, v3.8h }, [x1], x15
-  ld2 { v2.2s, v3.2s }, [x1], x15
-  ld2 { v2.4s, v3.4s }, [x1], x15
-  ld2 { v2.2d, v3.2d }, [x1], x15
-
-  st2 { v2.8b, v3.8b }, [x1], x15
-  st2 { v2.16b, v3.16b }, [x1], x15
-  st2 { v2.4h, v3.4h }, [x1], x15
-  st2 { v2.8h, v3.8h }, [x1], x15
-  st2 { v2.2s, v3.2s }, [x1], x15
-  st2 { v2.4s, v3.4s }, [x1], x15
-  st2 { v2.2d, v3.2d }, [x1], x15
-
-  ld2 { v2.8b, v3.8b }, [x1], #16
-  ld2 { v2.16b, v3.16b }, [x1], #32
-  ld2 { v2.4h, v3.4h }, [x1], #16
-  ld2 { v2.8h, v3.8h }, [x1], #32
-  ld2 { v2.2s, v3.2s }, [x1], #16
-  ld2 { v2.4s, v3.4s }, [x1], #32
-  ld2 { v2.2d, v3.2d }, [x1], #32
-
-  st2 { v2.8b, v3.8b }, [x1], #16
-  st2 { v2.16b, v3.16b }, [x1], #32
-  st2 { v2.4h, v3.4h }, [x1], #16
-  st2 { v2.8h, v3.8h }, [x1], #32
-  st2 { v2.2s, v3.2s }, [x1], #16
-  st2 { v2.4s, v3.4s }, [x1], #32
-  st2 { v2.2d, v3.2d }, [x1], #32
-
-  ld3 { v3.8b, v4.8b, v5.8b }, [x1], x15
-  ld3 { v3.16b, v4.16b, v5.16b }, [x1], x15
-  ld3 { v3.4h, v4.4h, v5.4h }, [x1], x15
-  ld3 { v3.8h, v4.8h, v5.8h }, [x1], x15
-  ld3 { v3.2s, v4.2s, v5.2s }, [x1], x15
-  ld3 { v3.4s, v4.4s, v5.4s }, [x1], x15
-  ld3 { v3.2d, v4.2d, v5.2d }, [x1], x15
-
-  st3 { v3.8b, v4.8b, v5.8b }, [x1], x15
-  st3 { v3.16b, v4.16b, v5.16b }, [x1], x15
-  st3 { v3.4h, v4.4h, v5.4h }, [x1], x15
-  st3 { v3.8h, v4.8h, v5.8h }, [x1], x15
-  st3 { v3.2s, v4.2s, v5.2s }, [x1], x15
-  st3 { v3.4s, v4.4s, v5.4s }, [x1], x15
-  st3 { v3.2d, v4.2d, v5.2d }, [x1], x15
-  ld3 { v3.8b, v4.8b, v5.8b }, [x1], #24
-
-  ld3 { v3.16b, v4.16b, v5.16b }, [x1], #48
-  ld3 { v3.4h, v4.4h, v5.4h }, [x1], #24
-  ld3 { v3.8h, v4.8h, v5.8h }, [x1], #48
-  ld3 { v3.2s, v4.2s, v5.2s }, [x1], #24
-  ld3 { v3.4s, v4.4s, v5.4s }, [x1], #48
-  ld3 { v3.2d, v4.2d, v5.2d }, [x1], #48
-
-  st3 { v3.8b, v4.8b, v5.8b }, [x1], #24
-  st3 { v3.16b, v4.16b, v5.16b }, [x1], #48
-  st3 { v3.4h, v4.4h, v5.4h }, [x1], #24
-  st3 { v3.8h, v4.8h, v5.8h }, [x1], #48
-  st3 { v3.2s, v4.2s, v5.2s }, [x1], #24
-  st3 { v3.4s, v4.4s, v5.4s }, [x1], #48
-  st3 { v3.2d, v4.2d, v5.2d }, [x1], #48
-
-  ld4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
-  ld4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
-  ld4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
-  ld4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
-  ld4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
-  ld4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
-  ld4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
-
-  st4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
-  st4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
-  st4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
-  st4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
-  st4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
-  st4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
-  st4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
-
-  ld4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
-  ld4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
-  ld4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
-  ld4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
-  ld4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
-  ld4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
-  ld4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
-
-  st4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
-  st4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
-  st4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
-  st4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
-  st4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
-  st4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
-  st4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
-
-
-  ld1r { v12.8b }, [x2]
-  ld1r { v12.8b }, [x2], x3
-  ld1r { v12.16b }, [x2]
-  ld1r { v12.16b }, [x2], x3
-  ld1r { v12.4h }, [x2]
-  ld1r { v12.4h }, [x2], x3
-  ld1r { v12.8h }, [x2]
-  ld1r { v12.8h }, [x2], x3
-  ld1r { v12.2s }, [x2]
-  ld1r { v12.2s }, [x2], x3
-  ld1r { v12.4s }, [x2]
-  ld1r { v12.4s }, [x2], x3
-  ld1r { v12.1d }, [x2]
-  ld1r { v12.1d }, [x2], x3
-  ld1r { v12.2d }, [x2]
-  ld1r { v12.2d }, [x2], x3
-
-  ld1r { v12.8b }, [x2], #1
-  ld1r { v12.16b }, [x2], #1
-  ld1r { v12.4h }, [x2], #2
-  ld1r { v12.8h }, [x2], #2
-  ld1r { v12.2s }, [x2], #4
-  ld1r { v12.4s }, [x2], #4
-  ld1r { v12.1d }, [x2], #8
-  ld1r { v12.2d }, [x2], #8
-  ld2r { v3.8b, v4.8b }, [x2]
-  ld2r { v3.8b, v4.8b }, [x2], x3
-  ld2r { v3.16b, v4.16b }, [x2]
-  ld2r { v3.16b, v4.16b }, [x2], x3
-  ld2r { v3.4h, v4.4h }, [x2]
-  ld2r { v3.4h, v4.4h }, [x2], x3
-  ld2r { v3.8h, v4.8h }, [x2]
-  ld2r { v3.8h, v4.8h }, [x2], x3
-  ld2r { v3.2s, v4.2s }, [x2]
-  ld2r { v3.2s, v4.2s }, [x2], x3
-  ld2r { v3.4s, v4.4s }, [x2]
-  ld2r { v3.4s, v4.4s }, [x2], x3
-  ld2r { v3.1d, v4.1d }, [x2]
-  ld2r { v3.1d, v4.1d }, [x2], x3
-  ld2r { v3.2d, v4.2d }, [x2]
-  ld2r { v3.2d, v4.2d }, [x2], x3
-
-  ld2r { v3.8b, v4.8b }, [x2], #2
-  ld2r { v3.16b, v4.16b }, [x2], #2
-  ld2r { v3.4h, v4.4h }, [x2], #4
-  ld2r { v3.8h, v4.8h }, [x2], #4
-  ld2r { v3.2s, v4.2s }, [x2], #8
-  ld2r { v3.4s, v4.4s }, [x2], #8
-  ld2r { v3.1d, v4.1d }, [x2], #16
-  ld2r { v3.2d, v4.2d }, [x2], #16
-
-  ld3r { v2.8b, v3.8b, v4.8b }, [x2]
-  ld3r { v2.8b, v3.8b, v4.8b }, [x2], x3
-  ld3r { v2.16b, v3.16b, v4.16b }, [x2]
-  ld3r { v2.16b, v3.16b, v4.16b }, [x2], x3
-  ld3r { v2.4h, v3.4h, v4.4h }, [x2]
-  ld3r { v2.4h, v3.4h, v4.4h }, [x2], x3
-  ld3r { v2.8h, v3.8h, v4.8h }, [x2]
-  ld3r { v2.8h, v3.8h, v4.8h }, [x2], x3
-  ld3r { v2.2s, v3.2s, v4.2s }, [x2]
-  ld3r { v2.2s, v3.2s, v4.2s }, [x2], x3
-  ld3r { v2.4s, v3.4s, v4.4s }, [x2]
-  ld3r { v2.4s, v3.4s, v4.4s }, [x2], x3
-  ld3r { v2.1d, v3.1d, v4.1d }, [x2]
-  ld3r { v2.1d, v3.1d, v4.1d }, [x2], x3
-  ld3r { v2.2d, v3.2d, v4.2d }, [x2]
-  ld3r { v2.2d, v3.2d, v4.2d }, [x2], x3
-
-  ld3r { v2.8b, v3.8b, v4.8b }, [x2], #3
-  ld3r { v2.16b, v3.16b, v4.16b }, [x2], #3
-  ld3r { v2.4h, v3.4h, v4.4h }, [x2], #6
-  ld3r { v2.8h, v3.8h, v4.8h }, [x2], #6
-  ld3r { v2.2s, v3.2s, v4.2s }, [x2], #12
-  ld3r { v2.4s, v3.4s, v4.4s }, [x2], #12
-  ld3r { v2.1d, v3.1d, v4.1d }, [x2], #24
-  ld3r { v2.2d, v3.2d, v4.2d }, [x2], #24
-
-  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2]
-  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2], x3
-  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2]
-  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2], x3
-  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2]
-  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2], x3
-  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2]
-  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2], x3
-  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2]
-  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2], x3
-  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2]
-  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2], x3
-  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2]
-  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2], x3
-  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2]
-  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2], x3
-
-  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2], #4
-  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2], #4
-  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2], #8
-  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2], #8
-  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2], #16
-  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2], #16
-  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2], #32
-  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2], #32
-
-  ld1 { v6.b }[13], [x3]
-  ld1 { v6.h }[2], [x3]
-  ld1 { v6.s }[2], [x3]
-  ld1 { v6.d }[1], [x3]
-  ld1 { v6.b }[13], [x3], x5
-  ld1 { v6.h }[2], [x3], x5
-  ld1 { v6.s }[2], [x3], x5
-  ld1 { v6.d }[1], [x3], x5
-  ld1 { v6.b }[13], [x3], #1
-  ld1 { v6.h }[2], [x3], #2
-  ld1 { v6.s }[2], [x3], #4
-  ld1 { v6.d }[1], [x3], #8
-
-  ld2 { v5.b, v6.b }[13], [x3]
-  ld2 { v5.h, v6.h }[2], [x3]
-  ld2 { v5.s, v6.s }[2], [x3]
-  ld2 { v5.d, v6.d }[1], [x3]
-  ld2 { v5.b, v6.b }[13], [x3], x5
-  ld2 { v5.h, v6.h }[2], [x3], x5
-  ld2 { v5.s, v6.s }[2], [x3], x5
-  ld2 { v5.d, v6.d }[1], [x3], x5
-  ld2 { v5.b, v6.b }[13], [x3], #2
-  ld2 { v5.h, v6.h }[2], [x3], #4
-  ld2 { v5.s, v6.s }[2], [x3], #8
-  ld2 { v5.d, v6.d }[1], [x3], #16
-
-  ld3 { v7.b, v8.b, v9.b }[13], [x3]
-  ld3 { v7.h, v8.h, v9.h }[2], [x3]
-  ld3 { v7.s, v8.s, v9.s }[2], [x3]
-  ld3 { v7.d, v8.d, v9.d }[1], [x3]
-  ld3 { v7.b, v8.b, v9.b }[13], [x3], x5
-  ld3 { v7.h, v8.h, v9.h }[2], [x3], x5
-  ld3 { v7.s, v8.s, v9.s }[2], [x3], x5
-  ld3 { v7.d, v8.d, v9.d }[1], [x3], x5
-  ld3 { v7.b, v8.b, v9.b }[13], [x3], #3
-  ld3 { v7.h, v8.h, v9.h }[2], [x3], #6
-  ld3 { v7.s, v8.s, v9.s }[2], [x3], #12
-  ld3 { v7.d, v8.d, v9.d }[1], [x3], #24
-
-  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3]
-  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3]
-  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3]
-  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3]
-  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], x5
-  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], x5
-  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], x5
-  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], x5
-  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], #4
-  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], #8
-  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], #16
-  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], #32
-
-  st1 { v6.b }[13], [x3]
-  st1 { v6.h }[2], [x3]
-  st1 { v6.s }[2], [x3]
-  st1 { v6.d }[1], [x3]
-  st1 { v6.b }[13], [x3], x5
-  st1 { v6.h }[2], [x3], x5
-  st1 { v6.s }[2], [x3], x5
-  st1 { v6.d }[1], [x3], x5
-  st1 { v6.b }[13], [x3], #1
-  st1 { v6.h }[2], [x3], #2
-  st1 { v6.s }[2], [x3], #4
-  st1 { v6.d }[1], [x3], #8
-
-
-  st2 { v5.b, v6.b }[13], [x3]
-  st2 { v5.h, v6.h }[2], [x3]
-  st2 { v5.s, v6.s }[2], [x3]
-  st2 { v5.d, v6.d }[1], [x3]
-  st2 { v5.b, v6.b }[13], [x3], x5
-  st2 { v5.h, v6.h }[2], [x3], x5
-  st2 { v5.s, v6.s }[2], [x3], x5
-  st2 { v5.d, v6.d }[1], [x3], x5
-  st2 { v5.b, v6.b }[13], [x3], #2
-  st2 { v5.h, v6.h }[2], [x3], #4
-  st2 { v5.s, v6.s }[2], [x3], #8
-  st2 { v5.d, v6.d }[1], [x3], #16
-
-  st3 { v7.b, v8.b, v9.b }[13], [x3]
-  st3 { v7.h, v8.h, v9.h }[2], [x3]
-  st3 { v7.s, v8.s, v9.s }[2], [x3]
-  st3 { v7.d, v8.d, v9.d }[1], [x3]
-  st3 { v7.b, v8.b, v9.b }[13], [x3], x5
-  st3 { v7.h, v8.h, v9.h }[2], [x3], x5
-  st3 { v7.s, v8.s, v9.s }[2], [x3], x5
-  st3 { v7.d, v8.d, v9.d }[1], [x3], x5
-  st3 { v7.b, v8.b, v9.b }[13], [x3], #3
-  st3 { v7.h, v8.h, v9.h }[2], [x3], #6
-  st3 { v7.s, v8.s, v9.s }[2], [x3], #12
-  st3 { v7.d, v8.d, v9.d }[1], [x3], #24
-
-  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3]
-  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3]
-  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3]
-  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3]
-  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], x5
-  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], x5
-  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], x5
-  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], x5
-  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], #4
-  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], #8
-  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], #16
-  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], #32
-
-; CHECK: ld1.8b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x40,0x0c]
-; CHECK: ld1.8b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x40,0x0c]
-; CHECK: ld1.8b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x40,0x0c]
-; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x40,0x0c]
-; CHECK: ld1.16b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x40,0x4c]
-; CHECK: ld1.16b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x40,0x4c]
-; CHECK: ld1.16b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x40,0x4c]
-; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x40,0x4c]
-; CHECK: ld1.4h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x40,0x0c]
-; CHECK: ld1.4h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x40,0x0c]
-; CHECK: ld1.4h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x40,0x0c]
-; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x40,0x0c]
-; CHECK: ld1.8h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x40,0x4c]
-; CHECK: ld1.8h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x40,0x4c]
-; CHECK: ld1.8h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x40,0x4c]
-; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x40,0x4c]
-; CHECK: ld1.2s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x40,0x0c]
-; CHECK: ld1.2s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x40,0x0c]
-; CHECK: ld1.2s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x40,0x0c]
-; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x40,0x0c]
-; CHECK: ld1.4s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x40,0x4c]
-; CHECK: ld1.4s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x40,0x4c]
-; CHECK: ld1.4s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x40,0x4c]
-; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x40,0x4c]
-; CHECK: ld1.1d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x40,0x0c]
-; CHECK: ld1.1d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x40,0x0c]
-; CHECK: ld1.1d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x40,0x0c]
-; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x40,0x0c]
-; CHECK: ld1.2d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x40,0x4c]
-; CHECK: ld1.2d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x40,0x4c]
-; CHECK: ld1.2d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x40,0x4c]
-; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x40,0x4c]
-; CHECK: st1.8b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x00,0x0c]
-; CHECK: st1.8b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x00,0x0c]
-; CHECK: st1.8b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x00,0x0c]
-; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x00,0x0c]
-; CHECK: st1.16b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x00,0x4c]
-; CHECK: st1.16b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x00,0x4c]
-; CHECK: st1.16b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x00,0x4c]
-; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x00,0x4c]
-; CHECK: st1.4h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x00,0x0c]
-; CHECK: st1.4h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x00,0x0c]
-; CHECK: st1.4h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x00,0x0c]
-; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x00,0x0c]
-; CHECK: st1.8h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x00,0x4c]
-; CHECK: st1.8h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x00,0x4c]
-; CHECK: st1.8h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x00,0x4c]
-; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x00,0x4c]
-; CHECK: st1.2s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x00,0x0c]
-; CHECK: st1.2s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x00,0x0c]
-; CHECK: st1.2s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x00,0x0c]
-; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x00,0x0c]
-; CHECK: st1.4s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x00,0x4c]
-; CHECK: st1.4s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x00,0x4c]
-; CHECK: st1.4s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x00,0x4c]
-; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x00,0x4c]
-; CHECK: st1.1d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x00,0x0c]
-; CHECK: st1.1d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x00,0x0c]
-; CHECK: st1.1d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x00,0x0c]
-; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x00,0x0c]
-; CHECK: st1.2d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x00,0x4c]
-; CHECK: st1.2d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x00,0x4c]
-; CHECK: st1.2d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x00,0x4c]
-; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x00,0x4c]
-; CHECK: ld2.8b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x40,0x0c]
-; CHECK: ld2.16b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x40,0x4c]
-; CHECK: ld2.4h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x40,0x0c]
-; CHECK: ld2.8h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x40,0x4c]
-; CHECK: ld2.2s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x40,0x0c]
-; CHECK: ld2.4s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x40,0x4c]
-; CHECK: ld2.2d	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8e,0x40,0x4c]
-; CHECK: st2.8b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x00,0x0c]
-; CHECK: st2.16b { v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x00,0x4c]
-; CHECK: st2.4h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x00,0x0c]
-; CHECK: st2.8h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x00,0x4c]
-; CHECK: st2.2s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x00,0x0c]
-; CHECK: st2.4s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x00,0x4c]
-; CHECK: st2.2d	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8e,0x00,0x4c]
-; CHECK: ld3.8b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x40,0x0c]
-; CHECK: ld3.16b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x40,0x4c]
-; CHECK: ld3.4h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x40,0x0c]
-; CHECK: ld3.8h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x40,0x4c]
-; CHECK: ld3.2s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x0c]
-; CHECK: ld3.4s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x4c]
-; CHECK: ld3.2d	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4e,0x40,0x4c]
-; CHECK: st3.8b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x00,0x0c]
-; CHECK: st3.16b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x00,0x4c]
-; CHECK: st3.4h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x00,0x0c]
-; CHECK: st3.8h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x00,0x4c]
-; CHECK: st3.2s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x0c]
-; CHECK: st3.4s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x4c]
-; CHECK: st3.2d	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4e,0x00,0x4c]
-; CHECK: ld4.8b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x40,0x0c]
-; CHECK: ld4.16b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x40,0x4c]
-; CHECK: ld4.4h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x40,0x0c]
-; CHECK: ld4.8h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x40,0x4c]
-; CHECK: ld4.2s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x40,0x0c]
-; CHECK: ld4.4s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x40,0x4c]
-; CHECK: ld4.2d	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0e,0x40,0x4c]
-; CHECK: st4.8b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x00,0x0c]
-; CHECK: st4.16b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x00,0x4c]
-; CHECK: st4.4h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x00,0x0c]
-; CHECK: st4.8h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x00,0x4c]
-; CHECK: st4.2s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x00,0x0c]
-; CHECK: st4.4s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x00,0x4c]
-; CHECK: st4.2d	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0e,0x00,0x4c]
-; CHECK: ld1.8b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0xcf,0x0c]
-; CHECK: ld1.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0xcf,0x0c]
-; CHECK: ld1.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0xcf,0x0c]
-; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0xcf,0x0c]
-; CHECK: ld1.16b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0xcf,0x4c]
-; CHECK: ld1.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0xcf,0x4c]
-; CHECK: ld1.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0xcf,0x4c]
-; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0xcf,0x4c]
-; CHECK: ld1.4h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0xcf,0x0c]
-; CHECK: ld1.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0xcf,0x0c]
-; CHECK: ld1.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0xcf,0x0c]
-; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0xcf,0x0c]
-; CHECK: ld1.8h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0xcf,0x4c]
-; CHECK: ld1.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0xcf,0x4c]
-; CHECK: ld1.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0xcf,0x4c]
-; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0xcf,0x4c]
-; CHECK: ld1.2s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0xcf,0x0c]
-; CHECK: ld1.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0xcf,0x0c]
-; CHECK: ld1.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0xcf,0x0c]
-; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0xcf,0x0c]
-; CHECK: ld1.4s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0xcf,0x4c]
-; CHECK: ld1.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0xcf,0x4c]
-; CHECK: ld1.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0xcf,0x4c]
-; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0xcf,0x4c]
-; CHECK: ld1.1d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0xcf,0x0c]
-; CHECK: ld1.1d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0xcf,0x0c]
-; CHECK: ld1.1d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0xcf,0x0c]
-; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0xcf,0x0c]
-; CHECK: ld1.2d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0xcf,0x4c]
-; CHECK: ld1.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0xcf,0x4c]
-; CHECK: ld1.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0xcf,0x4c]
-; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0xcf,0x4c]
-; CHECK: st1.8b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0x8f,0x0c]
-; CHECK: st1.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0x8f,0x0c]
-; CHECK: st1.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0x8f,0x0c]
-; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0x8f,0x0c]
-; CHECK: st1.16b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0x8f,0x4c]
-; CHECK: st1.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0x8f,0x4c]
-; CHECK: st1.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0x8f,0x4c]
-; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0x8f,0x4c]
-; CHECK: st1.4h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0x8f,0x0c]
-; CHECK: st1.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0x8f,0x0c]
-; CHECK: st1.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0x8f,0x0c]
-; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0x8f,0x0c]
-; CHECK: st1.8h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0x8f,0x4c]
-; CHECK: st1.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0x8f,0x4c]
-; CHECK: st1.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0x8f,0x4c]
-; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0x8f,0x4c]
-; CHECK: st1.2s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0x8f,0x0c]
-; CHECK: st1.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0x8f,0x0c]
-; CHECK: st1.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0x8f,0x0c]
-; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0x8f,0x0c]
-; CHECK: st1.4s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0x8f,0x4c]
-; CHECK: st1.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0x8f,0x4c]
-; CHECK: st1.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0x8f,0x4c]
-; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0x8f,0x4c]
-; CHECK: st1.1d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0x8f,0x0c]
-; CHECK: st1.1d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0x8f,0x0c]
-; CHECK: st1.1d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0x8f,0x0c]
-; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0x8f,0x0c]
-; CHECK: st1.2d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0x8f,0x4c]
-; CHECK: st1.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0x8f,0x4c]
-; CHECK: st1.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0x8f,0x4c]
-; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0x8f,0x4c]
-; CHECK: ld1.8b	{ v1 }, [x1], #8       ; encoding: [0x21,0x70,0xdf,0x0c]
-; CHECK: ld1.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa0,0xdf,0x0c]
-; CHECK: ld1.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x60,0xdf,0x0c]
-; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x20,0xdf,0x0c]
-; CHECK: ld1.16b	{ v1 }, [x1], #16       ; encoding: [0x21,0x70,0xdf,0x4c]
-; CHECK: ld1.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa0,0xdf,0x4c]
-; CHECK: ld1.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x60,0xdf,0x4c]
-; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x20,0xdf,0x4c]
-; CHECK: ld1.4h	{ v1 }, [x1], #8       ; encoding: [0x21,0x74,0xdf,0x0c]
-; CHECK: ld1.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa4,0xdf,0x0c]
-; CHECK: ld1.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x64,0xdf,0x0c]
-; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x24,0xdf,0x0c]
-; CHECK: ld1.8h	{ v1 }, [x1], #16       ; encoding: [0x21,0x74,0xdf,0x4c]
-; CHECK: ld1.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa4,0xdf,0x4c]
-; CHECK: ld1.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x64,0xdf,0x4c]
-; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x24,0xdf,0x4c]
-; CHECK: ld1.2s	{ v1 }, [x1], #8       ; encoding: [0x21,0x78,0xdf,0x0c]
-; CHECK: ld1.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa8,0xdf,0x0c]
-; CHECK: ld1.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x68,0xdf,0x0c]
-; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x28,0xdf,0x0c]
-; CHECK: ld1.4s	{ v1 }, [x1], #16       ; encoding: [0x21,0x78,0xdf,0x4c]
-; CHECK: ld1.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa8,0xdf,0x4c]
-; CHECK: ld1.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x68,0xdf,0x4c]
-; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x28,0xdf,0x4c]
-; CHECK: ld1.1d	{ v1 }, [x1], #8       ; encoding: [0x21,0x7c,0xdf,0x0c]
-; CHECK: ld1.1d	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xac,0xdf,0x0c]
-; CHECK: ld1.1d	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x6c,0xdf,0x0c]
-; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x2c,0xdf,0x0c]
-; CHECK: ld1.2d	{ v1 }, [x1], #16       ; encoding: [0x21,0x7c,0xdf,0x4c]
-; CHECK: ld1.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xac,0xdf,0x4c]
-; CHECK: ld1.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x6c,0xdf,0x4c]
-; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x2c,0xdf,0x4c]
-; CHECK: st1.8b	{ v1 }, [x1], #8       ; encoding: [0x21,0x70,0x9f,0x0c]
-; CHECK: st1.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa0,0x9f,0x0c]
-; CHECK: st1.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x60,0x9f,0x0c]
-; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x20,0x9f,0x0c]
-; CHECK: st1.16b	{ v1 }, [x1], #16       ; encoding: [0x21,0x70,0x9f,0x4c]
-; CHECK: st1.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa0,0x9f,0x4c]
-; CHECK: st1.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x60,0x9f,0x4c]
-; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x20,0x9f,0x4c]
-; CHECK: st1.4h	{ v1 }, [x1], #8       ; encoding: [0x21,0x74,0x9f,0x0c]
-; CHECK: st1.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa4,0x9f,0x0c]
-; CHECK: st1.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x64,0x9f,0x0c]
-; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x24,0x9f,0x0c]
-; CHECK: st1.8h	{ v1 }, [x1], #16       ; encoding: [0x21,0x74,0x9f,0x4c]
-; CHECK: st1.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa4,0x9f,0x4c]
-; CHECK: st1.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x64,0x9f,0x4c]
-; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x24,0x9f,0x4c]
-; CHECK: st1.2s	{ v1 }, [x1], #8       ; encoding: [0x21,0x78,0x9f,0x0c]
-; CHECK: st1.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa8,0x9f,0x0c]
-; CHECK: st1.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x68,0x9f,0x0c]
-; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x28,0x9f,0x0c]
-; CHECK: st1.4s	{ v1 }, [x1], #16       ; encoding: [0x21,0x78,0x9f,0x4c]
-; CHECK: st1.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa8,0x9f,0x4c]
-; CHECK: st1.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x68,0x9f,0x4c]
-; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x28,0x9f,0x4c]
-; CHECK: st1.1d	{ v1 }, [x1], #8       ; encoding: [0x21,0x7c,0x9f,0x0c]
-; CHECK: st1.1d	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xac,0x9f,0x0c]
-; CHECK: st1.1d	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x6c,0x9f,0x0c]
-; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x2c,0x9f,0x0c]
-; CHECK: st1.2d	{ v1 }, [x1], #16       ; encoding: [0x21,0x7c,0x9f,0x4c]
-; CHECK: st1.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xac,0x9f,0x4c]
-; CHECK: st1.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x6c,0x9f,0x4c]
-; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x2c,0x9f,0x4c]
-; CHECK: ld2.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0xcf,0x0c]
-; CHECK: ld2.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0xcf,0x4c]
-; CHECK: ld2.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0xcf,0x0c]
-; CHECK: ld2.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0xcf,0x4c]
-; CHECK: ld2.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0xcf,0x0c]
-; CHECK: ld2.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0xcf,0x4c]
-; CHECK: ld2.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x8c,0xcf,0x4c]
-; CHECK: st2.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0x8f,0x0c]
-; CHECK: st2.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0x8f,0x4c]
-; CHECK: st2.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0x8f,0x0c]
-; CHECK: st2.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0x8f,0x4c]
-; CHECK: st2.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0x8f,0x0c]
-; CHECK: st2.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0x8f,0x4c]
-; CHECK: st2.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x8c,0x8f,0x4c]
-; CHECK: ld2.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x80,0xdf,0x0c]
-; CHECK: ld2.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x80,0xdf,0x4c]
-; CHECK: ld2.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x84,0xdf,0x0c]
-; CHECK: ld2.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x84,0xdf,0x4c]
-; CHECK: ld2.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x88,0xdf,0x0c]
-; CHECK: ld2.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x88,0xdf,0x4c]
-; CHECK: ld2.2d	{ v2, v3 }, [x1], #32	; encoding: [0x22,0x8c,0xdf,0x4c]
-; CHECK: st2.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x80,0x9f,0x0c]
-; CHECK: st2.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x80,0x9f,0x4c]
-; CHECK: st2.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x84,0x9f,0x0c]
-; CHECK: st2.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x84,0x9f,0x4c]
-; CHECK: st2.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x88,0x9f,0x0c]
-; CHECK: st2.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x88,0x9f,0x4c]
-; CHECK: st2.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x8c,0x9f,0x4c]
-; CHECK: ld3.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0xcf,0x0c]
-; CHECK: ld3.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0xcf,0x4c]
-; CHECK: ld3.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0xcf,0x0c]
-; CHECK: ld3.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0xcf,0x4c]
-; CHECK: ld3.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0xcf,0x0c]
-; CHECK: ld3.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0xcf,0x4c]
-; CHECK: ld3.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x4c,0xcf,0x4c]
-; CHECK: st3.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0x8f,0x0c]
-; CHECK: st3.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0x8f,0x4c]
-; CHECK: st3.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0x8f,0x0c]
-; CHECK: st3.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0x8f,0x4c]
-; CHECK: st3.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0x8f,0x0c]
-; CHECK: st3.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0x8f,0x4c]
-; CHECK: st3.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x4c,0x8f,0x4c]
-; CHECK: ld3.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x40,0xdf,0x0c]
-; CHECK: ld3.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x40,0xdf,0x4c]
-; CHECK: ld3.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x44,0xdf,0x0c]
-; CHECK: ld3.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x44,0xdf,0x4c]
-; CHECK: ld3.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x48,0xdf,0x0c]
-; CHECK: ld3.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x48,0xdf,0x4c]
-; CHECK: ld3.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x4c,0xdf,0x4c]
-; CHECK: st3.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x40,0x9f,0x0c]
-; CHECK: st3.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x40,0x9f,0x4c]
-; CHECK: st3.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x44,0x9f,0x0c]
-; CHECK: st3.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x44,0x9f,0x4c]
-; CHECK: st3.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x48,0x9f,0x0c]
-; CHECK: st3.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x48,0x9f,0x4c]
-; CHECK: st3.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x4c,0x9f,0x4c]
-; CHECK: ld4.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0xcf,0x0c]
-; CHECK: ld4.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0xcf,0x4c]
-; CHECK: ld4.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0xcf,0x0c]
-; CHECK: ld4.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0xcf,0x4c]
-; CHECK: ld4.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0xcf,0x0c]
-; CHECK: ld4.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0xcf,0x4c]
-; CHECK: ld4.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x0c,0xcf,0x4c]
-; CHECK: st4.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0x8f,0x0c]
-; CHECK: st4.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0x8f,0x4c]
-; CHECK: st4.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0x8f,0x0c]
-; CHECK: st4.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0x8f,0x4c]
-; CHECK: st4.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0x8f,0x0c]
-; CHECK: st4.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0x8f,0x4c]
-; CHECK: st4.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x0c,0x8f,0x4c]
-; CHECK: ld4.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x00,0xdf,0x0c]
-; CHECK: ld4.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x00,0xdf,0x4c]
-; CHECK: ld4.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x04,0xdf,0x0c]
-; CHECK: ld4.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x04,0xdf,0x4c]
-; CHECK: ld4.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x08,0xdf,0x0c]
-; CHECK: ld4.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x08,0xdf,0x4c]
-; CHECK: ld4.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x0c,0xdf,0x4c]
-; CHECK: st4.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x00,0x9f,0x0c]
-; CHECK: st4.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x00,0x9f,0x4c]
-; CHECK: st4.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x04,0x9f,0x0c]
-; CHECK: st4.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x04,0x9f,0x4c]
-; CHECK: st4.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x08,0x9f,0x0c]
-; CHECK: st4.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x08,0x9f,0x4c]
-; CHECK: st4.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x0c,0x9f,0x4c]
-; CHECK: ld1r.8b	{ v12 }, [x2]           ; encoding: [0x4c,0xc0,0x40,0x0d]
-; CHECK: ld1r.8b	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc0,0xc3,0x0d]
-; CHECK: ld1r.16b	{ v12 }, [x2]   ; encoding: [0x4c,0xc0,0x40,0x4d]
-; CHECK: ld1r.16b	{ v12 }, [x2], x3 ; encoding: [0x4c,0xc0,0xc3,0x4d]
-; CHECK: ld1r.4h	{ v12 }, [x2]           ; encoding: [0x4c,0xc4,0x40,0x0d]
-; CHECK: ld1r.4h	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc4,0xc3,0x0d]
-; CHECK: ld1r.8h	{ v12 }, [x2]           ; encoding: [0x4c,0xc4,0x40,0x4d]
-; CHECK: ld1r.8h	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc4,0xc3,0x4d]
-; CHECK: ld1r.2s	{ v12 }, [x2]           ; encoding: [0x4c,0xc8,0x40,0x0d]
-; CHECK: ld1r.2s	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc8,0xc3,0x0d]
-; CHECK: ld1r.4s	{ v12 }, [x2]           ; encoding: [0x4c,0xc8,0x40,0x4d]
-; CHECK: ld1r.4s	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc8,0xc3,0x4d]
-; CHECK: ld1r.1d	{ v12 }, [x2]           ; encoding: [0x4c,0xcc,0x40,0x0d]
-; CHECK: ld1r.1d	{ v12 }, [x2], x3       ; encoding: [0x4c,0xcc,0xc3,0x0d]
-; CHECK: ld1r.2d	{ v12 }, [x2]           ; encoding: [0x4c,0xcc,0x40,0x4d]
-; CHECK: ld1r.2d	{ v12 }, [x2], x3       ; encoding: [0x4c,0xcc,0xc3,0x4d]
-; CHECK: ld1r.8b	{ v12 }, [x2], #1      ; encoding: [0x4c,0xc0,0xdf,0x0d]
-; CHECK: ld1r.16b	{ v12 }, [x2], #1 ; encoding: [0x4c,0xc0,0xdf,0x4d]
-; CHECK: ld1r.4h	{ v12 }, [x2], #2      ; encoding: [0x4c,0xc4,0xdf,0x0d]
-; CHECK: ld1r.8h	{ v12 }, [x2], #2      ; encoding: [0x4c,0xc4,0xdf,0x4d]
-; CHECK: ld1r.2s	{ v12 }, [x2], #4      ; encoding: [0x4c,0xc8,0xdf,0x0d]
-; CHECK: ld1r.4s	{ v12 }, [x2], #4      ; encoding: [0x4c,0xc8,0xdf,0x4d]
-; CHECK: ld1r.1d	{ v12 }, [x2], #8      ; encoding: [0x4c,0xcc,0xdf,0x0d]
-; CHECK: ld1r.2d	{ v12 }, [x2], #8      ; encoding: [0x4c,0xcc,0xdf,0x4d]
-; CHECK: ld2r.8b	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc0,0x60,0x0d]
-; CHECK: ld2r.8b	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc0,0xe3,0x0d]
-; CHECK: ld2r.16b	{ v3, v4 }, [x2] ; encoding: [0x43,0xc0,0x60,0x4d]
-; CHECK: ld2r.16b	{ v3, v4 }, [x2], x3 ; encoding: [0x43,0xc0,0xe3,0x4d]
-; CHECK: ld2r.4h	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc4,0x60,0x0d]
-; CHECK: ld2r.4h	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc4,0xe3,0x0d]
-; CHECK: ld2r.8h	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc4,0x60,0x4d]
-; CHECK: ld2r.8h	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc4,0xe3,0x4d]
-; CHECK: ld2r.2s	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc8,0x60,0x0d]
-; CHECK: ld2r.2s	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc8,0xe3,0x0d]
-; CHECK: ld2r.4s	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc8,0x60,0x4d]
-; CHECK: ld2r.4s	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc8,0xe3,0x4d]
-; CHECK: ld2r.1d	{ v3, v4 }, [x2]        ; encoding: [0x43,0xcc,0x60,0x0d]
-; CHECK: ld2r.1d	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xcc,0xe3,0x0d]
-; CHECK: ld2r.2d	{ v3, v4 }, [x2]        ; encoding: [0x43,0xcc,0x60,0x4d]
-; CHECK: ld2r.2d	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xcc,0xe3,0x4d]
-; CHECK: ld2r.8b	{ v3, v4 }, [x2], #2   ; encoding: [0x43,0xc0,0xff,0x0d]
-; CHECK: ld2r.16b	{ v3, v4 }, [x2], #2 ; encoding: [0x43,0xc0,0xff,0x4d]
-; CHECK: ld2r.4h	{ v3, v4 }, [x2], #4   ; encoding: [0x43,0xc4,0xff,0x0d]
-; CHECK: ld2r.8h	{ v3, v4 }, [x2], #4   ; encoding: [0x43,0xc4,0xff,0x4d]
-; CHECK: ld2r.2s	{ v3, v4 }, [x2], #8   ; encoding: [0x43,0xc8,0xff,0x0d]
-; CHECK: ld2r.4s	{ v3, v4 }, [x2], #8   ; encoding: [0x43,0xc8,0xff,0x4d]
-; CHECK: ld2r.1d	{ v3, v4 }, [x2], #16   ; encoding: [0x43,0xcc,0xff,0x0d]
-; CHECK: ld2r.2d	{ v3, v4 }, [x2], #16   ; encoding: [0x43,0xcc,0xff,0x4d]
-; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe0,0x40,0x0d]
-; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe0,0xc3,0x0d]
-; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2] ; encoding: [0x42,0xe0,0x40,0x4d]
-; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe0,0xc3,0x4d]
-; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe4,0x40,0x0d]
-; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe4,0xc3,0x0d]
-; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe4,0x40,0x4d]
-; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe4,0xc3,0x4d]
-; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe8,0x40,0x0d]
-; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe8,0xc3,0x0d]
-; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe8,0x40,0x4d]
-; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe8,0xc3,0x4d]
-; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xec,0x40,0x0d]
-; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xec,0xc3,0x0d]
-; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xec,0x40,0x4d]
-; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xec,0xc3,0x4d]
-; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2], #3 ; encoding: [0x42,0xe0,0xdf,0x0d]
-; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2], #3 ; encoding: [0x42,0xe0,0xdf,0x4d]
-; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2], #6 ; encoding: [0x42,0xe4,0xdf,0x0d]
-; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2], #6 ; encoding: [0x42,0xe4,0xdf,0x4d]
-; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2], #12 ; encoding: [0x42,0xe8,0xdf,0x0d]
-; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2], #12 ; encoding: [0x42,0xe8,0xdf,0x4d]
-; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2], #24 ; encoding: [0x42,0xec,0xdf,0x0d]
-; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2], #24 ; encoding: [0x42,0xec,0xdf,0x4d]
-; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe0,0x60,0x0d]
-; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe0,0xe3,0x0d]
-; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe0,0x60,0x4d]
-; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe0,0xe3,0x4d]
-; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe4,0x60,0x0d]
-; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe4,0xe3,0x0d]
-; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe4,0x60,0x4d]
-; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe4,0xe3,0x4d]
-; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe8,0x60,0x0d]
-; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe8,0xe3,0x0d]
-; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe8,0x60,0x4d]
-; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe8,0xe3,0x4d]
-; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xec,0x60,0x0d]
-; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xec,0xe3,0x0d]
-; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xec,0x60,0x4d]
-; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xec,0xe3,0x4d]
-; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2], #4 ; encoding: [0x42,0xe0,0xff,0x0d]
-; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2], #4 ; encoding: [0x42,0xe0,0xff,0x4d]
-; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2], #8 ; encoding: [0x42,0xe4,0xff,0x0d]
-; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2], #8 ; encoding: [0x42,0xe4,0xff,0x4d]
-; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x0d]
-; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x4d]
-; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2], #32 ; encoding: [0x42,0xec,0xff,0x0d]
-; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2], #32 ; encoding: [0x42,0xec,0xff,0x4d]
-; CHECK: ld1.b	{ v6 }[13], [x3]        ; encoding: [0x66,0x14,0x40,0x4d]
-; CHECK: ld1.h	{ v6 }[2], [x3]         ; encoding: [0x66,0x50,0x40,0x0d]
-; CHECK: ld1.s	{ v6 }[2], [x3]         ; encoding: [0x66,0x80,0x40,0x4d]
-; CHECK: ld1.d	{ v6 }[1], [x3]         ; encoding: [0x66,0x84,0x40,0x4d]
-; CHECK: ld1.b	{ v6 }[13], [x3], x5    ; encoding: [0x66,0x14,0xc5,0x4d]
-; CHECK: ld1.h	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x50,0xc5,0x0d]
-; CHECK: ld1.s	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x80,0xc5,0x4d]
-; CHECK: ld1.d	{ v6 }[1], [x3], x5     ; encoding: [0x66,0x84,0xc5,0x4d]
-; CHECK: ld1.b	{ v6 }[13], [x3], #1   ; encoding: [0x66,0x14,0xdf,0x4d]
-; CHECK: ld1.h	{ v6 }[2], [x3], #2    ; encoding: [0x66,0x50,0xdf,0x0d]
-; CHECK: ld1.s	{ v6 }[2], [x3], #4    ; encoding: [0x66,0x80,0xdf,0x4d]
-; CHECK: ld1.d	{ v6 }[1], [x3], #8    ; encoding: [0x66,0x84,0xdf,0x4d]
-; CHECK: ld2.b	{ v5, v6 }[13], [x3]    ; encoding: [0x65,0x14,0x60,0x4d]
-; CHECK: ld2.h	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x50,0x60,0x0d]
-; CHECK: ld2.s	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x80,0x60,0x4d]
-; CHECK: ld2.d	{ v5, v6 }[1], [x3]     ; encoding: [0x65,0x84,0x60,0x4d]
-; CHECK: ld2.b	{ v5, v6 }[13], [x3], x5 ; encoding: [0x65,0x14,0xe5,0x4d]
-; CHECK: ld2.h	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x50,0xe5,0x0d]
-; CHECK: ld2.s	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x80,0xe5,0x4d]
-; CHECK: ld2.d	{ v5, v6 }[1], [x3], x5 ; encoding: [0x65,0x84,0xe5,0x4d]
-; CHECK: ld2.b	{ v5, v6 }[13], [x3], #2 ; encoding: [0x65,0x14,0xff,0x4d]
-; CHECK: ld2.h	{ v5, v6 }[2], [x3], #4 ; encoding: [0x65,0x50,0xff,0x0d]
-; CHECK: ld2.s	{ v5, v6 }[2], [x3], #8 ; encoding: [0x65,0x80,0xff,0x4d]
-; CHECK: ld2.d	{ v5, v6 }[1], [x3], #16 ; encoding: [0x65,0x84,0xff,0x4d]
-; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3] ; encoding: [0x67,0x34,0x40,0x4d]
-; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0x70,0x40,0x0d]
-; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0xa0,0x40,0x4d]
-; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3] ; encoding: [0x67,0xa4,0x40,0x4d]
-; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3], x5 ; encoding: [0x67,0x34,0xc5,0x4d]
-; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0x70,0xc5,0x0d]
-; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xc5,0x4d]
-; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xc5,0x4d]
-; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3], #3 ; encoding: [0x67,0x34,0xdf,0x4d]
-; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3], #6 ; encoding: [0x67,0x70,0xdf,0x0d]
-; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3], #12 ; encoding: [0x67,0xa0,0xdf,0x4d]
-; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3], #24 ; encoding: [0x67,0xa4,0xdf,0x4d]
-; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3] ; encoding: [0x67,0x34,0x60,0x4d]
-; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0x70,0x60,0x0d]
-; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0xa0,0x60,0x4d]
-; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3] ; encoding: [0x67,0xa4,0x60,0x4d]
-; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3], x5 ; encoding: [0x67,0x34,0xe5,0x4d]
-; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0x70,0xe5,0x0d]
-; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xe5,0x4d]
-; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xe5,0x4d]
-; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3], #4 ; encoding: [0x67,0x34,0xff,0x4d]
-; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3], #8 ; encoding: [0x67,0x70,0xff,0x0d]
-; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3], #16 ; encoding: [0x67,0xa0,0xff,0x4d]
-; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3], #32 ; encoding: [0x67,0xa4,0xff,0x4d]
-; CHECK: st1.b	{ v6 }[13], [x3]        ; encoding: [0x66,0x14,0x00,0x4d]
-; CHECK: st1.h	{ v6 }[2], [x3]         ; encoding: [0x66,0x50,0x00,0x0d]
-; CHECK: st1.s	{ v6 }[2], [x3]         ; encoding: [0x66,0x80,0x00,0x4d]
-; CHECK: st1.d	{ v6 }[1], [x3]         ; encoding: [0x66,0x84,0x00,0x4d]
-; CHECK: st1.b	{ v6 }[13], [x3], x5    ; encoding: [0x66,0x14,0x85,0x4d]
-; CHECK: st1.h	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x50,0x85,0x0d]
-; CHECK: st1.s	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x80,0x85,0x4d]
-; CHECK: st1.d	{ v6 }[1], [x3], x5     ; encoding: [0x66,0x84,0x85,0x4d]
-; CHECK: st1.b	{ v6 }[13], [x3], #1   ; encoding: [0x66,0x14,0x9f,0x4d]
-; CHECK: st1.h	{ v6 }[2], [x3], #2    ; encoding: [0x66,0x50,0x9f,0x0d]
-; CHECK: st1.s	{ v6 }[2], [x3], #4    ; encoding: [0x66,0x80,0x9f,0x4d]
-; CHECK: st1.d	{ v6 }[1], [x3], #8    ; encoding: [0x66,0x84,0x9f,0x4d]
-; CHECK: st2.b	{ v5, v6 }[13], [x3]    ; encoding: [0x65,0x14,0x20,0x4d]
-; CHECK: st2.h	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x50,0x20,0x0d]
-; CHECK: st2.s	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x80,0x20,0x4d]
-; CHECK: st2.d	{ v5, v6 }[1], [x3]     ; encoding: [0x65,0x84,0x20,0x4d]
-; CHECK: st2.b	{ v5, v6 }[13], [x3], x5 ; encoding: [0x65,0x14,0xa5,0x4d]
-; CHECK: st2.h	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x50,0xa5,0x0d]
-; CHECK: st2.s	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x80,0xa5,0x4d]
-; CHECK: st2.d	{ v5, v6 }[1], [x3], x5 ; encoding: [0x65,0x84,0xa5,0x4d]
-; CHECK: st2.b	{ v5, v6 }[13], [x3], #2 ; encoding: [0x65,0x14,0xbf,0x4d]
-; CHECK: st2.h	{ v5, v6 }[2], [x3], #4 ; encoding: [0x65,0x50,0xbf,0x0d]
-; CHECK: st2.s	{ v5, v6 }[2], [x3], #8 ; encoding: [0x65,0x80,0xbf,0x4d]
-; CHECK: st2.d	{ v5, v6 }[1], [x3], #16 ; encoding: [0x65,0x84,0xbf,0x4d]
-; CHECK: st3.b	{ v7, v8, v9 }[13], [x3] ; encoding: [0x67,0x34,0x00,0x4d]
-; CHECK: st3.h	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0x70,0x00,0x0d]
-; CHECK: st3.s	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0xa0,0x00,0x4d]
-; CHECK: st3.d	{ v7, v8, v9 }[1], [x3] ; encoding: [0x67,0xa4,0x00,0x4d]
-; CHECK: st3.b	{ v7, v8, v9 }[13], [x3], x5 ; encoding: [0x67,0x34,0x85,0x4d]
-; CHECK: st3.h	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0x70,0x85,0x0d]
-; CHECK: st3.s	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0xa0,0x85,0x4d]
-; CHECK: st3.d	{ v7, v8, v9 }[1], [x3], x5 ; encoding: [0x67,0xa4,0x85,0x4d]
-; CHECK: st3.b	{ v7, v8, v9 }[13], [x3], #3 ; encoding: [0x67,0x34,0x9f,0x4d]
-; CHECK: st3.h	{ v7, v8, v9 }[2], [x3], #6 ; encoding: [0x67,0x70,0x9f,0x0d]
-; CHECK: st3.s	{ v7, v8, v9 }[2], [x3], #12 ; encoding: [0x67,0xa0,0x9f,0x4d]
-; CHECK: st3.d	{ v7, v8, v9 }[1], [x3], #24 ; encoding: [0x67,0xa4,0x9f,0x4d]
-; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3] ; encoding: [0x67,0x34,0x20,0x4d]
-; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0x70,0x20,0x0d]
-; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0xa0,0x20,0x4d]
-; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3] ; encoding: [0x67,0xa4,0x20,0x4d]
-; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3], x5 ; encoding: [0x67,0x34,0xa5,0x4d]
-; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0x70,0xa5,0x0d]
-; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xa5,0x4d]
-; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xa5,0x4d]
-; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3], #4 ; encoding: [0x67,0x34,0xbf,0x4d]
-; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3], #8 ; encoding: [0x67,0x70,0xbf,0x0d]
-; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3], #16 ; encoding: [0x67,0xa0,0xbf,0x4d]
-; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3], #32 ; encoding: [0x67,0xa4,0xbf,0x4d]
diff --git a/test/MC/ARM64/small-data-fixups.s b/test/MC/ARM64/small-data-fixups.s
deleted file mode 100644
index 3fe7c75c011..00000000000
--- a/test/MC/ARM64/small-data-fixups.s
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o - %s | macho-dump | FileCheck %s
-
-foo:
-  .long 0
-bar:
-  .long 1
-
-baz:
-  .byte foo - bar
-  .short foo - bar
-
-; CHECK: # Relocation 0
-; CHECK: (('word-0', 0x9),
-; CHECK:  ('word-1', 0x1a000002)),
-; CHECK: # Relocation 1
-; CHECK: (('word-0', 0x9),
-; CHECK:  ('word-1', 0xa000001)),
-; CHECK: # Relocation 2
-; CHECK: (('word-0', 0x8),
-; CHECK:  ('word-1', 0x18000002)),
-; CHECK: # Relocation 3
-; CHECK: (('word-0', 0x8),
-; CHECK:  ('word-1', 0x8000001)),
-
diff --git a/test/MC/ARM64/spsel-sysreg.s b/test/MC/ARM64/spsel-sysreg.s
deleted file mode 100644
index f1d94d8c2d8..00000000000
--- a/test/MC/ARM64/spsel-sysreg.s
+++ /dev/null
@@ -1,24 +0,0 @@
-// RUN: not llvm-mc -triple arm64 -show-encoding < %s 2>%t | FileCheck %s
-// RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-msr SPSel, #0
-msr SPSel, x0
-msr DAIFSet, #0
-msr ESR_EL1, x0
-mrs x0, SPSel
-mrs x0, ESR_EL1
-
-// CHECK: msr SPSEL, #0               // encoding: [0xbf,0x40,0x00,0xd5]
-// CHECK: msr SPSEL, x0               // encoding: [0x00,0x42,0x18,0xd5]
-// CHECK: msr DAIFSET, #0             // encoding: [0xdf,0x40,0x03,0xd5]
-// CHECK: msr ESR_EL1, x0             // encoding: [0x00,0x52,0x18,0xd5]
-// CHECK: mrs x0, SPSEL               // encoding: [0x00,0x42,0x38,0xd5]
-// CHECK: mrs x0, ESR_EL1             // encoding: [0x00,0x52,0x38,0xd5]
-
-
-msr DAIFSet, x0
-msr ESR_EL1, #0
-mrs x0, DAIFSet
-// CHECK-ERRORS: error: immediate must be an integer in range [0, 15]
-// CHECK-ERRORS: error: invalid operand for instruction
-// CHECK-ERRORS: error: expected readable system register
diff --git a/test/MC/ARM64/system-encoding.s b/test/MC/ARM64/system-encoding.s
deleted file mode 100644
index 9246608725b..00000000000
--- a/test/MC/ARM64/system-encoding.s
+++ /dev/null
@@ -1,623 +0,0 @@
-; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-foo:
-
-;-----------------------------------------------------------------------------
-; Simple encodings (instuctions w/ no operands)
-;-----------------------------------------------------------------------------
-
-  nop
-  sev
-  sevl
-  wfe
-  wfi
-  yield
-
-; CHECK: nop                             ; encoding: [0x1f,0x20,0x03,0xd5]
-; CHECK: sev                             ; encoding: [0x9f,0x20,0x03,0xd5]
-; CHECK: sevl                            ; encoding: [0xbf,0x20,0x03,0xd5]
-; CHECK: wfe                             ; encoding: [0x5f,0x20,0x03,0xd5]
-; CHECK: wfi                             ; encoding: [0x7f,0x20,0x03,0xd5]
-; CHECK: yield                           ; encoding: [0x3f,0x20,0x03,0xd5]
-
-;-----------------------------------------------------------------------------
-; Single-immediate operand instructions
-;-----------------------------------------------------------------------------
-
-  clrex #10
-; CHECK: clrex #10  ; encoding: [0x5f,0x3a,0x03,0xd5]
-  isb #15
-  isb sy
-; CHECK: isb     ; encoding: [0xdf,0x3f,0x03,0xd5]
-; CHECK: isb     ; encoding: [0xdf,0x3f,0x03,0xd5]
-  dmb #3
-  dmb osh
-; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
-; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
-  dsb #7
-  dsb nsh
-; CHECK: dsb nsh    ; encoding: [0x9f,0x37,0x03,0xd5]
-; CHECK: dsb nsh    ; encoding: [0x9f,0x37,0x03,0xd5]
-
-;-----------------------------------------------------------------------------
-; Generic system instructions
-;-----------------------------------------------------------------------------
-  sys #2, c0, c5, #7
-; CHECK: encoding: [0xff,0x05,0x0a,0xd5]
-  sys #7, C6, c10, #7, x7
-; CHECK: encoding: [0xe7,0x6a,0x0f,0xd5]
-  sysl  x20, #6, c3, C15, #7
-; CHECK: encoding: [0xf4,0x3f,0x2e,0xd5]
-
-; Check for error on invalid 'C' operand value.
-  sys #2, c16, c5, #7
-; CHECK-ERRORS: error: Expected cN operand where 0 <= N <= 15
-
-;-----------------------------------------------------------------------------
-; MSR/MRS instructions
-;-----------------------------------------------------------------------------
-  msr ACTLR_EL1, x3
-  msr ACTLR_EL2, x3
-  msr ACTLR_EL3, x3
-  msr AFSR0_EL1, x3
-  msr AFSR0_EL2, x3
-  msr AFSR0_EL3, x3
-  msr AFSR1_EL1, x3
-  msr AFSR1_EL2, x3
-  msr AFSR1_EL3, x3
-  msr AMAIR_EL1, x3
-  msr AMAIR_EL2, x3
-  msr AMAIR_EL3, x3
-  msr CNTFRQ_EL0, x3
-  msr CNTHCTL_EL2, x3
-  msr CNTHP_CTL_EL2, x3
-  msr CNTHP_CVAL_EL2, x3
-  msr CNTHP_TVAL_EL2, x3
-  msr CNTKCTL_EL1, x3
-  msr CNTP_CTL_EL0, x3
-  msr CNTP_CVAL_EL0, x3
-  msr CNTP_TVAL_EL0, x3
-  msr CNTVOFF_EL2, x3
-  msr CNTV_CTL_EL0, x3
-  msr CNTV_CVAL_EL0, x3
-  msr CNTV_TVAL_EL0, x3
-  msr CONTEXTIDR_EL1, x3
-  msr CPACR_EL1, x3
-  msr CPTR_EL2, x3
-  msr CPTR_EL3, x3
-  msr CSSELR_EL1, x3
-  msr CURRENTEL, x3
-  msr DACR32_EL2, x3
-  msr ESR_EL1, x3
-  msr ESR_EL2, x3
-  msr ESR_EL3, x3
-  msr FAR_EL1, x3
-  msr FAR_EL2, x3
-  msr FAR_EL3, x3
-  msr FPEXC32_EL2, x3
-  msr HACR_EL2, x3
-  msr HCR_EL2, x3
-  msr HPFAR_EL2, x3
-  msr HSTR_EL2, x3
-  msr IFSR32_EL2, x3
-  msr MAIR_EL1, x3
-  msr MAIR_EL2, x3
-  msr MAIR_EL3, x3
-  msr MDCR_EL2, x3
-  msr MDCR_EL3, x3
-  msr PAR_EL1, x3
-  msr SCR_EL3, x3
-  msr SCTLR_EL1, x3
-  msr SCTLR_EL2, x3
-  msr SCTLR_EL3, x3
-  msr SDER32_EL3, x3
-  msr TCR_EL1, x3
-  msr TCR_EL2, x3
-  msr TCR_EL3, x3
-  msr TEECR32_EL1, x3
-  msr TEEHBR32_EL1, x3
-  msr TPIDRRO_EL0, x3
-  msr TPIDR_EL0, x3
-  msr TPIDR_EL1, x3
-  msr TPIDR_EL2, x3
-  msr TPIDR_EL3, x3
-  msr TTBR0_EL1, x3
-  msr TTBR0_EL2, x3
-  msr TTBR0_EL3, x3
-  msr TTBR1_EL1, x3
-  msr VBAR_EL1, x3
-  msr VBAR_EL2, x3
-  msr VBAR_EL3, x3
-  msr VMPIDR_EL2, x3
-  msr VPIDR_EL2, x3
-  msr VTCR_EL2, x3
-  msr VTTBR_EL2, x3
-  msr SPSel, x3
-  msr S3_2_C11_C6_4, x1
-; CHECK: msr ACTLR_EL1, x3              ; encoding: [0x23,0x10,0x18,0xd5]
-; CHECK: msr ACTLR_EL2, x3              ; encoding: [0x23,0x10,0x1c,0xd5]
-; CHECK: msr ACTLR_EL3, x3              ; encoding: [0x23,0x10,0x1e,0xd5]
-; CHECK: msr AFSR0_EL1, x3              ; encoding: [0x03,0x51,0x18,0xd5]
-; CHECK: msr AFSR0_EL2, x3              ; encoding: [0x03,0x51,0x1c,0xd5]
-; CHECK: msr AFSR0_EL3, x3              ; encoding: [0x03,0x51,0x1e,0xd5]
-; CHECK: msr AFSR1_EL1, x3              ; encoding: [0x23,0x51,0x18,0xd5]
-; CHECK: msr AFSR1_EL2, x3              ; encoding: [0x23,0x51,0x1c,0xd5]
-; CHECK: msr AFSR1_EL3, x3              ; encoding: [0x23,0x51,0x1e,0xd5]
-; CHECK: msr AMAIR_EL1, x3              ; encoding: [0x03,0xa3,0x18,0xd5]
-; CHECK: msr AMAIR_EL2, x3              ; encoding: [0x03,0xa3,0x1c,0xd5]
-; CHECK: msr AMAIR_EL3, x3              ; encoding: [0x03,0xa3,0x1e,0xd5]
-; CHECK: msr CNTFRQ_EL0, x3             ; encoding: [0x03,0xe0,0x1b,0xd5]
-; CHECK: msr CNTHCTL_EL2, x3            ; encoding: [0x03,0xe1,0x1c,0xd5]
-; CHECK: msr CNTHP_CTL_EL2, x3          ; encoding: [0x23,0xe2,0x1c,0xd5]
-; CHECK: msr CNTHP_CVAL_EL2, x3         ; encoding: [0x43,0xe2,0x1c,0xd5]
-; CHECK: msr CNTHP_TVAL_EL2, x3         ; encoding: [0x03,0xe2,0x1c,0xd5]
-; CHECK: msr CNTKCTL_EL1, x3            ; encoding: [0x03,0xe1,0x18,0xd5]
-; CHECK: msr CNTP_CTL_EL0, x3           ; encoding: [0x23,0xe2,0x1b,0xd5]
-; CHECK: msr CNTP_CVAL_EL0, x3          ; encoding: [0x43,0xe2,0x1b,0xd5]
-; CHECK: msr CNTP_TVAL_EL0, x3          ; encoding: [0x03,0xe2,0x1b,0xd5]
-; CHECK: msr CNTVOFF_EL2, x3            ; encoding: [0x63,0xe0,0x1c,0xd5]
-; CHECK: msr CNTV_CTL_EL0, x3           ; encoding: [0x23,0xe3,0x1b,0xd5]
-; CHECK: msr CNTV_CVAL_EL0, x3          ; encoding: [0x43,0xe3,0x1b,0xd5]
-; CHECK: msr CNTV_TVAL_EL0, x3          ; encoding: [0x03,0xe3,0x1b,0xd5]
-; CHECK: msr CONTEXTIDR_EL1, x3         ; encoding: [0x23,0xd0,0x18,0xd5]
-; CHECK: msr CPACR_EL1, x3              ; encoding: [0x43,0x10,0x18,0xd5]
-; CHECK: msr CPTR_EL2, x3               ; encoding: [0x43,0x11,0x1c,0xd5]
-; CHECK: msr CPTR_EL3, x3               ; encoding: [0x43,0x11,0x1e,0xd5]
-; CHECK: msr CSSELR_EL1, x3             ; encoding: [0x03,0x00,0x1a,0xd5]
-; CHECK: msr CURRENTEL, x3              ; encoding: [0x43,0x42,0x18,0xd5]
-; CHECK: msr DACR32_EL2, x3             ; encoding: [0x03,0x30,0x1c,0xd5]
-; CHECK: msr ESR_EL1, x3                ; encoding: [0x03,0x52,0x18,0xd5]
-; CHECK: msr ESR_EL2, x3                ; encoding: [0x03,0x52,0x1c,0xd5]
-; CHECK: msr ESR_EL3, x3                ; encoding: [0x03,0x52,0x1e,0xd5]
-; CHECK: msr FAR_EL1, x3                ; encoding: [0x03,0x60,0x18,0xd5]
-; CHECK: msr FAR_EL2, x3                ; encoding: [0x03,0x60,0x1c,0xd5]
-; CHECK: msr FAR_EL3, x3                ; encoding: [0x03,0x60,0x1e,0xd5]
-; CHECK: msr FPEXC32_EL2, x3            ; encoding: [0x03,0x53,0x1c,0xd5]
-; CHECK: msr HACR_EL2, x3               ; encoding: [0xe3,0x11,0x1c,0xd5]
-; CHECK: msr HCR_EL2, x3                ; encoding: [0x03,0x11,0x1c,0xd5]
-; CHECK: msr HPFAR_EL2, x3              ; encoding: [0x83,0x60,0x1c,0xd5]
-; CHECK: msr HSTR_EL2, x3               ; encoding: [0x63,0x11,0x1c,0xd5]
-; CHECK: msr IFSR32_EL2, x3             ; encoding: [0x23,0x50,0x1c,0xd5]
-; CHECK: msr MAIR_EL1, x3               ; encoding: [0x03,0xa2,0x18,0xd5]
-; CHECK: msr MAIR_EL2, x3               ; encoding: [0x03,0xa2,0x1c,0xd5]
-; CHECK: msr MAIR_EL3, x3               ; encoding: [0x03,0xa2,0x1e,0xd5]
-; CHECK: msr MDCR_EL2, x3               ; encoding: [0x23,0x11,0x1c,0xd5]
-; CHECK: msr MDCR_EL3, x3               ; encoding: [0x23,0x13,0x1e,0xd5]
-; CHECK: msr PAR_EL1, x3                ; encoding: [0x03,0x74,0x18,0xd5]
-; CHECK: msr SCR_EL3, x3                ; encoding: [0x03,0x11,0x1e,0xd5]
-; CHECK: msr SCTLR_EL1, x3              ; encoding: [0x03,0x10,0x18,0xd5]
-; CHECK: msr SCTLR_EL2, x3              ; encoding: [0x03,0x10,0x1c,0xd5]
-; CHECK: msr SCTLR_EL3, x3              ; encoding: [0x03,0x10,0x1e,0xd5]
-; CHECK: msr SDER32_EL3, x3             ; encoding: [0x23,0x11,0x1e,0xd5]
-; CHECK: msr TCR_EL1, x3                ; encoding: [0x43,0x20,0x18,0xd5]
-; CHECK: msr TCR_EL2, x3                ; encoding: [0x43,0x20,0x1c,0xd5]
-; CHECK: msr TCR_EL3, x3                ; encoding: [0x43,0x20,0x1e,0xd5]
-; CHECK: msr TEECR32_EL1, x3            ; encoding: [0x03,0x00,0x12,0xd5]
-; CHECK: msr TEEHBR32_EL1, x3           ; encoding: [0x03,0x10,0x12,0xd5]
-; CHECK: msr TPIDRRO_EL0, x3            ; encoding: [0x63,0xd0,0x1b,0xd5]
-; CHECK: msr TPIDR_EL0, x3              ; encoding: [0x43,0xd0,0x1b,0xd5]
-; CHECK: msr TPIDR_EL1, x3              ; encoding: [0x83,0xd0,0x18,0xd5]
-; CHECK: msr TPIDR_EL2, x3              ; encoding: [0x43,0xd0,0x1c,0xd5]
-; CHECK: msr TPIDR_EL3, x3              ; encoding: [0x43,0xd0,0x1e,0xd5]
-; CHECK: msr TTBR0_EL1, x3              ; encoding: [0x03,0x20,0x18,0xd5]
-; CHECK: msr TTBR0_EL2, x3              ; encoding: [0x03,0x20,0x1c,0xd5]
-; CHECK: msr TTBR0_EL3, x3              ; encoding: [0x03,0x20,0x1e,0xd5]
-; CHECK: msr TTBR1_EL1, x3              ; encoding: [0x23,0x20,0x18,0xd5]
-; CHECK: msr VBAR_EL1, x3               ; encoding: [0x03,0xc0,0x18,0xd5]
-; CHECK: msr VBAR_EL2, x3               ; encoding: [0x03,0xc0,0x1c,0xd5]
-; CHECK: msr VBAR_EL3, x3               ; encoding: [0x03,0xc0,0x1e,0xd5]
-; CHECK: msr VMPIDR_EL2, x3             ; encoding: [0xa3,0x00,0x1c,0xd5]
-; CHECK: msr VPIDR_EL2, x3              ; encoding: [0x03,0x00,0x1c,0xd5]
-; CHECK: msr VTCR_EL2, x3               ; encoding: [0x43,0x21,0x1c,0xd5]
-; CHECK: msr VTTBR_EL2, x3              ; encoding: [0x03,0x21,0x1c,0xd5]
-; CHECK: msr  SPSEL, x3                 ; encoding: [0x03,0x42,0x18,0xd5]
-; CHECK: msr  S3_2_C11_C6_4, x1         ; encoding: [0x81,0xb6,0x1a,0xd5]
-
-  mrs x3, ACTLR_EL1
-  mrs x3, ACTLR_EL2
-  mrs x3, ACTLR_EL3
-  mrs x3, AFSR0_EL1
-  mrs x3, AFSR0_EL2
-  mrs x3, AFSR0_EL3
-  mrs x3, AIDR_EL1
-  mrs x3, AFSR1_EL1
-  mrs x3, AFSR1_EL2
-  mrs x3, AFSR1_EL3
-  mrs x3, AMAIR_EL1
-  mrs x3, AMAIR_EL2
-  mrs x3, AMAIR_EL3
-  mrs x3, CCSIDR_EL1
-  mrs x3, CLIDR_EL1
-  mrs x3, CNTFRQ_EL0
-  mrs x3, CNTHCTL_EL2
-  mrs x3, CNTHP_CTL_EL2
-  mrs x3, CNTHP_CVAL_EL2
-  mrs x3, CNTHP_TVAL_EL2
-  mrs x3, CNTKCTL_EL1
-  mrs x3, CNTPCT_EL0
-  mrs x3, CNTP_CTL_EL0
-  mrs x3, CNTP_CVAL_EL0
-  mrs x3, CNTP_TVAL_EL0
-  mrs x3, CNTVCT_EL0
-  mrs x3, CNTVOFF_EL2
-  mrs x3, CNTV_CTL_EL0
-  mrs x3, CNTV_CVAL_EL0
-  mrs x3, CNTV_TVAL_EL0
-  mrs x3, CONTEXTIDR_EL1
-  mrs x3, CPACR_EL1
-  mrs x3, CPTR_EL2
-  mrs x3, CPTR_EL3
-  mrs x3, CSSELR_EL1
-  mrs x3, CTR_EL0
-  mrs x3, CURRENTEL
-  mrs x3, DACR32_EL2
-  mrs x3, DCZID_EL0
-  mrs x3, REVIDR_EL1
-  mrs x3, ESR_EL1
-  mrs x3, ESR_EL2
-  mrs x3, ESR_EL3
-  mrs x3, FAR_EL1
-  mrs x3, FAR_EL2
-  mrs x3, FAR_EL3
-  mrs x3, FPEXC32_EL2
-  mrs x3, HACR_EL2
-  mrs x3, HCR_EL2
-  mrs x3, HPFAR_EL2
-  mrs x3, HSTR_EL2
-  mrs x3, ID_AA64DFR0_EL1
-  mrs x3, ID_AA64DFR1_EL1
-  mrs x3, ID_AA64ISAR0_EL1
-  mrs x3, ID_AA64ISAR1_EL1
-  mrs x3, ID_AA64MMFR0_EL1
-  mrs x3, ID_AA64MMFR1_EL1
-  mrs x3, ID_AA64PFR0_EL1
-  mrs x3, ID_AA64PFR1_EL1
-  mrs x3, IFSR32_EL2
-  mrs x3, ISR_EL1
-  mrs x3, MAIR_EL1
-  mrs x3, MAIR_EL2
-  mrs x3, MAIR_EL3
-  mrs x3, MDCR_EL2
-  mrs x3, MDCR_EL3
-  mrs x3, MIDR_EL1
-  mrs x3, MPIDR_EL1
-  mrs x3, MVFR0_EL1
-  mrs x3, MVFR1_EL1
-  mrs x3, PAR_EL1
-  mrs x3, RVBAR_EL1
-  mrs x3, RVBAR_EL2
-  mrs x3, RVBAR_EL3
-  mrs x3, SCR_EL3
-  mrs x3, SCTLR_EL1
-  mrs x3, SCTLR_EL2
-  mrs x3, SCTLR_EL3
-  mrs x3, SDER32_EL3
-  mrs x3, TCR_EL1
-  mrs x3, TCR_EL2
-  mrs x3, TCR_EL3
-  mrs x3, TEECR32_EL1
-  mrs x3, TEEHBR32_EL1
-  mrs x3, TPIDRRO_EL0
-  mrs x3, TPIDR_EL0
-  mrs x3, TPIDR_EL1
-  mrs x3, TPIDR_EL2
-  mrs x3, TPIDR_EL3
-  mrs x3, TTBR0_EL1
-  mrs x3, TTBR0_EL2
-  mrs x3, TTBR0_EL3
-  mrs x3, TTBR1_EL1
-  mrs x3, VBAR_EL1
-  mrs x3, VBAR_EL2
-  mrs x3, VBAR_EL3
-  mrs x3, VMPIDR_EL2
-  mrs x3, VPIDR_EL2
-  mrs x3, VTCR_EL2
-  mrs x3, VTTBR_EL2
-
-  mrs x3, MDCCSR_EL0
-  mrs x3, MDCCINT_EL1
-  mrs x3, DBGDTR_EL0
-  mrs x3, DBGDTRRX_EL0
-  mrs x3, DBGVCR32_EL2
-  mrs x3, OSDTRRX_EL1
-  mrs x3, MDSCR_EL1
-  mrs x3, OSDTRTX_EL1
-  mrs x3, OSECCR_EL1
-  mrs x3, DBGBVR0_EL1
-  mrs x3, DBGBVR1_EL1
-  mrs x3, DBGBVR2_EL1
-  mrs x3, DBGBVR3_EL1
-  mrs x3, DBGBVR4_EL1
-  mrs x3, DBGBVR5_EL1
-  mrs x3, DBGBVR6_EL1
-  mrs x3, DBGBVR7_EL1
-  mrs x3, DBGBVR8_EL1
-  mrs x3, DBGBVR9_EL1
-  mrs x3, DBGBVR10_EL1
-  mrs x3, DBGBVR11_EL1
-  mrs x3, DBGBVR12_EL1
-  mrs x3, DBGBVR13_EL1
-  mrs x3, DBGBVR14_EL1
-  mrs x3, DBGBVR15_EL1
-  mrs x3, DBGBCR0_EL1
-  mrs x3, DBGBCR1_EL1
-  mrs x3, DBGBCR2_EL1
-  mrs x3, DBGBCR3_EL1
-  mrs x3, DBGBCR4_EL1
-  mrs x3, DBGBCR5_EL1
-  mrs x3, DBGBCR6_EL1
-  mrs x3, DBGBCR7_EL1
-  mrs x3, DBGBCR8_EL1
-  mrs x3, DBGBCR9_EL1
-  mrs x3, DBGBCR10_EL1
-  mrs x3, DBGBCR11_EL1
-  mrs x3, DBGBCR12_EL1
-  mrs x3, DBGBCR13_EL1
-  mrs x3, DBGBCR14_EL1
-  mrs x3, DBGBCR15_EL1
-  mrs x3, DBGWVR0_EL1
-  mrs x3, DBGWVR1_EL1
-  mrs x3, DBGWVR2_EL1
-  mrs x3, DBGWVR3_EL1
-  mrs x3, DBGWVR4_EL1
-  mrs x3, DBGWVR5_EL1
-  mrs x3, DBGWVR6_EL1
-  mrs x3, DBGWVR7_EL1
-  mrs x3, DBGWVR8_EL1
-  mrs x3, DBGWVR9_EL1
-  mrs x3, DBGWVR10_EL1
-  mrs x3, DBGWVR11_EL1
-  mrs x3, DBGWVR12_EL1
-  mrs x3, DBGWVR13_EL1
-  mrs x3, DBGWVR14_EL1
-  mrs x3, DBGWVR15_EL1
-  mrs x3, DBGWCR0_EL1
-  mrs x3, DBGWCR1_EL1
-  mrs x3, DBGWCR2_EL1
-  mrs x3, DBGWCR3_EL1
-  mrs x3, DBGWCR4_EL1
-  mrs x3, DBGWCR5_EL1
-  mrs x3, DBGWCR6_EL1
-  mrs x3, DBGWCR7_EL1
-  mrs x3, DBGWCR8_EL1
-  mrs x3, DBGWCR9_EL1
-  mrs x3, DBGWCR10_EL1
-  mrs x3, DBGWCR11_EL1
-  mrs x3, DBGWCR12_EL1
-  mrs x3, DBGWCR13_EL1
-  mrs x3, DBGWCR14_EL1
-  mrs x3, DBGWCR15_EL1
-  mrs x3, MDRAR_EL1
-  mrs x3, OSLSR_EL1
-  mrs x3, OSDLR_EL1
-  mrs x3, DBGPRCR_EL1
-  mrs x3, DBGCLAIMSET_EL1
-  mrs x3, DBGCLAIMCLR_EL1
-  mrs x3, DBGAUTHSTATUS_EL1
-  mrs x1, S3_2_C15_C6_4
-  mrs x3, s3_3_c11_c1_4
-  mrs x3, S3_3_c11_c1_4
-
-; CHECK: mrs x3, ACTLR_EL1              ; encoding: [0x23,0x10,0x38,0xd5]
-; CHECK: mrs x3, ACTLR_EL2              ; encoding: [0x23,0x10,0x3c,0xd5]
-; CHECK: mrs x3, ACTLR_EL3              ; encoding: [0x23,0x10,0x3e,0xd5]
-; CHECK: mrs x3, AFSR0_EL1              ; encoding: [0x03,0x51,0x38,0xd5]
-; CHECK: mrs x3, AFSR0_EL2              ; encoding: [0x03,0x51,0x3c,0xd5]
-; CHECK: mrs x3, AFSR0_EL3              ; encoding: [0x03,0x51,0x3e,0xd5]
-; CHECK: mrs x3, AIDR_EL1               ; encoding: [0xe3,0x00,0x39,0xd5]
-; CHECK: mrs x3, AFSR1_EL1              ; encoding: [0x23,0x51,0x38,0xd5]
-; CHECK: mrs x3, AFSR1_EL2              ; encoding: [0x23,0x51,0x3c,0xd5]
-; CHECK: mrs x3, AFSR1_EL3              ; encoding: [0x23,0x51,0x3e,0xd5]
-; CHECK: mrs x3, AMAIR_EL1              ; encoding: [0x03,0xa3,0x38,0xd5]
-; CHECK: mrs x3, AMAIR_EL2              ; encoding: [0x03,0xa3,0x3c,0xd5]
-; CHECK: mrs x3, AMAIR_EL3              ; encoding: [0x03,0xa3,0x3e,0xd5]
-; CHECK: mrs x3, CCSIDR_EL1             ; encoding: [0x03,0x00,0x39,0xd5]
-; CHECK: mrs x3, CLIDR_EL1              ; encoding: [0x23,0x00,0x39,0xd5]
-; CHECK: mrs x3, CNTFRQ_EL0             ; encoding: [0x03,0xe0,0x3b,0xd5]
-; CHECK: mrs x3, CNTHCTL_EL2            ; encoding: [0x03,0xe1,0x3c,0xd5]
-; CHECK: mrs x3, CNTHP_CTL_EL2          ; encoding: [0x23,0xe2,0x3c,0xd5]
-; CHECK: mrs x3, CNTHP_CVAL_EL2         ; encoding: [0x43,0xe2,0x3c,0xd5]
-; CHECK: mrs x3, CNTHP_TVAL_EL2         ; encoding: [0x03,0xe2,0x3c,0xd5]
-; CHECK: mrs x3, CNTKCTL_EL1            ; encoding: [0x03,0xe1,0x38,0xd5]
-; CHECK: mrs x3, CNTPCT_EL0             ; encoding: [0x23,0xe0,0x3b,0xd5]
-; CHECK: mrs x3, CNTP_CTL_EL0           ; encoding: [0x23,0xe2,0x3b,0xd5]
-; CHECK: mrs x3, CNTP_CVAL_EL0          ; encoding: [0x43,0xe2,0x3b,0xd5]
-; CHECK: mrs x3, CNTP_TVAL_EL0          ; encoding: [0x03,0xe2,0x3b,0xd5]
-; CHECK: mrs x3, CNTVCT_EL0             ; encoding: [0x43,0xe0,0x3b,0xd5]
-; CHECK: mrs x3, CNTVOFF_EL2            ; encoding: [0x63,0xe0,0x3c,0xd5]
-; CHECK: mrs x3, CNTV_CTL_EL0           ; encoding: [0x23,0xe3,0x3b,0xd5]
-; CHECK: mrs x3, CNTV_CVAL_EL0          ; encoding: [0x43,0xe3,0x3b,0xd5]
-; CHECK: mrs x3, CNTV_TVAL_EL0          ; encoding: [0x03,0xe3,0x3b,0xd5]
-; CHECK: mrs x3, CONTEXTIDR_EL1         ; encoding: [0x23,0xd0,0x38,0xd5]
-; CHECK: mrs x3, CPACR_EL1              ; encoding: [0x43,0x10,0x38,0xd5]
-; CHECK: mrs x3, CPTR_EL2               ; encoding: [0x43,0x11,0x3c,0xd5]
-; CHECK: mrs x3, CPTR_EL3               ; encoding: [0x43,0x11,0x3e,0xd5]
-; CHECK: mrs x3, CSSELR_EL1             ; encoding: [0x03,0x00,0x3a,0xd5]
-; CHECK: mrs x3, CTR_EL0                ; encoding: [0x23,0x00,0x3b,0xd5]
-; CHECK: mrs x3, CURRENTEL              ; encoding: [0x43,0x42,0x38,0xd5]
-; CHECK: mrs x3, DACR32_EL2             ; encoding: [0x03,0x30,0x3c,0xd5]
-; CHECK: mrs x3, DCZID_EL0              ; encoding: [0xe3,0x00,0x3b,0xd5]
-; CHECK: mrs x3, REVIDR_EL1             ; encoding: [0xc3,0x00,0x38,0xd5]
-; CHECK: mrs x3, ESR_EL1                ; encoding: [0x03,0x52,0x38,0xd5]
-; CHECK: mrs x3, ESR_EL2                ; encoding: [0x03,0x52,0x3c,0xd5]
-; CHECK: mrs x3, ESR_EL3                ; encoding: [0x03,0x52,0x3e,0xd5]
-; CHECK: mrs x3, FAR_EL1                ; encoding: [0x03,0x60,0x38,0xd5]
-; CHECK: mrs x3, FAR_EL2                ; encoding: [0x03,0x60,0x3c,0xd5]
-; CHECK: mrs x3, FAR_EL3                ; encoding: [0x03,0x60,0x3e,0xd5]
-; CHECK: mrs x3, FPEXC32_EL2            ; encoding: [0x03,0x53,0x3c,0xd5]
-; CHECK: mrs x3, HACR_EL2               ; encoding: [0xe3,0x11,0x3c,0xd5]
-; CHECK: mrs x3, HCR_EL2                ; encoding: [0x03,0x11,0x3c,0xd5]
-; CHECK: mrs x3, HPFAR_EL2              ; encoding: [0x83,0x60,0x3c,0xd5]
-; CHECK: mrs x3, HSTR_EL2               ; encoding: [0x63,0x11,0x3c,0xd5]
-; CHECK: mrs x3, ID_AA64DFR0_EL1        ; encoding: [0x03,0x05,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64DFR1_EL1        ; encoding: [0x23,0x05,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64ISAR0_EL1       ; encoding: [0x03,0x06,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64ISAR1_EL1       ; encoding: [0x23,0x06,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64MMFR0_EL1       ; encoding: [0x03,0x07,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64MMFR1_EL1       ; encoding: [0x23,0x07,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64PFR0_EL1        ; encoding: [0x03,0x04,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64PFR1_EL1        ; encoding: [0x23,0x04,0x38,0xd5]
-; CHECK: mrs x3, IFSR32_EL2             ; encoding: [0x23,0x50,0x3c,0xd5]
-; CHECK: mrs x3, ISR_EL1                ; encoding: [0x03,0xc1,0x38,0xd5]
-; CHECK: mrs x3, MAIR_EL1               ; encoding: [0x03,0xa2,0x38,0xd5]
-; CHECK: mrs x3, MAIR_EL2               ; encoding: [0x03,0xa2,0x3c,0xd5]
-; CHECK: mrs x3, MAIR_EL3               ; encoding: [0x03,0xa2,0x3e,0xd5]
-; CHECK: mrs x3, MDCR_EL2               ; encoding: [0x23,0x11,0x3c,0xd5]
-; CHECK: mrs x3, MDCR_EL3               ; encoding: [0x23,0x13,0x3e,0xd5]
-; CHECK: mrs x3, MIDR_EL1               ; encoding: [0x03,0x00,0x38,0xd5]
-; CHECK: mrs x3, MPIDR_EL1              ; encoding: [0xa3,0x00,0x38,0xd5]
-; CHECK: mrs x3, MVFR0_EL1              ; encoding: [0x03,0x03,0x38,0xd5]
-; CHECK: mrs x3, MVFR1_EL1              ; encoding: [0x23,0x03,0x38,0xd5]
-; CHECK: mrs x3, PAR_EL1                ; encoding: [0x03,0x74,0x38,0xd5]
-; CHECK: mrs x3, RVBAR_EL1              ; encoding: [0x23,0xc0,0x38,0xd5]
-; CHECK: mrs x3, RVBAR_EL2              ; encoding: [0x23,0xc0,0x3c,0xd5]
-; CHECK: mrs x3, RVBAR_EL3              ; encoding: [0x23,0xc0,0x3e,0xd5]
-; CHECK: mrs x3, SCR_EL3                ; encoding: [0x03,0x11,0x3e,0xd5]
-; CHECK: mrs x3, SCTLR_EL1              ; encoding: [0x03,0x10,0x38,0xd5]
-; CHECK: mrs x3, SCTLR_EL2              ; encoding: [0x03,0x10,0x3c,0xd5]
-; CHECK: mrs x3, SCTLR_EL3              ; encoding: [0x03,0x10,0x3e,0xd5]
-; CHECK: mrs x3, SDER32_EL3             ; encoding: [0x23,0x11,0x3e,0xd5]
-; CHECK: mrs x3, TCR_EL1                ; encoding: [0x43,0x20,0x38,0xd5]
-; CHECK: mrs x3, TCR_EL2                ; encoding: [0x43,0x20,0x3c,0xd5]
-; CHECK: mrs x3, TCR_EL3                ; encoding: [0x43,0x20,0x3e,0xd5]
-; CHECK: mrs x3, TEECR32_EL1            ; encoding: [0x03,0x00,0x32,0xd5]
-; CHECK: mrs x3, TEEHBR32_EL1           ; encoding: [0x03,0x10,0x32,0xd5]
-; CHECK: mrs x3, TPIDRRO_EL0            ; encoding: [0x63,0xd0,0x3b,0xd5]
-; CHECK: mrs x3, TPIDR_EL0              ; encoding: [0x43,0xd0,0x3b,0xd5]
-; CHECK: mrs x3, TPIDR_EL1              ; encoding: [0x83,0xd0,0x38,0xd5]
-; CHECK: mrs x3, TPIDR_EL2              ; encoding: [0x43,0xd0,0x3c,0xd5]
-; CHECK: mrs x3, TPIDR_EL3              ; encoding: [0x43,0xd0,0x3e,0xd5]
-; CHECK: mrs x3, TTBR0_EL1              ; encoding: [0x03,0x20,0x38,0xd5]
-; CHECK: mrs x3, TTBR0_EL2              ; encoding: [0x03,0x20,0x3c,0xd5]
-; CHECK: mrs x3, TTBR0_EL3              ; encoding: [0x03,0x20,0x3e,0xd5]
-; CHECK: mrs x3, TTBR1_EL1              ; encoding: [0x23,0x20,0x38,0xd5]
-; CHECK: mrs x3, VBAR_EL1               ; encoding: [0x03,0xc0,0x38,0xd5]
-; CHECK: mrs x3, VBAR_EL2               ; encoding: [0x03,0xc0,0x3c,0xd5]
-; CHECK: mrs x3, VBAR_EL3               ; encoding: [0x03,0xc0,0x3e,0xd5]
-; CHECK: mrs x3, VMPIDR_EL2             ; encoding: [0xa3,0x00,0x3c,0xd5]
-; CHECK: mrs x3, VPIDR_EL2              ; encoding: [0x03,0x00,0x3c,0xd5]
-; CHECK: mrs x3, VTCR_EL2               ; encoding: [0x43,0x21,0x3c,0xd5]
-; CHECK: mrs x3, VTTBR_EL2              ; encoding: [0x03,0x21,0x3c,0xd5]
-; CHECK: mrs	x3, MDCCSR_EL0          ; encoding: [0x03,0x01,0x33,0xd5]
-; CHECK: mrs	x3, MDCCINT_EL1         ; encoding: [0x03,0x02,0x30,0xd5]
-; CHECK: mrs	x3, DBGDTR_EL0          ; encoding: [0x03,0x04,0x33,0xd5]
-; CHECK: mrs	x3, DBGDTRRX_EL0        ; encoding: [0x03,0x05,0x33,0xd5]
-; CHECK: mrs	x3, DBGVCR32_EL2        ; encoding: [0x03,0x07,0x34,0xd5]
-; CHECK: mrs	x3, OSDTRRX_EL1         ; encoding: [0x43,0x00,0x30,0xd5]
-; CHECK: mrs	x3, MDSCR_EL1           ; encoding: [0x43,0x02,0x30,0xd5]
-; CHECK: mrs	x3, OSDTRTX_EL1         ; encoding: [0x43,0x03,0x30,0xd5]
-; CHECK: mrs	x3, OSECCR_EL1          ; encoding: [0x43,0x06,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR0_EL1         ; encoding: [0x83,0x00,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR1_EL1         ; encoding: [0x83,0x01,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR2_EL1         ; encoding: [0x83,0x02,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR3_EL1         ; encoding: [0x83,0x03,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR4_EL1         ; encoding: [0x83,0x04,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR5_EL1         ; encoding: [0x83,0x05,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR6_EL1         ; encoding: [0x83,0x06,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR7_EL1         ; encoding: [0x83,0x07,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR8_EL1         ; encoding: [0x83,0x08,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR9_EL1         ; encoding: [0x83,0x09,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR10_EL1        ; encoding: [0x83,0x0a,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR11_EL1        ; encoding: [0x83,0x0b,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR12_EL1        ; encoding: [0x83,0x0c,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR13_EL1        ; encoding: [0x83,0x0d,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR14_EL1        ; encoding: [0x83,0x0e,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR15_EL1        ; encoding: [0x83,0x0f,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR0_EL1         ; encoding: [0xa3,0x00,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR1_EL1         ; encoding: [0xa3,0x01,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR2_EL1         ; encoding: [0xa3,0x02,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR3_EL1         ; encoding: [0xa3,0x03,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR4_EL1         ; encoding: [0xa3,0x04,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR5_EL1         ; encoding: [0xa3,0x05,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR6_EL1         ; encoding: [0xa3,0x06,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR7_EL1         ; encoding: [0xa3,0x07,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR8_EL1         ; encoding: [0xa3,0x08,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR9_EL1         ; encoding: [0xa3,0x09,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR10_EL1        ; encoding: [0xa3,0x0a,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR11_EL1        ; encoding: [0xa3,0x0b,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR12_EL1        ; encoding: [0xa3,0x0c,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR13_EL1        ; encoding: [0xa3,0x0d,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR14_EL1        ; encoding: [0xa3,0x0e,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR15_EL1        ; encoding: [0xa3,0x0f,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR0_EL1         ; encoding: [0xc3,0x00,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR1_EL1         ; encoding: [0xc3,0x01,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR2_EL1         ; encoding: [0xc3,0x02,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR3_EL1         ; encoding: [0xc3,0x03,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR4_EL1         ; encoding: [0xc3,0x04,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR5_EL1         ; encoding: [0xc3,0x05,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR6_EL1         ; encoding: [0xc3,0x06,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR7_EL1         ; encoding: [0xc3,0x07,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR8_EL1         ; encoding: [0xc3,0x08,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR9_EL1         ; encoding: [0xc3,0x09,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR10_EL1        ; encoding: [0xc3,0x0a,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR11_EL1        ; encoding: [0xc3,0x0b,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR12_EL1        ; encoding: [0xc3,0x0c,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR13_EL1        ; encoding: [0xc3,0x0d,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR14_EL1        ; encoding: [0xc3,0x0e,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR15_EL1        ; encoding: [0xc3,0x0f,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR0_EL1         ; encoding: [0xe3,0x00,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR1_EL1         ; encoding: [0xe3,0x01,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR2_EL1         ; encoding: [0xe3,0x02,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR3_EL1         ; encoding: [0xe3,0x03,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR4_EL1         ; encoding: [0xe3,0x04,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR5_EL1         ; encoding: [0xe3,0x05,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR6_EL1         ; encoding: [0xe3,0x06,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR7_EL1         ; encoding: [0xe3,0x07,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR8_EL1         ; encoding: [0xe3,0x08,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR9_EL1         ; encoding: [0xe3,0x09,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR10_EL1        ; encoding: [0xe3,0x0a,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR11_EL1        ; encoding: [0xe3,0x0b,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR12_EL1        ; encoding: [0xe3,0x0c,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR13_EL1        ; encoding: [0xe3,0x0d,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR14_EL1        ; encoding: [0xe3,0x0e,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR15_EL1        ; encoding: [0xe3,0x0f,0x30,0xd5]
-; CHECK: mrs	x3, MDRAR_EL1           ; encoding: [0x03,0x10,0x30,0xd5]
-; CHECK: mrs	x3, OSLSR_EL1           ; encoding: [0x83,0x11,0x30,0xd5]
-; CHECK: mrs	x3, OSDLR_EL1           ; encoding: [0x83,0x13,0x30,0xd5]
-; CHECK: mrs	x3, DBGPRCR_EL1         ; encoding: [0x83,0x14,0x30,0xd5]
-; CHECK: mrs	x3, DBGCLAIMSET_EL1     ; encoding: [0xc3,0x78,0x30,0xd5]
-; CHECK: mrs	x3, DBGCLAIMCLR_EL1     ; encoding: [0xc3,0x79,0x30,0xd5]
-; CHECK: mrs	x3, DBGAUTHSTATUS_EL1   ; encoding: [0xc3,0x7e,0x30,0xd5]
-; CHECK: mrs    x1, S3_2_C15_C6_4       ; encoding: [0x81,0xf6,0x3a,0xd5]
-; CHECK: mrs	x3, S3_3_C11_C1_4       ; encoding: [0x83,0xb1,0x3b,0xd5]
-; CHECK: mrs	x3, S3_3_C11_C1_4       ; encoding: [0x83,0xb1,0x3b,0xd5]
-
-  msr RMR_EL3, x0
-  msr RMR_EL2, x0
-  msr RMR_EL1, x0
-  msr OSLAR_EL1, x3
-  msr DBGDTRTX_EL0, x3
-        
-; CHECK: msr	RMR_EL3, x0             ; encoding: [0x40,0xc0,0x1e,0xd5]
-; CHECK: msr	RMR_EL2, x0             ; encoding: [0x40,0xc0,0x1c,0xd5]
-; CHECK: msr	RMR_EL1, x0             ; encoding: [0x40,0xc0,0x18,0xd5]
-; CHECK: msr	OSLAR_EL1, x3           ; encoding: [0x83,0x10,0x10,0xd5]
-; CHECK: msr	DBGDTRTX_EL0, x3        ; encoding: [0x03,0x05,0x13,0xd5]
-        
- mrs x0, ID_PFR0_EL1
- mrs x0, ID_PFR1_EL1
- mrs x0, ID_DFR0_EL1
- mrs x0, ID_AFR0_EL1
- mrs x0, ID_ISAR0_EL1
- mrs x0, ID_ISAR1_EL1
- mrs x0, ID_ISAR2_EL1
- mrs x0, ID_ISAR3_EL1
- mrs x0, ID_ISAR4_EL1
- mrs x0, ID_ISAR5_EL1
- mrs x0, AFSR1_EL1
- mrs x0, AFSR0_EL1
- mrs x0, REVIDR_EL1
-; CHECK: mrs	x0, ID_PFR0_EL1         ; encoding: [0x00,0x01,0x38,0xd5]
-; CHECK: mrs	x0, ID_PFR1_EL1         ; encoding: [0x20,0x01,0x38,0xd5]
-; CHECK: mrs	x0, ID_DFR0_EL1         ; encoding: [0x40,0x01,0x38,0xd5]
-; CHECK: mrs	x0, ID_AFR0_EL1         ; encoding: [0x60,0x01,0x38,0xd5]
-; CHECK: mrs	x0, ID_ISAR0_EL1        ; encoding: [0x00,0x02,0x38,0xd5]
-; CHECK: mrs	x0, ID_ISAR1_EL1        ; encoding: [0x20,0x02,0x38,0xd5]
-; CHECK: mrs	x0, ID_ISAR2_EL1        ; encoding: [0x40,0x02,0x38,0xd5]
-; CHECK: mrs	x0, ID_ISAR3_EL1        ; encoding: [0x60,0x02,0x38,0xd5]
-; CHECK: mrs	x0, ID_ISAR4_EL1        ; encoding: [0x80,0x02,0x38,0xd5]
-; CHECK: mrs	x0, ID_ISAR5_EL1        ; encoding: [0xa0,0x02,0x38,0xd5]
-; CHECK: mrs	x0, AFSR1_EL1           ; encoding: [0x20,0x51,0x38,0xd5]
-; CHECK: mrs	x0, AFSR0_EL1           ; encoding: [0x00,0x51,0x38,0xd5]
-; CHECK: mrs	x0, REVIDR_EL1          ; encoding: [0xc0,0x00,0x38,0xd5]
diff --git a/test/MC/ARM64/target-specific-sysreg.s b/test/MC/ARM64/target-specific-sysreg.s
deleted file mode 100644
index 05cea3ac2da..00000000000
--- a/test/MC/ARM64/target-specific-sysreg.s
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: not llvm-mc -triple arm64 -mcpu=generic -show-encoding < %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CHECK-GENERIC
-//
-// RUN: llvm-mc -triple arm64 -mcpu=cyclone -show-encoding < %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CHECK-CYCLONE
-
-msr CPM_IOACC_CTL_EL3, x0
-
-// CHECK-GENERIC: error: expected writable system register or pstate
-// CHECK-CYCLONE: msr CPM_IOACC_CTL_EL3, x0   // encoding: [0x00,0xf2,0x1f,0xd5]
diff --git a/test/MC/ARM64/tls-modifiers-darwin.s b/test/MC/ARM64/tls-modifiers-darwin.s
deleted file mode 100644
index 8ff07cd86b2..00000000000
--- a/test/MC/ARM64/tls-modifiers-darwin.s
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llvm-mc -triple=arm64-apple-ios7.0 %s -o - | FileCheck %s
-; RUN: llvm-mc -triple=arm64-apple-ios7.0 -filetype=obj %s -o - | llvm-objdump -r - | FileCheck %s --check-prefix=CHECK-OBJ
-
-        adrp x2, _var@TLVPPAGE
-        ldr x0, [x15, _var@TLVPPAGEOFF]
-        add x30, x0, _var@TLVPPAGEOFF
-; CHECK: adrp x2, _var@TLVPPAG
-; CHECK: ldr x0, [x15, _var@TLVPPAGEOFF]
-; CHECK: add x30, x0, _var@TLVPPAGEOFF
-
-; CHECK-OBJ: 8 ARM64_RELOC_TLVP_LOAD_PAGEOFF12 _var
-; CHECK-OBJ: 4 ARM64_RELOC_TLVP_LOAD_PAGEOFF12 _var
-; CHECK-OBJ: 0 ARM64_RELOC_TLVP_LOAD_PAGE21 _var
diff --git a/test/MC/ARM64/tls-relocs.s b/test/MC/ARM64/tls-relocs.s
deleted file mode 100644
index 681f616d909..00000000000
--- a/test/MC/ARM64/tls-relocs.s
+++ /dev/null
@@ -1,320 +0,0 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s -o - | \
-// RUN:   llvm-readobj -r -t | FileCheck --check-prefix=CHECK-ELF %s
-
-
-////////////////////////////////////////////////////////////////////////////////
-// TLS initial-exec forms
-////////////////////////////////////////////////////////////////////////////////
-
-        movz x15, #:gottprel_g1:var
-// CHECK: movz    x15, #:gottprel_g1:var  // encoding: [0bAAA01111,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_arm64_movw
-
-// CHECK-ELF:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 [[VARSYM:[^ ]+]]
-
-
-        movk x13, #:gottprel_g0_nc:var
-// CHECK: movk    x13, #:gottprel_g0_nc:var // encoding: [0bAAA01101,A,0b100AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_arm64_movw
-
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC [[VARSYM]]
-
-        adrp x11, :gottprel:var
-        ldr x10, [x0, #:gottprel_lo12:var]
-        ldr x9, :gottprel:var
-// CHECK: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_arm64_pcrel_adrp_imm21
-// CHECK: ldr     x10, [x0, :gottprel_lo12:var] // encoding: [0x0a,0bAAAAAA00,0b01AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK: ldr     x9, :gottprel:var       // encoding: [0bAAA01001,A,A,0x58]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_arm64_ldr_pcrel_imm19
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_LD_GOTTPREL_PREL19 [[VARSYM]]
-
-
-////////////////////////////////////////////////////////////////////////////////
-// TLS local-exec forms
-////////////////////////////////////////////////////////////////////////////////
-
-        movz x3, #:tprel_g2:var
-        movn x4, #:tprel_g2:var
-// CHECK: movz    x3, #:tprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_arm64_movw
-// CHECK: movn    x4, #:tprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
-
-
-        movz x5, #:tprel_g1:var
-        movn x6, #:tprel_g1:var
-        movz w7, #:tprel_g1:var
-// CHECK: movz    x5, #:tprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
-// CHECK: movn    x6, #:tprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
-// CHECK: movz    w7, #:tprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
-
-
-        movk x9, #:tprel_g1_nc:var
-        movk w10, #:tprel_g1_nc:var
-// CHECK: movk    x9, #:tprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_arm64_movw
-// CHECK: movk    w10, #:tprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
-
-
-        movz x11, #:tprel_g0:var
-        movn x12, #:tprel_g0:var
-        movz w13, #:tprel_g0:var
-// CHECK: movz    x11, #:tprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
-// CHECK: movn    x12, #:tprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
-// CHECK: movz    w13, #:tprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
-
-
-        movk x15, #:tprel_g0_nc:var
-        movk w16, #:tprel_g0_nc:var
-// CHECK: movk    x15, #:tprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_arm64_movw
-// CHECK: movk    w16, #:tprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
-
-
-        add x21, x22, #:tprel_lo12:var
-// CHECK: add     x21, x22, :tprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_add_imm12
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
-
-
-        add x25, x26, #:tprel_lo12_nc:var
-// CHECK: add     x25, x26, :tprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_add_imm12
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
-
-
-        ldrb w29, [x30, #:tprel_lo12:var]
-        ldrsb x29, [x28, #:tprel_lo12_nc:var]
-// CHECK: ldrb    w29, [x30, :tprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale1
-// CHECK: ldrsb   x29, [x28, :tprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale1
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST8_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC [[VARSYM]]
-
-
-        strh w27, [x26, #:tprel_lo12:var]
-        ldrsh x25, [x24, #:tprel_lo12_nc:var]
-// CHECK: strh    w27, [x26, :tprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale2
-// CHECK: ldrsh   x25, [x24, :tprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale2
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST16_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC [[VARSYM]]
-
-
-        ldr w23, [x22, #:tprel_lo12:var]
-        ldrsw x21, [x20, #:tprel_lo12_nc:var]
-// CHECK: ldr     w23, [x22, :tprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale4
-// CHECK: ldrsw   x21, [x20, :tprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale4
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST32_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC [[VARSYM]]
-
-        ldr x19, [x18, #:tprel_lo12:var]
-        str x17, [x16, #:tprel_lo12_nc:var]
-// CHECK: ldr     x19, [x18, :tprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK: str     x17, [x16, :tprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale8
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST64_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC [[VARSYM]]
-
-
-////////////////////////////////////////////////////////////////////////////////
-// TLS local-dynamic forms
-////////////////////////////////////////////////////////////////////////////////
-
-        movz x3, #:dtprel_g2:var
-        movn x4, #:dtprel_g2:var
-// CHECK: movz    x3, #:dtprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
-// CHECK: movn    x4, #:dtprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G2 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G2 [[VARSYM]]
-
-
-        movz x5, #:dtprel_g1:var
-        movn x6, #:dtprel_g1:var
-        movz w7, #:dtprel_g1:var
-// CHECK: movz    x5, #:dtprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
-// CHECK: movn    x6, #:dtprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
-// CHECK: movz    w7, #:dtprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
-
-
-        movk x9, #:dtprel_g1_nc:var
-        movk w10, #:dtprel_g1_nc:var
-// CHECK: movk    x9, #:dtprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_arm64_movw
-// CHECK: movk    w10, #:dtprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
-
-
-        movz x11, #:dtprel_g0:var
-        movn x12, #:dtprel_g0:var
-        movz w13, #:dtprel_g0:var
-// CHECK: movz    x11, #:dtprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
-// CHECK: movn    x12, #:dtprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
-// CHECK: movz    w13, #:dtprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
-
-
-        movk x15, #:dtprel_g0_nc:var
-        movk w16, #:dtprel_g0_nc:var
-// CHECK: movk    x15, #:dtprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_arm64_movw
-// CHECK: movk    w16, #:dtprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
-
-
-        add x21, x22, #:dtprel_lo12:var
-// CHECK: add     x21, x22, :dtprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_add_imm12
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
-
-
-        add x25, x26, #:dtprel_lo12_nc:var
-// CHECK: add     x25, x26, :dtprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_add_imm12
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
-
-
-        ldrb w29, [x30, #:dtprel_lo12:var]
-        ldrsb x29, [x28, #:dtprel_lo12_nc:var]
-// CHECK: ldrb    w29, [x30, :dtprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale1
-// CHECK: ldrsb   x29, [x28, :dtprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale1
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST8_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC [[VARSYM]]
-
-
-        strh w27, [x26, #:dtprel_lo12:var]
-        ldrsh x25, [x24, #:dtprel_lo12_nc:var]
-// CHECK: strh    w27, [x26, :dtprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale2
-// CHECK: ldrsh   x25, [x24, :dtprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale2
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST16_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC [[VARSYM]]
-
-
-        ldr w23, [x22, #:dtprel_lo12:var]
-        ldrsw x21, [x20, #:dtprel_lo12_nc:var]
-// CHECK: ldr     w23, [x22, :dtprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale4
-// CHECK: ldrsw   x21, [x20, :dtprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale4
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST32_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC [[VARSYM]]
-
-        ldr x19, [x18, #:dtprel_lo12:var]
-        str x17, [x16, #:dtprel_lo12_nc:var]
-// CHECK: ldr     x19, [x18, :dtprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK: str     x17, [x16, :dtprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale8
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST64_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC [[VARSYM]]
-
-////////////////////////////////////////////////////////////////////////////////
-// TLS descriptor forms
-////////////////////////////////////////////////////////////////////////////////
-
-        adrp x8, :tlsdesc:var
-        ldr x7, [x6, #:tlsdesc_lo12:var]
-        add x5, x4, #:tlsdesc_lo12:var
-        .tlsdesccall var
-        blr x3
-
-// CHECK: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_arm64_pcrel_adrp_imm21
-// CHECK: ldr     x7, [x6, :tlsdesc_lo12:var] // encoding: [0xc7,0bAAAAAA00,0b01AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK: add     x5, x4, :tlsdesc_lo12:var // encoding: [0x85,0bAAAAAA00,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_arm64_add_imm12
-// CHECK: .tlsdesccall var                // encoding: []
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: var, kind: fixup_arm64_tlsdesc_call
-// CHECK: blr     x3                      // encoding: [0x60,0x00,0x3f,0xd6]
-
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADR_PAGE [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_LD64_LO12_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADD_LO12_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_CALL [[VARSYM]]
-
-        // Make sure symbol 5 has type STT_TLS:
-
-// CHECK-ELF:      Symbols [
-// CHECK-ELF:        Symbol {
-// CHECK-ELF:          Name: var
-// CHECK-ELF-NEXT:     Value:
-// CHECK-ELF-NEXT:     Size:
-// CHECK-ELF-NEXT:     Binding: Global
-// CHECK-ELF-NEXT:     Type: TLS
diff --git a/test/MC/ARM64/v128_lo-diagnostics.s b/test/MC/ARM64/v128_lo-diagnostics.s
deleted file mode 100644
index ffe29cfbed3..00000000000
--- a/test/MC/ARM64/v128_lo-diagnostics.s
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: not llvm-mc -triple arm64 -mattr=neon %s 2> %t > /dev/null
-// RUN: FileCheck %s < %t
-
-        sqrdmulh v0.8h, v1.8h, v16.h[0]
-// CHECK: error: invalid operand for instruction
-
-        sqrdmulh h0, h1, v16.h[0]
-// CHECK: error: invalid operand for instruction
-
-        sqdmull2 v0.4h, v1.8h, v16.h[0]
-// CHECK: error: invalid operand for instruction
diff --git a/test/MC/ARM64/variable-exprs.s b/test/MC/ARM64/variable-exprs.s
deleted file mode 100644
index 01204425c79..00000000000
--- a/test/MC/ARM64/variable-exprs.s
+++ /dev/null
@@ -1,40 +0,0 @@
-// RUN: llvm-mc -triple arm64-apple-darwin10 %s -filetype=obj -o %t.o
-
-.data
-
-        .long 0
-a:
-        .long 0
-b = a
-
-c:      .long b
-
-d2 = d
-.globl d2
-d3 = d + 4
-.globl d3
-
-e = a + 4
-
-g:
-f = g
-        .long 0
-
-        .long b
-        .long e
-        .long a + 4
-        .long d
-        .long d2
-        .long d3
-        .long f
-        .long g
-
-///
-        .text
-t0:
-Lt0_a:
-        .long 0
-
-	.section	__DWARF,__debug_frame,regular,debug
-Lt1 = Lt0_a
-	.long	Lt1
diff --git a/test/MC/ARM64/vector-lists.s b/test/MC/ARM64/vector-lists.s
deleted file mode 100644
index a9b2d198e86..00000000000
--- a/test/MC/ARM64/vector-lists.s
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: not llvm-mc -triple arm64 -mattr=neon -show-encoding < %s 2>%t | FileCheck %s
-// RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-    ST4     {v0.8B-v3.8B}, [x0]
-    ST4     {v0.4H-v3.4H}, [x0]
-
-// CHECK: st4  { v0.8b, v1.8b, v2.8b, v3.8b }, [x0] // encoding: [0x00,0x00,0x00,0x0c]
-// CHECK: st4  { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] // encoding: [0x00,0x04,0x00,0x0c]
-
-    ST4     {v0.8B-v4.8B}, [x0]
-    ST4     {v0.8B-v3.8B,v4.8B}, [x0]
-    ST4     {v0.8B-v3.8H}, [x0]
-    ST4     {v0.8B-v3.16B}, [x0]
-    ST4     {v0.8B-},[x0]
-
-// CHECK-ERRORS: error: invalid number of vectors
-// CHECK-ERRORS: error: '}' expected
-// CHECK-ERRORS: error: mismatched register size suffix
-// CHECK-ERRORS: error: mismatched register size suffix
-// CHECK-ERRORS: error: vector register expected
diff --git a/test/MC/ARM64/verbose-vector-case.s b/test/MC/ARM64/verbose-vector-case.s
deleted file mode 100644
index 6f0a3812dd7..00000000000
--- a/test/MC/ARM64/verbose-vector-case.s
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: llvm-mc -triple arm64 -mattr=crypto -show-encoding < %s | FileCheck %s
-
-pmull v8.8h, v8.8b, v8.8b
-pmull2 v8.8h, v8.16b, v8.16b
-pmull v8.1q, v8.1d, v8.1d
-pmull2 v8.1q, v8.2d, v8.2d
-// CHECK: pmull v8.8h, v8.8b, v8.8b    // encoding: [0x08,0xe1,0x28,0x0e]
-// CHECK: pmull2 v8.8h, v8.16b, v8.16b // encoding: [0x08,0xe1,0x28,0x4e]
-// CHECK: pmull v8.1q, v8.1d, v8.1d    // encoding: [0x08,0xe1,0xe8,0x0e]
-// CHECK: pmull2 v8.1q, v8.2d, v8.2d   // encoding: [0x08,0xe1,0xe8,0x4e]
-
-pmull v8.8H, v8.8B, v8.8B
-pmull2 v8.8H, v8.16B, v8.16B
-pmull v8.1Q, v8.1D, v8.1D
-pmull2 v8.1Q, v8.2D, v8.2D
-// CHECK: pmull v8.8h, v8.8b, v8.8b    // encoding: [0x08,0xe1,0x28,0x0e]
-// CHECK: pmull2 v8.8h, v8.16b, v8.16b // encoding: [0x08,0xe1,0x28,0x4e]
-// CHECK: pmull v8.1q, v8.1d, v8.1d    // encoding: [0x08,0xe1,0xe8,0x0e]
-// CHECK: pmull2 v8.1q, v8.2d, v8.2d   // encoding: [0x08,0xe1,0xe8,0x4e]
diff --git a/test/MC/Disassembler/AArch64/arm64-advsimd.txt b/test/MC/Disassembler/AArch64/arm64-advsimd.txt
new file mode 100644
index 00000000000..cceee672dfd
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-advsimd.txt
@@ -0,0 +1,2283 @@
+# RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -output-asm-variant=1 --disassemble < %s | FileCheck %s
+
+0x00 0xb8 0x20 0x0e
+0x00 0xb8 0x20 0x4e
+0x00 0xb8 0x60 0x0e
+0x00 0xb8 0x60 0x4e
+0x00 0xb8 0xa0 0x0e
+0x00 0xb8 0xa0 0x4e
+
+# CHECK: abs.8b  v0, v0
+# CHECK: abs.16b v0, v0
+# CHECK: abs.4h  v0, v0
+# CHECK: abs.8h  v0, v0
+# CHECK: abs.2s  v0, v0
+# CHECK: abs.4s  v0, v0
+
+0x00 0x84 0x20 0x0e
+0x00 0x84 0x20 0x4e
+0x00 0x84 0x60 0x0e
+0x00 0x84 0x60 0x4e
+0x00 0x84 0xa0 0x0e
+0x00 0x84 0xa0 0x4e
+0x00 0x84 0xe0 0x4e
+
+# CHECK: add.8b  v0, v0, v0
+# CHECK: add.16b v0, v0, v0
+# CHECK: add.4h  v0, v0, v0
+# CHECK: add.8h  v0, v0, v0
+# CHECK: add.2s  v0, v0, v0
+# CHECK: add.4s  v0, v0, v0
+# CHECK: add.2d  v0, v0, v0
+
+0x41 0x84 0xe3 0x5e
+
+# CHECK: add d1, d2, d3
+
+0x00 0x40 0x20 0x0e
+0x00 0x40 0x20 0x4e
+0x00 0x40 0x60 0x0e
+0x00 0x40 0x60 0x4e
+0x00 0x40 0xa0 0x0e
+0x00 0x40 0xa0 0x4e
+
+# CHECK: addhn.8b   v0, v0, v0
+# CHECK: addhn2.16b v0, v0, v0
+# CHECK: addhn.4h   v0, v0, v0
+# CHECK: addhn2.8h  v0, v0, v0
+# CHECK: addhn.2s   v0, v0, v0
+# CHECK: addhn2.4s  v0, v0, v0
+
+0x00 0xbc 0x20 0x0e
+0x00 0xbc 0x20 0x4e
+0x00 0xbc 0x60 0x0e
+0x00 0xbc 0x60 0x4e
+0x00 0xbc 0xa0 0x0e
+0x00 0xbc 0xa0 0x4e
+0x00 0xbc 0xe0 0x4e
+
+# CHECK: addp.8b   v0, v0, v0
+# CHECK: addp.16b  v0, v0, v0
+# CHECK: addp.4h   v0, v0, v0
+# CHECK: addp.8h   v0, v0, v0
+# CHECK: addp.2s   v0, v0, v0
+# CHECK: addp.4s   v0, v0, v0
+# CHECK: addp.2d   v0, v0, v0
+
+0x00 0xb8 0xf1 0x5e
+
+# CHECK: addp.2d d0, v0
+
+0x00 0xb8 0x31 0x0e
+0x00 0xb8 0x31 0x4e
+0x00 0xb8 0x71 0x0e
+0x00 0xb8 0x71 0x4e
+0x00 0xb8 0xb1 0x4e
+
+# CHECK: addv.8b  b0, v0
+# CHECK: addv.16b b0, v0
+# CHECK: addv.4h  h0, v0
+# CHECK: addv.8h  h0, v0
+# CHECK: addv.4s  s0, v0
+
+
+# INS/DUP
+0x60 0x0c 0x08 0x4e
+0x60 0x0c 0x04 0x4e
+0x60 0x0c 0x04 0x0e
+0x60 0x0c 0x02 0x4e
+0x60 0x0c 0x02 0x0e
+0x60 0x0c 0x01 0x4e
+0x60 0x0c 0x01 0x0e
+
+# CHECK: dup.2d  v0, x3
+# CHECK: dup.4s  v0, w3
+# CHECK: dup.2s  v0, w3
+# CHECK: dup.8h  v0, w3
+# CHECK: dup.4h  v0, w3
+# CHECK: dup.16b v0, w3
+# CHECK: dup.8b  v0, w3
+
+0x60 0x04 0x18 0x4e
+0x60 0x04 0x0c 0x0e
+0x60 0x04 0x0c 0x4e
+0x60 0x04 0x06 0x0e
+0x60 0x04 0x06 0x4e
+0x60 0x04 0x03 0x0e
+0x60 0x04 0x03 0x4e
+
+# CHECK: dup.2d  v0, v3[1]
+# CHECK: dup.2s  v0, v3[1]
+# CHECK: dup.4s  v0, v3[1]
+# CHECK: dup.4h  v0, v3[1]
+# CHECK: dup.8h  v0, v3[1]
+# CHECK: dup.8b  v0, v3[1]
+# CHECK: dup.16b v0, v3[1]
+
+
+0x43 0x2c 0x14 0x4e
+0x43 0x2c 0x14 0x4e
+0x43 0x3c 0x14 0x0e
+0x43 0x3c 0x14 0x0e
+0x43 0x3c 0x18 0x4e
+0x43 0x3c 0x18 0x4e
+
+# CHECK: smov.s  x3, v2[2]
+# CHECK: smov.s  x3, v2[2]
+# CHECK: mov.s  w3, v2[2]
+# CHECK: mov.s  w3, v2[2]
+# CHECK: mov.d  x3, v2[1]
+# CHECK: mov.d  x3, v2[1]
+
+0xa2 0x1c 0x18 0x4e
+0xa2 0x1c 0x0c 0x4e
+0xa2 0x1c 0x06 0x4e
+0xa2 0x1c 0x03 0x4e
+
+0xa2 0x1c 0x18 0x4e
+0xa2 0x1c 0x0c 0x4e
+0xa2 0x1c 0x06 0x4e
+0xa2 0x1c 0x03 0x4e
+
+# CHECK: ins.d v2[1], x5
+# CHECK: ins.s v2[1], w5
+# CHECK: ins.h v2[1], w5
+# CHECK: ins.b v2[1], w5
+
+# CHECK: ins.d v2[1], x5
+# CHECK: ins.s v2[1], w5
+# CHECK: ins.h v2[1], w5
+# CHECK: ins.b v2[1], w5
+
+0xe2 0x45 0x18 0x6e
+0xe2 0x25 0x0c 0x6e
+0xe2 0x15 0x06 0x6e
+0xe2 0x0d 0x03 0x6e
+
+0xe2 0x05 0x18 0x6e
+0xe2 0x45 0x1c 0x6e
+0xe2 0x35 0x1e 0x6e
+0xe2 0x2d 0x15 0x6e
+
+# CHECK: ins.d v2[1], v15[1]
+# CHECK: ins.s v2[1], v15[1]
+# CHECK: ins.h v2[1], v15[1]
+# CHECK: ins.b v2[1], v15[1]
+
+# CHECK: ins.d v2[1], v15[0]
+# CHECK: ins.s v2[3], v15[2]
+# CHECK: ins.h v2[7], v15[3]
+# CHECK: ins.b v2[10], v15[5]
+
+0x00 0x1c 0x20 0x0e
+0x00 0x1c 0x20 0x4e
+
+# CHECK: and.8b  v0, v0, v0
+# CHECK: and.16b v0, v0, v0
+
+0x00 0x1c 0x60 0x0e
+
+# CHECK: bic.8b  v0, v0, v0
+
+0x00 0x8c 0x20 0x2e
+0x00 0x3c 0x20 0x0e
+0x00 0x34 0x20 0x0e
+0x00 0x34 0x20 0x2e
+0x00 0x3c 0x20 0x2e
+0x00 0x8c 0x20 0x0e
+0x00 0xd4 0xa0 0x2e
+0x00 0xec 0x20 0x2e
+0x00 0xec 0xa0 0x2e
+0x00 0xd4 0x20 0x2e
+0x00 0xd4 0x20 0x0e
+0x00 0xe4 0x20 0x0e
+0x00 0xe4 0x20 0x2e
+0x00 0xe4 0xa0 0x2e
+0x00 0xfc 0x20 0x2e
+0x00 0xc4 0x20 0x2e
+0x00 0xc4 0x20 0x0e
+0x00 0xf4 0x20 0x2e
+0x00 0xf4 0x20 0x0e
+0x00 0xc4 0xa0 0x2e
+0x00 0xc4 0xa0 0x0e
+0x00 0xf4 0xa0 0x2e
+0x00 0xf4 0xa0 0x0e
+0x00 0xcc 0x20 0x0e
+0x00 0xcc 0xa0 0x0e
+0x00 0xdc 0x20 0x0e
+0x00 0xdc 0x20 0x2e
+0x00 0xfc 0x20 0x0e
+0x00 0xfc 0xa0 0x0e
+0x00 0xd4 0xa0 0x0e
+0x00 0x94 0x20 0x0e
+0x00 0x94 0x20 0x2e
+0x00 0x9c 0x20 0x0e
+0x00 0x9c 0x20 0x2e
+0x00 0x7c 0x20 0x0e
+0x00 0x74 0x20 0x0e
+0x00 0x04 0x20 0x0e
+0x00 0x24 0x20 0x0e
+0x00 0xa4 0x20 0x0e
+0x00 0x64 0x20 0x0e
+0x00 0xac 0x20 0x0e
+0x00 0x6c 0x20 0x0e
+0x00 0x0c 0x20 0x0e
+0x00 0xb4 0x60 0x0e
+0x00 0xb4 0x60 0x2e
+0x00 0x5c 0x20 0x0e
+0x00 0x4c 0x20 0x0e
+0x00 0x2c 0x20 0x0e
+0x00 0x14 0x20 0x0e
+0x00 0x54 0x20 0x0e
+0x00 0x44 0x20 0x0e
+0x00 0x84 0x20 0x2e
+0x00 0x7c 0x20 0x2e
+0x00 0x74 0x20 0x2e
+0x00 0x04 0x20 0x2e
+0x00 0x24 0x20 0x2e
+0x00 0xa4 0x20 0x2e
+0x00 0x64 0x20 0x2e
+0x00 0xac 0x20 0x2e
+0x00 0x6c 0x20 0x2e
+0x00 0x0c 0x20 0x2e
+0x00 0x5c 0x20 0x2e
+0x00 0x4c 0x20 0x2e
+0x00 0x2c 0x20 0x2e
+0x00 0x14 0x20 0x2e
+0x00 0x54 0x20 0x2e
+0x00 0x44 0x20 0x2e
+
+# CHECK: cmeq.8b	v0, v0, v0
+# CHECK: cmge.8b	v0, v0, v0
+# CHECK: cmgt.8b	v0, v0, v0
+# CHECK: cmhi.8b	v0, v0, v0
+# CHECK: cmhs.8b	v0, v0, v0
+# CHECK: cmtst.8b	v0, v0, v0
+# CHECK: fabd.2s	v0, v0, v0
+# CHECK: facge.2s	v0, v0, v0
+# CHECK: facgt.2s	v0, v0, v0
+# CHECK: faddp.2s	v0, v0, v0
+# CHECK: fadd.2s	v0, v0, v0
+# CHECK: fcmeq.2s	v0, v0, v0
+# CHECK: fcmge.2s	v0, v0, v0
+# CHECK: fcmgt.2s	v0, v0, v0
+# CHECK: fdiv.2s	v0, v0, v0
+# CHECK: fmaxnmp.2s	v0, v0, v0
+# CHECK: fmaxnm.2s	v0, v0, v0
+# CHECK: fmaxp.2s	v0, v0, v0
+# CHECK: fmax.2s	v0, v0, v0
+# CHECK: fminnmp.2s	v0, v0, v0
+# CHECK: fminnm.2s	v0, v0, v0
+# CHECK: fminp.2s	v0, v0, v0
+# CHECK: fmin.2s	v0, v0, v0
+# CHECK: fmla.2s	v0, v0, v0
+# CHECK: fmls.2s	v0, v0, v0
+# CHECK: fmulx.2s	v0, v0, v0
+# CHECK: fmul.2s	v0, v0, v0
+# CHECK: frecps.2s	v0, v0, v0
+# CHECK: frsqrts.2s	v0, v0, v0
+# CHECK: fsub.2s	v0, v0, v0
+# CHECK: mla.8b	v0, v0, v0
+# CHECK: mls.8b	v0, v0, v0
+# CHECK: mul.8b	v0, v0, v0
+# CHECK: pmul.8b	v0, v0, v0
+# CHECK: saba.8b	v0, v0, v0
+# CHECK: sabd.8b	v0, v0, v0
+# CHECK: shadd.8b	v0, v0, v0
+# CHECK: shsub.8b	v0, v0, v0
+# CHECK: smaxp.8b	v0, v0, v0
+# CHECK: smax.8b	v0, v0, v0
+# CHECK: sminp.8b	v0, v0, v0
+# CHECK: smin.8b	v0, v0, v0
+# CHECK: sqadd.8b	v0, v0, v0
+# CHECK: sqdmulh.4h v0, v0, v0
+# CHECK: sqrdmulh.4h v0, v0, v0
+# CHECK: sqrshl.8b	v0, v0, v0
+# CHECK: sqshl.8b	v0, v0, v0
+# CHECK: sqsub.8b	v0, v0, v0
+# CHECK: srhadd.8b	v0, v0, v0
+# CHECK: srshl.8b	v0, v0, v0
+# CHECK: sshl.8b	v0, v0, v0
+# CHECK: sub.8b	v0, v0, v0
+# CHECK: uaba.8b	v0, v0, v0
+# CHECK: uabd.8b	v0, v0, v0
+# CHECK: uhadd.8b	v0, v0, v0
+# CHECK: uhsub.8b	v0, v0, v0
+# CHECK: umaxp.8b	v0, v0, v0
+# CHECK: umax.8b	v0, v0, v0
+# CHECK: uminp.8b	v0, v0, v0
+# CHECK: umin.8b	v0, v0, v0
+# CHECK: uqadd.8b	v0, v0, v0
+# CHECK: uqrshl.8b	v0, v0, v0
+# CHECK: uqshl.8b	v0, v0, v0
+# CHECK: uqsub.8b	v0, v0, v0
+# CHECK: urhadd.8b	v0, v0, v0
+# CHECK: urshl.8b	v0, v0, v0
+# CHECK: ushl.8b	v0, v0, v0
+
+0x00 0x1c 0xe0 0x2e
+0x00 0x1c 0xa0 0x2e
+0x00 0x1c 0x60 0x2e
+0x00 0x1c 0x20 0x2e
+0x00 0x1c 0xe0 0x0e
+0x00 0x1c 0xa1 0x0e
+
+# CHECK: bif.8b	v0, v0, v0
+# CHECK: bit.8b	v0, v0, v0
+# CHECK: bsl.8b	v0, v0, v0
+# CHECK: eor.8b	v0, v0, v0
+# CHECK: orn.8b	v0, v0, v0
+# CHECK: orr.8b	v0, v0, v1
+
+0x00 0x68 0x20 0x0e
+0x00 0x68 0x20 0x4e
+0x00 0x68 0x60 0x0e
+0x00 0x68 0x60 0x4e
+0x00 0x68 0xa0 0x0e
+0x00 0x68 0xa0 0x4e
+
+# CHECK: sadalp.4h	v0, v0
+# CHECK: sadalp.8h	v0, v0
+# CHECK: sadalp.2s	v0, v0
+# CHECK: sadalp.4s	v0, v0
+# CHECK: sadalp.1d	v0, v0
+# CHECK: sadalp.2d	v0, v0
+
+0x00 0x48 0x20 0x0e
+0x00 0x48 0x20 0x2e
+0x00 0x58 0x20 0x0e
+0x00 0xf8 0xa0 0x0e
+0x00 0xc8 0x21 0x0e
+0x00 0xc8 0x21 0x2e
+0x00 0xb8 0x21 0x0e
+0x00 0xb8 0x21 0x2e
+0x00 0xa8 0x21 0x0e
+0x00 0xa8 0x21 0x2e
+0x00 0xa8 0xa1 0x0e
+0x00 0xa8 0xa1 0x2e
+0x00 0xb8 0xa1 0x0e
+0x00 0xb8 0xa1 0x2e
+0x00 0xf8 0xa0 0x2e
+0x00 0xd8 0xa1 0x0e
+0x00 0xd8 0xa1 0x2e
+0x00 0xf8 0xa1 0x2e
+0x00 0xb8 0x20 0x2e
+0x00 0x58 0x20 0x2e
+0x00 0x58 0x60 0x2e
+0x00 0x18 0x20 0x0e
+0x00 0x08 0x20 0x2e
+0x00 0x08 0x20 0x0e
+0x00 0x68 0x20 0x0e
+0x00 0x28 0x20 0x0e
+0x00 0xd8 0x21 0x0e
+0x00 0x38 0x21 0x2e
+0x00 0x78 0x20 0x0e
+0x00 0x78 0x20 0x2e
+0x00 0x48 0x21 0x0e
+0x00 0x28 0x21 0x2e
+0x00 0x38 0x20 0x0e
+0x00 0x68 0x20 0x2e
+0x00 0x28 0x20 0x2e
+0x00 0xd8 0x21 0x2e
+0x00 0x48 0x21 0x2e
+0x00 0xc8 0xa1 0x0e
+0x00 0xc8 0xa1 0x2e
+0x00 0x38 0x20 0x2e
+0x00 0x28 0x21 0x0e
+0x00 0x48 0x20 0x0e
+0x00 0x48 0x20 0x2e
+0x00 0x58 0x20 0x0e
+0x00 0xf8 0xa0 0x0e
+0x00 0xc8 0x21 0x0e
+0x00 0xc8 0x21 0x2e
+0x00 0xb8 0x21 0x0e
+0x00 0xb8 0x21 0x2e
+0x00 0xa8 0x21 0x0e
+0x00 0xa8 0x21 0x2e
+0x00 0xa8 0xa1 0x0e
+0x00 0xa8 0xa1 0x2e
+0x00 0xb8 0xa1 0x0e
+0x00 0xb8 0xa1 0x2e
+0x00 0xf8 0xa0 0x2e
+0x00 0xd8 0xa1 0x0e
+0x00 0xd8 0xa1 0x2e
+0x00 0xf8 0xa1 0x2e
+0x00 0xb8 0x20 0x2e
+0x00 0x58 0x20 0x2e
+0x00 0x58 0x60 0x2e
+0x00 0x18 0x20 0x0e
+0x00 0x08 0x20 0x2e
+0x00 0x08 0x20 0x0e
+0x00 0x68 0x20 0x0e
+0x00 0x28 0x20 0x0e
+0x00 0xd8 0x21 0x0e
+0x00 0x38 0x21 0x2e
+0x00 0x78 0x20 0x0e
+0x00 0x78 0x20 0x2e
+0x00 0x48 0x21 0x0e
+0x00 0x28 0x21 0x2e
+0x00 0x38 0x20 0x0e
+0x00 0x68 0x20 0x2e
+0x00 0x28 0x20 0x2e
+0x00 0xd8 0x21 0x2e
+0x00 0x48 0x21 0x2e
+0x00 0xc8 0xa1 0x0e
+0x00 0xc8 0xa1 0x2e
+0x00 0x38 0x20 0x2e
+0x00 0x28 0x21 0x0e
+
+# CHECK: cls.8b	v0, v0
+# CHECK: clz.8b	v0, v0
+# CHECK: cnt.8b	v0, v0
+# CHECK: fabs.2s	v0, v0
+# CHECK: fcvtas.2s	v0, v0
+# CHECK: fcvtau.2s	v0, v0
+# CHECK: fcvtms.2s	v0, v0
+# CHECK: fcvtmu.2s	v0, v0
+# CHECK: fcvtns.2s	v0, v0
+# CHECK: fcvtnu.2s	v0, v0
+# CHECK: fcvtps.2s	v0, v0
+# CHECK: fcvtpu.2s	v0, v0
+# CHECK: fcvtzs.2s	v0, v0
+# CHECK: fcvtzu.2s	v0, v0
+# CHECK: fneg.2s	v0, v0
+# CHECK: frecpe.2s	v0, v0
+# CHECK: frsqrte.2s	v0, v0
+# CHECK: fsqrt.2s	v0, v0
+# CHECK: neg.8b	v0, v0
+# CHECK: mvn.8b	v0, v0
+# CHECK: rbit.8b	v0, v0
+# CHECK: rev16.8b	v0, v0
+# CHECK: rev32.8b	v0, v0
+# CHECK: rev64.8b	v0, v0
+# CHECK: sadalp.4h	v0, v0
+# CHECK: saddlp.4h	v0, v0
+# CHECK: scvtf.2s	v0, v0
+# CHECK: shll.8h	v0, v0, #8
+# CHECK: sqabs.8b	v0, v0
+# CHECK: sqneg.8b	v0, v0
+# CHECK: sqxtn.8b	v0, v0
+# CHECK: sqxtun.8b	v0, v0
+# CHECK: suqadd.8b	v0, v0
+# CHECK: uadalp.4h	v0, v0
+# CHECK: uaddlp.4h	v0, v0
+# CHECK: ucvtf.2s	v0, v0
+# CHECK: uqxtn.8b	v0, v0
+# CHECK: urecpe.2s	v0, v0
+# CHECK: ursqrte.2s	v0, v0
+# CHECK: usqadd.8b	v0, v0
+# CHECK: xtn.8b	v0, v0
+
+0x00 0x98 0x20 0x0e
+0x00 0x98 0x20 0x4e
+0x00 0x98 0x60 0x0e
+0x00 0x98 0x60 0x4e
+0x00 0x98 0xa0 0x0e
+0x00 0x98 0xa0 0x4e
+0x00 0x98 0xe0 0x4e
+
+# CHECK: cmeq.8b	v0, v0, #0
+# CHECK: cmeq.16b	v0, v0, #0
+# CHECK: cmeq.4h	v0, v0, #0
+# CHECK: cmeq.8h	v0, v0, #0
+# CHECK: cmeq.2s	v0, v0, #0
+# CHECK: cmeq.4s	v0, v0, #0
+# CHECK: cmeq.2d	v0, v0, #0
+
+0x00 0x88 0x20 0x2e
+0x00 0x88 0x20 0x0e
+0x00 0x98 0x20 0x2e
+0x00 0xa8 0x20 0x0e
+0x00 0xd8 0xa0 0x0e
+0x00 0xc8 0xa0 0x2e
+0x00 0xc8 0xa0 0x0e
+0x00 0xd8 0xa0 0x2e
+0x00 0xe8 0xa0 0x0e
+
+# CHECK: cmge.8b	v0, v0, #0
+# CHECK: cmgt.8b	v0, v0, #0
+# CHECK: cmle.8b	v0, v0, #0
+# CHECK: cmlt.8b	v0, v0, #0
+# CHECK: fcmeq.2s	v0, v0, #0
+# CHECK: fcmge.2s	v0, v0, #0
+# CHECK: fcmgt.2s	v0, v0, #0
+# CHECK: fcmle.2s	v0, v0, #0
+# CHECK: fcmlt.2s	v0, v0, #0
+
+0x00 0x78 0x21 0x0e
+0x00 0x78 0x21 0x4e
+0x00 0x78 0x61 0x0e
+0x00 0x78 0x61 0x4e
+0x00 0x68 0x21 0x0e
+0x00 0x68 0x21 0x4e
+0x00 0x68 0x61 0x0e
+0x00 0x68 0x61 0x4e
+0x00 0x68 0x61 0x2e
+0x00 0x68 0x61 0x6e
+
+# CHECK: fcvtl	v0.4s, v0.4h
+# CHECK: fcvtl2	v0.4s, v0.8h
+# CHECK: fcvtl	v0.2d, v0.2s
+# CHECK: fcvtl2	v0.2d, v0.4s
+# CHECK: fcvtn	v0.4h, v0.4s
+# CHECK: fcvtn2	v0.8h, v0.4s
+# CHECK: fcvtn	v0.2s, v0.2d
+# CHECK: fcvtn2	v0.4s, v0.2d
+# CHECK: fcvtxn	v0.2s, v0.2d
+# CHECK: fcvtxn2	v0.4s, v0.2d
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD modified immediate instructions
+#===-------------------------------------------------------------------------===
+
+0x20 0x14 0x00 0x2f
+0x20 0x34 0x00 0x2f
+0x20 0x54 0x00 0x2f
+0x20 0x74 0x00 0x2f
+
+# CHECK: bic.2s v0, #0x1
+# CHECK: bic.2s v0, #0x1, lsl #8
+# CHECK: bic.2s v0, #0x1, lsl #16
+# CHECK: bic.2s v0, #0x1, lsl #24
+
+0x20 0x94 0x00 0x2f
+0x20 0x94 0x00 0x2f
+0x20 0xb4 0x00 0x2f
+
+# CHECK: bic.4h v0, #0x1
+# CHECK: bic.4h v0, #0x1
+# FIXME: bic.4h v0, #0x1, lsl #8
+#    'bic.4h' should be selected over "fcvtnu.2s v0, v1, #0"
+
+0x20 0x14 0x00 0x6f
+0x20 0x34 0x00 0x6f
+0x20 0x54 0x00 0x6f
+0x20 0x74 0x00 0x6f
+
+# CHECK: bic.4s v0, #0x1
+# CHECK: bic.4s v0, #0x1, lsl #8
+# CHECK: bic.4s v0, #0x1, lsl #16
+# CHECK: bic.4s v0, #0x1, lsl #24
+
+0x20 0x94 0x00 0x6f
+0x20 0xb4 0x00 0x6f
+
+# CHECK: bic.8h v0, #0x1
+# FIXME: bic.8h v0, #0x1, lsl #8
+#    "bic.8h" should be selected over "fcvtnu.4s v0, v1, #0"
+
+0x00 0xf4 0x02 0x6f
+
+# CHECK: fmov.2d v0, #0.12500000
+
+0x00 0xf4 0x02 0x0f
+0x00 0xf4 0x02 0x4f
+
+# CHECK: fmov.2s v0, #0.12500000
+# CHECK: fmov.4s v0, #0.12500000
+
+0x20 0x14 0x00 0x0f
+0x20 0x34 0x00 0x0f
+0x20 0x54 0x00 0x0f
+0x20 0x74 0x00 0x0f
+
+# CHECK: orr.2s v0, #0x1
+# CHECK: orr.2s v0, #0x1, lsl #8
+# CHECK: orr.2s v0, #0x1, lsl #16
+# CHECK: orr.2s v0, #0x1, lsl #24
+
+0x20 0x94 0x00 0x0f
+0x20 0xb4 0x00 0x0f
+
+# CHECK: orr.4h v0, #0x1
+# FIXME: orr.4h v0, #0x1, lsl #8
+#    'orr.4h' should be selected over "fcvtns.2s v0, v1, #0"
+
+0x20 0x14 0x00 0x4f
+0x20 0x34 0x00 0x4f
+0x20 0x54 0x00 0x4f
+0x20 0x74 0x00 0x4f
+
+# CHECK: orr.4s v0, #0x1
+# CHECK: orr.4s v0, #0x1, lsl #8
+# CHECK: orr.4s v0, #0x1, lsl #16
+# CHECK: orr.4s v0, #0x1, lsl #24
+
+0x20 0x94 0x00 0x4f
+0x20 0xb4 0x00 0x4f
+
+# CHECK: orr.8h v0, #0x1
+# CHECK: orr.8h v0, #0x1, lsl #8
+
+0x21 0x70 0x40 0x0c
+0x42 0xa0 0x40 0x4c
+0x64 0x64 0x40 0x0c
+0x87 0x24 0x40 0x4c
+0x0c 0xa8 0x40 0x0c
+0x0a 0x68 0x40 0x4c
+0x2d 0xac 0x40 0x0c
+0x4f 0x7c 0x40 0x4c
+0xe0 0x03 0x40 0x0d
+
+# CHECK: ld1.8b { v1 }, [x1]
+# CHECK: ld1.16b { v2, v3 }, [x2]
+# CHECK: ld1.4h { v4, v5, v6 }, [x3]
+# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4]
+# CHECK: ld1.2s { v12, v13 }, [x0]
+# CHECK: ld1.4s { v10, v11, v12 }, [x0]
+# CHECK: ld1.1d { v13, v14 }, [x1]
+# CHECK: ld1.2d	{ v15 }, [x2]
+# CHECK: ld1.b	{ v0 }[0], [sp]
+
+0x41 0x70 0xdf 0x0c
+0x41 0xa0 0xdf 0x0c
+0x41 0x60 0xdf 0x0c
+0x41 0x20 0xdf 0x0c
+0x42 0x70 0xdf 0x4c
+0x42 0xa0 0xdf 0x4c
+0x42 0x60 0xdf 0x4c
+0x42 0x20 0xdf 0x4c
+0x64 0x74 0xdf 0x0c
+0x64 0xa4 0xdf 0x0c
+0x64 0x64 0xdf 0x0c
+0x64 0x24 0xdf 0x0c
+0x87 0x74 0xdf 0x4c
+0x87 0xa4 0xdf 0x4c
+0x87 0x64 0xdf 0x4c
+0x87 0x24 0xdf 0x4c
+0x0c 0x78 0xdf 0x0c
+0x0c 0xa8 0xdf 0x0c
+0x0c 0x68 0xdf 0x0c
+0x0c 0x28 0xdf 0x0c
+0x0a 0x78 0xdf 0x4c
+0x0a 0xa8 0xdf 0x4c
+0x0a 0x68 0xdf 0x4c
+0x0a 0x28 0xdf 0x4c
+0x2d 0x7c 0xdf 0x0c
+0x2d 0xac 0xdf 0x0c
+0x2d 0x6c 0xdf 0x0c
+0x2d 0x2c 0xdf 0x0c
+0x4f 0x7c 0xdf 0x4c
+0x4f 0xac 0xdf 0x4c
+0x4f 0x6c 0xdf 0x4c
+0x4f 0x2c 0xdf 0x4c
+
+# CHECK: ld1.8b { v1 }, [x2], #8
+# CHECK: ld1.8b { v1, v2 }, [x2], #16
+# CHECK: ld1.8b { v1, v2, v3 }, [x2], #24
+# CHECK: ld1.8b { v1, v2, v3, v4 }, [x2], #32
+# CHECK: ld1.16b { v2 }, [x2], #16
+# CHECK: ld1.16b { v2, v3 }, [x2], #32
+# CHECK: ld1.16b { v2, v3, v4 }, [x2], #48
+# CHECK: ld1.16b { v2, v3, v4, v5 }, [x2], #64
+# CHECK: ld1.4h { v4 }, [x3], #8
+# CHECK: ld1.4h { v4, v5 }, [x3], #16
+# CHECK: ld1.4h { v4, v5, v6 }, [x3], #24
+# CHECK: ld1.4h { v4, v5, v6, v7 }, [x3], #32
+# CHECK: ld1.8h { v7 }, [x4], #16
+# CHECK: ld1.8h { v7, v8 }, [x4], #32
+# CHECK: ld1.8h { v7, v8, v9 }, [x4], #48
+# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4], #64
+# CHECK: ld1.2s { v12 }, [x0], #8
+# CHECK: ld1.2s { v12, v13 }, [x0], #16
+# CHECK: ld1.2s { v12, v13, v14 }, [x0], #24
+# CHECK: ld1.2s { v12, v13, v14, v15 }, [x0], #32
+# CHECK: ld1.4s { v10 }, [x0], #16
+# CHECK: ld1.4s { v10, v11 }, [x0], #32
+# CHECK: ld1.4s { v10, v11, v12 }, [x0], #48
+# CHECK: ld1.4s { v10, v11, v12, v13 }, [x0], #64
+# CHECK: ld1.1d { v13 }, [x1], #8
+# CHECK: ld1.1d { v13, v14 }, [x1], #16
+# CHECK: ld1.1d { v13, v14, v15 }, [x1], #24
+# CHECK: ld1.1d { v13, v14, v15, v16 }, [x1], #32
+# CHECK: ld1.2d { v15 }, [x2], #16
+# CHECK: ld1.2d { v15, v16 }, [x2], #32
+# CHECK: ld1.2d { v15, v16, v17 }, [x2], #48
+# CHECK: ld1.2d { v15, v16, v17, v18 }, [x2], #64
+
+0x21 0x70 0x00 0x0c
+0x42 0xa0 0x00 0x4c
+0x64 0x64 0x00 0x0c
+0x87 0x24 0x00 0x4c
+0x0c 0xa8 0x00 0x0c
+0x0a 0x68 0x00 0x4c
+0x2d 0xac 0x00 0x0c
+0x4f 0x7c 0x00 0x4c
+
+# CHECK: st1.8b { v1 }, [x1]
+# CHECK: st1.16b { v2, v3 }, [x2]
+# CHECK: st1.4h { v4, v5, v6 }, [x3]
+# CHECK: st1.8h { v7, v8, v9, v10 }, [x4]
+# CHECK: st1.2s { v12, v13 }, [x0]
+# CHECK: st1.4s { v10, v11, v12 }, [x0]
+# CHECK: st1.1d { v13, v14 }, [x1]
+# CHECK: st1.2d	{ v15 }, [x2]
+
+0x61 0x08 0x40 0x0d
+0x82 0x84 0x40 0x4d
+0xa3 0x58 0x40 0x0d
+0xc4 0x80 0x40 0x4d
+
+# CHECK: ld1.b { v1 }[2], [x3]
+# CHECK: ld1.d { v2 }[1], [x4]
+# CHECK: ld1.h { v3 }[3], [x5]
+# CHECK: ld1.s { v4 }[2], [x6]
+
+0x61 0x08 0xdf 0x0d
+0x82 0x84 0xdf 0x4d
+0xa3 0x58 0xdf 0x0d
+0xc4 0x80 0xdf 0x4d
+
+# CHECK: ld1.b { v1 }[2], [x3], #1
+# CHECK: ld1.d { v2 }[1], [x4], #8
+# CHECK: ld1.h { v3 }[3], [x5], #2
+# CHECK: ld1.s { v4 }[2], [x6], #4
+
+0x61 0x08 0x00 0x0d
+0x82 0x84 0x00 0x4d
+0xa3 0x58 0x00 0x0d
+0xc4 0x80 0x00 0x4d
+
+# CHECK: st1.b { v1 }[2], [x3]
+# CHECK: st1.d { v2 }[1], [x4]
+# CHECK: st1.h { v3 }[3], [x5]
+# CHECK: st1.s { v4 }[2], [x6]
+
+0x61 0x08 0x9f 0x0d
+0x82 0x84 0x9f 0x4d
+0xa3 0x58 0x9f 0x0d
+0xc4 0x80 0x9f 0x4d
+
+# CHECK: st1.b { v1 }[2], [x3], #1
+# CHECK: st1.d { v2 }[1], [x4], #8
+# CHECK: st1.h { v3 }[3], [x5], #2
+# CHECK: st1.s { v4 }[2], [x6], #4
+
+0x61 0x08 0xc4 0x0d
+0x82 0x84 0xc5 0x4d
+0xa3 0x58 0xc6 0x0d
+0xc4 0x80 0xc7 0x4d
+
+# CHECK: ld1.b { v1 }[2], [x3], x4
+# CHECK: ld1.d { v2 }[1], [x4], x5
+# CHECK: ld1.h { v3 }[3], [x5], x6
+# CHECK: ld1.s { v4 }[2], [x6], x7
+
+0x61 0x08 0x84 0x0d
+0x82 0x84 0x85 0x4d
+0xa3 0x58 0x86 0x0d
+0xc4 0x80 0x87 0x4d
+
+# CHECK: st1.b { v1 }[2], [x3], x4
+# CHECK: st1.d { v2 }[1], [x4], x5
+# CHECK: st1.h { v3 }[3], [x5], x6
+# CHECK: st1.s { v4 }[2], [x6], x7
+
+0x41 0x70 0xc3 0x0c
+0x42 0xa0 0xc4 0x4c
+0x64 0x64 0xc5 0x0c
+0x87 0x24 0xc6 0x4c
+0x0c 0xa8 0xc7 0x0c
+0x0a 0x68 0xc8 0x4c
+0x2d 0xac 0xc9 0x0c
+0x4f 0x7c 0xca 0x4c
+
+# CHECK: ld1.8b { v1 }, [x2], x3
+# CHECK: ld1.16b { v2, v3 }, [x2], x4
+# CHECK: ld1.4h { v4, v5, v6 }, [x3], x5
+# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4], x6
+# CHECK: ld1.2s { v12, v13 }, [x0], x7
+# CHECK: ld1.4s { v10, v11, v12 }, [x0], x8
+# CHECK: ld1.1d { v13, v14 }, [x1], x9
+# CHECK: ld1.2d { v15 }, [x2], x10
+
+0x41 0x70 0x83 0x0c
+0x42 0xa0 0x84 0x4c
+0x64 0x64 0x85 0x0c
+0x87 0x24 0x86 0x4c
+0x0c 0xa8 0x87 0x0c
+0x0a 0x68 0x88 0x4c
+0x2d 0xac 0x89 0x0c
+0x4f 0x7c 0x8a 0x4c
+
+# CHECK: st1.8b { v1 }, [x2], x3
+# CHECK: st1.16b { v2, v3 }, [x2], x4
+# CHECK: st1.4h { v4, v5, v6 }, [x3], x5
+# CHECK: st1.8h { v7, v8, v9, v10 }, [x4], x6
+# CHECK: st1.2s { v12, v13 }, [x0], x7
+# CHECK: st1.4s { v10, v11, v12 }, [x0], x8
+# CHECK: st1.1d { v13, v14 }, [x1], x9
+# CHECK: st1.2d { v15 }, [x2], x10
+
+0x41 0x70 0x9f 0x0c
+0x41 0xa0 0x9f 0x0c
+0x41 0x60 0x9f 0x0c
+0x41 0x20 0x9f 0x0c
+0x42 0x70 0x9f 0x4c
+0x42 0xa0 0x9f 0x4c
+0x42 0x60 0x9f 0x4c
+0x42 0x20 0x9f 0x4c
+0x64 0x74 0x9f 0x0c
+0x64 0xa4 0x9f 0x0c
+0x64 0x64 0x9f 0x0c
+0x64 0x24 0x9f 0x0c
+0x87 0x74 0x9f 0x4c
+0x87 0xa4 0x9f 0x4c
+0x87 0x64 0x9f 0x4c
+0x87 0x24 0x9f 0x4c
+0x0c 0x78 0x9f 0x0c
+0x0c 0xa8 0x9f 0x0c
+0x0c 0x68 0x9f 0x0c
+0x0c 0x28 0x9f 0x0c
+0x0a 0x78 0x9f 0x4c
+0x0a 0xa8 0x9f 0x4c
+0x0a 0x68 0x9f 0x4c
+0x0a 0x28 0x9f 0x4c
+0x2d 0x7c 0x9f 0x0c
+0x2d 0xac 0x9f 0x0c
+0x2d 0x6c 0x9f 0x0c
+0x2d 0x2c 0x9f 0x0c
+0x4f 0x7c 0x9f 0x4c
+0x4f 0xac 0x9f 0x4c
+0x4f 0x6c 0x9f 0x4c
+0x4f 0x2c 0x9f 0x4c
+
+# CHECK: st1.8b { v1 }, [x2], #8
+# CHECK: st1.8b { v1, v2 }, [x2], #16
+# CHECK: st1.8b { v1, v2, v3 }, [x2], #24
+# CHECK: st1.8b { v1, v2, v3, v4 }, [x2], #32
+# CHECK: st1.16b { v2 }, [x2], #16
+# CHECK: st1.16b { v2, v3 }, [x2], #32
+# CHECK: st1.16b { v2, v3, v4 }, [x2], #48
+# CHECK: st1.16b { v2, v3, v4, v5 }, [x2], #64
+# CHECK: st1.4h { v4 }, [x3], #8
+# CHECK: st1.4h { v4, v5 }, [x3], #16
+# CHECK: st1.4h { v4, v5, v6 }, [x3], #24
+# CHECK: st1.4h { v4, v5, v6, v7 }, [x3], #32
+# CHECK: st1.8h { v7 }, [x4], #16
+# CHECK: st1.8h { v7, v8 }, [x4], #32
+# CHECK: st1.8h { v7, v8, v9 }, [x4], #48
+# CHECK: st1.8h { v7, v8, v9, v10 }, [x4], #64
+# CHECK: st1.2s { v12 }, [x0], #8
+# CHECK: st1.2s { v12, v13 }, [x0], #16
+# CHECK: st1.2s { v12, v13, v14 }, [x0], #24
+# CHECK: st1.2s { v12, v13, v14, v15 }, [x0], #32
+# CHECK: st1.4s { v10 }, [x0], #16
+# CHECK: st1.4s { v10, v11 }, [x0], #32
+# CHECK: st1.4s { v10, v11, v12 }, [x0], #48
+# CHECK: st1.4s { v10, v11, v12, v13 }, [x0], #64
+# CHECK: st1.1d { v13 }, [x1], #8
+# CHECK: st1.1d { v13, v14 }, [x1], #16
+# CHECK: st1.1d { v13, v14, v15 }, [x1], #24
+# CHECK: st1.1d { v13, v14, v15, v16 }, [x1], #32
+# CHECK: st1.2d { v15 }, [x2], #16
+# CHECK: st1.2d { v15, v16 }, [x2], #32
+# CHECK: st1.2d { v15, v16, v17 }, [x2], #48
+# CHECK: st1.2d { v15, v16, v17, v18 }, [x2], #64
+
+0x21 0xc0 0x40 0x0d
+0x21 0xc0 0xc2 0x0d
+0x64 0xc4 0x40 0x0d
+0x64 0xc4 0xc5 0x0d
+0xa9 0xc8 0x40 0x0d
+0xa9 0xc8 0xc6 0x0d
+0xec 0xcc 0x40 0x0d
+0xec 0xcc 0xc8 0x0d
+
+# CHECK: ld1r.8b { v1 }, [x1]
+# CHECK: ld1r.8b { v1 }, [x1], x2
+# CHECK: ld1r.4h { v4 }, [x3]
+# CHECK: ld1r.4h { v4 }, [x3], x5
+# CHECK: ld1r.2s { v9 }, [x5]
+# CHECK: ld1r.2s { v9 }, [x5], x6
+# CHECK: ld1r.1d { v12 }, [x7]
+# CHECK: ld1r.1d { v12 }, [x7], x8
+
+0x21 0xc0 0xdf 0x0d
+0x21 0xc4 0xdf 0x0d
+0x21 0xc8 0xdf 0x0d
+0x21 0xcc 0xdf 0x0d
+
+# CHECK: ld1r.8b { v1 }, [x1], #1
+# CHECK: ld1r.4h { v1 }, [x1], #2
+# CHECK: ld1r.2s { v1 }, [x1], #4
+# CHECK: ld1r.1d { v1 }, [x1], #8
+
+0x45 0x80 0x40 0x4c
+0x0a 0x88 0x40 0x0c
+
+# CHECK: ld2.16b { v5, v6 }, [x2]
+# CHECK: ld2.2s { v10, v11 }, [x0]
+
+0x45 0x80 0x00 0x4c
+0x0a 0x88 0x00 0x0c
+
+# CHECK: st2.16b { v5, v6 }, [x2]
+# CHECK: st2.2s { v10, v11 }, [x0]
+
+0x61 0x08 0x20 0x0d
+0x82 0x84 0x20 0x4d
+0xc3 0x50 0x20 0x0d
+0xe4 0x90 0x20 0x4d
+
+# CHECK: st2.b { v1, v2 }[2], [x3]
+# CHECK: st2.d { v2, v3 }[1], [x4]
+# CHECK: st2.h { v3, v4 }[2], [x6]
+# CHECK: st2.s { v4, v5 }[3], [x7]
+
+0x61 0x08 0xbf 0x0d
+0x82 0x84 0xbf 0x4d
+0xa3 0x58 0xbf 0x0d
+0xc4 0x80 0xbf 0x4d
+
+# CHECK: st2.b { v1, v2 }[2], [x3], #2
+# CHECK: st2.d { v2, v3 }[1], [x4], #16
+# CHECK: st2.h { v3, v4 }[3], [x5], #4
+# CHECK: st2.s { v4, v5 }[2], [x6], #8
+
+0x61 0x08 0x60 0x0d
+0x82 0x84 0x60 0x4d
+0xc3 0x50 0x60 0x0d
+0xe4 0x90 0x60 0x4d
+
+# CHECK: ld2.b { v1, v2 }[2], [x3]
+# CHECK: ld2.d { v2, v3 }[1], [x4]
+# CHECK: ld2.h { v3, v4 }[2], [x6]
+# CHECK: ld2.s { v4, v5 }[3], [x7]
+
+0x61 0x08 0xff 0x0d
+0x82 0x84 0xff 0x4d
+0xa3 0x58 0xff 0x0d
+0xc4 0x80 0xff 0x4d
+
+# CHECK: ld2.b { v1, v2 }[2], [x3], #2
+# CHECK: ld2.d { v2, v3 }[1], [x4], #16
+# CHECK: ld2.h { v3, v4 }[3], [x5], #4
+# CHECK: ld2.s { v4, v5 }[2], [x6], #8
+
+0x61 0x08 0xe4 0x0d
+0x82 0x84 0xe6 0x4d
+0xa3 0x58 0xe8 0x0d
+0xc4 0x80 0xea 0x4d
+
+# CHECK: ld2.b { v1, v2 }[2], [x3], x4
+# CHECK: ld2.d { v2, v3 }[1], [x4], x6
+# CHECK: ld2.h { v3, v4 }[3], [x5], x8
+# CHECK: ld2.s { v4, v5 }[2], [x6], x10
+
+0x61 0x08 0xa4 0x0d
+0x82 0x84 0xa6 0x4d
+0xa3 0x58 0xa8 0x0d
+0xc4 0x80 0xaa 0x4d
+
+# CHECK: st2.b { v1, v2 }[2], [x3], x4
+# CHECK: st2.d { v2, v3 }[1], [x4], x6
+# CHECK: st2.h { v3, v4 }[3], [x5], x8
+# CHECK: st2.s { v4, v5 }[2], [x6], x10
+
+0x64 0x84 0xc5 0x0c
+0x0c 0x88 0xc7 0x0c
+
+# CHECK: ld2.4h { v4, v5 }, [x3], x5
+# CHECK: ld2.2s { v12, v13 }, [x0], x7
+
+0x00 0x80 0xdf 0x0c
+0x00 0x80 0xdf 0x4c
+0x00 0x84 0xdf 0x0c
+0x00 0x84 0xdf 0x4c
+0x00 0x88 0xdf 0x0c
+0x00 0x88 0xdf 0x4c
+0x00 0x8c 0xdf 0x4c
+
+# CHECK: ld2.8b { v0, v1 }, [x0], #16
+# CHECK: ld2.16b { v0, v1 }, [x0], #32
+# CHECK: ld2.4h { v0, v1 }, [x0], #16
+# CHECK: ld2.8h { v0, v1 }, [x0], #32
+# CHECK: ld2.2s { v0, v1 }, [x0], #16
+# CHECK: ld2.4s { v0, v1 }, [x0], #32
+# CHECK: ld2.2d { v0, v1 }, [x0], #32
+
+0x64 0x84 0x85 0x0c
+0x0c 0x88 0x87 0x0c
+
+# CHECK: st2.4h { v4, v5 }, [x3], x5
+# CHECK: st2.2s { v12, v13 }, [x0], x7
+
+0x00 0x80 0x9f 0x0c
+0x00 0x80 0x9f 0x4c
+0x00 0x84 0x9f 0x0c
+0x00 0x84 0x9f 0x4c
+0x00 0x88 0x9f 0x0c
+0x00 0x88 0x9f 0x4c
+0x00 0x8c 0x9f 0x4c
+
+# CHECK: st2.8b { v0, v1 }, [x0], #16
+# CHECK: st2.16b { v0, v1 }, [x0], #32
+# CHECK: st2.4h { v0, v1 }, [x0], #16
+# CHECK: st2.8h { v0, v1 }, [x0], #32
+# CHECK: st2.2s { v0, v1 }, [x0], #16
+# CHECK: st2.4s { v0, v1 }, [x0], #32
+# CHECK: st2.2d { v0, v1 }, [x0], #32
+
+0x21 0xc0 0x60 0x0d
+0x21 0xc0 0xe2 0x0d
+0x21 0xc0 0x60 0x4d
+0x21 0xc0 0xe2 0x4d
+0x21 0xc4 0x60 0x0d
+0x21 0xc4 0xe2 0x0d
+0x21 0xc4 0x60 0x4d
+0x21 0xc4 0xe2 0x4d
+0x21 0xc8 0x60 0x0d
+0x21 0xc8 0xe2 0x0d
+0x21 0xcc 0x60 0x4d
+0x21 0xcc 0xe2 0x4d
+0x21 0xcc 0x60 0x0d
+0x21 0xcc 0xe2 0x0d
+
+# CHECK: ld2r.8b { v1, v2 }, [x1]
+# CHECK: ld2r.8b { v1, v2 }, [x1], x2
+# CHECK: ld2r.16b { v1, v2 }, [x1]
+# CHECK: ld2r.16b { v1, v2 }, [x1], x2
+# CHECK: ld2r.4h { v1, v2 }, [x1]
+# CHECK: ld2r.4h { v1, v2 }, [x1], x2
+# CHECK: ld2r.8h { v1, v2 }, [x1]
+# CHECK: ld2r.8h { v1, v2 }, [x1], x2
+# CHECK: ld2r.2s { v1, v2 }, [x1]
+# CHECK: ld2r.2s { v1, v2 }, [x1], x2
+# CHECK: ld2r.2d { v1, v2 }, [x1]
+# CHECK: ld2r.2d { v1, v2 }, [x1], x2
+# CHECK: ld2r.1d { v1, v2 }, [x1]
+# CHECK: ld2r.1d { v1, v2 }, [x1], x2
+
+0x21 0xc0 0xff 0x0d
+0x21 0xc0 0xff 0x4d
+0x21 0xc4 0xff 0x0d
+0x21 0xc4 0xff 0x4d
+0x21 0xc8 0xff 0x0d
+0x21 0xcc 0xff 0x4d
+0x21 0xcc 0xff 0x0d
+
+# CHECK: ld2r.8b { v1, v2 }, [x1], #2
+# CHECK: ld2r.16b { v1, v2 }, [x1], #2
+# CHECK: ld2r.4h { v1, v2 }, [x1], #4
+# CHECK: ld2r.8h { v1, v2 }, [x1], #4
+# CHECK: ld2r.2s { v1, v2 }, [x1], #8
+# CHECK: ld2r.2d { v1, v2 }, [x1], #16
+# CHECK: ld2r.1d { v1, v2 }, [x1], #16
+
+0x21 0x40 0x40 0x0c
+0x45 0x40 0x40 0x4c
+0x0a 0x48 0x40 0x0c
+
+# CHECK: ld3.8b { v1, v2, v3 }, [x1]
+# CHECK: ld3.16b { v5, v6, v7 }, [x2]
+# CHECK: ld3.2s { v10, v11, v12 }, [x0]
+
+0x21 0x40 0x00 0x0c
+0x45 0x40 0x00 0x4c
+0x0a 0x48 0x00 0x0c
+
+# CHECK: st3.8b { v1, v2, v3 }, [x1]
+# CHECK: st3.16b { v5, v6, v7 }, [x2]
+# CHECK: st3.2s { v10, v11, v12 }, [x0]
+
+0x61 0x28 0xc4 0x0d
+0x82 0xa4 0xc5 0x4d
+0xa3 0x78 0xc6 0x0d
+0xc4 0xa0 0xc7 0x4d
+
+# CHECK: ld3.b { v1, v2, v3 }[2], [x3], x4
+# CHECK: ld3.d { v2, v3, v4 }[1], [x4], x5
+# CHECK: ld3.h { v3, v4, v5 }[3], [x5], x6
+# CHECK: ld3.s { v4, v5, v6 }[2], [x6], x7
+
+0x61 0x28 0x84 0x0d
+0x82 0xa4 0x85 0x4d
+0xa3 0x78 0x86 0x0d
+0xc4 0xa0 0x87 0x4d
+
+# CHECK: st3.b { v1, v2, v3 }[2], [x3], x4
+# CHECK: st3.d { v2, v3, v4 }[1], [x4], x5
+# CHECK: st3.h { v3, v4, v5 }[3], [x5], x6
+# CHECK: st3.s { v4, v5, v6 }[2], [x6], x7
+
+0x61 0x28 0x9f 0x0d
+0x82 0xa4 0x9f 0x4d
+0xa3 0x78 0x9f 0x0d
+0xc4 0xa0 0x9f 0x4d
+
+# CHECK: st3.b { v1, v2, v3 }[2], [x3], #3
+# CHECK: st3.d { v2, v3, v4 }[1], [x4], #24
+# CHECK: st3.h { v3, v4, v5 }[3], [x5], #6
+# CHECK: st3.s { v4, v5, v6 }[2], [x6], #12
+
+0x41 0x40 0xc3 0x0c
+0x42 0x40 0xc4 0x4c
+0x64 0x44 0xc5 0x0c
+0x87 0x44 0xc6 0x4c
+0x0c 0x48 0xc7 0x0c
+0x0a 0x48 0xc8 0x4c
+0x4f 0x4c 0xca 0x4c
+
+# CHECK: ld3.8b { v1, v2, v3 }, [x2], x3
+# CHECK: ld3.16b { v2, v3, v4 }, [x2], x4
+# CHECK: ld3.4h { v4, v5, v6 }, [x3], x5
+# CHECK: ld3.8h { v7, v8, v9 }, [x4], x6
+# CHECK: ld3.2s { v12, v13, v14 }, [x0], x7
+# CHECK: ld3.4s { v10, v11, v12 }, [x0], x8
+# CHECK: ld3.2d { v15, v16, v17 }, [x2], x10
+
+0x00 0x40 0xdf 0x0c
+0x00 0x40 0xdf 0x4c
+0x00 0x44 0xdf 0x0c
+0x00 0x44 0xdf 0x4c
+0x00 0x48 0xdf 0x0c
+0x00 0x48 0xdf 0x4c
+0x00 0x4c 0xdf 0x4c
+
+# CHECK: ld3.8b { v0, v1, v2 }, [x0], #24
+# CHECK: ld3.16b { v0, v1, v2 }, [x0], #48
+# CHECK: ld3.4h { v0, v1, v2 }, [x0], #24
+# CHECK: ld3.8h { v0, v1, v2 }, [x0], #48
+# CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
+# CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
+# CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
+
+0x41 0x40 0x83 0x0c
+0x42 0x40 0x84 0x4c
+0x64 0x44 0x85 0x0c
+0x87 0x44 0x86 0x4c
+0x0c 0x48 0x87 0x0c
+0x0a 0x48 0x88 0x4c
+0x4f 0x4c 0x8a 0x4c
+
+# CHECK: st3.8b { v1, v2, v3 }, [x2], x3
+# CHECK: st3.16b { v2, v3, v4 }, [x2], x4
+# CHECK: st3.4h { v4, v5, v6 }, [x3], x5
+# CHECK: st3.8h { v7, v8, v9 }, [x4], x6
+# CHECK: st3.2s { v12, v13, v14 }, [x0], x7
+# CHECK: st3.4s { v10, v11, v12 }, [x0], x8
+# CHECK: st3.2d { v15, v16, v17 }, [x2], x10
+
+0x00 0x40 0x9f 0x0c
+0x00 0x40 0x9f 0x4c
+0x00 0x44 0x9f 0x0c
+0x00 0x44 0x9f 0x4c
+0x00 0x48 0x9f 0x0c
+0x00 0x48 0x9f 0x4c
+0x00 0x4c 0x9f 0x4c
+
+# CHECK: st3.8b { v0, v1, v2 }, [x0], #24
+# CHECK: st3.16b { v0, v1, v2 }, [x0], #48
+# CHECK: st3.4h { v0, v1, v2 }, [x0], #24
+# CHECK: st3.8h { v0, v1, v2 }, [x0], #48
+# CHECK: st3.2s { v0, v1, v2 }, [x0], #24
+# CHECK: st3.4s { v0, v1, v2 }, [x0], #48
+# CHECK: st3.2d { v0, v1, v2 }, [x0], #48
+
+0x61 0x28 0x40 0x0d
+0x82 0xa4 0x40 0x4d
+0xc3 0x70 0x40 0x0d
+0xe4 0xb0 0x40 0x4d
+
+# CHECK: ld3.b { v1, v2, v3 }[2], [x3]
+# CHECK: ld3.d { v2, v3, v4 }[1], [x4]
+# CHECK: ld3.h { v3, v4, v5 }[2], [x6]
+# CHECK: ld3.s { v4, v5, v6 }[3], [x7]
+
+0x61 0x28 0xdf 0x0d
+0x82 0xa4 0xdf 0x4d
+0xa3 0x78 0xdf 0x0d
+0xc4 0xa0 0xdf 0x4d
+
+# CHECK: ld3.b { v1, v2, v3 }[2], [x3], #3
+# CHECK: ld3.d { v2, v3, v4 }[1], [x4], #24
+# CHECK: ld3.h { v3, v4, v5 }[3], [x5], #6
+# CHECK: ld3.s { v4, v5, v6 }[2], [x6], #12
+
+0x61 0x28 0x00 0x0d
+0x82 0xa4 0x00 0x4d
+0xc3 0x70 0x00 0x0d
+0xe4 0xb0 0x00 0x4d
+
+# CHECK: st3.b { v1, v2, v3 }[2], [x3]
+# CHECK: st3.d { v2, v3, v4 }[1], [x4]
+# CHECK: st3.h { v3, v4, v5 }[2], [x6]
+# CHECK: st3.s { v4, v5, v6 }[3], [x7]
+
+0x21 0xe0 0x40 0x0d
+0x21 0xe0 0xc2 0x0d
+0x21 0xe0 0x40 0x4d
+0x21 0xe0 0xc2 0x4d
+0x21 0xe4 0x40 0x0d
+0x21 0xe4 0xc2 0x0d
+0x21 0xe4 0x40 0x4d
+0x21 0xe4 0xc2 0x4d
+0x21 0xe8 0x40 0x0d
+0x21 0xe8 0xc2 0x0d
+0x21 0xec 0x40 0x4d
+0x21 0xec 0xc2 0x4d
+0x21 0xec 0x40 0x0d
+0x21 0xec 0xc2 0x0d
+
+# CHECK: ld3r.8b { v1, v2, v3 }, [x1]
+# CHECK: ld3r.8b { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.16b { v1, v2, v3 }, [x1]
+# CHECK: ld3r.16b { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.4h { v1, v2, v3 }, [x1]
+# CHECK: ld3r.4h { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.8h { v1, v2, v3 }, [x1]
+# CHECK: ld3r.8h { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.2s { v1, v2, v3 }, [x1]
+# CHECK: ld3r.2s { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.2d { v1, v2, v3 }, [x1]
+# CHECK: ld3r.2d { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.1d { v1, v2, v3 }, [x1]
+# CHECK: ld3r.1d { v1, v2, v3 }, [x1], x2
+
+0x21 0xe0 0xdf 0x0d
+0x21 0xe0 0xdf 0x4d
+0x21 0xe4 0xdf 0x0d
+0x21 0xe4 0xdf 0x4d
+0x21 0xe8 0xdf 0x0d
+0x21 0xec 0xdf 0x4d
+0x21 0xec 0xdf 0x0d
+
+# CHECK: ld3r.8b	{ v1, v2, v3 }, [x1], #3
+# CHECK: ld3r.16b	{ v1, v2, v3 }, [x1], #3
+# CHECK: ld3r.4h	{ v1, v2, v3 }, [x1], #6
+# CHECK: ld3r.8h	{ v1, v2, v3 }, [x1], #6
+# CHECK: ld3r.2s	{ v1, v2, v3 }, [x1], #12
+# CHECK: ld3r.2d	{ v1, v2, v3 }, [x1], #24
+# CHECK: ld3r.1d	{ v1, v2, v3 }, [x1], #24
+
+0x21 0x00 0x40 0x0c
+0x45 0x00 0x40 0x4c
+0x0a 0x08 0x40 0x0c
+
+# CHECK: ld4.8b { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4.16b { v5, v6, v7, v8 }, [x2]
+# CHECK: ld4.2s { v10, v11, v12, v13 }, [x0]
+
+0x21 0x00 0x00 0x0c
+0x45 0x00 0x00 0x4c
+0x0a 0x08 0x00 0x0c
+
+# CHECK: st4.8b { v1, v2, v3, v4 }, [x1]
+# CHECK: st4.16b { v5, v6, v7, v8 }, [x2]
+# CHECK: st4.2s { v10, v11, v12, v13 }, [x0]
+
+0x61 0x28 0xe4 0x0d
+0x82 0xa4 0xe5 0x4d
+0xa3 0x78 0xe6 0x0d
+0xc4 0xa0 0xe7 0x4d
+
+# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3], x4
+# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4], x5
+# CHECK: ld4.h { v3, v4, v5, v6 }[3], [x5], x6
+# CHECK: ld4.s { v4, v5, v6, v7 }[2], [x6], x7
+
+0x61 0x28 0xff 0x0d
+0x82 0xa4 0xff 0x4d
+0xa3 0x78 0xff 0x0d
+0xc4 0xa0 0xff 0x4d
+
+# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3], #4
+# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4], #32
+# CHECK: ld4.h { v3, v4, v5, v6 }[3], [x5], #8
+# CHECK: ld4.s { v4, v5, v6, v7 }[2], [x6], #16
+
+0x61 0x28 0xa4 0x0d
+0x82 0xa4 0xa5 0x4d
+0xa3 0x78 0xa6 0x0d
+0xc4 0xa0 0xa7 0x4d
+
+# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3], x4
+# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4], x5
+# CHECK: st4.h { v3, v4, v5, v6 }[3], [x5], x6
+# CHECK: st4.s { v4, v5, v6, v7 }[2], [x6], x7
+
+0x61 0x28 0xbf 0x0d
+0x82 0xa4 0xbf 0x4d
+0xa3 0x78 0xbf 0x0d
+0xc4 0xa0 0xbf 0x4d
+
+# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3], #4
+# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4], #32
+# CHECK: st4.h { v3, v4, v5, v6 }[3], [x5], #8
+# CHECK: st4.s { v4, v5, v6, v7 }[2], [x6], #16
+
+0x41 0x00 0xc3 0x0c
+0x42 0x00 0xc4 0x4c
+0x64 0x04 0xc5 0x0c
+0x87 0x04 0xc6 0x4c
+0x0c 0x08 0xc7 0x0c
+0x0a 0x08 0xc8 0x4c
+0x4f 0x0c 0xca 0x4c
+
+# CHECK: ld4.8b { v1, v2, v3, v4 }, [x2], x3
+# CHECK: ld4.16b { v2, v3, v4, v5 }, [x2], x4
+# CHECK: ld4.4h { v4, v5, v6, v7 }, [x3], x5
+# CHECK: ld4.8h { v7, v8, v9, v10 }, [x4], x6
+# CHECK: ld4.2s { v12, v13, v14, v15 }, [x0], x7
+# CHECK: ld4.4s { v10, v11, v12, v13 }, [x0], x8
+# CHECK: ld4.2d { v15, v16, v17, v18 }, [x2], x10
+
+0x00 0x00 0xdf 0x0c
+0x00 0x00 0xdf 0x4c
+0x00 0x04 0xdf 0x0c
+0x00 0x04 0xdf 0x4c
+0x00 0x08 0xdf 0x0c
+0x00 0x08 0xdf 0x4c
+0x00 0x0c 0xdf 0x4c
+
+# CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32
+# CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64
+# CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32
+# CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64
+# CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
+# CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
+# CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
+
+0x00 0x00 0x9f 0x0c
+0x00 0x00 0x9f 0x4c
+0x00 0x04 0x9f 0x0c
+0x00 0x04 0x9f 0x4c
+0x00 0x08 0x9f 0x0c
+0x00 0x08 0x9f 0x4c
+0x00 0x0c 0x9f 0x4c
+
+# CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32
+# CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64
+# CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32
+# CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64
+# CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
+# CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
+# CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
+
+0x41 0x00 0x83 0x0c
+0x42 0x00 0x84 0x4c
+0x64 0x04 0x85 0x0c
+0x87 0x04 0x86 0x4c
+0x0c 0x08 0x87 0x0c
+0x0a 0x08 0x88 0x4c
+0x4f 0x0c 0x8a 0x4c
+
+# CHECK: st4.8b { v1, v2, v3, v4 }, [x2], x3
+# CHECK: st4.16b { v2, v3, v4, v5 }, [x2], x4
+# CHECK: st4.4h { v4, v5, v6, v7 }, [x3], x5
+# CHECK: st4.8h { v7, v8, v9, v10 }, [x4], x6
+# CHECK: st4.2s { v12, v13, v14, v15 }, [x0], x7
+# CHECK: st4.4s { v10, v11, v12, v13 }, [x0], x8
+# CHECK: st4.2d { v15, v16, v17, v18 }, [x2], x10
+
+0x61 0x28 0x60 0x0d
+0x82 0xa4 0x60 0x4d
+0xc3 0x70 0x60 0x0d
+0xe4 0xb0 0x60 0x4d
+
+# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3]
+# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4]
+# CHECK: ld4.h { v3, v4, v5, v6 }[2], [x6]
+# CHECK: ld4.s { v4, v5, v6, v7 }[3], [x7]
+
+0x61 0x28 0x20 0x0d
+0x82 0xa4 0x20 0x4d
+0xc3 0x70 0x20 0x0d
+0xe4 0xb0 0x20 0x4d
+
+# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3]
+# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4]
+# CHECK: st4.h { v3, v4, v5, v6 }[2], [x6]
+# CHECK: st4.s { v4, v5, v6, v7 }[3], [x7]
+
+0x21 0xe0 0x60 0x0d
+0x21 0xe0 0xe2 0x0d
+0x21 0xe0 0x60 0x4d
+0x21 0xe0 0xe2 0x4d
+0x21 0xe4 0x60 0x0d
+0x21 0xe4 0xe2 0x0d
+0x21 0xe4 0x60 0x4d
+0x21 0xe4 0xe2 0x4d
+0x21 0xe8 0x60 0x0d
+0x21 0xe8 0xe2 0x0d
+0x21 0xec 0x60 0x4d
+0x21 0xec 0xe2 0x4d
+0x21 0xec 0x60 0x0d
+0x21 0xec 0xe2 0x0d
+
+# CHECK: ld4r.8b { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.8b { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.16b { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.16b { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.4h { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.4h { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.8h { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.8h { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.2s { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.2s { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.2d { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.2d { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.1d { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.1d { v1, v2, v3, v4 }, [x1], x2
+
+0x21 0xe0 0xff 0x0d
+0x21 0xe0 0xff 0x4d
+0x21 0xe4 0xff 0x0d
+0x21 0xe4 0xff 0x4d
+0x21 0xe8 0xff 0x0d
+0x21 0xec 0xff 0x4d
+0x21 0xec 0xff 0x0d
+
+# CHECK: ld4r.8b	{ v1, v2, v3, v4 }, [x1], #4
+# CHECK: ld4r.16b	{ v1, v2, v3, v4 }, [x1], #4
+# CHECK: ld4r.4h	{ v1, v2, v3, v4 }, [x1], #8
+# CHECK: ld4r.8h	{ v1, v2, v3, v4 }, [x1], #8
+# CHECK: ld4r.2s	{ v1, v2, v3, v4 }, [x1], #16
+# CHECK: ld4r.2d	{ v1, v2, v3, v4 }, [x1], #32
+# CHECK: ld4r.1d	{ v1, v2, v3, v4 }, [x1], #32
+
+0x20 0xe4 0x00 0x2f
+0x20 0xe4 0x00 0x6f
+0x20 0xe4 0x00 0x0f
+0x20 0xe4 0x00 0x4f
+
+# CHECK: movi     d0, #0x000000000000ff
+# CHECK: movi.2d  v0, #0x000000000000ff
+# CHECK: movi.8b  v0, #0x1
+# CHECK: movi.16b v0, #0x1
+
+0x20 0x04 0x00 0x0f
+0x20 0x24 0x00 0x0f
+0x20 0x44 0x00 0x0f
+0x20 0x64 0x00 0x0f
+
+# CHECK: movi.2s v0, #0x1
+# CHECK: movi.2s v0, #0x1, lsl #8
+# CHECK: movi.2s v0, #0x1, lsl #16
+# CHECK: movi.2s v0, #0x1, lsl #24
+
+0x20 0x04 0x00 0x4f
+0x20 0x24 0x00 0x4f
+0x20 0x44 0x00 0x4f
+0x20 0x64 0x00 0x4f
+
+# CHECK: movi.4s v0, #0x1
+# CHECK: movi.4s v0, #0x1, lsl #8
+# CHECK: movi.4s v0, #0x1, lsl #16
+# CHECK: movi.4s v0, #0x1, lsl #24
+
+0x20 0x84 0x00 0x0f
+0x20 0xa4 0x00 0x0f
+
+# CHECK: movi.4h v0, #0x1
+# CHECK: movi.4h v0, #0x1, lsl #8
+
+0x20 0x84 0x00 0x4f
+0x20 0xa4 0x00 0x4f
+
+# CHECK: movi.8h v0, #0x1
+# CHECK: movi.8h v0, #0x1, lsl #8
+
+0x20 0x04 0x00 0x2f
+0x20 0x24 0x00 0x2f
+0x20 0x44 0x00 0x2f
+0x20 0x64 0x00 0x2f
+
+# CHECK: mvni.2s v0, #0x1
+# CHECK: mvni.2s v0, #0x1, lsl #8
+# CHECK: mvni.2s v0, #0x1, lsl #16
+# CHECK: mvni.2s v0, #0x1, lsl #24
+
+0x20 0x04 0x00 0x6f
+0x20 0x24 0x00 0x6f
+0x20 0x44 0x00 0x6f
+0x20 0x64 0x00 0x6f
+
+# CHECK: mvni.4s v0, #0x1
+# CHECK: mvni.4s v0, #0x1, lsl #8
+# CHECK: mvni.4s v0, #0x1, lsl #16
+# CHECK: mvni.4s v0, #0x1, lsl #24
+
+0x20 0x84 0x00 0x2f
+0x20 0xa4 0x00 0x2f
+
+# CHECK: mvni.4h v0, #0x1
+# CHECK: mvni.4h v0, #0x1, lsl #8
+
+0x20 0x84 0x00 0x6f
+0x20 0xa4 0x00 0x6f
+
+# CHECK: mvni.8h v0, #0x1
+# CHECK: mvni.8h v0, #0x1, lsl #8
+
+0x20 0xc4 0x00 0x2f
+0x20 0xd4 0x00 0x2f
+0x20 0xc4 0x00 0x6f
+0x20 0xd4 0x00 0x6f
+
+# CHECK: mvni.2s v0, #0x1, msl #8
+# CHECK: mvni.2s v0, #0x1, msl #16
+# CHECK: mvni.4s v0, #0x1, msl #8
+# CHECK: mvni.4s v0, #0x1, msl #16
+
+0x00 0x88 0x21 0x2e
+0x00 0x98 0x21 0x2e
+0x00 0x98 0xa1 0x2e
+0x00 0x98 0x21 0x0e
+0x00 0x88 0x21 0x0e
+0x00 0x88 0xa1 0x0e
+0x00 0x98 0xa1 0x0e
+
+# CHECK: frinta.2s	v0, v0
+# CHECK: frintx.2s	v0, v0
+# CHECK: frinti.2s	v0, v0
+# CHECK: frintm.2s	v0, v0
+# CHECK: frintn.2s	v0, v0
+# CHECK: frintp.2s	v0, v0
+# CHECK: frintz.2s	v0, v0
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD scalar x index instructions
+#===-------------------------------------------------------------------------===
+
+0x00 0x18 0xa0 0x5f
+0x00 0x18 0xc0 0x5f
+0x00 0x58 0xa0 0x5f
+0x00 0x58 0xc0 0x5f
+0x00 0x98 0xa0 0x7f
+0x00 0x98 0xc0 0x7f
+0x00 0x98 0xa0 0x5f
+0x00 0x98 0xc0 0x5f
+0x00 0x38 0x70 0x5f
+0x00 0x38 0xa0 0x5f
+0x00 0x78 0x70 0x5f
+0x00 0xc8 0x70 0x5f
+0x00 0xc8 0xa0 0x5f
+0x00 0xb8 0x70 0x5f
+0x00 0xb8 0xa0 0x5f
+0x00 0xd8 0x70 0x5f
+0x00 0xd8 0xa0 0x5f
+
+# CHECK: fmla.s	s0, s0, v0[3]
+# CHECK: fmla.d	d0, d0, v0[1]
+# CHECK: fmls.s	s0, s0, v0[3]
+# CHECK: fmls.d	d0, d0, v0[1]
+# CHECK: fmulx.s	s0, s0, v0[3]
+# CHECK: fmulx.d	d0, d0, v0[1]
+# CHECK: fmul.s	s0, s0, v0[3]
+# CHECK: fmul.d	d0, d0, v0[1]
+# CHECK: sqdmlal.h	s0, h0, v0[7]
+# CHECK: sqdmlal.s	d0, s0, v0[3]
+# CHECK: sqdmlsl.h	s0, h0, v0[7]
+# CHECK: sqdmulh.h	h0, h0, v0[7]
+# CHECK: sqdmulh.s	s0, s0, v0[3]
+# CHECK: sqdmull.h	s0, h0, v0[7]
+# CHECK: sqdmull.s	d0, s0, v0[3]
+# CHECK: sqrdmulh.h	h0, h0, v0[7]
+# CHECK: sqrdmulh.s	s0, s0, v0[3]
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD vector x index instructions
+#===-------------------------------------------------------------------------===
+
+  0x00 0x10 0x80 0x0f
+  0x00 0x10 0xa0 0x4f
+  0x00 0x18 0xc0 0x4f
+  0x00 0x50 0x80 0x0f
+  0x00 0x50 0xa0 0x4f
+  0x00 0x58 0xc0 0x4f
+  0x00 0x90 0x80 0x2f
+  0x00 0x90 0xa0 0x6f
+  0x00 0x98 0xc0 0x6f
+  0x00 0x90 0x80 0x0f
+  0x00 0x90 0xa0 0x4f
+  0x00 0x98 0xc0 0x4f
+  0x00 0x00 0x40 0x2f
+  0x00 0x00 0x50 0x6f
+  0x00 0x08 0x80 0x2f
+  0x00 0x08 0xa0 0x6f
+  0x00 0x40 0x40 0x2f
+  0x00 0x40 0x50 0x6f
+  0x00 0x48 0x80 0x2f
+  0x00 0x48 0xa0 0x6f
+  0x00 0x80 0x40 0x0f
+  0x00 0x80 0x50 0x4f
+  0x00 0x88 0x80 0x0f
+  0x00 0x88 0xa0 0x4f
+  0x00 0x20 0x40 0x0f
+  0x00 0x20 0x50 0x4f
+  0x00 0x28 0x80 0x0f
+  0x00 0x28 0xa0 0x4f
+  0x00 0x60 0x40 0x0f
+  0x00 0x60 0x50 0x4f
+  0x00 0x68 0x80 0x0f
+  0x00 0x68 0xa0 0x4f
+  0x00 0xa0 0x40 0x0f
+  0x00 0xa0 0x50 0x4f
+  0x00 0xa8 0x80 0x0f
+  0x00 0xa8 0xa0 0x4f
+  0x00 0x30 0x40 0x0f
+  0x00 0x30 0x50 0x4f
+  0x00 0x38 0x80 0x0f
+  0x00 0x38 0xa0 0x4f
+  0x00 0x70 0x40 0x0f
+  0x00 0x70 0x50 0x4f
+  0x00 0x78 0x80 0x0f
+  0x00 0x78 0xa0 0x4f
+  0x00 0xc0 0x40 0x0f
+  0x00 0xc0 0x50 0x4f
+  0x00 0xc8 0x80 0x0f
+  0x00 0xc8 0xa0 0x4f
+  0x00 0xb0 0x40 0x0f
+  0x00 0xb0 0x50 0x4f
+  0x00 0xb8 0x80 0x0f
+  0x00 0xb8 0xa0 0x4f
+  0x00 0xd0 0x40 0x0f
+  0x00 0xd0 0x50 0x4f
+  0x00 0xd8 0x80 0x0f
+  0x00 0xd8 0xa0 0x4f
+  0x00 0x20 0x40 0x2f
+  0x00 0x20 0x50 0x6f
+  0x00 0x28 0x80 0x2f
+  0x00 0x28 0xa0 0x6f
+  0x00 0x60 0x40 0x2f
+  0x00 0x60 0x50 0x6f
+  0x00 0x68 0x80 0x2f
+  0x00 0x68 0xa0 0x6f
+  0x00 0xa0 0x40 0x2f
+  0x00 0xa0 0x50 0x6f
+  0x00 0xa8 0x80 0x2f
+  0x00 0xa8 0xa0 0x6f
+
+# CHECK: fmla.2s	v0, v0, v0[0]
+# CHECK: fmla.4s	v0, v0, v0[1]
+# CHECK: fmla.2d	v0, v0, v0[1]
+# CHECK: fmls.2s	v0, v0, v0[0]
+# CHECK: fmls.4s	v0, v0, v0[1]
+# CHECK: fmls.2d	v0, v0, v0[1]
+# CHECK: fmulx.2s	v0, v0, v0[0]
+# CHECK: fmulx.4s	v0, v0, v0[1]
+# CHECK: fmulx.2d	v0, v0, v0[1]
+# CHECK: fmul.2s	v0, v0, v0[0]
+# CHECK: fmul.4s	v0, v0, v0[1]
+# CHECK: fmul.2d	v0, v0, v0[1]
+# CHECK: mla.4h	v0, v0, v0[0]
+# CHECK: mla.8h	v0, v0, v0[1]
+# CHECK: mla.2s	v0, v0, v0[2]
+# CHECK: mla.4s	v0, v0, v0[3]
+# CHECK: mls.4h	v0, v0, v0[0]
+# CHECK: mls.8h	v0, v0, v0[1]
+# CHECK: mls.2s	v0, v0, v0[2]
+# CHECK: mls.4s	v0, v0, v0[3]
+# CHECK: mul.4h	v0, v0, v0[0]
+# CHECK: mul.8h	v0, v0, v0[1]
+# CHECK: mul.2s	v0, v0, v0[2]
+# CHECK: mul.4s	v0, v0, v0[3]
+# CHECK: smlal.4s	v0, v0, v0[0]
+# CHECK: smlal2.4s	v0, v0, v0[1]
+# CHECK: smlal.2d	v0, v0, v0[2]
+# CHECK: smlal2.2d	v0, v0, v0[3]
+# CHECK: smlsl.4s	v0, v0, v0[0]
+# CHECK: smlsl2.4s	v0, v0, v0[1]
+# CHECK: smlsl.2d	v0, v0, v0[2]
+# CHECK: smlsl2.2d	v0, v0, v0[3]
+# CHECK: smull.4s	v0, v0, v0[0]
+# CHECK: smull2.4s	v0, v0, v0[1]
+# CHECK: smull.2d	v0, v0, v0[2]
+# CHECK: smull2.2d	v0, v0, v0[3]
+# CHECK: sqdmlal.4s	v0, v0, v0[0]
+# CHECK: sqdmlal2.4s	v0, v0, v0[1]
+# CHECK: sqdmlal.2d	v0, v0, v0[2]
+# CHECK: sqdmlal2.2d	v0, v0, v0[3]
+# CHECK: sqdmlsl.4s	v0, v0, v0[0]
+# CHECK: sqdmlsl2.4s	v0, v0, v0[1]
+# CHECK: sqdmlsl.2d	v0, v0, v0[2]
+# CHECK: sqdmlsl2.2d	v0, v0, v0[3]
+# CHECK: sqdmulh.4h	v0, v0, v0[0]
+# CHECK: sqdmulh.8h	v0, v0, v0[1]
+# CHECK: sqdmulh.2s	v0, v0, v0[2]
+# CHECK: sqdmulh.4s	v0, v0, v0[3]
+# CHECK: sqdmull.4s	v0, v0, v0[0]
+# CHECK: sqdmull2.4s	v0, v0, v0[1]
+# CHECK: sqdmull.2d	v0, v0, v0[2]
+# CHECK: sqdmull2.2d	v0, v0, v0[3]
+# CHECK: sqrdmulh.4h	v0, v0, v0[0]
+# CHECK: sqrdmulh.8h	v0, v0, v0[1]
+# CHECK: sqrdmulh.2s	v0, v0, v0[2]
+# CHECK: sqrdmulh.4s	v0, v0, v0[3]
+# CHECK: umlal.4s	v0, v0, v0[0]
+# CHECK: umlal2.4s	v0, v0, v0[1]
+# CHECK: umlal.2d	v0, v0, v0[2]
+# CHECK: umlal2.2d	v0, v0, v0[3]
+# CHECK: umlsl.4s	v0, v0, v0[0]
+# CHECK: umlsl2.4s	v0, v0, v0[1]
+# CHECK: umlsl.2d	v0, v0, v0[2]
+# CHECK: umlsl2.2d	v0, v0, v0[3]
+# CHECK: umull.4s	v0, v0, v0[0]
+# CHECK: umull2.4s	v0, v0, v0[1]
+# CHECK: umull.2d	v0, v0, v0[2]
+# CHECK: umull2.2d	v0, v0, v0[3]
+
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD scalar + shift instructions
+#===-------------------------------------------------------------------------===
+
+  0x00 0x54 0x41 0x5f
+  0x00 0x54 0x41 0x7f
+  0x00 0x9c 0x09 0x5f
+  0x00 0x9c 0x12 0x5f
+  0x00 0x9c 0x23 0x5f
+  0x00 0x8c 0x09 0x7f
+  0x00 0x8c 0x12 0x7f
+  0x00 0x8c 0x23 0x7f
+  0x00 0x64 0x09 0x7f
+  0x00 0x64 0x12 0x7f
+  0x00 0x64 0x23 0x7f
+  0x00 0x64 0x44 0x7f
+  0x00 0x74 0x09 0x5f
+  0x00 0x74 0x12 0x5f
+  0x00 0x74 0x23 0x5f
+  0x00 0x74 0x44 0x5f
+  0x00 0x94 0x09 0x5f
+  0x00 0x94 0x12 0x5f
+  0x00 0x94 0x23 0x5f
+  0x00 0x84 0x09 0x7f
+  0x00 0x84 0x12 0x7f
+  0x00 0x84 0x23 0x7f
+  0x00 0x44 0x41 0x7f
+  0x00 0x24 0x41 0x5f
+  0x00 0x34 0x41 0x5f
+  0x00 0x04 0x41 0x5f
+  0x00 0xe4 0x21 0x7f
+  0x00 0xe4 0x42 0x7f
+  0x00 0x9c 0x09 0x7f
+  0x00 0x9c 0x12 0x7f
+  0x00 0x9c 0x23 0x7f
+  0x00 0x74 0x09 0x7f
+  0x00 0x74 0x12 0x7f
+  0x00 0x74 0x23 0x7f
+  0x00 0x74 0x44 0x7f
+  0x00 0x94 0x09 0x7f
+  0x00 0x94 0x12 0x7f
+  0x00 0x94 0x23 0x7f
+  0x00 0x24 0x41 0x7f
+  0x00 0x34 0x41 0x7f
+  0x00 0x04 0x41 0x7f
+  0x00 0x14 0x41 0x7f
+
+# CHECK: shl	d0, d0, #1
+# CHECK: sli	d0, d0, #1
+# CHECK: sqrshrn	b0, h0, #7
+# CHECK: sqrshrn	h0, s0, #14
+# CHECK: sqrshrn	s0, d0, #29
+# CHECK: sqrshrun	b0, h0, #7
+# CHECK: sqrshrun	h0, s0, #14
+# CHECK: sqrshrun	s0, d0, #29
+# CHECK: sqshlu	b0, b0, #1
+# CHECK: sqshlu	h0, h0, #2
+# CHECK: sqshlu	s0, s0, #3
+# CHECK: sqshlu	d0, d0, #4
+# CHECK: sqshl	b0, b0, #1
+# CHECK: sqshl	h0, h0, #2
+# CHECK: sqshl	s0, s0, #3
+# CHECK: sqshl	d0, d0, #4
+# CHECK: sqshrn	b0, h0, #7
+# CHECK: sqshrn	h0, s0, #14
+# CHECK: sqshrn	s0, d0, #29
+# CHECK: sqshrun	b0, h0, #7
+# CHECK: sqshrun	h0, s0, #14
+# CHECK: sqshrun	s0, d0, #29
+# CHECK: sri	d0, d0, #63
+# CHECK: srshr	d0, d0, #63
+# CHECK: srsra	d0, d0, #63
+# CHECK: sshr	d0, d0, #63
+# CHECK: ucvtf	s0, s0, #31
+# CHECK: ucvtf	d0, d0, #62
+# CHECK: uqrshrn	b0, h0, #7
+# CHECK: uqrshrn	h0, s0, #14
+# CHECK: uqrshrn	s0, d0, #29
+# CHECK: uqshl	b0, b0, #1
+# CHECK: uqshl	h0, h0, #2
+# CHECK: uqshl	s0, s0, #3
+# CHECK: uqshl	d0, d0, #4
+# CHECK: uqshrn	b0, h0, #7
+# CHECK: uqshrn	h0, s0, #14
+# CHECK: uqshrn	s0, d0, #29
+# CHECK: urshr	d0, d0, #63
+# CHECK: ursra	d0, d0, #63
+# CHECK: ushr	d0, d0, #63
+# CHECK: usra	d0, d0, #63
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD vector + shift instructions
+#===-------------------------------------------------------------------------===
+
+  0x00 0xfc 0x21 0x0f
+  0x00 0xfc 0x22 0x4f
+  0x00 0xfc 0x43 0x4f
+  0x00 0xfc 0x21 0x2f
+  0x00 0xfc 0x22 0x6f
+  0x00 0xfc 0x43 0x6f
+  0x00 0x8c 0x09 0x0f
+  0x00 0x8c 0x0a 0x4f
+  0x00 0x8c 0x13 0x0f
+  0x00 0x8c 0x14 0x4f
+  0x00 0x8c 0x25 0x0f
+  0x00 0x8c 0x26 0x4f
+  0x00 0xe4 0x21 0x0f
+  0x00 0xe4 0x22 0x4f
+  0x00 0xe4 0x43 0x4f
+  0x00 0x54 0x09 0x0f
+  0x00 0x54 0x0a 0x4f
+  0x00 0x54 0x13 0x0f
+  0x00 0x54 0x14 0x4f
+  0x00 0x54 0x25 0x0f
+  0x00 0x54 0x26 0x4f
+  0x00 0x54 0x47 0x4f
+  0x00 0x84 0x09 0x0f
+  0x00 0x84 0x0a 0x4f
+  0x00 0x84 0x13 0x0f
+  0x00 0x84 0x14 0x4f
+  0x00 0x84 0x25 0x0f
+  0x00 0x84 0x26 0x4f
+  0x00 0x54 0x09 0x2f
+  0x00 0x54 0x0a 0x6f
+  0x00 0x54 0x13 0x2f
+  0x00 0x54 0x14 0x6f
+  0x00 0x54 0x25 0x2f
+  0x00 0x54 0x26 0x6f
+  0x00 0x54 0x47 0x6f
+  0x00 0x9c 0x09 0x0f
+  0x00 0x9c 0x0a 0x4f
+  0x00 0x9c 0x13 0x0f
+  0x00 0x9c 0x14 0x4f
+  0x00 0x9c 0x25 0x0f
+  0x00 0x9c 0x26 0x4f
+  0x00 0x8c 0x09 0x2f
+  0x00 0x8c 0x0a 0x6f
+  0x00 0x8c 0x13 0x2f
+  0x00 0x8c 0x14 0x6f
+  0x00 0x8c 0x25 0x2f
+  0x00 0x8c 0x26 0x6f
+  0x00 0x64 0x09 0x2f
+  0x00 0x64 0x0a 0x6f
+  0x00 0x64 0x13 0x2f
+  0x00 0x64 0x14 0x6f
+  0x00 0x64 0x25 0x2f
+  0x00 0x64 0x26 0x6f
+  0x00 0x64 0x47 0x6f
+  0x00 0x74 0x09 0x0f
+  0x00 0x74 0x0a 0x4f
+  0x00 0x74 0x13 0x0f
+  0x00 0x74 0x14 0x4f
+  0x00 0x74 0x25 0x0f
+  0x00 0x74 0x26 0x4f
+  0x00 0x74 0x47 0x4f
+  0x00 0x94 0x09 0x0f
+  0x00 0x94 0x0a 0x4f
+  0x00 0x94 0x13 0x0f
+  0x00 0x94 0x14 0x4f
+  0x00 0x94 0x25 0x0f
+  0x00 0x94 0x26 0x4f
+  0x00 0x84 0x09 0x2f
+  0x00 0x84 0x0a 0x6f
+  0x00 0x84 0x13 0x2f
+  0x00 0x84 0x14 0x6f
+  0x00 0x84 0x25 0x2f
+  0x00 0x84 0x26 0x6f
+  0x00 0x44 0x09 0x2f
+  0x00 0x44 0x0a 0x6f
+  0x00 0x44 0x13 0x2f
+  0x00 0x44 0x14 0x6f
+  0x00 0x44 0x25 0x2f
+  0x00 0x44 0x26 0x6f
+  0x00 0x44 0x47 0x6f
+  0x00 0x24 0x09 0x0f
+  0x00 0x24 0x0a 0x4f
+  0x00 0x24 0x13 0x0f
+  0x00 0x24 0x14 0x4f
+  0x00 0x24 0x25 0x0f
+  0x00 0x24 0x26 0x4f
+  0x00 0x24 0x47 0x4f
+  0x00 0x34 0x09 0x0f
+  0x00 0x34 0x0a 0x4f
+  0x00 0x34 0x13 0x0f
+  0x00 0x34 0x14 0x4f
+  0x00 0x34 0x25 0x0f
+  0x00 0x34 0x26 0x4f
+  0x00 0x34 0x47 0x4f
+  0x00 0xa4 0x09 0x0f
+  0x00 0xa4 0x0a 0x4f
+  0x00 0xa4 0x13 0x0f
+  0x00 0xa4 0x14 0x4f
+  0x00 0xa4 0x25 0x0f
+  0x00 0xa4 0x26 0x4f
+  0x00 0x04 0x09 0x0f
+  0x00 0x04 0x0a 0x4f
+  0x00 0x04 0x13 0x0f
+  0x00 0x04 0x14 0x4f
+  0x00 0x04 0x25 0x0f
+  0x00 0x04 0x26 0x4f
+  0x00 0x04 0x47 0x4f
+  0x00 0x04 0x09 0x0f
+  0x00 0x14 0x0a 0x4f
+  0x00 0x14 0x13 0x0f
+  0x00 0x14 0x14 0x4f
+  0x00 0x14 0x25 0x0f
+  0x00 0x14 0x26 0x4f
+  0x00 0x14 0x47 0x4f
+  0x00 0x14 0x40 0x5f
+  0x00 0xe4 0x21 0x2f
+  0x00 0xe4 0x22 0x6f
+  0x00 0xe4 0x43 0x6f
+  0x00 0x9c 0x09 0x2f
+  0x00 0x9c 0x0a 0x6f
+  0x00 0x9c 0x13 0x2f
+  0x00 0x9c 0x14 0x6f
+  0x00 0x9c 0x25 0x2f
+  0x00 0x9c 0x26 0x6f
+  0x00 0x74 0x09 0x2f
+  0x00 0x74 0x0a 0x6f
+  0x00 0x74 0x13 0x2f
+  0x00 0x74 0x14 0x6f
+  0x00 0x74 0x25 0x2f
+  0x00 0x74 0x26 0x6f
+  0x00 0x74 0x47 0x6f
+  0x00 0x94 0x09 0x2f
+  0x00 0x94 0x0a 0x6f
+  0x00 0x94 0x13 0x2f
+  0x00 0x94 0x14 0x6f
+  0x00 0x94 0x25 0x2f
+  0x00 0x94 0x26 0x6f
+  0x00 0x24 0x09 0x2f
+  0x00 0x24 0x0a 0x6f
+  0x00 0x24 0x13 0x2f
+  0x00 0x24 0x14 0x6f
+  0x00 0x24 0x25 0x2f
+  0x00 0x24 0x26 0x6f
+  0x00 0x24 0x47 0x6f
+  0x00 0x34 0x09 0x2f
+  0x00 0x34 0x0a 0x6f
+  0x00 0x34 0x13 0x2f
+  0x00 0x34 0x14 0x6f
+  0x00 0x34 0x25 0x2f
+  0x00 0x34 0x26 0x6f
+  0x00 0x34 0x47 0x6f
+  0x00 0xa4 0x09 0x2f
+  0x00 0xa4 0x0a 0x6f
+  0x00 0xa4 0x13 0x2f
+  0x00 0xa4 0x14 0x6f
+  0x00 0xa4 0x25 0x2f
+  0x00 0xa4 0x26 0x6f
+  0x00 0x04 0x09 0x2f
+  0x00 0x04 0x0a 0x6f
+  0x00 0x04 0x13 0x2f
+  0x00 0x04 0x14 0x6f
+  0x00 0x04 0x25 0x2f
+  0x00 0x04 0x26 0x6f
+  0x00 0x04 0x47 0x6f
+  0x00 0x14 0x09 0x2f
+  0x00 0x14 0x0a 0x6f
+  0x00 0x14 0x13 0x2f
+  0x00 0x14 0x14 0x6f
+  0x00 0x14 0x25 0x2f
+  0x00 0x14 0x26 0x6f
+  0x00 0x14 0x47 0x6f
+
+# CHECK: fcvtzs.2s	v0, v0, #31
+# CHECK: fcvtzs.4s	v0, v0, #30
+# CHECK: fcvtzs.2d	v0, v0, #61
+# CHECK: fcvtzu.2s	v0, v0, #31
+# CHECK: fcvtzu.4s	v0, v0, #30
+# CHECK: fcvtzu.2d	v0, v0, #61
+# CHECK: rshrn.8b	v0, v0, #7
+# CHECK: rshrn2.16b	v0, v0, #6
+# CHECK: rshrn.4h	v0, v0, #13
+# CHECK: rshrn2.8h	v0, v0, #12
+# CHECK: rshrn.2s	v0, v0, #27
+# CHECK: rshrn2.4s	v0, v0, #26
+# CHECK: scvtf.2s	v0, v0, #31
+# CHECK: scvtf.4s	v0, v0, #30
+# CHECK: scvtf.2d	v0, v0, #61
+# CHECK: shl.8b	v0, v0, #1
+# CHECK: shl.16b	v0, v0, #2
+# CHECK: shl.4h	v0, v0, #3
+# CHECK: shl.8h	v0, v0, #4
+# CHECK: shl.2s	v0, v0, #5
+# CHECK: shl.4s	v0, v0, #6
+# CHECK: shl.2d	v0, v0, #7
+# CHECK: shrn.8b	v0, v0, #7
+# CHECK: shrn2.16b	v0, v0, #6
+# CHECK: shrn.4h	v0, v0, #13
+# CHECK: shrn2.8h	v0, v0, #12
+# CHECK: shrn.2s	v0, v0, #27
+# CHECK: shrn2.4s	v0, v0, #26
+# CHECK: sli.8b	v0, v0, #1
+# CHECK: sli.16b	v0, v0, #2
+# CHECK: sli.4h	v0, v0, #3
+# CHECK: sli.8h	v0, v0, #4
+# CHECK: sli.2s	v0, v0, #5
+# CHECK: sli.4s	v0, v0, #6
+# CHECK: sli.2d	v0, v0, #7
+# CHECK: sqrshrn.8b	v0, v0, #7
+# CHECK: sqrshrn2.16b	v0, v0, #6
+# CHECK: sqrshrn.4h	v0, v0, #13
+# CHECK: sqrshrn2.8h	v0, v0, #12
+# CHECK: sqrshrn.2s	v0, v0, #27
+# CHECK: sqrshrn2.4s	v0, v0, #26
+# CHECK: sqrshrun.8b	v0, v0, #7
+# CHECK: sqrshrun2.16b	v0, v0, #6
+# CHECK: sqrshrun.4h	v0, v0, #13
+# CHECK: sqrshrun2.8h	v0, v0, #12
+# CHECK: sqrshrun.2s	v0, v0, #27
+# CHECK: sqrshrun2.4s	v0, v0, #26
+# CHECK: sqshlu.8b	v0, v0, #1
+# CHECK: sqshlu.16b	v0, v0, #2
+# CHECK: sqshlu.4h	v0, v0, #3
+# CHECK: sqshlu.8h	v0, v0, #4
+# CHECK: sqshlu.2s	v0, v0, #5
+# CHECK: sqshlu.4s	v0, v0, #6
+# CHECK: sqshlu.2d	v0, v0, #7
+# CHECK: sqshl.8b	v0, v0, #1
+# CHECK: sqshl.16b	v0, v0, #2
+# CHECK: sqshl.4h	v0, v0, #3
+# CHECK: sqshl.8h	v0, v0, #4
+# CHECK: sqshl.2s	v0, v0, #5
+# CHECK: sqshl.4s	v0, v0, #6
+# CHECK: sqshl.2d	v0, v0, #7
+# CHECK: sqshrn.8b	v0, v0, #7
+# CHECK: sqshrn2.16b	v0, v0, #6
+# CHECK: sqshrn.4h	v0, v0, #13
+# CHECK: sqshrn2.8h	v0, v0, #12
+# CHECK: sqshrn.2s	v0, v0, #27
+# CHECK: sqshrn2.4s	v0, v0, #26
+# CHECK: sqshrun.8b	v0, v0, #7
+# CHECK: sqshrun2.16b	v0, v0, #6
+# CHECK: sqshrun.4h	v0, v0, #13
+# CHECK: sqshrun2.8h	v0, v0, #12
+# CHECK: sqshrun.2s	v0, v0, #27
+# CHECK: sqshrun2.4s	v0, v0, #26
+# CHECK: sri.8b	v0, v0, #7
+# CHECK: sri.16b	v0, v0, #6
+# CHECK: sri.4h	v0, v0, #13
+# CHECK: sri.8h	v0, v0, #12
+# CHECK: sri.2s	v0, v0, #27
+# CHECK: sri.4s	v0, v0, #26
+# CHECK: sri.2d	v0, v0, #57
+# CHECK: srshr.8b	v0, v0, #7
+# CHECK: srshr.16b	v0, v0, #6
+# CHECK: srshr.4h	v0, v0, #13
+# CHECK: srshr.8h	v0, v0, #12
+# CHECK: srshr.2s	v0, v0, #27
+# CHECK: srshr.4s	v0, v0, #26
+# CHECK: srshr.2d	v0, v0, #57
+# CHECK: srsra.8b	v0, v0, #7
+# CHECK: srsra.16b	v0, v0, #6
+# CHECK: srsra.4h	v0, v0, #13
+# CHECK: srsra.8h	v0, v0, #12
+# CHECK: srsra.2s	v0, v0, #27
+# CHECK: srsra.4s	v0, v0, #26
+# CHECK: srsra.2d	v0, v0, #57
+# CHECK: sshll.8h	v0, v0, #1
+# CHECK: sshll2.8h	v0, v0, #2
+# CHECK: sshll.4s	v0, v0, #3
+# CHECK: sshll2.4s	v0, v0, #4
+# CHECK: sshll.2d	v0, v0, #5
+# CHECK: sshll2.2d	v0, v0, #6
+# CHECK: sshr.8b	v0, v0, #7
+# CHECK: sshr.16b	v0, v0, #6
+# CHECK: sshr.4h	v0, v0, #13
+# CHECK: sshr.8h	v0, v0, #12
+# CHECK: sshr.2s	v0, v0, #27
+# CHECK: sshr.4s	v0, v0, #26
+# CHECK: sshr.2d	v0, v0, #57
+# CHECK: sshr.8b	v0, v0, #7
+# CHECK: ssra.16b	v0, v0, #6
+# CHECK: ssra.4h	v0, v0, #13
+# CHECK: ssra.8h	v0, v0, #12
+# CHECK: ssra.2s	v0, v0, #27
+# CHECK: ssra.4s	v0, v0, #26
+# CHECK: ssra.2d	v0, v0, #57
+# CHECK: ssra		d0, d0, #64
+# CHECK: ucvtf.2s	v0, v0, #31
+# CHECK: ucvtf.4s	v0, v0, #30
+# CHECK: ucvtf.2d	v0, v0, #61
+# CHECK: uqrshrn.8b	v0, v0, #7
+# CHECK: uqrshrn2.16b	v0, v0, #6
+# CHECK: uqrshrn.4h	v0, v0, #13
+# CHECK: uqrshrn2.8h	v0, v0, #12
+# CHECK: uqrshrn.2s	v0, v0, #27
+# CHECK: uqrshrn2.4s	v0, v0, #26
+# CHECK: uqshl.8b	v0, v0, #1
+# CHECK: uqshl.16b	v0, v0, #2
+# CHECK: uqshl.4h	v0, v0, #3
+# CHECK: uqshl.8h	v0, v0, #4
+# CHECK: uqshl.2s	v0, v0, #5
+# CHECK: uqshl.4s	v0, v0, #6
+# CHECK: uqshl.2d	v0, v0, #7
+# CHECK: uqshrn.8b	v0, v0, #7
+# CHECK: uqshrn2.16b	v0, v0, #6
+# CHECK: uqshrn.4h	v0, v0, #13
+# CHECK: uqshrn2.8h	v0, v0, #12
+# CHECK: uqshrn.2s	v0, v0, #27
+# CHECK: uqshrn2.4s	v0, v0, #26
+# CHECK: urshr.8b	v0, v0, #7
+# CHECK: urshr.16b	v0, v0, #6
+# CHECK: urshr.4h	v0, v0, #13
+# CHECK: urshr.8h	v0, v0, #12
+# CHECK: urshr.2s	v0, v0, #27
+# CHECK: urshr.4s	v0, v0, #26
+# CHECK: urshr.2d	v0, v0, #57
+# CHECK: ursra.8b	v0, v0, #7
+# CHECK: ursra.16b	v0, v0, #6
+# CHECK: ursra.4h	v0, v0, #13
+# CHECK: ursra.8h	v0, v0, #12
+# CHECK: ursra.2s	v0, v0, #27
+# CHECK: ursra.4s	v0, v0, #26
+# CHECK: ursra.2d	v0, v0, #57
+# CHECK: ushll.8h	v0, v0, #1
+# CHECK: ushll2.8h	v0, v0, #2
+# CHECK: ushll.4s	v0, v0, #3
+# CHECK: ushll2.4s	v0, v0, #4
+# CHECK: ushll.2d	v0, v0, #5
+# CHECK: ushll2.2d	v0, v0, #6
+# CHECK: ushr.8b	v0, v0, #7
+# CHECK: ushr.16b	v0, v0, #6
+# CHECK: ushr.4h	v0, v0, #13
+# CHECK: ushr.8h	v0, v0, #12
+# CHECK: ushr.2s	v0, v0, #27
+# CHECK: ushr.4s	v0, v0, #26
+# CHECK: ushr.2d	v0, v0, #57
+# CHECK: usra.8b	v0, v0, #7
+# CHECK: usra.16b	v0, v0, #6
+# CHECK: usra.4h	v0, v0, #13
+# CHECK: usra.8h	v0, v0, #12
+# CHECK: usra.2s	v0, v0, #27
+# CHECK: usra.4s	v0, v0, #26
+# CHECK: usra.2d	v0, v0, #57
+
+
+  0x00 0xe0 0x20 0x0e
+  0x00 0xe0 0x20 0x4e
+  0x00 0xe0 0xe0 0x0e
+  0x00 0xe0 0xe0 0x4e
+
+# CHECK: pmull.8h v0, v0, v0
+# CHECK: pmull2.8h v0, v0, v0
+# CHECK: pmull.1q v0, v0, v0
+# CHECK: pmull2.1q v0, v0, v0
+
+  0x41 0xd8 0x70 0x7e
+  0x83 0xd8 0x30 0x7e
+# CHECK: faddp.2d	d1, v2
+# CHECK: faddp.2s	s3, v4
+
+  0x82 0x60 0x01 0x4e
+  0x80 0x60 0x01 0x0e
+  0xa2 0x00 0x01 0x4e
+  0xa0 0x00 0x01 0x0e
+  0xa2 0x40 0x01 0x4e
+  0xa0 0x40 0x01 0x0e
+  0xc2 0x20 0x01 0x4e
+  0xc0 0x20 0x01 0x0e
+
+# CHECK: tbl.16b	v2, { v4, v5, v6, v7 }, v1
+# CHECK: tbl.8b	v0, { v4, v5, v6, v7 }, v1
+# CHECK: tbl.16b	v2, { v5 }, v1
+# CHECK: tbl.8b	v0, { v5 }, v1
+# CHECK: tbl.16b	v2, { v5, v6, v7 }, v1
+# CHECK: tbl.8b	v0, { v5, v6, v7 }, v1
+# CHECK: tbl.16b	v2, { v6, v7 }, v1
+# CHECK: tbl.8b	v0, { v6, v7 }, v1
+#
+  0x82 0x70 0x01 0x4e
+  0x80 0x70 0x01 0x0e
+  0xa2 0x10 0x01 0x4e
+  0xa0 0x10 0x01 0x0e
+  0xa2 0x50 0x01 0x4e
+  0xa0 0x50 0x01 0x0e
+  0xc2 0x30 0x01 0x4e
+  0xc0 0x30 0x01 0x0e
+
+# CHECK: tbx.16b	v2, { v4, v5, v6, v7 }, v1
+# CHECK: tbx.8b	v0, { v4, v5, v6, v7 }, v1
+# CHECK: tbx.16b	v2, { v5 }, v1
+# CHECK: tbx.8b	v0, { v5 }, v1
+# CHECK: tbx.16b	v2, { v5, v6, v7 }, v1
+# CHECK: tbx.8b	v0, { v5, v6, v7 }, v1
+# CHECK: tbx.16b	v2, { v6, v7 }, v1
+# CHECK: tbx.8b	v0, { v6, v7 }, v1
+#
+
+0x00 0x80 0x20 0x0e
+0x00 0x80 0x20 0x4e
+0x00 0x80 0xa0 0x0e
+0x00 0x80 0xa0 0x4e
+
+# CHECK: smlal.8h v0, v0, v0
+# CHECK: smlal2.8h v0, v0, v0
+# CHECK: smlal.2d v0, v0, v0
+# CHECK: smlal2.2d v0, v0, v0
+
+0x00 0x80 0x20 0x2e
+0x00 0x80 0x20 0x6e
+0x00 0x80 0xa0 0x2e
+0x00 0x80 0xa0 0x6e
+
+# CHECK: umlal.8h v0, v0, v0
+# CHECK: umlal2.8h v0, v0, v0
+# CHECK: umlal.2d v0, v0, v0
+# CHECK: umlal2.2d v0, v0, v0
+
+0x00 0x90 0x60 0x5e
+0x00 0x90 0xa0 0x5e
+0x00 0xb0 0x60 0x5e
+0x00 0xb0 0xa0 0x5e
+
+# CHECK: sqdmlal s0, h0, h0
+# CHECK: sqdmlal d0, s0, s0
+# CHECK: sqdmlsl s0, h0, h0
+# CHECK: sqdmlsl d0, s0, s0
+
+0xaa 0xc5 0xc7 0x4d
+0xaa 0xc9 0xc7 0x4d
+0xaa 0xc1 0xc7 0x4d
+
+# CHECK: ld1r.8h { v10 }, [x13], x7
+# CHECK: ld1r.4s { v10 }, [x13], x7
+# CHECK: ld1r.16b { v10 }, [x13], x7
+
+0x00 0xd0 0x60 0x5e
+0x00 0xd0 0xa0 0x5e
+# CHECK: sqdmull	s0, h0, h0
+# CHECK: sqdmull	d0, s0, s0
+
+0x00 0xd8 0xa1 0x7e
+0x00 0xd8 0xe1 0x7e
+
+# CHECK: frsqrte s0, s0
+# CHECK: frsqrte d0, d0
+
+0xca 0xcd 0xc7 0x4d
+0xea 0xc9 0xe7 0x4d
+0xea 0xe9 0xc7 0x4d
+0xea 0xe9 0xe7 0x4d
+# CHECK: ld1r.2d	{ v10 }, [x14], x7
+# CHECK: ld2r.4s	{ v10, v11 }, [x15], x7
+# CHECK: ld3r.4s	{ v10, v11, v12 }, [x15], x7
+# CHECK: ld4r.4s	{ v10, v11, v12, v13 }, [x15], x7
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD scalar three same
+#===-------------------------------------------------------------------------===
+0x62 0xdc 0x21 0x5e
+# CHECK: fmulx	s2, s3, s1
+0x62 0xdc 0x61 0x5e
+# CHECK: fmulx	d2, d3, d1
+
+
+# rdar://12511369
+0xe8 0x6b 0xdf 0x4c
+# CHECK: ld1.4s	{ v8, v9, v10 }, [sp], #48
diff --git a/test/MC/Disassembler/AArch64/arm64-arithmetic.txt b/test/MC/Disassembler/AArch64/arm64-arithmetic.txt
new file mode 100644
index 00000000000..bd870edc8af
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-arithmetic.txt
@@ -0,0 +1,526 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#==---------------------------------------------------------------------------==
+# Add/Subtract with carry/borrow
+#==---------------------------------------------------------------------------==
+
+0x41 0x00 0x03 0x1a
+0x41 0x00 0x03 0x9a
+0x85 0x00 0x03 0x3a
+0x85 0x00 0x03 0xba
+
+# CHECK: adc  w1, w2, w3
+# CHECK: adc  x1, x2, x3
+# CHECK: adcs w5, w4, w3
+# CHECK: adcs x5, x4, x3
+
+0x41 0x00 0x03 0x5a
+0x41 0x00 0x03 0xda
+0x41 0x00 0x03 0x7a
+0x41 0x00 0x03 0xfa
+
+# CHECK: sbc  w1, w2, w3
+# CHECK: sbc  x1, x2, x3
+# CHECK: sbcs w1, w2, w3
+# CHECK: sbcs x1, x2, x3
+
+#==---------------------------------------------------------------------------==
+# Add/Subtract with (optionally shifted) immediate
+#==---------------------------------------------------------------------------==
+
+0x83 0x00 0x10 0x11
+0x83 0x00 0x10 0x91
+
+# CHECK: add w3, w4, #1024
+# CHECK: add x3, x4, #1024
+
+0x83 0x00 0x50 0x11
+0x83 0x00 0x40 0x11
+0x83 0x00 0x50 0x91
+0x83 0x00 0x40 0x91
+0xff 0x83 0x00 0x91
+
+# CHECK: add w3, w4, #1024, lsl #12
+# CHECK: add x3, x4, #1024, lsl #12
+# CHECK: add x3, x4, #0, lsl #12
+# CHECK: add sp, sp, #32
+
+0x83 0x00 0x10 0x31
+0x83 0x00 0x50 0x31
+0x83 0x00 0x10 0xb1
+0x83 0x00 0x50 0xb1
+0xff 0x83 0x00 0xb1
+
+# CHECK: adds w3, w4, #1024
+# CHECK: adds w3, w4, #1024, lsl #12
+# CHECK: adds x3, x4, #1024
+# CHECK: adds x3, x4, #1024, lsl #12
+# CHECK: cmn  sp, #32
+
+0x83 0x00 0x10 0x51
+0x83 0x00 0x50 0x51
+0x83 0x00 0x10 0xd1
+0x83 0x00 0x50 0xd1
+0xff 0x83 0x00 0xd1
+
+# CHECK: sub w3, w4, #1024
+# CHECK: sub w3, w4, #1024, lsl #12
+# CHECK: sub x3, x4, #1024
+# CHECK: sub x3, x4, #1024, lsl #12
+# CHECK: sub sp, sp, #32
+
+0x83 0x00 0x10 0x71
+0x83 0x00 0x50 0x71
+0x83 0x00 0x10 0xf1
+0x83 0x00 0x50 0xf1
+0xff 0x83 0x00 0xf1
+
+# CHECK: subs w3, w4, #1024
+# CHECK: subs w3, w4, #1024, lsl #12
+# CHECK: subs x3, x4, #1024
+# CHECK: subs x3, x4, #1024, lsl #12
+# CHECK: cmp  sp, #32
+
+#==---------------------------------------------------------------------------==
+# Add/Subtract register with (optional) shift
+#==---------------------------------------------------------------------------==
+
+0xac 0x01 0x0e 0x0b
+0xac 0x01 0x0e 0x8b
+0xac 0x31 0x0e 0x0b
+0xac 0x31 0x0e 0x8b
+0xac 0x29 0x4e 0x0b
+0xac 0x29 0x4e 0x8b
+0xac 0x1d 0x8e 0x0b
+0xac 0x9d 0x8e 0x8b
+
+# CHECK: add w12, w13, w14
+# CHECK: add x12, x13, x14
+# CHECK: add w12, w13, w14, lsl #12
+# CHECK: add x12, x13, x14, lsl #12
+# CHECK: add w12, w13, w14, lsr #10
+# CHECK: add x12, x13, x14, lsr #10
+# CHECK: add w12, w13, w14, asr #7
+# CHECK: add x12, x13, x14, asr #39
+
+0xac 0x01 0x0e 0x4b
+0xac 0x01 0x0e 0xcb
+0xac 0x31 0x0e 0x4b
+0xac 0x31 0x0e 0xcb
+0xac 0x29 0x4e 0x4b
+0xac 0x29 0x4e 0xcb
+0xac 0x1d 0x8e 0x4b
+0xac 0x9d 0x8e 0xcb
+
+# CHECK: sub w12, w13, w14
+# CHECK: sub x12, x13, x14
+# CHECK: sub w12, w13, w14, lsl #12
+# CHECK: sub x12, x13, x14, lsl #12
+# CHECK: sub w12, w13, w14, lsr #10
+# CHECK: sub x12, x13, x14, lsr #10
+# CHECK: sub w12, w13, w14, asr #7
+# CHECK: sub x12, x13, x14, asr #39
+
+0xac 0x01 0x0e 0x2b
+0xac 0x01 0x0e 0xab
+0xac 0x31 0x0e 0x2b
+0xac 0x31 0x0e 0xab
+0xac 0x29 0x4e 0x2b
+0xac 0x29 0x4e 0xab
+0xac 0x1d 0x8e 0x2b
+0xac 0x9d 0x8e 0xab
+
+# CHECK: adds w12, w13, w14
+# CHECK: adds x12, x13, x14
+# CHECK: adds w12, w13, w14, lsl #12
+# CHECK: adds x12, x13, x14, lsl #12
+# CHECK: adds w12, w13, w14, lsr #10
+# CHECK: adds x12, x13, x14, lsr #10
+# CHECK: adds w12, w13, w14, asr #7
+# CHECK: adds x12, x13, x14, asr #39
+
+0xac 0x01 0x0e 0x6b
+0xac 0x01 0x0e 0xeb
+0xac 0x31 0x0e 0x6b
+0xac 0x31 0x0e 0xeb
+0xac 0x29 0x4e 0x6b
+0xac 0x29 0x4e 0xeb
+0xac 0x1d 0x8e 0x6b
+0xac 0x9d 0x8e 0xeb
+
+# CHECK: subs w12, w13, w14
+# CHECK: subs x12, x13, x14
+# CHECK: subs w12, w13, w14, lsl #12
+# CHECK: subs x12, x13, x14, lsl #12
+# CHECK: subs w12, w13, w14, lsr #10
+# CHECK: subs x12, x13, x14, lsr #10
+# CHECK: subs w12, w13, w14, asr #7
+# CHECK: subs x12, x13, x14, asr #39
+
+#==---------------------------------------------------------------------------==
+# Add/Subtract with (optional) extend
+#==---------------------------------------------------------------------------==
+
+0x41 0x00 0x23 0x0b
+0x41 0x20 0x23 0x0b
+0x41 0x40 0x23 0x0b
+0x41 0x60 0x23 0x0b
+0x41 0x80 0x23 0x0b
+0x41 0xa0 0x23 0x0b
+0x41 0xc0 0x23 0x0b
+0x41 0xe0 0x23 0x0b
+
+# CHECK: add w1, w2, w3, uxtb
+# CHECK: add w1, w2, w3, uxth
+# CHECK: add w1, w2, w3
+# CHECK: add w1, w2, w3, uxtx
+# CHECK: add w1, w2, w3, sxtb
+# CHECK: add w1, w2, w3, sxth
+# CHECK: add w1, w2, w3, sxtw
+# CHECK: add w1, w2, w3, sxtx
+
+0x41 0x00 0x23 0x8b
+0x41 0x20 0x23 0x8b
+0x41 0x40 0x23 0x8b
+0x41 0x80 0x23 0x8b
+0x41 0xa0 0x23 0x8b
+0x41 0xc0 0x23 0x8b
+
+# CHECK: add x1, x2, w3, uxtb
+# CHECK: add x1, x2, w3, uxth
+# CHECK: add x1, x2, w3, uxtw
+# CHECK: add x1, x2, w3, sxtb
+# CHECK: add x1, x2, w3, sxth
+# CHECK: add x1, x2, w3, sxtw
+
+0xe1 0x43 0x23 0x0b
+0xe1 0x43 0x23 0x0b
+0x5f 0x60 0x23 0x8b
+0x5f 0x60 0x23 0x8b
+
+# CHECK: add w1, wsp, w3
+# CHECK: add w1, wsp, w3
+# CHECK: add sp, x2, x3
+# CHECK: add sp, x2, x3
+
+0x41 0x00 0x23 0x4b
+0x41 0x20 0x23 0x4b
+0x41 0x40 0x23 0x4b
+0x41 0x60 0x23 0x4b
+0x41 0x80 0x23 0x4b
+0x41 0xa0 0x23 0x4b
+0x41 0xc0 0x23 0x4b
+0x41 0xe0 0x23 0x4b
+
+# CHECK: sub w1, w2, w3, uxtb
+# CHECK: sub w1, w2, w3, uxth
+# CHECK: sub w1, w2, w3
+# CHECK: sub w1, w2, w3, uxtx
+# CHECK: sub w1, w2, w3, sxtb
+# CHECK: sub w1, w2, w3, sxth
+# CHECK: sub w1, w2, w3, sxtw
+# CHECK: sub w1, w2, w3, sxtx
+
+0x41 0x00 0x23 0xcb
+0x41 0x20 0x23 0xcb
+0x41 0x40 0x23 0xcb
+0x41 0x80 0x23 0xcb
+0x41 0xa0 0x23 0xcb
+0x41 0xc0 0x23 0xcb
+
+# CHECK: sub x1, x2, w3, uxtb
+# CHECK: sub x1, x2, w3, uxth
+# CHECK: sub x1, x2, w3, uxtw
+# CHECK: sub x1, x2, w3, sxtb
+# CHECK: sub x1, x2, w3, sxth
+# CHECK: sub x1, x2, w3, sxtw
+
+0xe1 0x43 0x23 0x4b
+0xe1 0x43 0x23 0x4b
+0x5f 0x60 0x23 0xcb
+0x5f 0x60 0x23 0xcb
+
+# CHECK: sub w1, wsp, w3
+# CHECK: sub w1, wsp, w3
+# CHECK: sub sp, x2, x3
+# CHECK: sub sp, x2, x3
+
+0x41 0x00 0x23 0x2b
+0x41 0x20 0x23 0x2b
+0x41 0x40 0x23 0x2b
+0x41 0x60 0x23 0x2b
+0x41 0x80 0x23 0x2b
+0x41 0xa0 0x23 0x2b
+0x41 0xc0 0x23 0x2b
+0x41 0xe0 0x23 0x2b
+
+# CHECK: adds w1, w2, w3, uxtb
+# CHECK: adds w1, w2, w3, uxth
+# CHECK: adds w1, w2, w3
+# CHECK: adds w1, w2, w3, uxtx
+# CHECK: adds w1, w2, w3, sxtb
+# CHECK: adds w1, w2, w3, sxth
+# CHECK: adds w1, w2, w3, sxtw
+# CHECK: adds w1, w2, w3, sxtx
+
+0x41 0x00 0x23 0xab
+0x41 0x20 0x23 0xab
+0x41 0x40 0x23 0xab
+0x41 0x80 0x23 0xab
+0x41 0xa0 0x23 0xab
+0x41 0xc0 0x23 0xab
+
+# CHECK: adds x1, x2, w3, uxtb
+# CHECK: adds x1, x2, w3, uxth
+# CHECK: adds x1, x2, w3, uxtw
+# CHECK: adds x1, x2, w3, sxtb
+# CHECK: adds x1, x2, w3, sxth
+# CHECK: adds x1, x2, w3, sxtw
+
+0xe1 0x43 0x23 0x2b
+0xe1 0x43 0x23 0x2b
+
+# CHECK: adds w1, wsp, w3
+# CHECK: adds w1, wsp, w3
+
+0x41 0x00 0x23 0x6b
+0x41 0x20 0x23 0x6b
+0x41 0x40 0x23 0x6b
+0x41 0x60 0x23 0x6b
+0x41 0x80 0x23 0x6b
+0x41 0xa0 0x23 0x6b
+0x41 0xc0 0x23 0x6b
+0x41 0xe0 0x23 0x6b
+
+# CHECK: subs w1, w2, w3, uxtb
+# CHECK: subs w1, w2, w3, uxth
+# CHECK: subs w1, w2, w3
+# CHECK: subs w1, w2, w3, uxtx
+# CHECK: subs w1, w2, w3, sxtb
+# CHECK: subs w1, w2, w3, sxth
+# CHECK: subs w1, w2, w3, sxtw
+# CHECK: subs w1, w2, w3, sxtx
+
+0x41 0x00 0x23 0xeb
+0x41 0x20 0x23 0xeb
+0x41 0x40 0x23 0xeb
+0x41 0x80 0x23 0xeb
+0x41 0xa0 0x23 0xeb
+0x41 0xc0 0x23 0xeb
+
+# CHECK: subs x1, x2, w3, uxtb
+# CHECK: subs x1, x2, w3, uxth
+# CHECK: subs x1, x2, w3, uxtw
+# CHECK: subs x1, x2, w3, sxtb
+# CHECK: subs x1, x2, w3, sxth
+# CHECK: subs x1, x2, w3, sxtw
+
+0xe1 0x43 0x23 0x6b
+0xe1 0x43 0x23 0x6b
+
+# CHECK: subs w1, wsp, w3
+# CHECK: subs w1, wsp, w3
+
+0x1f 0x41 0x28 0xeb
+0x3f 0x41 0x28 0x6b
+0xff 0x43 0x28 0x6b
+0xff 0x43 0x28 0xeb
+
+# CHECK: cmp x8, w8, uxtw
+# CHECK: cmp w9, w8, uxtw
+# CHECK: cmp wsp, w8
+# CHECK: cmp sp, w8
+
+0x3f 0x41 0x28 0x4b
+0xe1 0x43 0x28 0x4b
+0xff 0x43 0x28 0x4b
+0x3f 0x41 0x28 0xcb
+0xe1 0x43 0x28 0xcb
+0xff 0x43 0x28 0xcb
+0xe1 0x43 0x28 0x6b
+0xe1 0x43 0x28 0xeb
+
+# CHECK: sub wsp, w9, w8
+# CHECK: sub w1, wsp, w8
+# CHECK: sub wsp, wsp, w8
+# CHECK: sub sp, x9, w8
+# CHECK: sub x1, sp, w8
+# CHECK: sub sp, sp, w8
+# CHECK: subs w1, wsp, w8
+# CHECK: subs x1, sp, w8
+
+#==---------------------------------------------------------------------------==
+# Signed/Unsigned divide
+#==---------------------------------------------------------------------------==
+
+0x41 0x0c 0xc3 0x1a
+0x41 0x0c 0xc3 0x9a
+0x41 0x08 0xc3 0x1a
+0x41 0x08 0xc3 0x9a
+
+# CHECK: sdiv w1, w2, w3
+# CHECK: sdiv x1, x2, x3
+# CHECK: udiv w1, w2, w3
+# CHECK: udiv x1, x2, x3
+
+#==---------------------------------------------------------------------------==
+# Variable shifts
+#==---------------------------------------------------------------------------==
+
+  0x41 0x28 0xc3 0x1a
+# CHECK: asr w1, w2, w3
+  0x41 0x28 0xc3 0x9a
+# CHECK: asr x1, x2, x3
+  0x41 0x20 0xc3 0x1a
+# CHECK: lsl w1, w2, w3
+  0x41 0x20 0xc3 0x9a
+# CHECK: lsl x1, x2, x3
+  0x41 0x24 0xc3 0x1a
+# CHECK: lsr w1, w2, w3
+  0x41 0x24 0xc3 0x9a
+# CHECK: lsr x1, x2, x3
+  0x41 0x2c 0xc3 0x1a
+# CHECK: ror w1, w2, w3
+  0x41 0x2c 0xc3 0x9a
+# CHECK: ror x1, x2, x3
+
+#==---------------------------------------------------------------------------==
+# One operand instructions
+#==---------------------------------------------------------------------------==
+
+  0x41 0x14 0xc0 0x5a
+# CHECK: cls w1, w2
+  0x41 0x14 0xc0 0xda
+# CHECK: cls x1, x2
+  0x41 0x10 0xc0 0x5a
+# CHECK: clz w1, w2
+  0x41 0x10 0xc0 0xda
+# CHECK: clz x1, x2
+  0x41 0x00 0xc0 0x5a
+# CHECK: rbit w1, w2
+  0x41 0x00 0xc0 0xda
+# CHECK: rbit x1, x2
+  0x41 0x08 0xc0 0x5a
+# CHECK: rev w1, w2
+  0x41 0x0c 0xc0 0xda
+# CHECK: rev x1, x2
+  0x41 0x04 0xc0 0x5a
+# CHECK: rev16 w1, w2
+  0x41 0x04 0xc0 0xda
+# CHECK: rev16 x1, x2
+  0x41 0x08 0xc0 0xda
+# CHECK: rev32 x1, x2
+
+#==---------------------------------------------------------------------------==
+# 6.6.1 Multiply-add instructions
+#==---------------------------------------------------------------------------==
+
+0x41 0x10 0x03 0x1b
+0x41 0x10 0x03 0x9b
+0x41 0x90 0x03 0x1b
+0x41 0x90 0x03 0x9b
+0x41 0x10 0x23 0x9b
+0x41 0x90 0x23 0x9b
+0x41 0x10 0xa3 0x9b
+0x41 0x90 0xa3 0x9b
+
+# CHECK: madd   w1, w2, w3, w4
+# CHECK: madd   x1, x2, x3, x4
+# CHECK: msub   w1, w2, w3, w4
+# CHECK: msub   x1, x2, x3, x4
+# CHECK: smaddl x1, w2, w3, x4
+# CHECK: smsubl x1, w2, w3, x4
+# CHECK: umaddl x1, w2, w3, x4
+# CHECK: umsubl x1, w2, w3, x4
+
+#==---------------------------------------------------------------------------==
+# Multiply-high instructions
+#==---------------------------------------------------------------------------==
+
+0x41 0x7c 0x43 0x9b
+0x41 0x7c 0xc3 0x9b
+
+# CHECK: smulh x1, x2, x3
+# CHECK: umulh x1, x2, x3
+
+#==---------------------------------------------------------------------------==
+# Move immediate instructions
+#==---------------------------------------------------------------------------==
+
+0x20 0x00 0x80 0x52
+0x20 0x00 0x80 0xd2
+0x20 0x00 0xa0 0x52
+0x20 0x00 0xa0 0xd2
+
+# CHECK: movz w0, #0x1
+# CHECK: movz x0, #0x1
+# CHECK: movz w0, #0x1, lsl #16
+# CHECK: movz x0, #0x1, lsl #16
+
+0x40 0x00 0x80 0x12
+0x40 0x00 0x80 0x92
+0x40 0x00 0xa0 0x12
+0x40 0x00 0xa0 0x92
+
+# CHECK: movn w0, #0x2
+# CHECK: movn x0, #0x2
+# CHECK: movn w0, #0x2, lsl #16
+# CHECK: movn x0, #0x2, lsl #16
+
+0x20 0x00 0x80 0x72
+0x20 0x00 0x80 0xf2
+0x20 0x00 0xa0 0x72
+0x20 0x00 0xa0 0xf2
+
+# CHECK: movk w0, #0x1
+# CHECK: movk x0, #0x1
+# CHECK: movk w0, #0x1, lsl #16
+# CHECK: movk x0, #0x1, lsl #16
+
+#==---------------------------------------------------------------------------==
+# Conditionally set flags instructions
+#==---------------------------------------------------------------------------==
+
+  0x1f 0x00 0x00 0x31
+# CHECK: cmn w0, #0
+  0x1f 0xfc 0x03 0xb1
+# CHECK: x0, #255
+
+  0x23 0x08 0x42 0x3a
+# CHECK: ccmn w1, #2, #3, eq
+  0x23 0x08 0x42 0xba
+# CHECK: ccmn x1, #2, #3, eq
+  0x23 0x08 0x42 0x7a
+# CHECK: ccmp w1, #2, #3, eq
+  0x23 0x08 0x42 0xfa
+# CHECK: ccmp x1, #2, #3, eq
+
+  0x23 0x00 0x42 0x3a
+# CHECK: ccmn w1, w2, #3, eq
+  0x23 0x00 0x42 0xba
+# CHECK: ccmn x1, x2, #3, eq
+  0x23 0x00 0x42 0x7a
+# CHECK: ccmp w1, w2, #3, eq
+  0x23 0x00 0x42 0xfa
+# CHECK: ccmp x1, x2, #3, eq
+
+#==---------------------------------------------------------------------------==
+# Conditional select instructions
+#==---------------------------------------------------------------------------==
+
+  0x41 0x00 0x83 0x1a
+# CHECK: csel w1, w2, w3, eq
+  0x41 0x00 0x83 0x9a
+# CHECK: csel x1, x2, x3, eq
+  0x41 0x04 0x83 0x1a
+# CHECK: csinc w1, w2, w3, eq
+  0x41 0x04 0x83 0x9a
+# CHECK: csinc x1, x2, x3, eq
+  0x41 0x00 0x83 0x5a
+# CHECK: csinv w1, w2, w3, eq
+  0x41 0x00 0x83 0xda
+# CHECK: csinv x1, x2, x3, eq
+  0x41 0x04 0x83 0x5a
+# CHECK: csneg w1, w2, w3, eq
+  0x41 0x04 0x83 0xda
+# CHECK: csneg x1, x2, x3, eq
diff --git a/test/MC/Disassembler/AArch64/arm64-basic-a64-undefined.txt b/test/MC/Disassembler/AArch64/arm64-basic-a64-undefined.txt
new file mode 100644
index 00000000000..0e15af63e68
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-basic-a64-undefined.txt
@@ -0,0 +1,31 @@
+# These spawn another process so they're rather expensive. Not many.
+
+# LDR/STR: undefined if option field is 10x or 00x.
+# RUN: echo "0x00 0x08 0x20 0xf8" | llvm-mc -triple arm64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x88 0x20 0xf8" | llvm-mc -triple arm64 -disassemble 2>&1 | FileCheck %s
+
+# Instructions notionally in the add/sub (extended register) sheet, but with
+# invalid shift amount or "opt" field.
+# RUN: echo "0x00 0x10 0xa0 0x0b" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x10 0x60 0x0b" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x14 0x20 0x0b" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+
+# MOVK with sf == 0 and hw<1> == 1 is unallocated.
+# RUN: echo "0x00 0x00 0xc0 0x72" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+
+# ADD/SUB (shifted register) are reserved if shift == '11' or sf == '0' and imm6<5> == '1'.
+# RUN: echo "0x00 0x00 0xc0 0xeb" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x80 0x80 0x6b" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+
+# UBFM is undefined when s == 0 and imms<5> or immr<5> is 1.
+# RUN: echo "0x00 0x80 0x00 0x53" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+
+# EXT on vectors of i8 must have imm<3> = 0.
+# RUN: echo "0x00 0x40 0x00 0x2e" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+
+# SCVTF on fixed point W-registers is undefined if scale<5> == 0.
+# Same with FCVTZS and FCVTZU.
+# RUN: echo "0x00 0x00 0x02 0x1e" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x00 0x18 0x1e" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/AArch64/arm64-bitfield.txt b/test/MC/Disassembler/AArch64/arm64-bitfield.txt
new file mode 100644
index 00000000000..d620cb3b217
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-bitfield.txt
@@ -0,0 +1,29 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#==---------------------------------------------------------------------------==
+# 5.4.4 Bitfield Operations
+#==---------------------------------------------------------------------------==
+
+0x41 0x3c 0x01 0x33
+0x41 0x3c 0x41 0xb3
+0x41 0x3c 0x01 0x13
+0x41 0x3c 0x41 0x93
+0x41 0x3c 0x01 0x53
+0x41 0x3c 0x41 0xd3
+
+# CHECK: bfxil  w1, w2, #1, #15
+# CHECK: bfxil  x1, x2, #1, #15
+# CHECK: sbfx w1, w2, #1, #15
+# CHECK: sbfx x1, x2, #1, #15
+# CHECK: ubfx w1, w2, #1, #15
+# CHECK: ubfx x1, x2, #1, #15
+
+#==---------------------------------------------------------------------------==
+# 5.4.5 Extract (immediate)
+#==---------------------------------------------------------------------------==
+
+0x41 0x3c 0x83 0x13
+0x62 0x04 0xc4 0x93
+
+# CHECK: extr w1, w2, w3, #15
+# CHECK: extr x2, x3, x4, #1
diff --git a/test/MC/Disassembler/AArch64/arm64-branch.txt b/test/MC/Disassembler/AArch64/arm64-branch.txt
new file mode 100644
index 00000000000..6af1ad886a4
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-branch.txt
@@ -0,0 +1,75 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#-----------------------------------------------------------------------------
+# Unconditional branch (register) instructions.
+#-----------------------------------------------------------------------------
+
+  0xc0 0x03 0x5f 0xd6
+# CHECK: ret
+  0x20 0x00 0x5f 0xd6
+# CHECK: ret x1
+  0xe0 0x03 0xbf 0xd6
+# CHECK: drps
+  0xe0 0x03 0x9f 0xd6
+# CHECK: eret
+  0xa0 0x00 0x1f 0xd6
+# CHECK: br  x5
+  0x20 0x01 0x3f 0xd6
+# CHECK: blr x9
+  0x0B 0x00 0x18 0x37
+# CHECK: tbnz	w11, #3, #0
+
+#-----------------------------------------------------------------------------
+# Exception generation instructions.
+#-----------------------------------------------------------------------------
+
+  0x20 0x00 0x20 0xd4
+# CHECK: brk   #0x1
+  0x41 0x00 0xa0 0xd4
+# CHECK: dcps1 #0x2
+  0x62 0x00 0xa0 0xd4
+# CHECK: dcps2 #0x3
+  0x83 0x00 0xa0 0xd4
+# CHECK: dcps3 #0x4
+  0xa0 0x00 0x40 0xd4
+# CHECK: hlt   #0x5
+  0xc2 0x00 0x00 0xd4
+# CHECK: hvc   #0x6
+  0xe3 0x00 0x00 0xd4
+# CHECK: smc   #0x7
+  0x01 0x01 0x00 0xd4
+# CHECK: svc   #0x8
+
+#-----------------------------------------------------------------------------
+# PC-relative branches (both positive and negative displacement)
+#-----------------------------------------------------------------------------
+
+  0x07 0x00 0x00 0x14
+# CHECK: b #28
+  0x06 0x00 0x00 0x94
+# CHECK: bl #24
+  0xa1 0x00 0x00 0x54
+# CHECK: b.ne #20
+  0x80 0x00 0x08 0x36
+# CHECK: tbz w0, #1, #16
+  0xe1 0xff 0xf7 0x36
+# CHECK: tbz w1, #30, #-4
+  0x60 0x00 0x08 0x37
+# CHECK: tbnz w0, #1, #12
+  0x40 0x00 0x00 0xb4
+# CHECK: cbz x0, #8
+  0x20 0x00 0x00 0xb5
+# CHECK: cbnz x0, #4
+  0x1f 0x20 0x03 0xd5
+# CHECK: nop
+  0xff 0xff 0xff 0x17
+# CHECK: b #-4
+  0xc1 0xff 0xff 0x54
+# CHECK: b.ne #-8
+  0xa0 0xff 0x0f 0x36
+# CHECK: tbz w0, #1, #-12
+  0x80 0xff 0xff 0xb4
+# CHECK: cbz x0, #-16
+  0x1f 0x20 0x03 0xd5
+# CHECK: nop
+
diff --git a/test/MC/Disassembler/AArch64/arm64-canonical-form.txt b/test/MC/Disassembler/AArch64/arm64-canonical-form.txt
new file mode 100644
index 00000000000..1c94b13b4ac
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-canonical-form.txt
@@ -0,0 +1,21 @@
+# RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon --disassemble < %s | FileCheck %s
+
+0x00 0x08 0x00 0xc8
+
+# CHECK: stxr	w0, x0, [x0]
+
+0x00 0x00 0x40 0x9b
+
+# CHECK: smulh x0, x0, x0
+
+0x08 0x20 0x21 0x1e
+
+# CHECK: fcmp s0, #0.0
+
+0x1f 0x00 0x00 0x11
+
+# CHECK: mov wsp, w0
+
+0x00 0x7c 0x00 0x13
+
+# CHECK: asr	w0, w0, #0
diff --git a/test/MC/Disassembler/AArch64/arm64-crc32.txt b/test/MC/Disassembler/AArch64/arm64-crc32.txt
new file mode 100644
index 00000000000..51717ee2862
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-crc32.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc -triple=arm64 -mattr=+crc -disassemble < %s | FileCheck %s
+
+# CHECK: crc32b  w5, w7, w20
+# CHECK: crc32h  w28, wzr, w30
+# CHECK: crc32w  w0, w1, w2
+# CHECK: crc32x  w7, w9, x20
+# CHECK: crc32cb w9, w5, w4
+# CHECK: crc32ch w13, w17, w25
+# CHECK: crc32cw wzr, w3, w5
+# CHECK: crc32cx w18, w16, xzr
+0xe5 0x40 0xd4 0x1a
+0xfc 0x47 0xde 0x1a
+0x20 0x48 0xc2 0x1a
+0x27 0x4d 0xd4 0x9a
+0xa9 0x50 0xc4 0x1a
+0x2d 0x56 0xd9 0x1a
+0x7f 0x58 0xc5 0x1a
+0x12 0x5e 0xdf 0x9a
diff --git a/test/MC/Disassembler/AArch64/arm64-crypto.txt b/test/MC/Disassembler/AArch64/arm64-crypto.txt
new file mode 100644
index 00000000000..b905b92c636
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-crypto.txt
@@ -0,0 +1,47 @@
+# RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -output-asm-variant=1 --disassemble < %s | FileCheck %s --check-prefix=CHECK-APPLE
+
+  0x20 0x48 0x28 0x4e
+  0x20 0x58 0x28 0x4e
+  0x20 0x68 0x28 0x4e
+  0x20 0x78 0x28 0x4e
+  0x20 0x00 0x02 0x5e
+  0x20 0x10 0x02 0x5e
+  0x20 0x20 0x02 0x5e
+  0x20 0x30 0x02 0x5e
+  0x20 0x40 0x02 0x5e
+  0x20 0x50 0x02 0x5e
+  0x20 0x60 0x02 0x5e
+  0x20 0x08 0x28 0x5e
+  0x20 0x18 0x28 0x5e
+  0x20 0x28 0x28 0x5e
+
+# CHECK: aese v0.16b, v1.16b
+# CHECK: aesd v0.16b, v1.16b
+# CHECK: aesmc v0.16b, v1.16b
+# CHECK: aesimc v0.16b, v1.16b
+# CHECK: sha1c q0, s1, v2.4s
+# CHECK: sha1p q0, s1, v2.4s
+# CHECK: sha1m q0, s1, v2.4s
+# CHECK: sha1su0 v0.4s, v1.4s, v2
+# CHECK: sha256h q0, q1, v2.4s
+# CHECK: sha256h2 q0, q1, v2.4s
+# CHECK: sha256su1 v0.4s, v1.4s, v2.4s
+# CHECK: sha1h s0, s1
+# CHECK: sha1su1 v0.4s, v1.4s
+# CHECK: sha256su0 v0.4s, v1.4s
+
+# CHECK-APPLE: aese.16b v0, v1
+# CHECK-APPLE: aesd.16b v0, v1
+# CHECK-APPLE: aesmc.16b v0, v1
+# CHECK-APPLE: aesimc.16b v0, v1
+# CHECK-APPLE: sha1c.4s q0, s1, v2
+# CHECK-APPLE: sha1p.4s q0, s1, v2
+# CHECK-APPLE: sha1m.4s q0, s1, v2
+# CHECK-APPLE: sha1su0.4s v0, v1, v2
+# CHECK-APPLE: sha256h.4s q0, q1, v2
+# CHECK-APPLE: sha256h2.4s q0, q1, v2
+# CHECK-APPLE: sha256su1.4s v0, v1, v2
+# CHECK-APPLE: sha1h s0, s1
+# CHECK-APPLE: sha1su1.4s v0, v1
+# CHECK-APPLE: sha256su0.4s v0, v1
diff --git a/test/MC/Disassembler/AArch64/arm64-invalid-logical.txt b/test/MC/Disassembler/AArch64/arm64-invalid-logical.txt
new file mode 100644
index 00000000000..8a4ecb664ed
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-invalid-logical.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc -triple arm64-apple-darwin -disassemble < %s 2>&1 | FileCheck %s
+
+# rdar://15226511
+0x7b 0xbf 0x25 0x72
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: 0x7b 0xbf 0x25 0x72
diff --git a/test/MC/Disassembler/AArch64/arm64-logical.txt b/test/MC/Disassembler/AArch64/arm64-logical.txt
new file mode 100644
index 00000000000..e3cb3ebe7e0
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-logical.txt
@@ -0,0 +1,223 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#==---------------------------------------------------------------------------==
+# 5.4.2 Logical (immediate)
+#==---------------------------------------------------------------------------==
+
+0x00 0x00 0x00 0x12
+0x00 0x00 0x40 0x92
+0x41 0x0c 0x00 0x12
+0x41 0x0c 0x40 0x92
+0xbf 0xec 0x7c 0x92
+0x00 0x00 0x00 0x72
+0x00 0x00 0x40 0xf2
+0x41 0x0c 0x00 0x72
+0x41 0x0c 0x40 0xf2
+0x5f 0x0c 0x40 0xf2
+
+# CHECK: and  w0, w0, #0x1
+# CHECK: and  x0, x0, #0x1
+# CHECK: and  w1, w2, #0xf
+# CHECK: and  x1, x2, #0xf
+# CHECK: and  sp, x5, #0xfffffffffffffff0
+# CHECK: ands w0, w0, #0x1
+# CHECK: ands x0, x0, #0x1
+# CHECK: ands w1, w2, #0xf
+# CHECK: ands x1, x2, #0xf
+# CHECK: tst x2, #0xf
+
+0x41 0x00 0x12 0x52
+0x41 0x00 0x71 0xd2
+0x5f 0x00 0x71 0xd2
+
+# CHECK: eor w1, w2, #0x4000
+# CHECK: eor x1, x2, #0x8000
+# CHECK: eor sp, x2, #0x8000
+
+0x41 0x00 0x12 0x32
+0x41 0x00 0x71 0xb2
+0x5f 0x00 0x71 0xb2
+
+# CHECK: orr w1, w2, #0x4000
+# CHECK: orr x1, x2, #0x8000
+# CHECK: orr sp, x2, #0x8000
+
+#==---------------------------------------------------------------------------==
+# 5.5.3 Logical (shifted register)
+#==---------------------------------------------------------------------------==
+
+0x41 0x00 0x03 0x0a
+0x41 0x00 0x03 0x8a
+0x41 0x08 0x03 0x0a
+0x41 0x08 0x03 0x8a
+0x41 0x08 0x43 0x0a
+0x41 0x08 0x43 0x8a
+0x41 0x08 0x83 0x0a
+0x41 0x08 0x83 0x8a
+0x41 0x08 0xc3 0x0a
+0x41 0x08 0xc3 0x8a
+
+# CHECK: and  w1, w2, w3
+# CHECK: and  x1, x2, x3
+# CHECK: and  w1, w2, w3, lsl #2
+# CHECK: and  x1, x2, x3, lsl #2
+# CHECK: and  w1, w2, w3, lsr #2
+# CHECK: and  x1, x2, x3, lsr #2
+# CHECK: and  w1, w2, w3, asr #2
+# CHECK: and  x1, x2, x3, asr #2
+# CHECK: and  w1, w2, w3, ror #2
+# CHECK: and  x1, x2, x3, ror #2
+
+0x41 0x00 0x03 0x6a
+0x41 0x00 0x03 0xea
+0x41 0x08 0x03 0x6a
+0x41 0x08 0x03 0xea
+0x41 0x08 0x43 0x6a
+0x41 0x08 0x43 0xea
+0x41 0x08 0x83 0x6a
+0x41 0x08 0x83 0xea
+0x41 0x08 0xc3 0x6a
+0x41 0x08 0xc3 0xea
+
+# CHECK: ands w1, w2, w3
+# CHECK: ands x1, x2, x3
+# CHECK: ands w1, w2, w3, lsl #2
+# CHECK: ands x1, x2, x3, lsl #2
+# CHECK: ands w1, w2, w3, lsr #2
+# CHECK: ands x1, x2, x3, lsr #2
+# CHECK: ands w1, w2, w3, asr #2
+# CHECK: ands x1, x2, x3, asr #2
+# CHECK: ands w1, w2, w3, ror #2
+# CHECK: ands x1, x2, x3, ror #2
+
+0x41 0x00 0x23 0x0a
+0x41 0x00 0x23 0x8a
+0x41 0x0c 0x23 0x0a
+0x41 0x0c 0x23 0x8a
+0x41 0x0c 0x63 0x0a
+0x41 0x0c 0x63 0x8a
+0x41 0x0c 0xa3 0x0a
+0x41 0x0c 0xa3 0x8a
+0x41 0x0c 0xe3 0x0a
+0x41 0x0c 0xe3 0x8a
+
+# CHECK: bic w1, w2, w3
+# CHECK: bic x1, x2, x3
+# CHECK: bic w1, w2, w3, lsl #3
+# CHECK: bic x1, x2, x3, lsl #3
+# CHECK: bic w1, w2, w3, lsr #3
+# CHECK: bic x1, x2, x3, lsr #3
+# CHECK: bic w1, w2, w3, asr #3
+# CHECK: bic x1, x2, x3, asr #3
+# CHECK: bic w1, w2, w3, ror #3
+# CHECK: bic x1, x2, x3, ror #3
+
+0x41 0x00 0x23 0x6a
+0x41 0x00 0x23 0xea
+0x41 0x0c 0x23 0x6a
+0x41 0x0c 0x23 0xea
+0x41 0x0c 0x63 0x6a
+0x41 0x0c 0x63 0xea
+0x41 0x0c 0xa3 0x6a
+0x41 0x0c 0xa3 0xea
+0x41 0x0c 0xe3 0x6a
+0x41 0x0c 0xe3 0xea
+
+# CHECK: bics w1, w2, w3
+# CHECK: bics x1, x2, x3
+# CHECK: bics w1, w2, w3, lsl #3
+# CHECK: bics x1, x2, x3, lsl #3
+# CHECK: bics w1, w2, w3, lsr #3
+# CHECK: bics x1, x2, x3, lsr #3
+# CHECK: bics w1, w2, w3, asr #3
+# CHECK: bics x1, x2, x3, asr #3
+# CHECK: bics w1, w2, w3, ror #3
+# CHECK: bics x1, x2, x3, ror #3
+
+0x41 0x00 0x23 0x4a
+0x41 0x00 0x23 0xca
+0x41 0x10 0x23 0x4a
+0x41 0x10 0x23 0xca
+0x41 0x10 0x63 0x4a
+0x41 0x10 0x63 0xca
+0x41 0x10 0xa3 0x4a
+0x41 0x10 0xa3 0xca
+0x41 0x10 0xe3 0x4a
+0x41 0x10 0xe3 0xca
+
+# CHECK: eon w1, w2, w3
+# CHECK: eon x1, x2, x3
+# CHECK: eon w1, w2, w3, lsl #4
+# CHECK: eon x1, x2, x3, lsl #4
+# CHECK: eon w1, w2, w3, lsr #4
+# CHECK: eon x1, x2, x3, lsr #4
+# CHECK: eon w1, w2, w3, asr #4
+# CHECK: eon x1, x2, x3, asr #4
+# CHECK: eon w1, w2, w3, ror #4
+# CHECK: eon x1, x2, x3, ror #4
+
+0x41 0x00 0x03 0x4a
+0x41 0x00 0x03 0xca
+0x41 0x14 0x03 0x4a
+0x41 0x14 0x03 0xca
+0x41 0x14 0x43 0x4a
+0x41 0x14 0x43 0xca
+0x41 0x14 0x83 0x4a
+0x41 0x14 0x83 0xca
+0x41 0x14 0xc3 0x4a
+0x41 0x14 0xc3 0xca
+
+# CHECK: eor w1, w2, w3
+# CHECK: eor x1, x2, x3
+# CHECK: eor w1, w2, w3, lsl #5
+# CHECK: eor x1, x2, x3, lsl #5
+# CHECK: eor w1, w2, w3, lsr #5
+# CHECK: eor x1, x2, x3, lsr #5
+# CHECK: eor w1, w2, w3, asr #5
+# CHECK: eor x1, x2, x3, asr #5
+# CHECK: eor w1, w2, w3, ror #5
+# CHECK: eor x1, x2, x3, ror #5
+
+0x41 0x00 0x03 0x2a
+0x41 0x00 0x03 0xaa
+0x41 0x18 0x03 0x2a
+0x41 0x18 0x03 0xaa
+0x41 0x18 0x43 0x2a
+0x41 0x18 0x43 0xaa
+0x41 0x18 0x83 0x2a
+0x41 0x18 0x83 0xaa
+0x41 0x18 0xc3 0x2a
+0x41 0x18 0xc3 0xaa
+
+# CHECK: orr w1, w2, w3
+# CHECK: orr x1, x2, x3
+# CHECK: orr w1, w2, w3, lsl #6
+# CHECK: orr x1, x2, x3, lsl #6
+# CHECK: orr w1, w2, w3, lsr #6
+# CHECK: orr x1, x2, x3, lsr #6
+# CHECK: orr w1, w2, w3, asr #6
+# CHECK: orr x1, x2, x3, asr #6
+# CHECK: orr w1, w2, w3, ror #6
+# CHECK: orr x1, x2, x3, ror #6
+
+0x41 0x00 0x23 0x2a
+0x41 0x00 0x23 0xaa
+0x41 0x1c 0x23 0x2a
+0x41 0x1c 0x23 0xaa
+0x41 0x1c 0x63 0x2a
+0x41 0x1c 0x63 0xaa
+0x41 0x1c 0xa3 0x2a
+0x41 0x1c 0xa3 0xaa
+0x41 0x1c 0xe3 0x2a
+0x41 0x1c 0xe3 0xaa
+
+# CHECK: orn w1, w2, w3
+# CHECK: orn x1, x2, x3
+# CHECK: orn w1, w2, w3, lsl #7
+# CHECK: orn x1, x2, x3, lsl #7
+# CHECK: orn w1, w2, w3, lsr #7
+# CHECK: orn x1, x2, x3, lsr #7
+# CHECK: orn w1, w2, w3, asr #7
+# CHECK: orn x1, x2, x3, asr #7
+# CHECK: orn w1, w2, w3, ror #7
+# CHECK: orn x1, x2, x3, ror #7
diff --git a/test/MC/Disassembler/AArch64/arm64-memory.txt b/test/MC/Disassembler/AArch64/arm64-memory.txt
new file mode 100644
index 00000000000..54556a10b8a
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-memory.txt
@@ -0,0 +1,564 @@
+# RUN: llvm-mc --disassemble -triple arm64-apple-darwin < %s | FileCheck %s
+
+#-----------------------------------------------------------------------------
+# Indexed loads
+#-----------------------------------------------------------------------------
+
+  0x85 0x14 0x40 0xb9
+  0x64 0x00 0x40 0xf9
+  0xe2 0x13 0x40 0xf9
+  0xe5 0x07 0x40 0x3d
+  0xe6 0x07 0x40 0x7d
+  0xe7 0x07 0x40 0xbd
+  0xe8 0x07 0x40 0xfd
+  0xe9 0x07 0xc0 0x3d
+  0x64 0x00 0x40 0x39
+  0x20 0x78 0xa0 0xb8
+  0x85 0x50 0x40 0x39
+
+# CHECK: ldr	w5, [x4, #20]
+# CHECK: ldr	x4, [x3]
+# CHECK: ldr	x2, [sp, #32]
+# CHECK: ldr	b5, [sp, #1]
+# CHECK: ldr	h6, [sp, #2]
+# CHECK: ldr	s7, [sp, #4]
+# CHECK: ldr	d8, [sp, #8]
+# CHECK: ldr	q9, [sp, #16]
+# CHECK: ldrb	w4, [x3]
+# CHECK: ldrsw	x0, [x1, x0, lsl #2]
+# CHECK: ldrb	w5, [x4, #20]
+# CHECK: ldrsb	w9, [x3]
+# CHECK: ldrsb	x2, [sp, #128]
+# CHECK: ldrh	w2, [sp, #32]
+# CHECK: ldrsh	w3, [sp, #32]
+# CHECK: ldrsh	x5, [x9, #24]
+# CHECK: ldrsw	x9, [sp, #512]
+# CHECK: prfm	pldl3strm, [sp, #32]
+
+  0x69 0x00 0xc0 0x39
+  0xe2 0x03 0x82 0x39
+  0xe2 0x43 0x40 0x79
+  0xe3 0x43 0xc0 0x79
+  0x25 0x31 0x80 0x79
+  0xe9 0x03 0x82 0xb9
+  0xe5 0x13 0x80 0xf9
+  0x40 0x00 0x80 0xf9
+  0x41 0x00 0x80 0xf9
+  0x42 0x00 0x80 0xf9
+  0x43 0x00 0x80 0xf9
+  0x44 0x00 0x80 0xf9
+  0x45 0x00 0x80 0xf9
+  0x50 0x00 0x80 0xf9
+  0x51 0x00 0x80 0xf9
+  0x52 0x00 0x80 0xf9
+  0x53 0x00 0x80 0xf9
+  0x54 0x00 0x80 0xf9
+  0x55 0x00 0x80 0xf9
+
+# CHECK: prfm	pldl1keep, [x2]
+# CHECK: prfm	pldl1strm, [x2]
+# CHECK: prfm	pldl2keep, [x2]
+# CHECK: prfm	pldl2strm, [x2]
+# CHECK: prfm	pldl3keep, [x2]
+# CHECK: prfm	pldl3strm, [x2]
+# CHECK: prfm	pstl1keep, [x2]
+# CHECK: prfm	pstl1strm, [x2]
+# CHECK: prfm	pstl2keep, [x2]
+# CHECK: prfm	pstl2strm, [x2]
+# CHECK: prfm	pstl3keep, [x2]
+# CHECK: prfm	pstl3strm, [x2]
+
+#-----------------------------------------------------------------------------
+# Indexed stores
+#-----------------------------------------------------------------------------
+
+  0x64 0x00 0x00 0xf9
+  0xe2 0x13 0x00 0xf9
+  0x85 0x14 0x00 0xb9
+  0xe5 0x07 0x00 0x3d
+  0xe6 0x07 0x00 0x7d
+  0xe7 0x07 0x00 0xbd
+  0xe8 0x07 0x00 0xfd
+  0xe9 0x07 0x80 0x3d
+  0x64 0x00 0x00 0x39
+  0x85 0x50 0x00 0x39
+  0xe2 0x43 0x00 0x79
+  0x00 0xe8 0x20 0x38
+  0x00 0x48 0x20 0x38
+
+# CHECK: str	x4, [x3]
+# CHECK: str	x2, [sp, #32]
+# CHECK: str	w5, [x4, #20]
+# CHECK: str	b5, [sp, #1]
+# CHECK: str	h6, [sp, #2]
+# CHECK: str	s7, [sp, #4]
+# CHECK: str	d8, [sp, #8]
+# CHECK: str	q9, [sp, #16]
+# CHECK: strb	w4, [x3]
+# CHECK: strb	w5, [x4, #20]
+# CHECK: strh	w2, [sp, #32]
+# CHECK: strb	w0, [x0, x0, sxtx]
+# CHECK: strb	w0, [x0, w0, uxtw]
+
+#-----------------------------------------------------------------------------
+# Unscaled immediate loads and stores
+#-----------------------------------------------------------------------------
+
+  0x62 0x00 0x40 0xb8
+  0xe2 0x83 0x41 0xb8
+  0x62 0x00 0x40 0xf8
+  0xe2 0x83 0x41 0xf8
+  0xe5 0x13 0x40 0x3c
+  0xe6 0x23 0x40 0x7c
+  0xe7 0x43 0x40 0xbc
+  0xe8 0x83 0x40 0xfc
+  0xe9 0x03 0xc1 0x3c
+  0x69 0x00 0xc0 0x38
+  0xe2 0x03 0x88 0x38
+  0xe3 0x03 0xc2 0x78
+  0x25 0x81 0x81 0x78
+  0xe9 0x03 0x98 0xb8
+
+# CHECK: ldur	w2, [x3]
+# CHECK: ldur	w2, [sp, #24]
+# CHECK: ldur	x2, [x3]
+# CHECK: ldur	x2, [sp, #24]
+# CHECK: ldur	b5, [sp, #1]
+# CHECK: ldur	h6, [sp, #2]
+# CHECK: ldur	s7, [sp, #4]
+# CHECK: ldur	d8, [sp, #8]
+# CHECK: ldur	q9, [sp, #16]
+# CHECK: ldursb	w9, [x3]
+# CHECK: ldursb	x2, [sp, #128]
+# CHECK: ldursh	w3, [sp, #32]
+# CHECK: ldursh	x5, [x9, #24]
+# CHECK: ldursw	x9, [sp, #-128]
+
+  0x64 0x00 0x00 0xb8
+  0xe2 0x03 0x02 0xb8
+  0x64 0x00 0x00 0xf8
+  0xe2 0x03 0x02 0xf8
+  0x85 0x40 0x01 0xb8
+  0xe5 0x13 0x00 0x3c
+  0xe6 0x23 0x00 0x7c
+  0xe7 0x43 0x00 0xbc
+  0xe8 0x83 0x00 0xfc
+  0xe9 0x03 0x81 0x3c
+  0x64 0x00 0x00 0x38
+  0x85 0x40 0x01 0x38
+  0xe2 0x03 0x02 0x78
+  0xe5 0x03 0x82 0xf8
+
+# CHECK: stur	w4, [x3]
+# CHECK: stur	w2, [sp, #32]
+# CHECK: stur	x4, [x3]
+# CHECK: stur	x2, [sp, #32]
+# CHECK: stur	w5, [x4, #20]
+# CHECK: stur	b5, [sp, #1]
+# CHECK: stur	h6, [sp, #2]
+# CHECK: stur	s7, [sp, #4]
+# CHECK: stur	d8, [sp, #8]
+# CHECK: stur	q9, [sp, #16]
+# CHECK: sturb	w4, [x3]
+# CHECK: sturb	w5, [x4, #20]
+# CHECK: sturh	w2, [sp, #32]
+# CHECK: prfum	pldl3strm, [sp, #32]
+
+#-----------------------------------------------------------------------------
+# Unprivileged loads and stores
+#-----------------------------------------------------------------------------
+
+  0x83 0x08 0x41 0xb8
+  0x83 0x08 0x41 0xf8
+  0x83 0x08 0x41 0x38
+  0x69 0x08 0xc0 0x38
+  0xe2 0x0b 0x88 0x38
+  0x83 0x08 0x41 0x78
+  0xe3 0x0b 0xc2 0x78
+  0x25 0x89 0x81 0x78
+  0xe9 0x0b 0x98 0xb8
+
+# CHECK: ldtr	w3, [x4, #16]
+# CHECK: ldtr	x3, [x4, #16]
+# CHECK: ldtrb	w3, [x4, #16]
+# CHECK: ldtrsb	w9, [x3]
+# CHECK: ldtrsb	x2, [sp, #128]
+# CHECK: ldtrh	w3, [x4, #16]
+# CHECK: ldtrsh	w3, [sp, #32]
+# CHECK: ldtrsh	x5, [x9, #24]
+# CHECK: ldtrsw	x9, [sp, #-128]
+
+  0x85 0x48 0x01 0xb8
+  0x64 0x08 0x00 0xf8
+  0xe2 0x0b 0x02 0xf8
+  0x64 0x08 0x00 0x38
+  0x85 0x48 0x01 0x38
+  0xe2 0x0b 0x02 0x78
+
+# CHECK: sttr	w5, [x4, #20]
+# CHECK: sttr	x4, [x3]
+# CHECK: sttr	x2, [sp, #32]
+# CHECK: sttrb	w4, [x3]
+# CHECK: sttrb	w5, [x4, #20]
+# CHECK: sttrh	w2, [sp, #32]
+
+#-----------------------------------------------------------------------------
+# Pre-indexed loads and stores
+#-----------------------------------------------------------------------------
+
+  0xfd 0x8c 0x40 0xf8
+  0xfe 0x8c 0x40 0xf8
+  0x05 0x1c 0x40 0x3c
+  0x06 0x2c 0x40 0x7c
+  0x07 0x4c 0x40 0xbc
+  0x08 0x8c 0x40 0xfc
+  0x09 0x0c 0xc1 0x3c
+
+# CHECK: ldr	x29, [x7, #8]!
+# CHECK: ldr	x30, [x7, #8]!
+# CHECK: ldr	b5, [x0, #1]!
+# CHECK: ldr	h6, [x0, #2]!
+# CHECK: ldr	s7, [x0, #4]!
+# CHECK: ldr	d8, [x0, #8]!
+# CHECK: ldr	q9, [x0, #16]!
+
+  0xfe 0x8c 0x1f 0xf8
+  0xfd 0x8c 0x1f 0xf8
+  0x05 0xfc 0x1f 0x3c
+  0x06 0xec 0x1f 0x7c
+  0x07 0xcc 0x1f 0xbc
+  0x08 0x8c 0x1f 0xfc
+  0x09 0x0c 0x9f 0x3c
+
+# CHECK: str	x30, [x7, #-8]!
+# CHECK: str	x29, [x7, #-8]!
+# CHECK: str	b5, [x0, #-1]!
+# CHECK: str	h6, [x0, #-2]!
+# CHECK: str	s7, [x0, #-4]!
+# CHECK: str	d8, [x0, #-8]!
+# CHECK: str	q9, [x0, #-16]!
+
+#-----------------------------------------------------------------------------
+# post-indexed loads and stores
+#-----------------------------------------------------------------------------
+
+  0xfe 0x84 0x1f 0xf8
+  0xfd 0x84 0x1f 0xf8
+  0x05 0xf4 0x1f 0x3c
+  0x06 0xe4 0x1f 0x7c
+  0x07 0xc4 0x1f 0xbc
+  0x08 0x84 0x1f 0xfc
+  0x09 0x04 0x9f 0x3c
+
+# CHECK: str	x30, [x7], #-8
+# CHECK: str	x29, [x7], #-8
+# CHECK: str	b5, [x0], #-1
+# CHECK: str	h6, [x0], #-2
+# CHECK: str	s7, [x0], #-4
+# CHECK: str	d8, [x0], #-8
+# CHECK: str	q9, [x0], #-16
+
+  0xfd 0x84 0x40 0xf8
+  0xfe 0x84 0x40 0xf8
+  0x05 0x14 0x40 0x3c
+  0x06 0x24 0x40 0x7c
+  0x07 0x44 0x40 0xbc
+  0x08 0x84 0x40 0xfc
+  0x09 0x04 0xc1 0x3c
+
+# CHECK: ldr	x29, [x7], #8
+# CHECK: ldr	x30, [x7], #8
+# CHECK: ldr	b5, [x0], #1
+# CHECK: ldr	h6, [x0], #2
+# CHECK: ldr	s7, [x0], #4
+# CHECK: ldr	d8, [x0], #8
+# CHECK: ldr	q9, [x0], #16
+
+#-----------------------------------------------------------------------------
+# Load/Store pair (indexed  offset)
+#-----------------------------------------------------------------------------
+
+  0xe3 0x09 0x42 0x29
+  0xe4 0x27 0x7f 0xa9
+  0xc2 0x0d 0x42 0x69
+  0xe2 0x0f 0x7e 0x69
+  0x4a 0x04 0x48 0x2d
+  0x4a 0x04 0x40 0x6d
+
+# CHECK: ldp	w3, w2, [x15, #16]
+# CHECK: ldp	x4, x9, [sp, #-16]
+# CHECK: ldpsw	x2, x3, [x14, #16]
+# CHECK: ldpsw	x2, x3, [sp, #-16]
+# CHECK: ldp	s10, s1, [x2, #64]
+# CHECK: ldp	d10, d1, [x2]
+
+  0xe3 0x09 0x02 0x29
+  0xe4 0x27 0x3f 0xa9
+  0x4a 0x04 0x08 0x2d
+  0x4a 0x04 0x00 0x6d
+
+# CHECK: stp	w3, w2, [x15, #16]
+# CHECK: stp	x4, x9, [sp, #-16]
+# CHECK: stp	s10, s1, [x2, #64]
+# CHECK: stp	d10, d1, [x2]
+
+#-----------------------------------------------------------------------------
+# Load/Store pair (pre-indexed)
+#-----------------------------------------------------------------------------
+
+  0xe3 0x09 0xc2 0x29
+  0xe4 0x27 0xff 0xa9
+  0xc2 0x0d 0xc2 0x69
+  0xe2 0x0f 0xfe 0x69
+  0x4a 0x04 0xc8 0x2d
+  0x4a 0x04 0xc1 0x6d
+
+# CHECK: ldp	w3, w2, [x15, #16]!
+# CHECK: ldp	x4, x9, [sp, #-16]!
+# CHECK: ldpsw	x2, x3, [x14, #16]!
+# CHECK: ldpsw	x2, x3, [sp, #-16]!
+# CHECK: ldp	s10, s1, [x2, #64]!
+# CHECK: ldp	d10, d1, [x2, #16]!
+
+  0xe3 0x09 0x82 0x29
+  0xe4 0x27 0xbf 0xa9
+  0x4a 0x04 0x88 0x2d
+  0x4a 0x04 0x81 0x6d
+
+# CHECK: stp	w3, w2, [x15, #16]!
+# CHECK: stp	x4, x9, [sp, #-16]!
+# CHECK: stp	s10, s1, [x2, #64]!
+# CHECK: stp	d10, d1, [x2, #16]!
+
+#-----------------------------------------------------------------------------
+# Load/Store pair (post-indexed)
+#-----------------------------------------------------------------------------
+
+  0xe3 0x09 0xc2 0x28
+  0xe4 0x27 0xff 0xa8
+  0xc2 0x0d 0xc2 0x68
+  0xe2 0x0f 0xfe 0x68
+  0x4a 0x04 0xc8 0x2c
+  0x4a 0x04 0xc1 0x6c
+
+# CHECK: ldp	w3, w2, [x15], #16
+# CHECK: ldp	x4, x9, [sp], #-16
+# CHECK: ldpsw	x2, x3, [x14], #16
+# CHECK: ldpsw	x2, x3, [sp], #-16
+# CHECK: ldp	s10, s1, [x2], #64
+# CHECK: ldp	d10, d1, [x2], #16
+
+  0xe3 0x09 0x82 0x28
+  0xe4 0x27 0xbf 0xa8
+  0x4a 0x04 0x88 0x2c
+  0x4a 0x04 0x81 0x6c
+
+# CHECK: stp	w3, w2, [x15], #16
+# CHECK: stp	x4, x9, [sp], #-16
+# CHECK: stp	s10, s1, [x2], #64
+# CHECK: stp	d10, d1, [x2], #16
+
+#-----------------------------------------------------------------------------
+# Load/Store pair (no-allocate)
+#-----------------------------------------------------------------------------
+
+  0xe3 0x09 0x42 0x28
+  0xe4 0x27 0x7f 0xa8
+  0x4a 0x04 0x48 0x2c
+  0x4a 0x04 0x40 0x6c
+
+# CHECK: ldnp	w3, w2, [x15, #16]
+# CHECK: ldnp	x4, x9, [sp, #-16]
+# CHECK: ldnp	s10, s1, [x2, #64]
+# CHECK: ldnp	d10, d1, [x2]
+
+  0xe3 0x09 0x02 0x28
+  0xe4 0x27 0x3f 0xa8
+  0x4a 0x04 0x08 0x2c
+  0x4a 0x04 0x00 0x6c
+
+# CHECK: stnp	w3, w2, [x15, #16]
+# CHECK: stnp	x4, x9, [sp, #-16]
+# CHECK: stnp	s10, s1, [x2, #64]
+# CHECK: stnp	d10, d1, [x2]
+
+#-----------------------------------------------------------------------------
+# Load/Store register offset
+#-----------------------------------------------------------------------------
+
+  0x00 0x68 0x60 0xb8
+  0x00 0x78 0x60 0xb8
+  0x00 0x68 0x60 0xf8
+  0x00 0x78 0x60 0xf8
+  0x00 0xe8 0x60 0xf8
+
+# CHECK: ldr	w0, [x0, x0]
+# CHECK: ldr	w0, [x0, x0, lsl #2]
+# CHECK: ldr	x0, [x0, x0]
+# CHECK: ldr	x0, [x0, x0, lsl #3]
+# CHECK: ldr	x0, [x0, x0, sxtx]
+
+  0x21 0x68 0x62 0x3c
+  0x21 0x78 0x62 0x3c
+  0x21 0x68 0x62 0x7c
+  0x21 0x78 0x62 0x7c
+  0x21 0x68 0x62 0xbc
+  0x21 0x78 0x62 0xbc
+  0x21 0x68 0x62 0xfc
+  0x21 0x78 0x62 0xfc
+  0x21 0x68 0xe2 0x3c
+  0x21 0x78 0xe2 0x3c
+
+# CHECK: ldr	b1, [x1, x2]
+# CHECK: ldr	b1, [x1, x2, lsl #0]
+# CHECK: ldr	h1, [x1, x2]
+# CHECK: ldr	h1, [x1, x2, lsl #1]
+# CHECK: ldr	s1, [x1, x2]
+# CHECK: ldr	s1, [x1, x2, lsl #2]
+# CHECK: ldr	d1, [x1, x2]
+# CHECK: ldr	d1, [x1, x2, lsl #3]
+# CHECK: ldr	q1, [x1, x2]
+# CHECK: ldr	q1, [x1, x2, lsl #4]
+
+  0x00 0x48 0x20 0x7c
+  0xe1 0x6b 0x23 0xfc
+  0xe1 0x5b 0x23 0xfc
+  0xe1 0x6b 0xa3 0x3c
+  0xe1 0x5b 0xa3 0x3c
+
+# CHECK: str	h0, [x0, w0, uxtw]
+# CHECK: str	d1, [sp, x3]
+# CHECK: str	d1, [sp, w3, uxtw #3]
+# CHECK: str	q1, [sp, x3]
+# CHECK: str	q1, [sp, w3, uxtw #4]
+
+#-----------------------------------------------------------------------------
+# Load/Store exclusive
+#-----------------------------------------------------------------------------
+
+  0x26 0x7c 0x5f 0x08
+  0x26 0x7c 0x5f 0x48
+  0x27 0x0d 0x7f 0x88
+  0x27 0x0d 0x7f 0xc8
+
+# CHECK: ldxrb	w6, [x1]
+# CHECK: ldxrh	w6, [x1]
+# CHECK: ldxp	w7, w3, [x9]
+# CHECK: ldxp	x7, x3, [x9]
+
+  0x64 0x7c 0x01 0xc8
+  0x64 0x7c 0x01 0x88
+  0x64 0x7c 0x01 0x08
+  0x64 0x7c 0x01 0x48
+  0x22 0x18 0x21 0xc8
+  0x22 0x18 0x21 0x88
+
+# CHECK: stxr	w1, x4, [x3]
+# CHECK: stxr	w1, w4, [x3]
+# CHECK: stxrb	w1, w4, [x3]
+# CHECK: stxrh	w1, w4, [x3]
+# CHECK: stxp	w1, x2, x6, [x1]
+# CHECK: stxp	w1, w2, w6, [x1]
+
+#-----------------------------------------------------------------------------
+# Load-acquire/Store-release non-exclusive
+#-----------------------------------------------------------------------------
+
+  0xe4 0xff 0xdf 0x88
+  0xe4 0xff 0xdf 0xc8
+  0xe4 0xff 0xdf 0x08
+  0xe4 0xff 0xdf 0x48
+
+# CHECK: ldar	w4, [sp]
+# CHECK: ldar	x4, [sp]
+# CHECK: ldarb	w4, [sp]
+# CHECK: ldarh	w4, [sp]
+
+  0xc3 0xfc 0x9f 0x88
+  0xc3 0xfc 0x9f 0xc8
+  0xc3 0xfc 0x9f 0x08
+  0xc3 0xfc 0x9f 0x48
+
+# CHECK: stlr	w3, [x6]
+# CHECK: stlr	x3, [x6]
+# CHECK: stlrb	w3, [x6]
+# CHECK: stlrh	w3, [x6]
+
+#-----------------------------------------------------------------------------
+# Load-acquire/Store-release exclusive
+#-----------------------------------------------------------------------------
+
+  0x82 0xfc 0x5f 0x88
+  0x82 0xfc 0x5f 0xc8
+  0x82 0xfc 0x5f 0x08
+  0x82 0xfc 0x5f 0x48
+  0x22 0x98 0x7f 0x88
+  0x22 0x98 0x7f 0xc8
+
+# CHECK: ldaxr	w2, [x4]
+# CHECK: ldaxr	x2, [x4]
+# CHECK: ldaxrb	w2, [x4]
+# CHECK: ldaxrh	w2, [x4]
+# CHECK: ldaxp	w2, w6, [x1]
+# CHECK: ldaxp	x2, x6, [x1]
+
+  0x27 0xfc 0x08 0xc8
+  0x27 0xfc 0x08 0x88
+  0x27 0xfc 0x08 0x08
+  0x27 0xfc 0x08 0x48
+  0x22 0x98 0x21 0xc8
+  0x22 0x98 0x21 0x88
+
+# CHECK: stlxr	w8, x7, [x1]
+# CHECK: stlxr	w8, w7, [x1]
+# CHECK: stlxrb	w8, w7, [x1]
+# CHECK: stlxrh	w8, w7, [x1]
+# CHECK: stlxp	w1, x2, x6, [x1]
+# CHECK: stlxp	w1, w2, w6, [x1]
+
+#-----------------------------------------------------------------------------
+# Load/Store with explicit LSL values
+#-----------------------------------------------------------------------------
+  0x20 0x78 0xa0 0xb8
+  0x20 0x78 0x60 0xf8
+  0x20 0x78 0x20 0xf8
+  0x20 0x78 0x60 0xb8
+  0x20 0x78 0x20 0xb8
+  0x20 0x78 0xe0 0x3c
+  0x20 0x78 0xa0 0x3c
+  0x20 0x78 0x60 0xfc
+  0x20 0x78 0x20 0xfc
+  0x20 0x78 0x60 0xbc
+  0x20 0x78 0x20 0xbc
+  0x20 0x78 0x60 0x7c
+  0x20 0x78 0x60 0x3c
+  0x20 0x78 0x60 0x38
+  0x20 0x78 0x20 0x38
+  0x20 0x78 0xe0 0x38
+  0x20 0x78 0x60 0x78
+  0x20 0x78 0x20 0x78
+  0x20 0x78 0xe0 0x78
+  0x20 0x78 0xa0 0x38
+  0x20 0x78 0xa0 0x78
+
+# CHECK: ldrsw	x0, [x1, x0, lsl #2]
+# CHECK: ldr	x0, [x1, x0, lsl #3]
+# CHECK: str	x0, [x1, x0, lsl #3]
+# CHECK: ldr	w0, [x1, x0, lsl #2]
+# CHECK: str	w0, [x1, x0, lsl #2]
+# CHECK: ldr	q0, [x1, x0, lsl #4]
+# CHECK: str	q0, [x1, x0, lsl #4]
+# CHECK: ldr	d0, [x1, x0, lsl #3]
+# CHECK: str	d0, [x1, x0, lsl #3]
+# CHECK: ldr	s0, [x1, x0, lsl #2]
+# CHECK: str	s0, [x1, x0, lsl #2]
+# CHECK: ldr	h0, [x1, x0, lsl #1]
+# CHECK: ldr	b0, [x1, x0, lsl #0]
+# CHECK: ldrb	w0, [x1, x0, lsl #0]
+# CHECK: strb	w0, [x1, x0, lsl #0]
+# CHECK: ldrsb	w0, [x1, x0, lsl #0]
+# CHECK: ldrh	w0, [x1, x0, lsl #1]
+# CHECK: strh	w0, [x1, x0, lsl #1]
+# CHECK: ldrsh	w0, [x1, x0, lsl #1]
+# CHECK: ldrsb	x0, [x1, x0, lsl #0]
+# CHECK: ldrsh	x0, [x1, x0, lsl #1]
diff --git a/test/MC/Disassembler/AArch64/arm64-non-apple-fmov.txt b/test/MC/Disassembler/AArch64/arm64-non-apple-fmov.txt
new file mode 100644
index 00000000000..75cb95ce186
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-non-apple-fmov.txt
@@ -0,0 +1,7 @@
+# RUN: llvm-mc -triple arm64 -mattr=neon -disassemble < %s | FileCheck %s
+
+0x00 0x00 0xae 0x9e
+0x00 0x00 0xaf 0x9e
+
+# CHECK: fmov x0, v0.d[1]
+# CHECK: fmov v0.d[1], x0
diff --git a/test/MC/Disassembler/AArch64/arm64-scalar-fp.txt b/test/MC/Disassembler/AArch64/arm64-scalar-fp.txt
new file mode 100644
index 00000000000..f139700164c
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-scalar-fp.txt
@@ -0,0 +1,255 @@
+# RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon --disassemble -output-asm-variant=1 < %s | FileCheck %s
+
+#-----------------------------------------------------------------------------
+# Floating-point arithmetic
+#-----------------------------------------------------------------------------
+
+0x41 0xc0 0x20 0x1e
+0x41 0xc0 0x60 0x1e
+
+# CHECK: fabs s1, s2
+# CHECK: fabs d1, d2
+
+0x41 0x28 0x23 0x1e
+0x41 0x28 0x63 0x1e
+
+# CHECK: fadd s1, s2, s3
+# CHECK: fadd d1, d2, d3
+
+0x41 0x18 0x23 0x1e
+0x41 0x18 0x63 0x1e
+
+# CHECK: fdiv s1, s2, s3
+# CHECK: fdiv d1, d2, d3
+
+0x41 0x10 0x03 0x1f
+0x41 0x10 0x43 0x1f
+
+# CHECK: fmadd s1, s2, s3, s4
+# CHECK: fmadd d1, d2, d3, d4
+
+0x41 0x48 0x23 0x1e
+0x41 0x48 0x63 0x1e
+0x41 0x68 0x23 0x1e
+0x41 0x68 0x63 0x1e
+
+# CHECK: fmax   s1, s2, s3
+# CHECK: fmax   d1, d2, d3
+# CHECK: fmaxnm s1, s2, s3
+# CHECK: fmaxnm d1, d2, d3
+
+0x41 0x58 0x23 0x1e
+0x41 0x58 0x63 0x1e
+0x41 0x78 0x23 0x1e
+0x41 0x78 0x63 0x1e
+
+# CHECK: fmin   s1, s2, s3
+# CHECK: fmin   d1, d2, d3
+# CHECK: fminnm s1, s2, s3
+# CHECK: fminnm d1, d2, d3
+
+0x41 0x90 0x03 0x1f
+0x41 0x90 0x43 0x1f
+
+# CHECK: fmsub s1, s2, s3, s4
+# CHECK: fmsub d1, d2, d3, d4
+
+0x41 0x08 0x23 0x1e
+0x41 0x08 0x63 0x1e
+
+# CHECK: fmul s1, s2, s3
+# CHECK: fmul d1, d2, d3
+
+0x41 0x40 0x21 0x1e
+0x41 0x40 0x61 0x1e
+
+# CHECK: fneg s1, s2
+# CHECK: fneg d1, d2
+
+0x41 0x10 0x23 0x1f
+0x41 0x10 0x63 0x1f
+
+# CHECK: fnmadd s1, s2, s3, s4
+# CHECK: fnmadd d1, d2, d3, d4
+
+0x41 0x90 0x23 0x1f
+0x41 0x90 0x63 0x1f
+
+# CHECK: fnmsub s1, s2, s3, s4
+# CHECK: fnmsub d1, d2, d3, d4
+
+0x41 0x88 0x23 0x1e
+0x41 0x88 0x63 0x1e
+
+# CHECK: fnmul s1, s2, s3
+# CHECK: fnmul d1, d2, d3
+
+0x41 0xc0 0x21 0x1e
+0x41 0xc0 0x61 0x1e
+
+# CHECK: fsqrt s1, s2
+# CHECK: fsqrt d1, d2
+
+0x41 0x38 0x23 0x1e
+0x41 0x38 0x63 0x1e
+
+# CHECK: fsub s1, s2, s3
+# CHECK: fsub d1, d2, d3
+
+#-----------------------------------------------------------------------------
+# Floating-point comparison
+#-----------------------------------------------------------------------------
+
+0x20 0x04 0x22 0x1e
+0x20 0x04 0x62 0x1e
+0x30 0x04 0x22 0x1e
+0x30 0x04 0x62 0x1e
+
+# CHECK: fccmp  s1, s2, #0, eq
+# CHECK: fccmp  d1, d2, #0, eq
+# CHECK: fccmpe s1, s2, #0, eq
+# CHECK: fccmpe d1, d2, #0, eq
+
+0x20 0x20 0x22 0x1e
+0x20 0x20 0x62 0x1e
+0x28 0x20 0x20 0x1e
+0x28 0x20 0x60 0x1e
+0x30 0x20 0x22 0x1e
+0x30 0x20 0x62 0x1e
+0x38 0x20 0x20 0x1e
+0x38 0x20 0x60 0x1e
+
+# CHECK: fcmp  s1, s2
+# CHECK: fcmp  d1, d2
+# CHECK: fcmp  s1, #0.0
+# CHECK: fcmp  d1, #0.0
+# CHECK: fcmpe s1, s2
+# CHECK: fcmpe d1, d2
+# CHECK: fcmpe s1, #0.0
+# CHECK: fcmpe d1, #0.0
+
+#-----------------------------------------------------------------------------
+# Floating-point conditional select
+#-----------------------------------------------------------------------------
+
+0x41 0x0c 0x23 0x1e
+0x41 0x0c 0x63 0x1e
+
+# CHECK: fcsel s1, s2, s3, eq
+# CHECK: fcsel d1, d2, d3, eq
+
+#-----------------------------------------------------------------------------
+# Floating-point convert
+#-----------------------------------------------------------------------------
+
+0x41 0xc0 0x63 0x1e
+0x41 0x40 0x62 0x1e
+0x41 0xc0 0xe2 0x1e
+0x41 0x40 0xe2 0x1e
+0x41 0xc0 0x22 0x1e
+0x41 0xc0 0x23 0x1e
+
+# CHECK: fcvt h1, d2
+# CHECK: fcvt s1, d2
+# CHECK: fcvt d1, h2
+# CHECK: fcvt s1, h2
+# CHECK: fcvt d1, s2
+# CHECK: fcvt h1, s2
+
+0x41 0x00 0x44 0x1e
+0x41 0x04 0x44 0x1e
+0x41 0x00 0x44 0x9e
+0x41 0x04 0x44 0x9e
+0x41 0x00 0x04 0x1e
+0x41 0x04 0x04 0x1e
+0x41 0x00 0x04 0x9e
+0x41 0x04 0x04 0x9e
+
+#-----------------------------------------------------------------------------
+# Floating-point move
+#-----------------------------------------------------------------------------
+
+0x41 0x00 0x27 0x1e
+0x41 0x00 0x26 0x1e
+0x41 0x00 0x67 0x9e
+0x41 0x00 0x66 0x9e
+
+# CHECK: fmov s1, w2
+# CHECK: fmov w1, s2
+# CHECK: fmov d1, x2
+# CHECK: fmov x1, d2
+
+0x01 0x10 0x28 0x1e
+0x01 0x10 0x68 0x1e
+0x01 0xf0 0x7b 0x1e
+0x01 0xf0 0x6b 0x1e
+
+# CHECK: fmov s1, #0.12500000
+# CHECK: fmov d1, #0.12500000
+# CHECK: fmov d1, #-0.48437500
+# CHECK: fmov d1, #0.48437500
+
+0x41 0x40 0x20 0x1e
+0x41 0x40 0x60 0x1e
+
+# CHECK: fmov s1, s2
+# CHECK: fmov d1, d2
+
+#-----------------------------------------------------------------------------
+# Floating-point round to integral
+#-----------------------------------------------------------------------------
+
+0x41 0x40 0x26 0x1e
+0x41 0x40 0x66 0x1e
+
+# CHECK: frinta s1, s2
+# CHECK: frinta d1, d2
+
+0x41 0xc0 0x27 0x1e
+0x41 0xc0 0x67 0x1e
+
+# CHECK: frinti s1, s2
+# CHECK: frinti d1, d2
+
+0x41 0x40 0x25 0x1e
+0x41 0x40 0x65 0x1e
+
+# CHECK: frintm s1, s2
+# CHECK: frintm d1, d2
+
+0x41 0x40 0x24 0x1e
+0x41 0x40 0x64 0x1e
+
+# CHECK: frintn s1, s2
+# CHECK: frintn d1, d2
+
+0x41 0xc0 0x24 0x1e
+0x41 0xc0 0x64 0x1e
+
+# CHECK: frintp s1, s2
+# CHECK: frintp d1, d2
+
+0x41 0x40 0x27 0x1e
+0x41 0x40 0x67 0x1e
+
+# CHECK: frintx s1, s2
+# CHECK: frintx d1, d2
+
+0x41 0xc0 0x25 0x1e
+0x41 0xc0 0x65 0x1e
+
+# CHECK: frintz s1, s2
+# CHECK: frintz d1, d2
+
+  0x00 0x3c 0xe0 0x7e
+  0x00 0x8c 0xe0 0x5e
+
+# CHECK: cmhs d0, d0, d0
+# CHECK: cmtst d0, d0, d0
+
+0x00 0x00 0xaf 0x9e
+0x00 0x00 0xae 0x9e
+
+# CHECK: fmov.d v0[1], x0
+# CHECK: fmov.d x0, v0[1]
+
diff --git a/test/MC/Disassembler/AArch64/arm64-system.txt b/test/MC/Disassembler/AArch64/arm64-system.txt
new file mode 100644
index 00000000000..9027a60dd30
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-system.txt
@@ -0,0 +1,62 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+
+#-----------------------------------------------------------------------------
+# Hint encodings
+#-----------------------------------------------------------------------------
+
+  0x1f 0x20 0x03 0xd5
+# CHECK: nop
+  0x9f 0x20 0x03 0xd5
+# CHECK: sev
+  0xbf 0x20 0x03 0xd5
+# CHECK: sevl
+  0x5f 0x20 0x03 0xd5
+# CHECK: wfe
+  0x7f 0x20 0x03 0xd5
+# CHECK: wfi
+  0x3f 0x20 0x03 0xd5
+# CHECK: yield
+
+#-----------------------------------------------------------------------------
+# Single-immediate operand instructions
+#-----------------------------------------------------------------------------
+
+  0x5f 0x3a 0x03 0xd5
+# CHECK: clrex #10
+  0xdf 0x3f 0x03 0xd5
+# CHECK: isb{{$}}
+  0xdf 0x31 0x03 0xd5
+# CHECK: isb #1
+  0xbf 0x33 0x03 0xd5
+# CHECK: dmb osh
+  0x9f 0x37 0x03 0xd5
+# CHECK: dsb nsh
+  0x3f 0x76 0x08 0xd5
+# CHECK: dc ivac
+
+#-----------------------------------------------------------------------------
+# Generic system instructions
+#-----------------------------------------------------------------------------
+  0xff 0x05 0x0a 0xd5
+  0xe7 0x6a 0x0f 0xd5
+  0xf4 0x3f 0x2e 0xd5
+  0xbf 0x40 0x00 0xd5
+  0x00 0xb0 0x18 0xd5
+  0x00 0xb0 0x38 0xd5
+
+# CHECK: sys #2, c0, c5, #7
+# CHECK: sys #7, c6, c10, #7, x7
+# CHECK: sysl  x20, #6, c3, c15, #7
+# CHECK: msr  SPSEL, #0
+# CHECK: msr S3_0_C11_C0_0, x0
+# CHECK: mrs x0, S3_0_C11_C0_0
+
+  0x40 0xc0 0x1e 0xd5
+  0x40 0xc0 0x1c 0xd5
+  0x40 0xc0 0x18 0xd5
+
+# CHECK: msr RMR_EL3, x0
+# CHECK: msr RMR_EL2, x0
+# CHECK: msr RMR_EL1, x0
+
diff --git a/test/MC/Disassembler/AArch64/lit.local.cfg b/test/MC/Disassembler/AArch64/lit.local.cfg
index 65369741489..2c423d139bb 100644
--- a/test/MC/Disassembler/AArch64/lit.local.cfg
+++ b/test/MC/Disassembler/AArch64/lit.local.cfg
@@ -1,4 +1,4 @@
 targets = set(config.root.targets_to_build.split())
-if 'ARM64' not in targets:
+if 'AArch64' not in targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/ARM64/advsimd.txt b/test/MC/Disassembler/ARM64/advsimd.txt
deleted file mode 100644
index cceee672dfd..00000000000
--- a/test/MC/Disassembler/ARM64/advsimd.txt
+++ /dev/null
@@ -1,2283 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -output-asm-variant=1 --disassemble < %s | FileCheck %s
-
-0x00 0xb8 0x20 0x0e
-0x00 0xb8 0x20 0x4e
-0x00 0xb8 0x60 0x0e
-0x00 0xb8 0x60 0x4e
-0x00 0xb8 0xa0 0x0e
-0x00 0xb8 0xa0 0x4e
-
-# CHECK: abs.8b  v0, v0
-# CHECK: abs.16b v0, v0
-# CHECK: abs.4h  v0, v0
-# CHECK: abs.8h  v0, v0
-# CHECK: abs.2s  v0, v0
-# CHECK: abs.4s  v0, v0
-
-0x00 0x84 0x20 0x0e
-0x00 0x84 0x20 0x4e
-0x00 0x84 0x60 0x0e
-0x00 0x84 0x60 0x4e
-0x00 0x84 0xa0 0x0e
-0x00 0x84 0xa0 0x4e
-0x00 0x84 0xe0 0x4e
-
-# CHECK: add.8b  v0, v0, v0
-# CHECK: add.16b v0, v0, v0
-# CHECK: add.4h  v0, v0, v0
-# CHECK: add.8h  v0, v0, v0
-# CHECK: add.2s  v0, v0, v0
-# CHECK: add.4s  v0, v0, v0
-# CHECK: add.2d  v0, v0, v0
-
-0x41 0x84 0xe3 0x5e
-
-# CHECK: add d1, d2, d3
-
-0x00 0x40 0x20 0x0e
-0x00 0x40 0x20 0x4e
-0x00 0x40 0x60 0x0e
-0x00 0x40 0x60 0x4e
-0x00 0x40 0xa0 0x0e
-0x00 0x40 0xa0 0x4e
-
-# CHECK: addhn.8b   v0, v0, v0
-# CHECK: addhn2.16b v0, v0, v0
-# CHECK: addhn.4h   v0, v0, v0
-# CHECK: addhn2.8h  v0, v0, v0
-# CHECK: addhn.2s   v0, v0, v0
-# CHECK: addhn2.4s  v0, v0, v0
-
-0x00 0xbc 0x20 0x0e
-0x00 0xbc 0x20 0x4e
-0x00 0xbc 0x60 0x0e
-0x00 0xbc 0x60 0x4e
-0x00 0xbc 0xa0 0x0e
-0x00 0xbc 0xa0 0x4e
-0x00 0xbc 0xe0 0x4e
-
-# CHECK: addp.8b   v0, v0, v0
-# CHECK: addp.16b  v0, v0, v0
-# CHECK: addp.4h   v0, v0, v0
-# CHECK: addp.8h   v0, v0, v0
-# CHECK: addp.2s   v0, v0, v0
-# CHECK: addp.4s   v0, v0, v0
-# CHECK: addp.2d   v0, v0, v0
-
-0x00 0xb8 0xf1 0x5e
-
-# CHECK: addp.2d d0, v0
-
-0x00 0xb8 0x31 0x0e
-0x00 0xb8 0x31 0x4e
-0x00 0xb8 0x71 0x0e
-0x00 0xb8 0x71 0x4e
-0x00 0xb8 0xb1 0x4e
-
-# CHECK: addv.8b  b0, v0
-# CHECK: addv.16b b0, v0
-# CHECK: addv.4h  h0, v0
-# CHECK: addv.8h  h0, v0
-# CHECK: addv.4s  s0, v0
-
-
-# INS/DUP
-0x60 0x0c 0x08 0x4e
-0x60 0x0c 0x04 0x4e
-0x60 0x0c 0x04 0x0e
-0x60 0x0c 0x02 0x4e
-0x60 0x0c 0x02 0x0e
-0x60 0x0c 0x01 0x4e
-0x60 0x0c 0x01 0x0e
-
-# CHECK: dup.2d  v0, x3
-# CHECK: dup.4s  v0, w3
-# CHECK: dup.2s  v0, w3
-# CHECK: dup.8h  v0, w3
-# CHECK: dup.4h  v0, w3
-# CHECK: dup.16b v0, w3
-# CHECK: dup.8b  v0, w3
-
-0x60 0x04 0x18 0x4e
-0x60 0x04 0x0c 0x0e
-0x60 0x04 0x0c 0x4e
-0x60 0x04 0x06 0x0e
-0x60 0x04 0x06 0x4e
-0x60 0x04 0x03 0x0e
-0x60 0x04 0x03 0x4e
-
-# CHECK: dup.2d  v0, v3[1]
-# CHECK: dup.2s  v0, v3[1]
-# CHECK: dup.4s  v0, v3[1]
-# CHECK: dup.4h  v0, v3[1]
-# CHECK: dup.8h  v0, v3[1]
-# CHECK: dup.8b  v0, v3[1]
-# CHECK: dup.16b v0, v3[1]
-
-
-0x43 0x2c 0x14 0x4e
-0x43 0x2c 0x14 0x4e
-0x43 0x3c 0x14 0x0e
-0x43 0x3c 0x14 0x0e
-0x43 0x3c 0x18 0x4e
-0x43 0x3c 0x18 0x4e
-
-# CHECK: smov.s  x3, v2[2]
-# CHECK: smov.s  x3, v2[2]
-# CHECK: mov.s  w3, v2[2]
-# CHECK: mov.s  w3, v2[2]
-# CHECK: mov.d  x3, v2[1]
-# CHECK: mov.d  x3, v2[1]
-
-0xa2 0x1c 0x18 0x4e
-0xa2 0x1c 0x0c 0x4e
-0xa2 0x1c 0x06 0x4e
-0xa2 0x1c 0x03 0x4e
-
-0xa2 0x1c 0x18 0x4e
-0xa2 0x1c 0x0c 0x4e
-0xa2 0x1c 0x06 0x4e
-0xa2 0x1c 0x03 0x4e
-
-# CHECK: ins.d v2[1], x5
-# CHECK: ins.s v2[1], w5
-# CHECK: ins.h v2[1], w5
-# CHECK: ins.b v2[1], w5
-
-# CHECK: ins.d v2[1], x5
-# CHECK: ins.s v2[1], w5
-# CHECK: ins.h v2[1], w5
-# CHECK: ins.b v2[1], w5
-
-0xe2 0x45 0x18 0x6e
-0xe2 0x25 0x0c 0x6e
-0xe2 0x15 0x06 0x6e
-0xe2 0x0d 0x03 0x6e
-
-0xe2 0x05 0x18 0x6e
-0xe2 0x45 0x1c 0x6e
-0xe2 0x35 0x1e 0x6e
-0xe2 0x2d 0x15 0x6e
-
-# CHECK: ins.d v2[1], v15[1]
-# CHECK: ins.s v2[1], v15[1]
-# CHECK: ins.h v2[1], v15[1]
-# CHECK: ins.b v2[1], v15[1]
-
-# CHECK: ins.d v2[1], v15[0]
-# CHECK: ins.s v2[3], v15[2]
-# CHECK: ins.h v2[7], v15[3]
-# CHECK: ins.b v2[10], v15[5]
-
-0x00 0x1c 0x20 0x0e
-0x00 0x1c 0x20 0x4e
-
-# CHECK: and.8b  v0, v0, v0
-# CHECK: and.16b v0, v0, v0
-
-0x00 0x1c 0x60 0x0e
-
-# CHECK: bic.8b  v0, v0, v0
-
-0x00 0x8c 0x20 0x2e
-0x00 0x3c 0x20 0x0e
-0x00 0x34 0x20 0x0e
-0x00 0x34 0x20 0x2e
-0x00 0x3c 0x20 0x2e
-0x00 0x8c 0x20 0x0e
-0x00 0xd4 0xa0 0x2e
-0x00 0xec 0x20 0x2e
-0x00 0xec 0xa0 0x2e
-0x00 0xd4 0x20 0x2e
-0x00 0xd4 0x20 0x0e
-0x00 0xe4 0x20 0x0e
-0x00 0xe4 0x20 0x2e
-0x00 0xe4 0xa0 0x2e
-0x00 0xfc 0x20 0x2e
-0x00 0xc4 0x20 0x2e
-0x00 0xc4 0x20 0x0e
-0x00 0xf4 0x20 0x2e
-0x00 0xf4 0x20 0x0e
-0x00 0xc4 0xa0 0x2e
-0x00 0xc4 0xa0 0x0e
-0x00 0xf4 0xa0 0x2e
-0x00 0xf4 0xa0 0x0e
-0x00 0xcc 0x20 0x0e
-0x00 0xcc 0xa0 0x0e
-0x00 0xdc 0x20 0x0e
-0x00 0xdc 0x20 0x2e
-0x00 0xfc 0x20 0x0e
-0x00 0xfc 0xa0 0x0e
-0x00 0xd4 0xa0 0x0e
-0x00 0x94 0x20 0x0e
-0x00 0x94 0x20 0x2e
-0x00 0x9c 0x20 0x0e
-0x00 0x9c 0x20 0x2e
-0x00 0x7c 0x20 0x0e
-0x00 0x74 0x20 0x0e
-0x00 0x04 0x20 0x0e
-0x00 0x24 0x20 0x0e
-0x00 0xa4 0x20 0x0e
-0x00 0x64 0x20 0x0e
-0x00 0xac 0x20 0x0e
-0x00 0x6c 0x20 0x0e
-0x00 0x0c 0x20 0x0e
-0x00 0xb4 0x60 0x0e
-0x00 0xb4 0x60 0x2e
-0x00 0x5c 0x20 0x0e
-0x00 0x4c 0x20 0x0e
-0x00 0x2c 0x20 0x0e
-0x00 0x14 0x20 0x0e
-0x00 0x54 0x20 0x0e
-0x00 0x44 0x20 0x0e
-0x00 0x84 0x20 0x2e
-0x00 0x7c 0x20 0x2e
-0x00 0x74 0x20 0x2e
-0x00 0x04 0x20 0x2e
-0x00 0x24 0x20 0x2e
-0x00 0xa4 0x20 0x2e
-0x00 0x64 0x20 0x2e
-0x00 0xac 0x20 0x2e
-0x00 0x6c 0x20 0x2e
-0x00 0x0c 0x20 0x2e
-0x00 0x5c 0x20 0x2e
-0x00 0x4c 0x20 0x2e
-0x00 0x2c 0x20 0x2e
-0x00 0x14 0x20 0x2e
-0x00 0x54 0x20 0x2e
-0x00 0x44 0x20 0x2e
-
-# CHECK: cmeq.8b	v0, v0, v0
-# CHECK: cmge.8b	v0, v0, v0
-# CHECK: cmgt.8b	v0, v0, v0
-# CHECK: cmhi.8b	v0, v0, v0
-# CHECK: cmhs.8b	v0, v0, v0
-# CHECK: cmtst.8b	v0, v0, v0
-# CHECK: fabd.2s	v0, v0, v0
-# CHECK: facge.2s	v0, v0, v0
-# CHECK: facgt.2s	v0, v0, v0
-# CHECK: faddp.2s	v0, v0, v0
-# CHECK: fadd.2s	v0, v0, v0
-# CHECK: fcmeq.2s	v0, v0, v0
-# CHECK: fcmge.2s	v0, v0, v0
-# CHECK: fcmgt.2s	v0, v0, v0
-# CHECK: fdiv.2s	v0, v0, v0
-# CHECK: fmaxnmp.2s	v0, v0, v0
-# CHECK: fmaxnm.2s	v0, v0, v0
-# CHECK: fmaxp.2s	v0, v0, v0
-# CHECK: fmax.2s	v0, v0, v0
-# CHECK: fminnmp.2s	v0, v0, v0
-# CHECK: fminnm.2s	v0, v0, v0
-# CHECK: fminp.2s	v0, v0, v0
-# CHECK: fmin.2s	v0, v0, v0
-# CHECK: fmla.2s	v0, v0, v0
-# CHECK: fmls.2s	v0, v0, v0
-# CHECK: fmulx.2s	v0, v0, v0
-# CHECK: fmul.2s	v0, v0, v0
-# CHECK: frecps.2s	v0, v0, v0
-# CHECK: frsqrts.2s	v0, v0, v0
-# CHECK: fsub.2s	v0, v0, v0
-# CHECK: mla.8b	v0, v0, v0
-# CHECK: mls.8b	v0, v0, v0
-# CHECK: mul.8b	v0, v0, v0
-# CHECK: pmul.8b	v0, v0, v0
-# CHECK: saba.8b	v0, v0, v0
-# CHECK: sabd.8b	v0, v0, v0
-# CHECK: shadd.8b	v0, v0, v0
-# CHECK: shsub.8b	v0, v0, v0
-# CHECK: smaxp.8b	v0, v0, v0
-# CHECK: smax.8b	v0, v0, v0
-# CHECK: sminp.8b	v0, v0, v0
-# CHECK: smin.8b	v0, v0, v0
-# CHECK: sqadd.8b	v0, v0, v0
-# CHECK: sqdmulh.4h v0, v0, v0
-# CHECK: sqrdmulh.4h v0, v0, v0
-# CHECK: sqrshl.8b	v0, v0, v0
-# CHECK: sqshl.8b	v0, v0, v0
-# CHECK: sqsub.8b	v0, v0, v0
-# CHECK: srhadd.8b	v0, v0, v0
-# CHECK: srshl.8b	v0, v0, v0
-# CHECK: sshl.8b	v0, v0, v0
-# CHECK: sub.8b	v0, v0, v0
-# CHECK: uaba.8b	v0, v0, v0
-# CHECK: uabd.8b	v0, v0, v0
-# CHECK: uhadd.8b	v0, v0, v0
-# CHECK: uhsub.8b	v0, v0, v0
-# CHECK: umaxp.8b	v0, v0, v0
-# CHECK: umax.8b	v0, v0, v0
-# CHECK: uminp.8b	v0, v0, v0
-# CHECK: umin.8b	v0, v0, v0
-# CHECK: uqadd.8b	v0, v0, v0
-# CHECK: uqrshl.8b	v0, v0, v0
-# CHECK: uqshl.8b	v0, v0, v0
-# CHECK: uqsub.8b	v0, v0, v0
-# CHECK: urhadd.8b	v0, v0, v0
-# CHECK: urshl.8b	v0, v0, v0
-# CHECK: ushl.8b	v0, v0, v0
-
-0x00 0x1c 0xe0 0x2e
-0x00 0x1c 0xa0 0x2e
-0x00 0x1c 0x60 0x2e
-0x00 0x1c 0x20 0x2e
-0x00 0x1c 0xe0 0x0e
-0x00 0x1c 0xa1 0x0e
-
-# CHECK: bif.8b	v0, v0, v0
-# CHECK: bit.8b	v0, v0, v0
-# CHECK: bsl.8b	v0, v0, v0
-# CHECK: eor.8b	v0, v0, v0
-# CHECK: orn.8b	v0, v0, v0
-# CHECK: orr.8b	v0, v0, v1
-
-0x00 0x68 0x20 0x0e
-0x00 0x68 0x20 0x4e
-0x00 0x68 0x60 0x0e
-0x00 0x68 0x60 0x4e
-0x00 0x68 0xa0 0x0e
-0x00 0x68 0xa0 0x4e
-
-# CHECK: sadalp.4h	v0, v0
-# CHECK: sadalp.8h	v0, v0
-# CHECK: sadalp.2s	v0, v0
-# CHECK: sadalp.4s	v0, v0
-# CHECK: sadalp.1d	v0, v0
-# CHECK: sadalp.2d	v0, v0
-
-0x00 0x48 0x20 0x0e
-0x00 0x48 0x20 0x2e
-0x00 0x58 0x20 0x0e
-0x00 0xf8 0xa0 0x0e
-0x00 0xc8 0x21 0x0e
-0x00 0xc8 0x21 0x2e
-0x00 0xb8 0x21 0x0e
-0x00 0xb8 0x21 0x2e
-0x00 0xa8 0x21 0x0e
-0x00 0xa8 0x21 0x2e
-0x00 0xa8 0xa1 0x0e
-0x00 0xa8 0xa1 0x2e
-0x00 0xb8 0xa1 0x0e
-0x00 0xb8 0xa1 0x2e
-0x00 0xf8 0xa0 0x2e
-0x00 0xd8 0xa1 0x0e
-0x00 0xd8 0xa1 0x2e
-0x00 0xf8 0xa1 0x2e
-0x00 0xb8 0x20 0x2e
-0x00 0x58 0x20 0x2e
-0x00 0x58 0x60 0x2e
-0x00 0x18 0x20 0x0e
-0x00 0x08 0x20 0x2e
-0x00 0x08 0x20 0x0e
-0x00 0x68 0x20 0x0e
-0x00 0x28 0x20 0x0e
-0x00 0xd8 0x21 0x0e
-0x00 0x38 0x21 0x2e
-0x00 0x78 0x20 0x0e
-0x00 0x78 0x20 0x2e
-0x00 0x48 0x21 0x0e
-0x00 0x28 0x21 0x2e
-0x00 0x38 0x20 0x0e
-0x00 0x68 0x20 0x2e
-0x00 0x28 0x20 0x2e
-0x00 0xd8 0x21 0x2e
-0x00 0x48 0x21 0x2e
-0x00 0xc8 0xa1 0x0e
-0x00 0xc8 0xa1 0x2e
-0x00 0x38 0x20 0x2e
-0x00 0x28 0x21 0x0e
-0x00 0x48 0x20 0x0e
-0x00 0x48 0x20 0x2e
-0x00 0x58 0x20 0x0e
-0x00 0xf8 0xa0 0x0e
-0x00 0xc8 0x21 0x0e
-0x00 0xc8 0x21 0x2e
-0x00 0xb8 0x21 0x0e
-0x00 0xb8 0x21 0x2e
-0x00 0xa8 0x21 0x0e
-0x00 0xa8 0x21 0x2e
-0x00 0xa8 0xa1 0x0e
-0x00 0xa8 0xa1 0x2e
-0x00 0xb8 0xa1 0x0e
-0x00 0xb8 0xa1 0x2e
-0x00 0xf8 0xa0 0x2e
-0x00 0xd8 0xa1 0x0e
-0x00 0xd8 0xa1 0x2e
-0x00 0xf8 0xa1 0x2e
-0x00 0xb8 0x20 0x2e
-0x00 0x58 0x20 0x2e
-0x00 0x58 0x60 0x2e
-0x00 0x18 0x20 0x0e
-0x00 0x08 0x20 0x2e
-0x00 0x08 0x20 0x0e
-0x00 0x68 0x20 0x0e
-0x00 0x28 0x20 0x0e
-0x00 0xd8 0x21 0x0e
-0x00 0x38 0x21 0x2e
-0x00 0x78 0x20 0x0e
-0x00 0x78 0x20 0x2e
-0x00 0x48 0x21 0x0e
-0x00 0x28 0x21 0x2e
-0x00 0x38 0x20 0x0e
-0x00 0x68 0x20 0x2e
-0x00 0x28 0x20 0x2e
-0x00 0xd8 0x21 0x2e
-0x00 0x48 0x21 0x2e
-0x00 0xc8 0xa1 0x0e
-0x00 0xc8 0xa1 0x2e
-0x00 0x38 0x20 0x2e
-0x00 0x28 0x21 0x0e
-
-# CHECK: cls.8b	v0, v0
-# CHECK: clz.8b	v0, v0
-# CHECK: cnt.8b	v0, v0
-# CHECK: fabs.2s	v0, v0
-# CHECK: fcvtas.2s	v0, v0
-# CHECK: fcvtau.2s	v0, v0
-# CHECK: fcvtms.2s	v0, v0
-# CHECK: fcvtmu.2s	v0, v0
-# CHECK: fcvtns.2s	v0, v0
-# CHECK: fcvtnu.2s	v0, v0
-# CHECK: fcvtps.2s	v0, v0
-# CHECK: fcvtpu.2s	v0, v0
-# CHECK: fcvtzs.2s	v0, v0
-# CHECK: fcvtzu.2s	v0, v0
-# CHECK: fneg.2s	v0, v0
-# CHECK: frecpe.2s	v0, v0
-# CHECK: frsqrte.2s	v0, v0
-# CHECK: fsqrt.2s	v0, v0
-# CHECK: neg.8b	v0, v0
-# CHECK: mvn.8b	v0, v0
-# CHECK: rbit.8b	v0, v0
-# CHECK: rev16.8b	v0, v0
-# CHECK: rev32.8b	v0, v0
-# CHECK: rev64.8b	v0, v0
-# CHECK: sadalp.4h	v0, v0
-# CHECK: saddlp.4h	v0, v0
-# CHECK: scvtf.2s	v0, v0
-# CHECK: shll.8h	v0, v0, #8
-# CHECK: sqabs.8b	v0, v0
-# CHECK: sqneg.8b	v0, v0
-# CHECK: sqxtn.8b	v0, v0
-# CHECK: sqxtun.8b	v0, v0
-# CHECK: suqadd.8b	v0, v0
-# CHECK: uadalp.4h	v0, v0
-# CHECK: uaddlp.4h	v0, v0
-# CHECK: ucvtf.2s	v0, v0
-# CHECK: uqxtn.8b	v0, v0
-# CHECK: urecpe.2s	v0, v0
-# CHECK: ursqrte.2s	v0, v0
-# CHECK: usqadd.8b	v0, v0
-# CHECK: xtn.8b	v0, v0
-
-0x00 0x98 0x20 0x0e
-0x00 0x98 0x20 0x4e
-0x00 0x98 0x60 0x0e
-0x00 0x98 0x60 0x4e
-0x00 0x98 0xa0 0x0e
-0x00 0x98 0xa0 0x4e
-0x00 0x98 0xe0 0x4e
-
-# CHECK: cmeq.8b	v0, v0, #0
-# CHECK: cmeq.16b	v0, v0, #0
-# CHECK: cmeq.4h	v0, v0, #0
-# CHECK: cmeq.8h	v0, v0, #0
-# CHECK: cmeq.2s	v0, v0, #0
-# CHECK: cmeq.4s	v0, v0, #0
-# CHECK: cmeq.2d	v0, v0, #0
-
-0x00 0x88 0x20 0x2e
-0x00 0x88 0x20 0x0e
-0x00 0x98 0x20 0x2e
-0x00 0xa8 0x20 0x0e
-0x00 0xd8 0xa0 0x0e
-0x00 0xc8 0xa0 0x2e
-0x00 0xc8 0xa0 0x0e
-0x00 0xd8 0xa0 0x2e
-0x00 0xe8 0xa0 0x0e
-
-# CHECK: cmge.8b	v0, v0, #0
-# CHECK: cmgt.8b	v0, v0, #0
-# CHECK: cmle.8b	v0, v0, #0
-# CHECK: cmlt.8b	v0, v0, #0
-# CHECK: fcmeq.2s	v0, v0, #0
-# CHECK: fcmge.2s	v0, v0, #0
-# CHECK: fcmgt.2s	v0, v0, #0
-# CHECK: fcmle.2s	v0, v0, #0
-# CHECK: fcmlt.2s	v0, v0, #0
-
-0x00 0x78 0x21 0x0e
-0x00 0x78 0x21 0x4e
-0x00 0x78 0x61 0x0e
-0x00 0x78 0x61 0x4e
-0x00 0x68 0x21 0x0e
-0x00 0x68 0x21 0x4e
-0x00 0x68 0x61 0x0e
-0x00 0x68 0x61 0x4e
-0x00 0x68 0x61 0x2e
-0x00 0x68 0x61 0x6e
-
-# CHECK: fcvtl	v0.4s, v0.4h
-# CHECK: fcvtl2	v0.4s, v0.8h
-# CHECK: fcvtl	v0.2d, v0.2s
-# CHECK: fcvtl2	v0.2d, v0.4s
-# CHECK: fcvtn	v0.4h, v0.4s
-# CHECK: fcvtn2	v0.8h, v0.4s
-# CHECK: fcvtn	v0.2s, v0.2d
-# CHECK: fcvtn2	v0.4s, v0.2d
-# CHECK: fcvtxn	v0.2s, v0.2d
-# CHECK: fcvtxn2	v0.4s, v0.2d
-
-#===-------------------------------------------------------------------------===
-# AdvSIMD modified immediate instructions
-#===-------------------------------------------------------------------------===
-
-0x20 0x14 0x00 0x2f
-0x20 0x34 0x00 0x2f
-0x20 0x54 0x00 0x2f
-0x20 0x74 0x00 0x2f
-
-# CHECK: bic.2s v0, #0x1
-# CHECK: bic.2s v0, #0x1, lsl #8
-# CHECK: bic.2s v0, #0x1, lsl #16
-# CHECK: bic.2s v0, #0x1, lsl #24
-
-0x20 0x94 0x00 0x2f
-0x20 0x94 0x00 0x2f
-0x20 0xb4 0x00 0x2f
-
-# CHECK: bic.4h v0, #0x1
-# CHECK: bic.4h v0, #0x1
-# FIXME: bic.4h v0, #0x1, lsl #8
-#    'bic.4h' should be selected over "fcvtnu.2s v0, v1, #0"
-
-0x20 0x14 0x00 0x6f
-0x20 0x34 0x00 0x6f
-0x20 0x54 0x00 0x6f
-0x20 0x74 0x00 0x6f
-
-# CHECK: bic.4s v0, #0x1
-# CHECK: bic.4s v0, #0x1, lsl #8
-# CHECK: bic.4s v0, #0x1, lsl #16
-# CHECK: bic.4s v0, #0x1, lsl #24
-
-0x20 0x94 0x00 0x6f
-0x20 0xb4 0x00 0x6f
-
-# CHECK: bic.8h v0, #0x1
-# FIXME: bic.8h v0, #0x1, lsl #8
-#    "bic.8h" should be selected over "fcvtnu.4s v0, v1, #0"
-
-0x00 0xf4 0x02 0x6f
-
-# CHECK: fmov.2d v0, #0.12500000
-
-0x00 0xf4 0x02 0x0f
-0x00 0xf4 0x02 0x4f
-
-# CHECK: fmov.2s v0, #0.12500000
-# CHECK: fmov.4s v0, #0.12500000
-
-0x20 0x14 0x00 0x0f
-0x20 0x34 0x00 0x0f
-0x20 0x54 0x00 0x0f
-0x20 0x74 0x00 0x0f
-
-# CHECK: orr.2s v0, #0x1
-# CHECK: orr.2s v0, #0x1, lsl #8
-# CHECK: orr.2s v0, #0x1, lsl #16
-# CHECK: orr.2s v0, #0x1, lsl #24
-
-0x20 0x94 0x00 0x0f
-0x20 0xb4 0x00 0x0f
-
-# CHECK: orr.4h v0, #0x1
-# FIXME: orr.4h v0, #0x1, lsl #8
-#    'orr.4h' should be selected over "fcvtns.2s v0, v1, #0"
-
-0x20 0x14 0x00 0x4f
-0x20 0x34 0x00 0x4f
-0x20 0x54 0x00 0x4f
-0x20 0x74 0x00 0x4f
-
-# CHECK: orr.4s v0, #0x1
-# CHECK: orr.4s v0, #0x1, lsl #8
-# CHECK: orr.4s v0, #0x1, lsl #16
-# CHECK: orr.4s v0, #0x1, lsl #24
-
-0x20 0x94 0x00 0x4f
-0x20 0xb4 0x00 0x4f
-
-# CHECK: orr.8h v0, #0x1
-# CHECK: orr.8h v0, #0x1, lsl #8
-
-0x21 0x70 0x40 0x0c
-0x42 0xa0 0x40 0x4c
-0x64 0x64 0x40 0x0c
-0x87 0x24 0x40 0x4c
-0x0c 0xa8 0x40 0x0c
-0x0a 0x68 0x40 0x4c
-0x2d 0xac 0x40 0x0c
-0x4f 0x7c 0x40 0x4c
-0xe0 0x03 0x40 0x0d
-
-# CHECK: ld1.8b { v1 }, [x1]
-# CHECK: ld1.16b { v2, v3 }, [x2]
-# CHECK: ld1.4h { v4, v5, v6 }, [x3]
-# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4]
-# CHECK: ld1.2s { v12, v13 }, [x0]
-# CHECK: ld1.4s { v10, v11, v12 }, [x0]
-# CHECK: ld1.1d { v13, v14 }, [x1]
-# CHECK: ld1.2d	{ v15 }, [x2]
-# CHECK: ld1.b	{ v0 }[0], [sp]
-
-0x41 0x70 0xdf 0x0c
-0x41 0xa0 0xdf 0x0c
-0x41 0x60 0xdf 0x0c
-0x41 0x20 0xdf 0x0c
-0x42 0x70 0xdf 0x4c
-0x42 0xa0 0xdf 0x4c
-0x42 0x60 0xdf 0x4c
-0x42 0x20 0xdf 0x4c
-0x64 0x74 0xdf 0x0c
-0x64 0xa4 0xdf 0x0c
-0x64 0x64 0xdf 0x0c
-0x64 0x24 0xdf 0x0c
-0x87 0x74 0xdf 0x4c
-0x87 0xa4 0xdf 0x4c
-0x87 0x64 0xdf 0x4c
-0x87 0x24 0xdf 0x4c
-0x0c 0x78 0xdf 0x0c
-0x0c 0xa8 0xdf 0x0c
-0x0c 0x68 0xdf 0x0c
-0x0c 0x28 0xdf 0x0c
-0x0a 0x78 0xdf 0x4c
-0x0a 0xa8 0xdf 0x4c
-0x0a 0x68 0xdf 0x4c
-0x0a 0x28 0xdf 0x4c
-0x2d 0x7c 0xdf 0x0c
-0x2d 0xac 0xdf 0x0c
-0x2d 0x6c 0xdf 0x0c
-0x2d 0x2c 0xdf 0x0c
-0x4f 0x7c 0xdf 0x4c
-0x4f 0xac 0xdf 0x4c
-0x4f 0x6c 0xdf 0x4c
-0x4f 0x2c 0xdf 0x4c
-
-# CHECK: ld1.8b { v1 }, [x2], #8
-# CHECK: ld1.8b { v1, v2 }, [x2], #16
-# CHECK: ld1.8b { v1, v2, v3 }, [x2], #24
-# CHECK: ld1.8b { v1, v2, v3, v4 }, [x2], #32
-# CHECK: ld1.16b { v2 }, [x2], #16
-# CHECK: ld1.16b { v2, v3 }, [x2], #32
-# CHECK: ld1.16b { v2, v3, v4 }, [x2], #48
-# CHECK: ld1.16b { v2, v3, v4, v5 }, [x2], #64
-# CHECK: ld1.4h { v4 }, [x3], #8
-# CHECK: ld1.4h { v4, v5 }, [x3], #16
-# CHECK: ld1.4h { v4, v5, v6 }, [x3], #24
-# CHECK: ld1.4h { v4, v5, v6, v7 }, [x3], #32
-# CHECK: ld1.8h { v7 }, [x4], #16
-# CHECK: ld1.8h { v7, v8 }, [x4], #32
-# CHECK: ld1.8h { v7, v8, v9 }, [x4], #48
-# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4], #64
-# CHECK: ld1.2s { v12 }, [x0], #8
-# CHECK: ld1.2s { v12, v13 }, [x0], #16
-# CHECK: ld1.2s { v12, v13, v14 }, [x0], #24
-# CHECK: ld1.2s { v12, v13, v14, v15 }, [x0], #32
-# CHECK: ld1.4s { v10 }, [x0], #16
-# CHECK: ld1.4s { v10, v11 }, [x0], #32
-# CHECK: ld1.4s { v10, v11, v12 }, [x0], #48
-# CHECK: ld1.4s { v10, v11, v12, v13 }, [x0], #64
-# CHECK: ld1.1d { v13 }, [x1], #8
-# CHECK: ld1.1d { v13, v14 }, [x1], #16
-# CHECK: ld1.1d { v13, v14, v15 }, [x1], #24
-# CHECK: ld1.1d { v13, v14, v15, v16 }, [x1], #32
-# CHECK: ld1.2d { v15 }, [x2], #16
-# CHECK: ld1.2d { v15, v16 }, [x2], #32
-# CHECK: ld1.2d { v15, v16, v17 }, [x2], #48
-# CHECK: ld1.2d { v15, v16, v17, v18 }, [x2], #64
-
-0x21 0x70 0x00 0x0c
-0x42 0xa0 0x00 0x4c
-0x64 0x64 0x00 0x0c
-0x87 0x24 0x00 0x4c
-0x0c 0xa8 0x00 0x0c
-0x0a 0x68 0x00 0x4c
-0x2d 0xac 0x00 0x0c
-0x4f 0x7c 0x00 0x4c
-
-# CHECK: st1.8b { v1 }, [x1]
-# CHECK: st1.16b { v2, v3 }, [x2]
-# CHECK: st1.4h { v4, v5, v6 }, [x3]
-# CHECK: st1.8h { v7, v8, v9, v10 }, [x4]
-# CHECK: st1.2s { v12, v13 }, [x0]
-# CHECK: st1.4s { v10, v11, v12 }, [x0]
-# CHECK: st1.1d { v13, v14 }, [x1]
-# CHECK: st1.2d	{ v15 }, [x2]
-
-0x61 0x08 0x40 0x0d
-0x82 0x84 0x40 0x4d
-0xa3 0x58 0x40 0x0d
-0xc4 0x80 0x40 0x4d
-
-# CHECK: ld1.b { v1 }[2], [x3]
-# CHECK: ld1.d { v2 }[1], [x4]
-# CHECK: ld1.h { v3 }[3], [x5]
-# CHECK: ld1.s { v4 }[2], [x6]
-
-0x61 0x08 0xdf 0x0d
-0x82 0x84 0xdf 0x4d
-0xa3 0x58 0xdf 0x0d
-0xc4 0x80 0xdf 0x4d
-
-# CHECK: ld1.b { v1 }[2], [x3], #1
-# CHECK: ld1.d { v2 }[1], [x4], #8
-# CHECK: ld1.h { v3 }[3], [x5], #2
-# CHECK: ld1.s { v4 }[2], [x6], #4
-
-0x61 0x08 0x00 0x0d
-0x82 0x84 0x00 0x4d
-0xa3 0x58 0x00 0x0d
-0xc4 0x80 0x00 0x4d
-
-# CHECK: st1.b { v1 }[2], [x3]
-# CHECK: st1.d { v2 }[1], [x4]
-# CHECK: st1.h { v3 }[3], [x5]
-# CHECK: st1.s { v4 }[2], [x6]
-
-0x61 0x08 0x9f 0x0d
-0x82 0x84 0x9f 0x4d
-0xa3 0x58 0x9f 0x0d
-0xc4 0x80 0x9f 0x4d
-
-# CHECK: st1.b { v1 }[2], [x3], #1
-# CHECK: st1.d { v2 }[1], [x4], #8
-# CHECK: st1.h { v3 }[3], [x5], #2
-# CHECK: st1.s { v4 }[2], [x6], #4
-
-0x61 0x08 0xc4 0x0d
-0x82 0x84 0xc5 0x4d
-0xa3 0x58 0xc6 0x0d
-0xc4 0x80 0xc7 0x4d
-
-# CHECK: ld1.b { v1 }[2], [x3], x4
-# CHECK: ld1.d { v2 }[1], [x4], x5
-# CHECK: ld1.h { v3 }[3], [x5], x6
-# CHECK: ld1.s { v4 }[2], [x6], x7
-
-0x61 0x08 0x84 0x0d
-0x82 0x84 0x85 0x4d
-0xa3 0x58 0x86 0x0d
-0xc4 0x80 0x87 0x4d
-
-# CHECK: st1.b { v1 }[2], [x3], x4
-# CHECK: st1.d { v2 }[1], [x4], x5
-# CHECK: st1.h { v3 }[3], [x5], x6
-# CHECK: st1.s { v4 }[2], [x6], x7
-
-0x41 0x70 0xc3 0x0c
-0x42 0xa0 0xc4 0x4c
-0x64 0x64 0xc5 0x0c
-0x87 0x24 0xc6 0x4c
-0x0c 0xa8 0xc7 0x0c
-0x0a 0x68 0xc8 0x4c
-0x2d 0xac 0xc9 0x0c
-0x4f 0x7c 0xca 0x4c
-
-# CHECK: ld1.8b { v1 }, [x2], x3
-# CHECK: ld1.16b { v2, v3 }, [x2], x4
-# CHECK: ld1.4h { v4, v5, v6 }, [x3], x5
-# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4], x6
-# CHECK: ld1.2s { v12, v13 }, [x0], x7
-# CHECK: ld1.4s { v10, v11, v12 }, [x0], x8
-# CHECK: ld1.1d { v13, v14 }, [x1], x9
-# CHECK: ld1.2d { v15 }, [x2], x10
-
-0x41 0x70 0x83 0x0c
-0x42 0xa0 0x84 0x4c
-0x64 0x64 0x85 0x0c
-0x87 0x24 0x86 0x4c
-0x0c 0xa8 0x87 0x0c
-0x0a 0x68 0x88 0x4c
-0x2d 0xac 0x89 0x0c
-0x4f 0x7c 0x8a 0x4c
-
-# CHECK: st1.8b { v1 }, [x2], x3
-# CHECK: st1.16b { v2, v3 }, [x2], x4
-# CHECK: st1.4h { v4, v5, v6 }, [x3], x5
-# CHECK: st1.8h { v7, v8, v9, v10 }, [x4], x6
-# CHECK: st1.2s { v12, v13 }, [x0], x7
-# CHECK: st1.4s { v10, v11, v12 }, [x0], x8
-# CHECK: st1.1d { v13, v14 }, [x1], x9
-# CHECK: st1.2d { v15 }, [x2], x10
-
-0x41 0x70 0x9f 0x0c
-0x41 0xa0 0x9f 0x0c
-0x41 0x60 0x9f 0x0c
-0x41 0x20 0x9f 0x0c
-0x42 0x70 0x9f 0x4c
-0x42 0xa0 0x9f 0x4c
-0x42 0x60 0x9f 0x4c
-0x42 0x20 0x9f 0x4c
-0x64 0x74 0x9f 0x0c
-0x64 0xa4 0x9f 0x0c
-0x64 0x64 0x9f 0x0c
-0x64 0x24 0x9f 0x0c
-0x87 0x74 0x9f 0x4c
-0x87 0xa4 0x9f 0x4c
-0x87 0x64 0x9f 0x4c
-0x87 0x24 0x9f 0x4c
-0x0c 0x78 0x9f 0x0c
-0x0c 0xa8 0x9f 0x0c
-0x0c 0x68 0x9f 0x0c
-0x0c 0x28 0x9f 0x0c
-0x0a 0x78 0x9f 0x4c
-0x0a 0xa8 0x9f 0x4c
-0x0a 0x68 0x9f 0x4c
-0x0a 0x28 0x9f 0x4c
-0x2d 0x7c 0x9f 0x0c
-0x2d 0xac 0x9f 0x0c
-0x2d 0x6c 0x9f 0x0c
-0x2d 0x2c 0x9f 0x0c
-0x4f 0x7c 0x9f 0x4c
-0x4f 0xac 0x9f 0x4c
-0x4f 0x6c 0x9f 0x4c
-0x4f 0x2c 0x9f 0x4c
-
-# CHECK: st1.8b { v1 }, [x2], #8
-# CHECK: st1.8b { v1, v2 }, [x2], #16
-# CHECK: st1.8b { v1, v2, v3 }, [x2], #24
-# CHECK: st1.8b { v1, v2, v3, v4 }, [x2], #32
-# CHECK: st1.16b { v2 }, [x2], #16
-# CHECK: st1.16b { v2, v3 }, [x2], #32
-# CHECK: st1.16b { v2, v3, v4 }, [x2], #48
-# CHECK: st1.16b { v2, v3, v4, v5 }, [x2], #64
-# CHECK: st1.4h { v4 }, [x3], #8
-# CHECK: st1.4h { v4, v5 }, [x3], #16
-# CHECK: st1.4h { v4, v5, v6 }, [x3], #24
-# CHECK: st1.4h { v4, v5, v6, v7 }, [x3], #32
-# CHECK: st1.8h { v7 }, [x4], #16
-# CHECK: st1.8h { v7, v8 }, [x4], #32
-# CHECK: st1.8h { v7, v8, v9 }, [x4], #48
-# CHECK: st1.8h { v7, v8, v9, v10 }, [x4], #64
-# CHECK: st1.2s { v12 }, [x0], #8
-# CHECK: st1.2s { v12, v13 }, [x0], #16
-# CHECK: st1.2s { v12, v13, v14 }, [x0], #24
-# CHECK: st1.2s { v12, v13, v14, v15 }, [x0], #32
-# CHECK: st1.4s { v10 }, [x0], #16
-# CHECK: st1.4s { v10, v11 }, [x0], #32
-# CHECK: st1.4s { v10, v11, v12 }, [x0], #48
-# CHECK: st1.4s { v10, v11, v12, v13 }, [x0], #64
-# CHECK: st1.1d { v13 }, [x1], #8
-# CHECK: st1.1d { v13, v14 }, [x1], #16
-# CHECK: st1.1d { v13, v14, v15 }, [x1], #24
-# CHECK: st1.1d { v13, v14, v15, v16 }, [x1], #32
-# CHECK: st1.2d { v15 }, [x2], #16
-# CHECK: st1.2d { v15, v16 }, [x2], #32
-# CHECK: st1.2d { v15, v16, v17 }, [x2], #48
-# CHECK: st1.2d { v15, v16, v17, v18 }, [x2], #64
-
-0x21 0xc0 0x40 0x0d
-0x21 0xc0 0xc2 0x0d
-0x64 0xc4 0x40 0x0d
-0x64 0xc4 0xc5 0x0d
-0xa9 0xc8 0x40 0x0d
-0xa9 0xc8 0xc6 0x0d
-0xec 0xcc 0x40 0x0d
-0xec 0xcc 0xc8 0x0d
-
-# CHECK: ld1r.8b { v1 }, [x1]
-# CHECK: ld1r.8b { v1 }, [x1], x2
-# CHECK: ld1r.4h { v4 }, [x3]
-# CHECK: ld1r.4h { v4 }, [x3], x5
-# CHECK: ld1r.2s { v9 }, [x5]
-# CHECK: ld1r.2s { v9 }, [x5], x6
-# CHECK: ld1r.1d { v12 }, [x7]
-# CHECK: ld1r.1d { v12 }, [x7], x8
-
-0x21 0xc0 0xdf 0x0d
-0x21 0xc4 0xdf 0x0d
-0x21 0xc8 0xdf 0x0d
-0x21 0xcc 0xdf 0x0d
-
-# CHECK: ld1r.8b { v1 }, [x1], #1
-# CHECK: ld1r.4h { v1 }, [x1], #2
-# CHECK: ld1r.2s { v1 }, [x1], #4
-# CHECK: ld1r.1d { v1 }, [x1], #8
-
-0x45 0x80 0x40 0x4c
-0x0a 0x88 0x40 0x0c
-
-# CHECK: ld2.16b { v5, v6 }, [x2]
-# CHECK: ld2.2s { v10, v11 }, [x0]
-
-0x45 0x80 0x00 0x4c
-0x0a 0x88 0x00 0x0c
-
-# CHECK: st2.16b { v5, v6 }, [x2]
-# CHECK: st2.2s { v10, v11 }, [x0]
-
-0x61 0x08 0x20 0x0d
-0x82 0x84 0x20 0x4d
-0xc3 0x50 0x20 0x0d
-0xe4 0x90 0x20 0x4d
-
-# CHECK: st2.b { v1, v2 }[2], [x3]
-# CHECK: st2.d { v2, v3 }[1], [x4]
-# CHECK: st2.h { v3, v4 }[2], [x6]
-# CHECK: st2.s { v4, v5 }[3], [x7]
-
-0x61 0x08 0xbf 0x0d
-0x82 0x84 0xbf 0x4d
-0xa3 0x58 0xbf 0x0d
-0xc4 0x80 0xbf 0x4d
-
-# CHECK: st2.b { v1, v2 }[2], [x3], #2
-# CHECK: st2.d { v2, v3 }[1], [x4], #16
-# CHECK: st2.h { v3, v4 }[3], [x5], #4
-# CHECK: st2.s { v4, v5 }[2], [x6], #8
-
-0x61 0x08 0x60 0x0d
-0x82 0x84 0x60 0x4d
-0xc3 0x50 0x60 0x0d
-0xe4 0x90 0x60 0x4d
-
-# CHECK: ld2.b { v1, v2 }[2], [x3]
-# CHECK: ld2.d { v2, v3 }[1], [x4]
-# CHECK: ld2.h { v3, v4 }[2], [x6]
-# CHECK: ld2.s { v4, v5 }[3], [x7]
-
-0x61 0x08 0xff 0x0d
-0x82 0x84 0xff 0x4d
-0xa3 0x58 0xff 0x0d
-0xc4 0x80 0xff 0x4d
-
-# CHECK: ld2.b { v1, v2 }[2], [x3], #2
-# CHECK: ld2.d { v2, v3 }[1], [x4], #16
-# CHECK: ld2.h { v3, v4 }[3], [x5], #4
-# CHECK: ld2.s { v4, v5 }[2], [x6], #8
-
-0x61 0x08 0xe4 0x0d
-0x82 0x84 0xe6 0x4d
-0xa3 0x58 0xe8 0x0d
-0xc4 0x80 0xea 0x4d
-
-# CHECK: ld2.b { v1, v2 }[2], [x3], x4
-# CHECK: ld2.d { v2, v3 }[1], [x4], x6
-# CHECK: ld2.h { v3, v4 }[3], [x5], x8
-# CHECK: ld2.s { v4, v5 }[2], [x6], x10
-
-0x61 0x08 0xa4 0x0d
-0x82 0x84 0xa6 0x4d
-0xa3 0x58 0xa8 0x0d
-0xc4 0x80 0xaa 0x4d
-
-# CHECK: st2.b { v1, v2 }[2], [x3], x4
-# CHECK: st2.d { v2, v3 }[1], [x4], x6
-# CHECK: st2.h { v3, v4 }[3], [x5], x8
-# CHECK: st2.s { v4, v5 }[2], [x6], x10
-
-0x64 0x84 0xc5 0x0c
-0x0c 0x88 0xc7 0x0c
-
-# CHECK: ld2.4h { v4, v5 }, [x3], x5
-# CHECK: ld2.2s { v12, v13 }, [x0], x7
-
-0x00 0x80 0xdf 0x0c
-0x00 0x80 0xdf 0x4c
-0x00 0x84 0xdf 0x0c
-0x00 0x84 0xdf 0x4c
-0x00 0x88 0xdf 0x0c
-0x00 0x88 0xdf 0x4c
-0x00 0x8c 0xdf 0x4c
-
-# CHECK: ld2.8b { v0, v1 }, [x0], #16
-# CHECK: ld2.16b { v0, v1 }, [x0], #32
-# CHECK: ld2.4h { v0, v1 }, [x0], #16
-# CHECK: ld2.8h { v0, v1 }, [x0], #32
-# CHECK: ld2.2s { v0, v1 }, [x0], #16
-# CHECK: ld2.4s { v0, v1 }, [x0], #32
-# CHECK: ld2.2d { v0, v1 }, [x0], #32
-
-0x64 0x84 0x85 0x0c
-0x0c 0x88 0x87 0x0c
-
-# CHECK: st2.4h { v4, v5 }, [x3], x5
-# CHECK: st2.2s { v12, v13 }, [x0], x7
-
-0x00 0x80 0x9f 0x0c
-0x00 0x80 0x9f 0x4c
-0x00 0x84 0x9f 0x0c
-0x00 0x84 0x9f 0x4c
-0x00 0x88 0x9f 0x0c
-0x00 0x88 0x9f 0x4c
-0x00 0x8c 0x9f 0x4c
-
-# CHECK: st2.8b { v0, v1 }, [x0], #16
-# CHECK: st2.16b { v0, v1 }, [x0], #32
-# CHECK: st2.4h { v0, v1 }, [x0], #16
-# CHECK: st2.8h { v0, v1 }, [x0], #32
-# CHECK: st2.2s { v0, v1 }, [x0], #16
-# CHECK: st2.4s { v0, v1 }, [x0], #32
-# CHECK: st2.2d { v0, v1 }, [x0], #32
-
-0x21 0xc0 0x60 0x0d
-0x21 0xc0 0xe2 0x0d
-0x21 0xc0 0x60 0x4d
-0x21 0xc0 0xe2 0x4d
-0x21 0xc4 0x60 0x0d
-0x21 0xc4 0xe2 0x0d
-0x21 0xc4 0x60 0x4d
-0x21 0xc4 0xe2 0x4d
-0x21 0xc8 0x60 0x0d
-0x21 0xc8 0xe2 0x0d
-0x21 0xcc 0x60 0x4d
-0x21 0xcc 0xe2 0x4d
-0x21 0xcc 0x60 0x0d
-0x21 0xcc 0xe2 0x0d
-
-# CHECK: ld2r.8b { v1, v2 }, [x1]
-# CHECK: ld2r.8b { v1, v2 }, [x1], x2
-# CHECK: ld2r.16b { v1, v2 }, [x1]
-# CHECK: ld2r.16b { v1, v2 }, [x1], x2
-# CHECK: ld2r.4h { v1, v2 }, [x1]
-# CHECK: ld2r.4h { v1, v2 }, [x1], x2
-# CHECK: ld2r.8h { v1, v2 }, [x1]
-# CHECK: ld2r.8h { v1, v2 }, [x1], x2
-# CHECK: ld2r.2s { v1, v2 }, [x1]
-# CHECK: ld2r.2s { v1, v2 }, [x1], x2
-# CHECK: ld2r.2d { v1, v2 }, [x1]
-# CHECK: ld2r.2d { v1, v2 }, [x1], x2
-# CHECK: ld2r.1d { v1, v2 }, [x1]
-# CHECK: ld2r.1d { v1, v2 }, [x1], x2
-
-0x21 0xc0 0xff 0x0d
-0x21 0xc0 0xff 0x4d
-0x21 0xc4 0xff 0x0d
-0x21 0xc4 0xff 0x4d
-0x21 0xc8 0xff 0x0d
-0x21 0xcc 0xff 0x4d
-0x21 0xcc 0xff 0x0d
-
-# CHECK: ld2r.8b { v1, v2 }, [x1], #2
-# CHECK: ld2r.16b { v1, v2 }, [x1], #2
-# CHECK: ld2r.4h { v1, v2 }, [x1], #4
-# CHECK: ld2r.8h { v1, v2 }, [x1], #4
-# CHECK: ld2r.2s { v1, v2 }, [x1], #8
-# CHECK: ld2r.2d { v1, v2 }, [x1], #16
-# CHECK: ld2r.1d { v1, v2 }, [x1], #16
-
-0x21 0x40 0x40 0x0c
-0x45 0x40 0x40 0x4c
-0x0a 0x48 0x40 0x0c
-
-# CHECK: ld3.8b { v1, v2, v3 }, [x1]
-# CHECK: ld3.16b { v5, v6, v7 }, [x2]
-# CHECK: ld3.2s { v10, v11, v12 }, [x0]
-
-0x21 0x40 0x00 0x0c
-0x45 0x40 0x00 0x4c
-0x0a 0x48 0x00 0x0c
-
-# CHECK: st3.8b { v1, v2, v3 }, [x1]
-# CHECK: st3.16b { v5, v6, v7 }, [x2]
-# CHECK: st3.2s { v10, v11, v12 }, [x0]
-
-0x61 0x28 0xc4 0x0d
-0x82 0xa4 0xc5 0x4d
-0xa3 0x78 0xc6 0x0d
-0xc4 0xa0 0xc7 0x4d
-
-# CHECK: ld3.b { v1, v2, v3 }[2], [x3], x4
-# CHECK: ld3.d { v2, v3, v4 }[1], [x4], x5
-# CHECK: ld3.h { v3, v4, v5 }[3], [x5], x6
-# CHECK: ld3.s { v4, v5, v6 }[2], [x6], x7
-
-0x61 0x28 0x84 0x0d
-0x82 0xa4 0x85 0x4d
-0xa3 0x78 0x86 0x0d
-0xc4 0xa0 0x87 0x4d
-
-# CHECK: st3.b { v1, v2, v3 }[2], [x3], x4
-# CHECK: st3.d { v2, v3, v4 }[1], [x4], x5
-# CHECK: st3.h { v3, v4, v5 }[3], [x5], x6
-# CHECK: st3.s { v4, v5, v6 }[2], [x6], x7
-
-0x61 0x28 0x9f 0x0d
-0x82 0xa4 0x9f 0x4d
-0xa3 0x78 0x9f 0x0d
-0xc4 0xa0 0x9f 0x4d
-
-# CHECK: st3.b { v1, v2, v3 }[2], [x3], #3
-# CHECK: st3.d { v2, v3, v4 }[1], [x4], #24
-# CHECK: st3.h { v3, v4, v5 }[3], [x5], #6
-# CHECK: st3.s { v4, v5, v6 }[2], [x6], #12
-
-0x41 0x40 0xc3 0x0c
-0x42 0x40 0xc4 0x4c
-0x64 0x44 0xc5 0x0c
-0x87 0x44 0xc6 0x4c
-0x0c 0x48 0xc7 0x0c
-0x0a 0x48 0xc8 0x4c
-0x4f 0x4c 0xca 0x4c
-
-# CHECK: ld3.8b { v1, v2, v3 }, [x2], x3
-# CHECK: ld3.16b { v2, v3, v4 }, [x2], x4
-# CHECK: ld3.4h { v4, v5, v6 }, [x3], x5
-# CHECK: ld3.8h { v7, v8, v9 }, [x4], x6
-# CHECK: ld3.2s { v12, v13, v14 }, [x0], x7
-# CHECK: ld3.4s { v10, v11, v12 }, [x0], x8
-# CHECK: ld3.2d { v15, v16, v17 }, [x2], x10
-
-0x00 0x40 0xdf 0x0c
-0x00 0x40 0xdf 0x4c
-0x00 0x44 0xdf 0x0c
-0x00 0x44 0xdf 0x4c
-0x00 0x48 0xdf 0x0c
-0x00 0x48 0xdf 0x4c
-0x00 0x4c 0xdf 0x4c
-
-# CHECK: ld3.8b { v0, v1, v2 }, [x0], #24
-# CHECK: ld3.16b { v0, v1, v2 }, [x0], #48
-# CHECK: ld3.4h { v0, v1, v2 }, [x0], #24
-# CHECK: ld3.8h { v0, v1, v2 }, [x0], #48
-# CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
-# CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
-# CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
-
-0x41 0x40 0x83 0x0c
-0x42 0x40 0x84 0x4c
-0x64 0x44 0x85 0x0c
-0x87 0x44 0x86 0x4c
-0x0c 0x48 0x87 0x0c
-0x0a 0x48 0x88 0x4c
-0x4f 0x4c 0x8a 0x4c
-
-# CHECK: st3.8b { v1, v2, v3 }, [x2], x3
-# CHECK: st3.16b { v2, v3, v4 }, [x2], x4
-# CHECK: st3.4h { v4, v5, v6 }, [x3], x5
-# CHECK: st3.8h { v7, v8, v9 }, [x4], x6
-# CHECK: st3.2s { v12, v13, v14 }, [x0], x7
-# CHECK: st3.4s { v10, v11, v12 }, [x0], x8
-# CHECK: st3.2d { v15, v16, v17 }, [x2], x10
-
-0x00 0x40 0x9f 0x0c
-0x00 0x40 0x9f 0x4c
-0x00 0x44 0x9f 0x0c
-0x00 0x44 0x9f 0x4c
-0x00 0x48 0x9f 0x0c
-0x00 0x48 0x9f 0x4c
-0x00 0x4c 0x9f 0x4c
-
-# CHECK: st3.8b { v0, v1, v2 }, [x0], #24
-# CHECK: st3.16b { v0, v1, v2 }, [x0], #48
-# CHECK: st3.4h { v0, v1, v2 }, [x0], #24
-# CHECK: st3.8h { v0, v1, v2 }, [x0], #48
-# CHECK: st3.2s { v0, v1, v2 }, [x0], #24
-# CHECK: st3.4s { v0, v1, v2 }, [x0], #48
-# CHECK: st3.2d { v0, v1, v2 }, [x0], #48
-
-0x61 0x28 0x40 0x0d
-0x82 0xa4 0x40 0x4d
-0xc3 0x70 0x40 0x0d
-0xe4 0xb0 0x40 0x4d
-
-# CHECK: ld3.b { v1, v2, v3 }[2], [x3]
-# CHECK: ld3.d { v2, v3, v4 }[1], [x4]
-# CHECK: ld3.h { v3, v4, v5 }[2], [x6]
-# CHECK: ld3.s { v4, v5, v6 }[3], [x7]
-
-0x61 0x28 0xdf 0x0d
-0x82 0xa4 0xdf 0x4d
-0xa3 0x78 0xdf 0x0d
-0xc4 0xa0 0xdf 0x4d
-
-# CHECK: ld3.b { v1, v2, v3 }[2], [x3], #3
-# CHECK: ld3.d { v2, v3, v4 }[1], [x4], #24
-# CHECK: ld3.h { v3, v4, v5 }[3], [x5], #6
-# CHECK: ld3.s { v4, v5, v6 }[2], [x6], #12
-
-0x61 0x28 0x00 0x0d
-0x82 0xa4 0x00 0x4d
-0xc3 0x70 0x00 0x0d
-0xe4 0xb0 0x00 0x4d
-
-# CHECK: st3.b { v1, v2, v3 }[2], [x3]
-# CHECK: st3.d { v2, v3, v4 }[1], [x4]
-# CHECK: st3.h { v3, v4, v5 }[2], [x6]
-# CHECK: st3.s { v4, v5, v6 }[3], [x7]
-
-0x21 0xe0 0x40 0x0d
-0x21 0xe0 0xc2 0x0d
-0x21 0xe0 0x40 0x4d
-0x21 0xe0 0xc2 0x4d
-0x21 0xe4 0x40 0x0d
-0x21 0xe4 0xc2 0x0d
-0x21 0xe4 0x40 0x4d
-0x21 0xe4 0xc2 0x4d
-0x21 0xe8 0x40 0x0d
-0x21 0xe8 0xc2 0x0d
-0x21 0xec 0x40 0x4d
-0x21 0xec 0xc2 0x4d
-0x21 0xec 0x40 0x0d
-0x21 0xec 0xc2 0x0d
-
-# CHECK: ld3r.8b { v1, v2, v3 }, [x1]
-# CHECK: ld3r.8b { v1, v2, v3 }, [x1], x2
-# CHECK: ld3r.16b { v1, v2, v3 }, [x1]
-# CHECK: ld3r.16b { v1, v2, v3 }, [x1], x2
-# CHECK: ld3r.4h { v1, v2, v3 }, [x1]
-# CHECK: ld3r.4h { v1, v2, v3 }, [x1], x2
-# CHECK: ld3r.8h { v1, v2, v3 }, [x1]
-# CHECK: ld3r.8h { v1, v2, v3 }, [x1], x2
-# CHECK: ld3r.2s { v1, v2, v3 }, [x1]
-# CHECK: ld3r.2s { v1, v2, v3 }, [x1], x2
-# CHECK: ld3r.2d { v1, v2, v3 }, [x1]
-# CHECK: ld3r.2d { v1, v2, v3 }, [x1], x2
-# CHECK: ld3r.1d { v1, v2, v3 }, [x1]
-# CHECK: ld3r.1d { v1, v2, v3 }, [x1], x2
-
-0x21 0xe0 0xdf 0x0d
-0x21 0xe0 0xdf 0x4d
-0x21 0xe4 0xdf 0x0d
-0x21 0xe4 0xdf 0x4d
-0x21 0xe8 0xdf 0x0d
-0x21 0xec 0xdf 0x4d
-0x21 0xec 0xdf 0x0d
-
-# CHECK: ld3r.8b	{ v1, v2, v3 }, [x1], #3
-# CHECK: ld3r.16b	{ v1, v2, v3 }, [x1], #3
-# CHECK: ld3r.4h	{ v1, v2, v3 }, [x1], #6
-# CHECK: ld3r.8h	{ v1, v2, v3 }, [x1], #6
-# CHECK: ld3r.2s	{ v1, v2, v3 }, [x1], #12
-# CHECK: ld3r.2d	{ v1, v2, v3 }, [x1], #24
-# CHECK: ld3r.1d	{ v1, v2, v3 }, [x1], #24
-
-0x21 0x00 0x40 0x0c
-0x45 0x00 0x40 0x4c
-0x0a 0x08 0x40 0x0c
-
-# CHECK: ld4.8b { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4.16b { v5, v6, v7, v8 }, [x2]
-# CHECK: ld4.2s { v10, v11, v12, v13 }, [x0]
-
-0x21 0x00 0x00 0x0c
-0x45 0x00 0x00 0x4c
-0x0a 0x08 0x00 0x0c
-
-# CHECK: st4.8b { v1, v2, v3, v4 }, [x1]
-# CHECK: st4.16b { v5, v6, v7, v8 }, [x2]
-# CHECK: st4.2s { v10, v11, v12, v13 }, [x0]
-
-0x61 0x28 0xe4 0x0d
-0x82 0xa4 0xe5 0x4d
-0xa3 0x78 0xe6 0x0d
-0xc4 0xa0 0xe7 0x4d
-
-# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3], x4
-# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4], x5
-# CHECK: ld4.h { v3, v4, v5, v6 }[3], [x5], x6
-# CHECK: ld4.s { v4, v5, v6, v7 }[2], [x6], x7
-
-0x61 0x28 0xff 0x0d
-0x82 0xa4 0xff 0x4d
-0xa3 0x78 0xff 0x0d
-0xc4 0xa0 0xff 0x4d
-
-# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3], #4
-# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4], #32
-# CHECK: ld4.h { v3, v4, v5, v6 }[3], [x5], #8
-# CHECK: ld4.s { v4, v5, v6, v7 }[2], [x6], #16
-
-0x61 0x28 0xa4 0x0d
-0x82 0xa4 0xa5 0x4d
-0xa3 0x78 0xa6 0x0d
-0xc4 0xa0 0xa7 0x4d
-
-# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3], x4
-# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4], x5
-# CHECK: st4.h { v3, v4, v5, v6 }[3], [x5], x6
-# CHECK: st4.s { v4, v5, v6, v7 }[2], [x6], x7
-
-0x61 0x28 0xbf 0x0d
-0x82 0xa4 0xbf 0x4d
-0xa3 0x78 0xbf 0x0d
-0xc4 0xa0 0xbf 0x4d
-
-# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3], #4
-# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4], #32
-# CHECK: st4.h { v3, v4, v5, v6 }[3], [x5], #8
-# CHECK: st4.s { v4, v5, v6, v7 }[2], [x6], #16
-
-0x41 0x00 0xc3 0x0c
-0x42 0x00 0xc4 0x4c
-0x64 0x04 0xc5 0x0c
-0x87 0x04 0xc6 0x4c
-0x0c 0x08 0xc7 0x0c
-0x0a 0x08 0xc8 0x4c
-0x4f 0x0c 0xca 0x4c
-
-# CHECK: ld4.8b { v1, v2, v3, v4 }, [x2], x3
-# CHECK: ld4.16b { v2, v3, v4, v5 }, [x2], x4
-# CHECK: ld4.4h { v4, v5, v6, v7 }, [x3], x5
-# CHECK: ld4.8h { v7, v8, v9, v10 }, [x4], x6
-# CHECK: ld4.2s { v12, v13, v14, v15 }, [x0], x7
-# CHECK: ld4.4s { v10, v11, v12, v13 }, [x0], x8
-# CHECK: ld4.2d { v15, v16, v17, v18 }, [x2], x10
-
-0x00 0x00 0xdf 0x0c
-0x00 0x00 0xdf 0x4c
-0x00 0x04 0xdf 0x0c
-0x00 0x04 0xdf 0x4c
-0x00 0x08 0xdf 0x0c
-0x00 0x08 0xdf 0x4c
-0x00 0x0c 0xdf 0x4c
-
-# CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32
-# CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64
-# CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32
-# CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64
-# CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
-# CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
-# CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
-
-0x00 0x00 0x9f 0x0c
-0x00 0x00 0x9f 0x4c
-0x00 0x04 0x9f 0x0c
-0x00 0x04 0x9f 0x4c
-0x00 0x08 0x9f 0x0c
-0x00 0x08 0x9f 0x4c
-0x00 0x0c 0x9f 0x4c
-
-# CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32
-# CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64
-# CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32
-# CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64
-# CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
-# CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
-# CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
-
-0x41 0x00 0x83 0x0c
-0x42 0x00 0x84 0x4c
-0x64 0x04 0x85 0x0c
-0x87 0x04 0x86 0x4c
-0x0c 0x08 0x87 0x0c
-0x0a 0x08 0x88 0x4c
-0x4f 0x0c 0x8a 0x4c
-
-# CHECK: st4.8b { v1, v2, v3, v4 }, [x2], x3
-# CHECK: st4.16b { v2, v3, v4, v5 }, [x2], x4
-# CHECK: st4.4h { v4, v5, v6, v7 }, [x3], x5
-# CHECK: st4.8h { v7, v8, v9, v10 }, [x4], x6
-# CHECK: st4.2s { v12, v13, v14, v15 }, [x0], x7
-# CHECK: st4.4s { v10, v11, v12, v13 }, [x0], x8
-# CHECK: st4.2d { v15, v16, v17, v18 }, [x2], x10
-
-0x61 0x28 0x60 0x0d
-0x82 0xa4 0x60 0x4d
-0xc3 0x70 0x60 0x0d
-0xe4 0xb0 0x60 0x4d
-
-# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3]
-# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4]
-# CHECK: ld4.h { v3, v4, v5, v6 }[2], [x6]
-# CHECK: ld4.s { v4, v5, v6, v7 }[3], [x7]
-
-0x61 0x28 0x20 0x0d
-0x82 0xa4 0x20 0x4d
-0xc3 0x70 0x20 0x0d
-0xe4 0xb0 0x20 0x4d
-
-# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3]
-# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4]
-# CHECK: st4.h { v3, v4, v5, v6 }[2], [x6]
-# CHECK: st4.s { v4, v5, v6, v7 }[3], [x7]
-
-0x21 0xe0 0x60 0x0d
-0x21 0xe0 0xe2 0x0d
-0x21 0xe0 0x60 0x4d
-0x21 0xe0 0xe2 0x4d
-0x21 0xe4 0x60 0x0d
-0x21 0xe4 0xe2 0x0d
-0x21 0xe4 0x60 0x4d
-0x21 0xe4 0xe2 0x4d
-0x21 0xe8 0x60 0x0d
-0x21 0xe8 0xe2 0x0d
-0x21 0xec 0x60 0x4d
-0x21 0xec 0xe2 0x4d
-0x21 0xec 0x60 0x0d
-0x21 0xec 0xe2 0x0d
-
-# CHECK: ld4r.8b { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.8b { v1, v2, v3, v4 }, [x1], x2
-# CHECK: ld4r.16b { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.16b { v1, v2, v3, v4 }, [x1], x2
-# CHECK: ld4r.4h { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.4h { v1, v2, v3, v4 }, [x1], x2
-# CHECK: ld4r.8h { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.8h { v1, v2, v3, v4 }, [x1], x2
-# CHECK: ld4r.2s { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.2s { v1, v2, v3, v4 }, [x1], x2
-# CHECK: ld4r.2d { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.2d { v1, v2, v3, v4 }, [x1], x2
-# CHECK: ld4r.1d { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.1d { v1, v2, v3, v4 }, [x1], x2
-
-0x21 0xe0 0xff 0x0d
-0x21 0xe0 0xff 0x4d
-0x21 0xe4 0xff 0x0d
-0x21 0xe4 0xff 0x4d
-0x21 0xe8 0xff 0x0d
-0x21 0xec 0xff 0x4d
-0x21 0xec 0xff 0x0d
-
-# CHECK: ld4r.8b	{ v1, v2, v3, v4 }, [x1], #4
-# CHECK: ld4r.16b	{ v1, v2, v3, v4 }, [x1], #4
-# CHECK: ld4r.4h	{ v1, v2, v3, v4 }, [x1], #8
-# CHECK: ld4r.8h	{ v1, v2, v3, v4 }, [x1], #8
-# CHECK: ld4r.2s	{ v1, v2, v3, v4 }, [x1], #16
-# CHECK: ld4r.2d	{ v1, v2, v3, v4 }, [x1], #32
-# CHECK: ld4r.1d	{ v1, v2, v3, v4 }, [x1], #32
-
-0x20 0xe4 0x00 0x2f
-0x20 0xe4 0x00 0x6f
-0x20 0xe4 0x00 0x0f
-0x20 0xe4 0x00 0x4f
-
-# CHECK: movi     d0, #0x000000000000ff
-# CHECK: movi.2d  v0, #0x000000000000ff
-# CHECK: movi.8b  v0, #0x1
-# CHECK: movi.16b v0, #0x1
-
-0x20 0x04 0x00 0x0f
-0x20 0x24 0x00 0x0f
-0x20 0x44 0x00 0x0f
-0x20 0x64 0x00 0x0f
-
-# CHECK: movi.2s v0, #0x1
-# CHECK: movi.2s v0, #0x1, lsl #8
-# CHECK: movi.2s v0, #0x1, lsl #16
-# CHECK: movi.2s v0, #0x1, lsl #24
-
-0x20 0x04 0x00 0x4f
-0x20 0x24 0x00 0x4f
-0x20 0x44 0x00 0x4f
-0x20 0x64 0x00 0x4f
-
-# CHECK: movi.4s v0, #0x1
-# CHECK: movi.4s v0, #0x1, lsl #8
-# CHECK: movi.4s v0, #0x1, lsl #16
-# CHECK: movi.4s v0, #0x1, lsl #24
-
-0x20 0x84 0x00 0x0f
-0x20 0xa4 0x00 0x0f
-
-# CHECK: movi.4h v0, #0x1
-# CHECK: movi.4h v0, #0x1, lsl #8
-
-0x20 0x84 0x00 0x4f
-0x20 0xa4 0x00 0x4f
-
-# CHECK: movi.8h v0, #0x1
-# CHECK: movi.8h v0, #0x1, lsl #8
-
-0x20 0x04 0x00 0x2f
-0x20 0x24 0x00 0x2f
-0x20 0x44 0x00 0x2f
-0x20 0x64 0x00 0x2f
-
-# CHECK: mvni.2s v0, #0x1
-# CHECK: mvni.2s v0, #0x1, lsl #8
-# CHECK: mvni.2s v0, #0x1, lsl #16
-# CHECK: mvni.2s v0, #0x1, lsl #24
-
-0x20 0x04 0x00 0x6f
-0x20 0x24 0x00 0x6f
-0x20 0x44 0x00 0x6f
-0x20 0x64 0x00 0x6f
-
-# CHECK: mvni.4s v0, #0x1
-# CHECK: mvni.4s v0, #0x1, lsl #8
-# CHECK: mvni.4s v0, #0x1, lsl #16
-# CHECK: mvni.4s v0, #0x1, lsl #24
-
-0x20 0x84 0x00 0x2f
-0x20 0xa4 0x00 0x2f
-
-# CHECK: mvni.4h v0, #0x1
-# CHECK: mvni.4h v0, #0x1, lsl #8
-
-0x20 0x84 0x00 0x6f
-0x20 0xa4 0x00 0x6f
-
-# CHECK: mvni.8h v0, #0x1
-# CHECK: mvni.8h v0, #0x1, lsl #8
-
-0x20 0xc4 0x00 0x2f
-0x20 0xd4 0x00 0x2f
-0x20 0xc4 0x00 0x6f
-0x20 0xd4 0x00 0x6f
-
-# CHECK: mvni.2s v0, #0x1, msl #8
-# CHECK: mvni.2s v0, #0x1, msl #16
-# CHECK: mvni.4s v0, #0x1, msl #8
-# CHECK: mvni.4s v0, #0x1, msl #16
-
-0x00 0x88 0x21 0x2e
-0x00 0x98 0x21 0x2e
-0x00 0x98 0xa1 0x2e
-0x00 0x98 0x21 0x0e
-0x00 0x88 0x21 0x0e
-0x00 0x88 0xa1 0x0e
-0x00 0x98 0xa1 0x0e
-
-# CHECK: frinta.2s	v0, v0
-# CHECK: frintx.2s	v0, v0
-# CHECK: frinti.2s	v0, v0
-# CHECK: frintm.2s	v0, v0
-# CHECK: frintn.2s	v0, v0
-# CHECK: frintp.2s	v0, v0
-# CHECK: frintz.2s	v0, v0
-
-#===-------------------------------------------------------------------------===
-# AdvSIMD scalar x index instructions
-#===-------------------------------------------------------------------------===
-
-0x00 0x18 0xa0 0x5f
-0x00 0x18 0xc0 0x5f
-0x00 0x58 0xa0 0x5f
-0x00 0x58 0xc0 0x5f
-0x00 0x98 0xa0 0x7f
-0x00 0x98 0xc0 0x7f
-0x00 0x98 0xa0 0x5f
-0x00 0x98 0xc0 0x5f
-0x00 0x38 0x70 0x5f
-0x00 0x38 0xa0 0x5f
-0x00 0x78 0x70 0x5f
-0x00 0xc8 0x70 0x5f
-0x00 0xc8 0xa0 0x5f
-0x00 0xb8 0x70 0x5f
-0x00 0xb8 0xa0 0x5f
-0x00 0xd8 0x70 0x5f
-0x00 0xd8 0xa0 0x5f
-
-# CHECK: fmla.s	s0, s0, v0[3]
-# CHECK: fmla.d	d0, d0, v0[1]
-# CHECK: fmls.s	s0, s0, v0[3]
-# CHECK: fmls.d	d0, d0, v0[1]
-# CHECK: fmulx.s	s0, s0, v0[3]
-# CHECK: fmulx.d	d0, d0, v0[1]
-# CHECK: fmul.s	s0, s0, v0[3]
-# CHECK: fmul.d	d0, d0, v0[1]
-# CHECK: sqdmlal.h	s0, h0, v0[7]
-# CHECK: sqdmlal.s	d0, s0, v0[3]
-# CHECK: sqdmlsl.h	s0, h0, v0[7]
-# CHECK: sqdmulh.h	h0, h0, v0[7]
-# CHECK: sqdmulh.s	s0, s0, v0[3]
-# CHECK: sqdmull.h	s0, h0, v0[7]
-# CHECK: sqdmull.s	d0, s0, v0[3]
-# CHECK: sqrdmulh.h	h0, h0, v0[7]
-# CHECK: sqrdmulh.s	s0, s0, v0[3]
-
-#===-------------------------------------------------------------------------===
-# AdvSIMD vector x index instructions
-#===-------------------------------------------------------------------------===
-
-  0x00 0x10 0x80 0x0f
-  0x00 0x10 0xa0 0x4f
-  0x00 0x18 0xc0 0x4f
-  0x00 0x50 0x80 0x0f
-  0x00 0x50 0xa0 0x4f
-  0x00 0x58 0xc0 0x4f
-  0x00 0x90 0x80 0x2f
-  0x00 0x90 0xa0 0x6f
-  0x00 0x98 0xc0 0x6f
-  0x00 0x90 0x80 0x0f
-  0x00 0x90 0xa0 0x4f
-  0x00 0x98 0xc0 0x4f
-  0x00 0x00 0x40 0x2f
-  0x00 0x00 0x50 0x6f
-  0x00 0x08 0x80 0x2f
-  0x00 0x08 0xa0 0x6f
-  0x00 0x40 0x40 0x2f
-  0x00 0x40 0x50 0x6f
-  0x00 0x48 0x80 0x2f
-  0x00 0x48 0xa0 0x6f
-  0x00 0x80 0x40 0x0f
-  0x00 0x80 0x50 0x4f
-  0x00 0x88 0x80 0x0f
-  0x00 0x88 0xa0 0x4f
-  0x00 0x20 0x40 0x0f
-  0x00 0x20 0x50 0x4f
-  0x00 0x28 0x80 0x0f
-  0x00 0x28 0xa0 0x4f
-  0x00 0x60 0x40 0x0f
-  0x00 0x60 0x50 0x4f
-  0x00 0x68 0x80 0x0f
-  0x00 0x68 0xa0 0x4f
-  0x00 0xa0 0x40 0x0f
-  0x00 0xa0 0x50 0x4f
-  0x00 0xa8 0x80 0x0f
-  0x00 0xa8 0xa0 0x4f
-  0x00 0x30 0x40 0x0f
-  0x00 0x30 0x50 0x4f
-  0x00 0x38 0x80 0x0f
-  0x00 0x38 0xa0 0x4f
-  0x00 0x70 0x40 0x0f
-  0x00 0x70 0x50 0x4f
-  0x00 0x78 0x80 0x0f
-  0x00 0x78 0xa0 0x4f
-  0x00 0xc0 0x40 0x0f
-  0x00 0xc0 0x50 0x4f
-  0x00 0xc8 0x80 0x0f
-  0x00 0xc8 0xa0 0x4f
-  0x00 0xb0 0x40 0x0f
-  0x00 0xb0 0x50 0x4f
-  0x00 0xb8 0x80 0x0f
-  0x00 0xb8 0xa0 0x4f
-  0x00 0xd0 0x40 0x0f
-  0x00 0xd0 0x50 0x4f
-  0x00 0xd8 0x80 0x0f
-  0x00 0xd8 0xa0 0x4f
-  0x00 0x20 0x40 0x2f
-  0x00 0x20 0x50 0x6f
-  0x00 0x28 0x80 0x2f
-  0x00 0x28 0xa0 0x6f
-  0x00 0x60 0x40 0x2f
-  0x00 0x60 0x50 0x6f
-  0x00 0x68 0x80 0x2f
-  0x00 0x68 0xa0 0x6f
-  0x00 0xa0 0x40 0x2f
-  0x00 0xa0 0x50 0x6f
-  0x00 0xa8 0x80 0x2f
-  0x00 0xa8 0xa0 0x6f
-
-# CHECK: fmla.2s	v0, v0, v0[0]
-# CHECK: fmla.4s	v0, v0, v0[1]
-# CHECK: fmla.2d	v0, v0, v0[1]
-# CHECK: fmls.2s	v0, v0, v0[0]
-# CHECK: fmls.4s	v0, v0, v0[1]
-# CHECK: fmls.2d	v0, v0, v0[1]
-# CHECK: fmulx.2s	v0, v0, v0[0]
-# CHECK: fmulx.4s	v0, v0, v0[1]
-# CHECK: fmulx.2d	v0, v0, v0[1]
-# CHECK: fmul.2s	v0, v0, v0[0]
-# CHECK: fmul.4s	v0, v0, v0[1]
-# CHECK: fmul.2d	v0, v0, v0[1]
-# CHECK: mla.4h	v0, v0, v0[0]
-# CHECK: mla.8h	v0, v0, v0[1]
-# CHECK: mla.2s	v0, v0, v0[2]
-# CHECK: mla.4s	v0, v0, v0[3]
-# CHECK: mls.4h	v0, v0, v0[0]
-# CHECK: mls.8h	v0, v0, v0[1]
-# CHECK: mls.2s	v0, v0, v0[2]
-# CHECK: mls.4s	v0, v0, v0[3]
-# CHECK: mul.4h	v0, v0, v0[0]
-# CHECK: mul.8h	v0, v0, v0[1]
-# CHECK: mul.2s	v0, v0, v0[2]
-# CHECK: mul.4s	v0, v0, v0[3]
-# CHECK: smlal.4s	v0, v0, v0[0]
-# CHECK: smlal2.4s	v0, v0, v0[1]
-# CHECK: smlal.2d	v0, v0, v0[2]
-# CHECK: smlal2.2d	v0, v0, v0[3]
-# CHECK: smlsl.4s	v0, v0, v0[0]
-# CHECK: smlsl2.4s	v0, v0, v0[1]
-# CHECK: smlsl.2d	v0, v0, v0[2]
-# CHECK: smlsl2.2d	v0, v0, v0[3]
-# CHECK: smull.4s	v0, v0, v0[0]
-# CHECK: smull2.4s	v0, v0, v0[1]
-# CHECK: smull.2d	v0, v0, v0[2]
-# CHECK: smull2.2d	v0, v0, v0[3]
-# CHECK: sqdmlal.4s	v0, v0, v0[0]
-# CHECK: sqdmlal2.4s	v0, v0, v0[1]
-# CHECK: sqdmlal.2d	v0, v0, v0[2]
-# CHECK: sqdmlal2.2d	v0, v0, v0[3]
-# CHECK: sqdmlsl.4s	v0, v0, v0[0]
-# CHECK: sqdmlsl2.4s	v0, v0, v0[1]
-# CHECK: sqdmlsl.2d	v0, v0, v0[2]
-# CHECK: sqdmlsl2.2d	v0, v0, v0[3]
-# CHECK: sqdmulh.4h	v0, v0, v0[0]
-# CHECK: sqdmulh.8h	v0, v0, v0[1]
-# CHECK: sqdmulh.2s	v0, v0, v0[2]
-# CHECK: sqdmulh.4s	v0, v0, v0[3]
-# CHECK: sqdmull.4s	v0, v0, v0[0]
-# CHECK: sqdmull2.4s	v0, v0, v0[1]
-# CHECK: sqdmull.2d	v0, v0, v0[2]
-# CHECK: sqdmull2.2d	v0, v0, v0[3]
-# CHECK: sqrdmulh.4h	v0, v0, v0[0]
-# CHECK: sqrdmulh.8h	v0, v0, v0[1]
-# CHECK: sqrdmulh.2s	v0, v0, v0[2]
-# CHECK: sqrdmulh.4s	v0, v0, v0[3]
-# CHECK: umlal.4s	v0, v0, v0[0]
-# CHECK: umlal2.4s	v0, v0, v0[1]
-# CHECK: umlal.2d	v0, v0, v0[2]
-# CHECK: umlal2.2d	v0, v0, v0[3]
-# CHECK: umlsl.4s	v0, v0, v0[0]
-# CHECK: umlsl2.4s	v0, v0, v0[1]
-# CHECK: umlsl.2d	v0, v0, v0[2]
-# CHECK: umlsl2.2d	v0, v0, v0[3]
-# CHECK: umull.4s	v0, v0, v0[0]
-# CHECK: umull2.4s	v0, v0, v0[1]
-# CHECK: umull.2d	v0, v0, v0[2]
-# CHECK: umull2.2d	v0, v0, v0[3]
-
-
-#===-------------------------------------------------------------------------===
-# AdvSIMD scalar + shift instructions
-#===-------------------------------------------------------------------------===
-
-  0x00 0x54 0x41 0x5f
-  0x00 0x54 0x41 0x7f
-  0x00 0x9c 0x09 0x5f
-  0x00 0x9c 0x12 0x5f
-  0x00 0x9c 0x23 0x5f
-  0x00 0x8c 0x09 0x7f
-  0x00 0x8c 0x12 0x7f
-  0x00 0x8c 0x23 0x7f
-  0x00 0x64 0x09 0x7f
-  0x00 0x64 0x12 0x7f
-  0x00 0x64 0x23 0x7f
-  0x00 0x64 0x44 0x7f
-  0x00 0x74 0x09 0x5f
-  0x00 0x74 0x12 0x5f
-  0x00 0x74 0x23 0x5f
-  0x00 0x74 0x44 0x5f
-  0x00 0x94 0x09 0x5f
-  0x00 0x94 0x12 0x5f
-  0x00 0x94 0x23 0x5f
-  0x00 0x84 0x09 0x7f
-  0x00 0x84 0x12 0x7f
-  0x00 0x84 0x23 0x7f
-  0x00 0x44 0x41 0x7f
-  0x00 0x24 0x41 0x5f
-  0x00 0x34 0x41 0x5f
-  0x00 0x04 0x41 0x5f
-  0x00 0xe4 0x21 0x7f
-  0x00 0xe4 0x42 0x7f
-  0x00 0x9c 0x09 0x7f
-  0x00 0x9c 0x12 0x7f
-  0x00 0x9c 0x23 0x7f
-  0x00 0x74 0x09 0x7f
-  0x00 0x74 0x12 0x7f
-  0x00 0x74 0x23 0x7f
-  0x00 0x74 0x44 0x7f
-  0x00 0x94 0x09 0x7f
-  0x00 0x94 0x12 0x7f
-  0x00 0x94 0x23 0x7f
-  0x00 0x24 0x41 0x7f
-  0x00 0x34 0x41 0x7f
-  0x00 0x04 0x41 0x7f
-  0x00 0x14 0x41 0x7f
-
-# CHECK: shl	d0, d0, #1
-# CHECK: sli	d0, d0, #1
-# CHECK: sqrshrn	b0, h0, #7
-# CHECK: sqrshrn	h0, s0, #14
-# CHECK: sqrshrn	s0, d0, #29
-# CHECK: sqrshrun	b0, h0, #7
-# CHECK: sqrshrun	h0, s0, #14
-# CHECK: sqrshrun	s0, d0, #29
-# CHECK: sqshlu	b0, b0, #1
-# CHECK: sqshlu	h0, h0, #2
-# CHECK: sqshlu	s0, s0, #3
-# CHECK: sqshlu	d0, d0, #4
-# CHECK: sqshl	b0, b0, #1
-# CHECK: sqshl	h0, h0, #2
-# CHECK: sqshl	s0, s0, #3
-# CHECK: sqshl	d0, d0, #4
-# CHECK: sqshrn	b0, h0, #7
-# CHECK: sqshrn	h0, s0, #14
-# CHECK: sqshrn	s0, d0, #29
-# CHECK: sqshrun	b0, h0, #7
-# CHECK: sqshrun	h0, s0, #14
-# CHECK: sqshrun	s0, d0, #29
-# CHECK: sri	d0, d0, #63
-# CHECK: srshr	d0, d0, #63
-# CHECK: srsra	d0, d0, #63
-# CHECK: sshr	d0, d0, #63
-# CHECK: ucvtf	s0, s0, #31
-# CHECK: ucvtf	d0, d0, #62
-# CHECK: uqrshrn	b0, h0, #7
-# CHECK: uqrshrn	h0, s0, #14
-# CHECK: uqrshrn	s0, d0, #29
-# CHECK: uqshl	b0, b0, #1
-# CHECK: uqshl	h0, h0, #2
-# CHECK: uqshl	s0, s0, #3
-# CHECK: uqshl	d0, d0, #4
-# CHECK: uqshrn	b0, h0, #7
-# CHECK: uqshrn	h0, s0, #14
-# CHECK: uqshrn	s0, d0, #29
-# CHECK: urshr	d0, d0, #63
-# CHECK: ursra	d0, d0, #63
-# CHECK: ushr	d0, d0, #63
-# CHECK: usra	d0, d0, #63
-
-#===-------------------------------------------------------------------------===
-# AdvSIMD vector + shift instructions
-#===-------------------------------------------------------------------------===
-
-  0x00 0xfc 0x21 0x0f
-  0x00 0xfc 0x22 0x4f
-  0x00 0xfc 0x43 0x4f
-  0x00 0xfc 0x21 0x2f
-  0x00 0xfc 0x22 0x6f
-  0x00 0xfc 0x43 0x6f
-  0x00 0x8c 0x09 0x0f
-  0x00 0x8c 0x0a 0x4f
-  0x00 0x8c 0x13 0x0f
-  0x00 0x8c 0x14 0x4f
-  0x00 0x8c 0x25 0x0f
-  0x00 0x8c 0x26 0x4f
-  0x00 0xe4 0x21 0x0f
-  0x00 0xe4 0x22 0x4f
-  0x00 0xe4 0x43 0x4f
-  0x00 0x54 0x09 0x0f
-  0x00 0x54 0x0a 0x4f
-  0x00 0x54 0x13 0x0f
-  0x00 0x54 0x14 0x4f
-  0x00 0x54 0x25 0x0f
-  0x00 0x54 0x26 0x4f
-  0x00 0x54 0x47 0x4f
-  0x00 0x84 0x09 0x0f
-  0x00 0x84 0x0a 0x4f
-  0x00 0x84 0x13 0x0f
-  0x00 0x84 0x14 0x4f
-  0x00 0x84 0x25 0x0f
-  0x00 0x84 0x26 0x4f
-  0x00 0x54 0x09 0x2f
-  0x00 0x54 0x0a 0x6f
-  0x00 0x54 0x13 0x2f
-  0x00 0x54 0x14 0x6f
-  0x00 0x54 0x25 0x2f
-  0x00 0x54 0x26 0x6f
-  0x00 0x54 0x47 0x6f
-  0x00 0x9c 0x09 0x0f
-  0x00 0x9c 0x0a 0x4f
-  0x00 0x9c 0x13 0x0f
-  0x00 0x9c 0x14 0x4f
-  0x00 0x9c 0x25 0x0f
-  0x00 0x9c 0x26 0x4f
-  0x00 0x8c 0x09 0x2f
-  0x00 0x8c 0x0a 0x6f
-  0x00 0x8c 0x13 0x2f
-  0x00 0x8c 0x14 0x6f
-  0x00 0x8c 0x25 0x2f
-  0x00 0x8c 0x26 0x6f
-  0x00 0x64 0x09 0x2f
-  0x00 0x64 0x0a 0x6f
-  0x00 0x64 0x13 0x2f
-  0x00 0x64 0x14 0x6f
-  0x00 0x64 0x25 0x2f
-  0x00 0x64 0x26 0x6f
-  0x00 0x64 0x47 0x6f
-  0x00 0x74 0x09 0x0f
-  0x00 0x74 0x0a 0x4f
-  0x00 0x74 0x13 0x0f
-  0x00 0x74 0x14 0x4f
-  0x00 0x74 0x25 0x0f
-  0x00 0x74 0x26 0x4f
-  0x00 0x74 0x47 0x4f
-  0x00 0x94 0x09 0x0f
-  0x00 0x94 0x0a 0x4f
-  0x00 0x94 0x13 0x0f
-  0x00 0x94 0x14 0x4f
-  0x00 0x94 0x25 0x0f
-  0x00 0x94 0x26 0x4f
-  0x00 0x84 0x09 0x2f
-  0x00 0x84 0x0a 0x6f
-  0x00 0x84 0x13 0x2f
-  0x00 0x84 0x14 0x6f
-  0x00 0x84 0x25 0x2f
-  0x00 0x84 0x26 0x6f
-  0x00 0x44 0x09 0x2f
-  0x00 0x44 0x0a 0x6f
-  0x00 0x44 0x13 0x2f
-  0x00 0x44 0x14 0x6f
-  0x00 0x44 0x25 0x2f
-  0x00 0x44 0x26 0x6f
-  0x00 0x44 0x47 0x6f
-  0x00 0x24 0x09 0x0f
-  0x00 0x24 0x0a 0x4f
-  0x00 0x24 0x13 0x0f
-  0x00 0x24 0x14 0x4f
-  0x00 0x24 0x25 0x0f
-  0x00 0x24 0x26 0x4f
-  0x00 0x24 0x47 0x4f
-  0x00 0x34 0x09 0x0f
-  0x00 0x34 0x0a 0x4f
-  0x00 0x34 0x13 0x0f
-  0x00 0x34 0x14 0x4f
-  0x00 0x34 0x25 0x0f
-  0x00 0x34 0x26 0x4f
-  0x00 0x34 0x47 0x4f
-  0x00 0xa4 0x09 0x0f
-  0x00 0xa4 0x0a 0x4f
-  0x00 0xa4 0x13 0x0f
-  0x00 0xa4 0x14 0x4f
-  0x00 0xa4 0x25 0x0f
-  0x00 0xa4 0x26 0x4f
-  0x00 0x04 0x09 0x0f
-  0x00 0x04 0x0a 0x4f
-  0x00 0x04 0x13 0x0f
-  0x00 0x04 0x14 0x4f
-  0x00 0x04 0x25 0x0f
-  0x00 0x04 0x26 0x4f
-  0x00 0x04 0x47 0x4f
-  0x00 0x04 0x09 0x0f
-  0x00 0x14 0x0a 0x4f
-  0x00 0x14 0x13 0x0f
-  0x00 0x14 0x14 0x4f
-  0x00 0x14 0x25 0x0f
-  0x00 0x14 0x26 0x4f
-  0x00 0x14 0x47 0x4f
-  0x00 0x14 0x40 0x5f
-  0x00 0xe4 0x21 0x2f
-  0x00 0xe4 0x22 0x6f
-  0x00 0xe4 0x43 0x6f
-  0x00 0x9c 0x09 0x2f
-  0x00 0x9c 0x0a 0x6f
-  0x00 0x9c 0x13 0x2f
-  0x00 0x9c 0x14 0x6f
-  0x00 0x9c 0x25 0x2f
-  0x00 0x9c 0x26 0x6f
-  0x00 0x74 0x09 0x2f
-  0x00 0x74 0x0a 0x6f
-  0x00 0x74 0x13 0x2f
-  0x00 0x74 0x14 0x6f
-  0x00 0x74 0x25 0x2f
-  0x00 0x74 0x26 0x6f
-  0x00 0x74 0x47 0x6f
-  0x00 0x94 0x09 0x2f
-  0x00 0x94 0x0a 0x6f
-  0x00 0x94 0x13 0x2f
-  0x00 0x94 0x14 0x6f
-  0x00 0x94 0x25 0x2f
-  0x00 0x94 0x26 0x6f
-  0x00 0x24 0x09 0x2f
-  0x00 0x24 0x0a 0x6f
-  0x00 0x24 0x13 0x2f
-  0x00 0x24 0x14 0x6f
-  0x00 0x24 0x25 0x2f
-  0x00 0x24 0x26 0x6f
-  0x00 0x24 0x47 0x6f
-  0x00 0x34 0x09 0x2f
-  0x00 0x34 0x0a 0x6f
-  0x00 0x34 0x13 0x2f
-  0x00 0x34 0x14 0x6f
-  0x00 0x34 0x25 0x2f
-  0x00 0x34 0x26 0x6f
-  0x00 0x34 0x47 0x6f
-  0x00 0xa4 0x09 0x2f
-  0x00 0xa4 0x0a 0x6f
-  0x00 0xa4 0x13 0x2f
-  0x00 0xa4 0x14 0x6f
-  0x00 0xa4 0x25 0x2f
-  0x00 0xa4 0x26 0x6f
-  0x00 0x04 0x09 0x2f
-  0x00 0x04 0x0a 0x6f
-  0x00 0x04 0x13 0x2f
-  0x00 0x04 0x14 0x6f
-  0x00 0x04 0x25 0x2f
-  0x00 0x04 0x26 0x6f
-  0x00 0x04 0x47 0x6f
-  0x00 0x14 0x09 0x2f
-  0x00 0x14 0x0a 0x6f
-  0x00 0x14 0x13 0x2f
-  0x00 0x14 0x14 0x6f
-  0x00 0x14 0x25 0x2f
-  0x00 0x14 0x26 0x6f
-  0x00 0x14 0x47 0x6f
-
-# CHECK: fcvtzs.2s	v0, v0, #31
-# CHECK: fcvtzs.4s	v0, v0, #30
-# CHECK: fcvtzs.2d	v0, v0, #61
-# CHECK: fcvtzu.2s	v0, v0, #31
-# CHECK: fcvtzu.4s	v0, v0, #30
-# CHECK: fcvtzu.2d	v0, v0, #61
-# CHECK: rshrn.8b	v0, v0, #7
-# CHECK: rshrn2.16b	v0, v0, #6
-# CHECK: rshrn.4h	v0, v0, #13
-# CHECK: rshrn2.8h	v0, v0, #12
-# CHECK: rshrn.2s	v0, v0, #27
-# CHECK: rshrn2.4s	v0, v0, #26
-# CHECK: scvtf.2s	v0, v0, #31
-# CHECK: scvtf.4s	v0, v0, #30
-# CHECK: scvtf.2d	v0, v0, #61
-# CHECK: shl.8b	v0, v0, #1
-# CHECK: shl.16b	v0, v0, #2
-# CHECK: shl.4h	v0, v0, #3
-# CHECK: shl.8h	v0, v0, #4
-# CHECK: shl.2s	v0, v0, #5
-# CHECK: shl.4s	v0, v0, #6
-# CHECK: shl.2d	v0, v0, #7
-# CHECK: shrn.8b	v0, v0, #7
-# CHECK: shrn2.16b	v0, v0, #6
-# CHECK: shrn.4h	v0, v0, #13
-# CHECK: shrn2.8h	v0, v0, #12
-# CHECK: shrn.2s	v0, v0, #27
-# CHECK: shrn2.4s	v0, v0, #26
-# CHECK: sli.8b	v0, v0, #1
-# CHECK: sli.16b	v0, v0, #2
-# CHECK: sli.4h	v0, v0, #3
-# CHECK: sli.8h	v0, v0, #4
-# CHECK: sli.2s	v0, v0, #5
-# CHECK: sli.4s	v0, v0, #6
-# CHECK: sli.2d	v0, v0, #7
-# CHECK: sqrshrn.8b	v0, v0, #7
-# CHECK: sqrshrn2.16b	v0, v0, #6
-# CHECK: sqrshrn.4h	v0, v0, #13
-# CHECK: sqrshrn2.8h	v0, v0, #12
-# CHECK: sqrshrn.2s	v0, v0, #27
-# CHECK: sqrshrn2.4s	v0, v0, #26
-# CHECK: sqrshrun.8b	v0, v0, #7
-# CHECK: sqrshrun2.16b	v0, v0, #6
-# CHECK: sqrshrun.4h	v0, v0, #13
-# CHECK: sqrshrun2.8h	v0, v0, #12
-# CHECK: sqrshrun.2s	v0, v0, #27
-# CHECK: sqrshrun2.4s	v0, v0, #26
-# CHECK: sqshlu.8b	v0, v0, #1
-# CHECK: sqshlu.16b	v0, v0, #2
-# CHECK: sqshlu.4h	v0, v0, #3
-# CHECK: sqshlu.8h	v0, v0, #4
-# CHECK: sqshlu.2s	v0, v0, #5
-# CHECK: sqshlu.4s	v0, v0, #6
-# CHECK: sqshlu.2d	v0, v0, #7
-# CHECK: sqshl.8b	v0, v0, #1
-# CHECK: sqshl.16b	v0, v0, #2
-# CHECK: sqshl.4h	v0, v0, #3
-# CHECK: sqshl.8h	v0, v0, #4
-# CHECK: sqshl.2s	v0, v0, #5
-# CHECK: sqshl.4s	v0, v0, #6
-# CHECK: sqshl.2d	v0, v0, #7
-# CHECK: sqshrn.8b	v0, v0, #7
-# CHECK: sqshrn2.16b	v0, v0, #6
-# CHECK: sqshrn.4h	v0, v0, #13
-# CHECK: sqshrn2.8h	v0, v0, #12
-# CHECK: sqshrn.2s	v0, v0, #27
-# CHECK: sqshrn2.4s	v0, v0, #26
-# CHECK: sqshrun.8b	v0, v0, #7
-# CHECK: sqshrun2.16b	v0, v0, #6
-# CHECK: sqshrun.4h	v0, v0, #13
-# CHECK: sqshrun2.8h	v0, v0, #12
-# CHECK: sqshrun.2s	v0, v0, #27
-# CHECK: sqshrun2.4s	v0, v0, #26
-# CHECK: sri.8b	v0, v0, #7
-# CHECK: sri.16b	v0, v0, #6
-# CHECK: sri.4h	v0, v0, #13
-# CHECK: sri.8h	v0, v0, #12
-# CHECK: sri.2s	v0, v0, #27
-# CHECK: sri.4s	v0, v0, #26
-# CHECK: sri.2d	v0, v0, #57
-# CHECK: srshr.8b	v0, v0, #7
-# CHECK: srshr.16b	v0, v0, #6
-# CHECK: srshr.4h	v0, v0, #13
-# CHECK: srshr.8h	v0, v0, #12
-# CHECK: srshr.2s	v0, v0, #27
-# CHECK: srshr.4s	v0, v0, #26
-# CHECK: srshr.2d	v0, v0, #57
-# CHECK: srsra.8b	v0, v0, #7
-# CHECK: srsra.16b	v0, v0, #6
-# CHECK: srsra.4h	v0, v0, #13
-# CHECK: srsra.8h	v0, v0, #12
-# CHECK: srsra.2s	v0, v0, #27
-# CHECK: srsra.4s	v0, v0, #26
-# CHECK: srsra.2d	v0, v0, #57
-# CHECK: sshll.8h	v0, v0, #1
-# CHECK: sshll2.8h	v0, v0, #2
-# CHECK: sshll.4s	v0, v0, #3
-# CHECK: sshll2.4s	v0, v0, #4
-# CHECK: sshll.2d	v0, v0, #5
-# CHECK: sshll2.2d	v0, v0, #6
-# CHECK: sshr.8b	v0, v0, #7
-# CHECK: sshr.16b	v0, v0, #6
-# CHECK: sshr.4h	v0, v0, #13
-# CHECK: sshr.8h	v0, v0, #12
-# CHECK: sshr.2s	v0, v0, #27
-# CHECK: sshr.4s	v0, v0, #26
-# CHECK: sshr.2d	v0, v0, #57
-# CHECK: sshr.8b	v0, v0, #7
-# CHECK: ssra.16b	v0, v0, #6
-# CHECK: ssra.4h	v0, v0, #13
-# CHECK: ssra.8h	v0, v0, #12
-# CHECK: ssra.2s	v0, v0, #27
-# CHECK: ssra.4s	v0, v0, #26
-# CHECK: ssra.2d	v0, v0, #57
-# CHECK: ssra		d0, d0, #64
-# CHECK: ucvtf.2s	v0, v0, #31
-# CHECK: ucvtf.4s	v0, v0, #30
-# CHECK: ucvtf.2d	v0, v0, #61
-# CHECK: uqrshrn.8b	v0, v0, #7
-# CHECK: uqrshrn2.16b	v0, v0, #6
-# CHECK: uqrshrn.4h	v0, v0, #13
-# CHECK: uqrshrn2.8h	v0, v0, #12
-# CHECK: uqrshrn.2s	v0, v0, #27
-# CHECK: uqrshrn2.4s	v0, v0, #26
-# CHECK: uqshl.8b	v0, v0, #1
-# CHECK: uqshl.16b	v0, v0, #2
-# CHECK: uqshl.4h	v0, v0, #3
-# CHECK: uqshl.8h	v0, v0, #4
-# CHECK: uqshl.2s	v0, v0, #5
-# CHECK: uqshl.4s	v0, v0, #6
-# CHECK: uqshl.2d	v0, v0, #7
-# CHECK: uqshrn.8b	v0, v0, #7
-# CHECK: uqshrn2.16b	v0, v0, #6
-# CHECK: uqshrn.4h	v0, v0, #13
-# CHECK: uqshrn2.8h	v0, v0, #12
-# CHECK: uqshrn.2s	v0, v0, #27
-# CHECK: uqshrn2.4s	v0, v0, #26
-# CHECK: urshr.8b	v0, v0, #7
-# CHECK: urshr.16b	v0, v0, #6
-# CHECK: urshr.4h	v0, v0, #13
-# CHECK: urshr.8h	v0, v0, #12
-# CHECK: urshr.2s	v0, v0, #27
-# CHECK: urshr.4s	v0, v0, #26
-# CHECK: urshr.2d	v0, v0, #57
-# CHECK: ursra.8b	v0, v0, #7
-# CHECK: ursra.16b	v0, v0, #6
-# CHECK: ursra.4h	v0, v0, #13
-# CHECK: ursra.8h	v0, v0, #12
-# CHECK: ursra.2s	v0, v0, #27
-# CHECK: ursra.4s	v0, v0, #26
-# CHECK: ursra.2d	v0, v0, #57
-# CHECK: ushll.8h	v0, v0, #1
-# CHECK: ushll2.8h	v0, v0, #2
-# CHECK: ushll.4s	v0, v0, #3
-# CHECK: ushll2.4s	v0, v0, #4
-# CHECK: ushll.2d	v0, v0, #5
-# CHECK: ushll2.2d	v0, v0, #6
-# CHECK: ushr.8b	v0, v0, #7
-# CHECK: ushr.16b	v0, v0, #6
-# CHECK: ushr.4h	v0, v0, #13
-# CHECK: ushr.8h	v0, v0, #12
-# CHECK: ushr.2s	v0, v0, #27
-# CHECK: ushr.4s	v0, v0, #26
-# CHECK: ushr.2d	v0, v0, #57
-# CHECK: usra.8b	v0, v0, #7
-# CHECK: usra.16b	v0, v0, #6
-# CHECK: usra.4h	v0, v0, #13
-# CHECK: usra.8h	v0, v0, #12
-# CHECK: usra.2s	v0, v0, #27
-# CHECK: usra.4s	v0, v0, #26
-# CHECK: usra.2d	v0, v0, #57
-
-
-  0x00 0xe0 0x20 0x0e
-  0x00 0xe0 0x20 0x4e
-  0x00 0xe0 0xe0 0x0e
-  0x00 0xe0 0xe0 0x4e
-
-# CHECK: pmull.8h v0, v0, v0
-# CHECK: pmull2.8h v0, v0, v0
-# CHECK: pmull.1q v0, v0, v0
-# CHECK: pmull2.1q v0, v0, v0
-
-  0x41 0xd8 0x70 0x7e
-  0x83 0xd8 0x30 0x7e
-# CHECK: faddp.2d	d1, v2
-# CHECK: faddp.2s	s3, v4
-
-  0x82 0x60 0x01 0x4e
-  0x80 0x60 0x01 0x0e
-  0xa2 0x00 0x01 0x4e
-  0xa0 0x00 0x01 0x0e
-  0xa2 0x40 0x01 0x4e
-  0xa0 0x40 0x01 0x0e
-  0xc2 0x20 0x01 0x4e
-  0xc0 0x20 0x01 0x0e
-
-# CHECK: tbl.16b	v2, { v4, v5, v6, v7 }, v1
-# CHECK: tbl.8b	v0, { v4, v5, v6, v7 }, v1
-# CHECK: tbl.16b	v2, { v5 }, v1
-# CHECK: tbl.8b	v0, { v5 }, v1
-# CHECK: tbl.16b	v2, { v5, v6, v7 }, v1
-# CHECK: tbl.8b	v0, { v5, v6, v7 }, v1
-# CHECK: tbl.16b	v2, { v6, v7 }, v1
-# CHECK: tbl.8b	v0, { v6, v7 }, v1
-#
-  0x82 0x70 0x01 0x4e
-  0x80 0x70 0x01 0x0e
-  0xa2 0x10 0x01 0x4e
-  0xa0 0x10 0x01 0x0e
-  0xa2 0x50 0x01 0x4e
-  0xa0 0x50 0x01 0x0e
-  0xc2 0x30 0x01 0x4e
-  0xc0 0x30 0x01 0x0e
-
-# CHECK: tbx.16b	v2, { v4, v5, v6, v7 }, v1
-# CHECK: tbx.8b	v0, { v4, v5, v6, v7 }, v1
-# CHECK: tbx.16b	v2, { v5 }, v1
-# CHECK: tbx.8b	v0, { v5 }, v1
-# CHECK: tbx.16b	v2, { v5, v6, v7 }, v1
-# CHECK: tbx.8b	v0, { v5, v6, v7 }, v1
-# CHECK: tbx.16b	v2, { v6, v7 }, v1
-# CHECK: tbx.8b	v0, { v6, v7 }, v1
-#
-
-0x00 0x80 0x20 0x0e
-0x00 0x80 0x20 0x4e
-0x00 0x80 0xa0 0x0e
-0x00 0x80 0xa0 0x4e
-
-# CHECK: smlal.8h v0, v0, v0
-# CHECK: smlal2.8h v0, v0, v0
-# CHECK: smlal.2d v0, v0, v0
-# CHECK: smlal2.2d v0, v0, v0
-
-0x00 0x80 0x20 0x2e
-0x00 0x80 0x20 0x6e
-0x00 0x80 0xa0 0x2e
-0x00 0x80 0xa0 0x6e
-
-# CHECK: umlal.8h v0, v0, v0
-# CHECK: umlal2.8h v0, v0, v0
-# CHECK: umlal.2d v0, v0, v0
-# CHECK: umlal2.2d v0, v0, v0
-
-0x00 0x90 0x60 0x5e
-0x00 0x90 0xa0 0x5e
-0x00 0xb0 0x60 0x5e
-0x00 0xb0 0xa0 0x5e
-
-# CHECK: sqdmlal s0, h0, h0
-# CHECK: sqdmlal d0, s0, s0
-# CHECK: sqdmlsl s0, h0, h0
-# CHECK: sqdmlsl d0, s0, s0
-
-0xaa 0xc5 0xc7 0x4d
-0xaa 0xc9 0xc7 0x4d
-0xaa 0xc1 0xc7 0x4d
-
-# CHECK: ld1r.8h { v10 }, [x13], x7
-# CHECK: ld1r.4s { v10 }, [x13], x7
-# CHECK: ld1r.16b { v10 }, [x13], x7
-
-0x00 0xd0 0x60 0x5e
-0x00 0xd0 0xa0 0x5e
-# CHECK: sqdmull	s0, h0, h0
-# CHECK: sqdmull	d0, s0, s0
-
-0x00 0xd8 0xa1 0x7e
-0x00 0xd8 0xe1 0x7e
-
-# CHECK: frsqrte s0, s0
-# CHECK: frsqrte d0, d0
-
-0xca 0xcd 0xc7 0x4d
-0xea 0xc9 0xe7 0x4d
-0xea 0xe9 0xc7 0x4d
-0xea 0xe9 0xe7 0x4d
-# CHECK: ld1r.2d	{ v10 }, [x14], x7
-# CHECK: ld2r.4s	{ v10, v11 }, [x15], x7
-# CHECK: ld3r.4s	{ v10, v11, v12 }, [x15], x7
-# CHECK: ld4r.4s	{ v10, v11, v12, v13 }, [x15], x7
-
-#===-------------------------------------------------------------------------===
-# AdvSIMD scalar three same
-#===-------------------------------------------------------------------------===
-0x62 0xdc 0x21 0x5e
-# CHECK: fmulx	s2, s3, s1
-0x62 0xdc 0x61 0x5e
-# CHECK: fmulx	d2, d3, d1
-
-
-# rdar://12511369
-0xe8 0x6b 0xdf 0x4c
-# CHECK: ld1.4s	{ v8, v9, v10 }, [sp], #48
diff --git a/test/MC/Disassembler/ARM64/arithmetic.txt b/test/MC/Disassembler/ARM64/arithmetic.txt
deleted file mode 100644
index bd870edc8af..00000000000
--- a/test/MC/Disassembler/ARM64/arithmetic.txt
+++ /dev/null
@@ -1,526 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
-
-#==---------------------------------------------------------------------------==
-# Add/Subtract with carry/borrow
-#==---------------------------------------------------------------------------==
-
-0x41 0x00 0x03 0x1a
-0x41 0x00 0x03 0x9a
-0x85 0x00 0x03 0x3a
-0x85 0x00 0x03 0xba
-
-# CHECK: adc  w1, w2, w3
-# CHECK: adc  x1, x2, x3
-# CHECK: adcs w5, w4, w3
-# CHECK: adcs x5, x4, x3
-
-0x41 0x00 0x03 0x5a
-0x41 0x00 0x03 0xda
-0x41 0x00 0x03 0x7a
-0x41 0x00 0x03 0xfa
-
-# CHECK: sbc  w1, w2, w3
-# CHECK: sbc  x1, x2, x3
-# CHECK: sbcs w1, w2, w3
-# CHECK: sbcs x1, x2, x3
-
-#==---------------------------------------------------------------------------==
-# Add/Subtract with (optionally shifted) immediate
-#==---------------------------------------------------------------------------==
-
-0x83 0x00 0x10 0x11
-0x83 0x00 0x10 0x91
-
-# CHECK: add w3, w4, #1024
-# CHECK: add x3, x4, #1024
-
-0x83 0x00 0x50 0x11
-0x83 0x00 0x40 0x11
-0x83 0x00 0x50 0x91
-0x83 0x00 0x40 0x91
-0xff 0x83 0x00 0x91
-
-# CHECK: add w3, w4, #1024, lsl #12
-# CHECK: add x3, x4, #1024, lsl #12
-# CHECK: add x3, x4, #0, lsl #12
-# CHECK: add sp, sp, #32
-
-0x83 0x00 0x10 0x31
-0x83 0x00 0x50 0x31
-0x83 0x00 0x10 0xb1
-0x83 0x00 0x50 0xb1
-0xff 0x83 0x00 0xb1
-
-# CHECK: adds w3, w4, #1024
-# CHECK: adds w3, w4, #1024, lsl #12
-# CHECK: adds x3, x4, #1024
-# CHECK: adds x3, x4, #1024, lsl #12
-# CHECK: cmn  sp, #32
-
-0x83 0x00 0x10 0x51
-0x83 0x00 0x50 0x51
-0x83 0x00 0x10 0xd1
-0x83 0x00 0x50 0xd1
-0xff 0x83 0x00 0xd1
-
-# CHECK: sub w3, w4, #1024
-# CHECK: sub w3, w4, #1024, lsl #12
-# CHECK: sub x3, x4, #1024
-# CHECK: sub x3, x4, #1024, lsl #12
-# CHECK: sub sp, sp, #32
-
-0x83 0x00 0x10 0x71
-0x83 0x00 0x50 0x71
-0x83 0x00 0x10 0xf1
-0x83 0x00 0x50 0xf1
-0xff 0x83 0x00 0xf1
-
-# CHECK: subs w3, w4, #1024
-# CHECK: subs w3, w4, #1024, lsl #12
-# CHECK: subs x3, x4, #1024
-# CHECK: subs x3, x4, #1024, lsl #12
-# CHECK: cmp  sp, #32
-
-#==---------------------------------------------------------------------------==
-# Add/Subtract register with (optional) shift
-#==---------------------------------------------------------------------------==
-
-0xac 0x01 0x0e 0x0b
-0xac 0x01 0x0e 0x8b
-0xac 0x31 0x0e 0x0b
-0xac 0x31 0x0e 0x8b
-0xac 0x29 0x4e 0x0b
-0xac 0x29 0x4e 0x8b
-0xac 0x1d 0x8e 0x0b
-0xac 0x9d 0x8e 0x8b
-
-# CHECK: add w12, w13, w14
-# CHECK: add x12, x13, x14
-# CHECK: add w12, w13, w14, lsl #12
-# CHECK: add x12, x13, x14, lsl #12
-# CHECK: add w12, w13, w14, lsr #10
-# CHECK: add x12, x13, x14, lsr #10
-# CHECK: add w12, w13, w14, asr #7
-# CHECK: add x12, x13, x14, asr #39
-
-0xac 0x01 0x0e 0x4b
-0xac 0x01 0x0e 0xcb
-0xac 0x31 0x0e 0x4b
-0xac 0x31 0x0e 0xcb
-0xac 0x29 0x4e 0x4b
-0xac 0x29 0x4e 0xcb
-0xac 0x1d 0x8e 0x4b
-0xac 0x9d 0x8e 0xcb
-
-# CHECK: sub w12, w13, w14
-# CHECK: sub x12, x13, x14
-# CHECK: sub w12, w13, w14, lsl #12
-# CHECK: sub x12, x13, x14, lsl #12
-# CHECK: sub w12, w13, w14, lsr #10
-# CHECK: sub x12, x13, x14, lsr #10
-# CHECK: sub w12, w13, w14, asr #7
-# CHECK: sub x12, x13, x14, asr #39
-
-0xac 0x01 0x0e 0x2b
-0xac 0x01 0x0e 0xab
-0xac 0x31 0x0e 0x2b
-0xac 0x31 0x0e 0xab
-0xac 0x29 0x4e 0x2b
-0xac 0x29 0x4e 0xab
-0xac 0x1d 0x8e 0x2b
-0xac 0x9d 0x8e 0xab
-
-# CHECK: adds w12, w13, w14
-# CHECK: adds x12, x13, x14
-# CHECK: adds w12, w13, w14, lsl #12
-# CHECK: adds x12, x13, x14, lsl #12
-# CHECK: adds w12, w13, w14, lsr #10
-# CHECK: adds x12, x13, x14, lsr #10
-# CHECK: adds w12, w13, w14, asr #7
-# CHECK: adds x12, x13, x14, asr #39
-
-0xac 0x01 0x0e 0x6b
-0xac 0x01 0x0e 0xeb
-0xac 0x31 0x0e 0x6b
-0xac 0x31 0x0e 0xeb
-0xac 0x29 0x4e 0x6b
-0xac 0x29 0x4e 0xeb
-0xac 0x1d 0x8e 0x6b
-0xac 0x9d 0x8e 0xeb
-
-# CHECK: subs w12, w13, w14
-# CHECK: subs x12, x13, x14
-# CHECK: subs w12, w13, w14, lsl #12
-# CHECK: subs x12, x13, x14, lsl #12
-# CHECK: subs w12, w13, w14, lsr #10
-# CHECK: subs x12, x13, x14, lsr #10
-# CHECK: subs w12, w13, w14, asr #7
-# CHECK: subs x12, x13, x14, asr #39
-
-#==---------------------------------------------------------------------------==
-# Add/Subtract with (optional) extend
-#==---------------------------------------------------------------------------==
-
-0x41 0x00 0x23 0x0b
-0x41 0x20 0x23 0x0b
-0x41 0x40 0x23 0x0b
-0x41 0x60 0x23 0x0b
-0x41 0x80 0x23 0x0b
-0x41 0xa0 0x23 0x0b
-0x41 0xc0 0x23 0x0b
-0x41 0xe0 0x23 0x0b
-
-# CHECK: add w1, w2, w3, uxtb
-# CHECK: add w1, w2, w3, uxth
-# CHECK: add w1, w2, w3
-# CHECK: add w1, w2, w3, uxtx
-# CHECK: add w1, w2, w3, sxtb
-# CHECK: add w1, w2, w3, sxth
-# CHECK: add w1, w2, w3, sxtw
-# CHECK: add w1, w2, w3, sxtx
-
-0x41 0x00 0x23 0x8b
-0x41 0x20 0x23 0x8b
-0x41 0x40 0x23 0x8b
-0x41 0x80 0x23 0x8b
-0x41 0xa0 0x23 0x8b
-0x41 0xc0 0x23 0x8b
-
-# CHECK: add x1, x2, w3, uxtb
-# CHECK: add x1, x2, w3, uxth
-# CHECK: add x1, x2, w3, uxtw
-# CHECK: add x1, x2, w3, sxtb
-# CHECK: add x1, x2, w3, sxth
-# CHECK: add x1, x2, w3, sxtw
-
-0xe1 0x43 0x23 0x0b
-0xe1 0x43 0x23 0x0b
-0x5f 0x60 0x23 0x8b
-0x5f 0x60 0x23 0x8b
-
-# CHECK: add w1, wsp, w3
-# CHECK: add w1, wsp, w3
-# CHECK: add sp, x2, x3
-# CHECK: add sp, x2, x3
-
-0x41 0x00 0x23 0x4b
-0x41 0x20 0x23 0x4b
-0x41 0x40 0x23 0x4b
-0x41 0x60 0x23 0x4b
-0x41 0x80 0x23 0x4b
-0x41 0xa0 0x23 0x4b
-0x41 0xc0 0x23 0x4b
-0x41 0xe0 0x23 0x4b
-
-# CHECK: sub w1, w2, w3, uxtb
-# CHECK: sub w1, w2, w3, uxth
-# CHECK: sub w1, w2, w3
-# CHECK: sub w1, w2, w3, uxtx
-# CHECK: sub w1, w2, w3, sxtb
-# CHECK: sub w1, w2, w3, sxth
-# CHECK: sub w1, w2, w3, sxtw
-# CHECK: sub w1, w2, w3, sxtx
-
-0x41 0x00 0x23 0xcb
-0x41 0x20 0x23 0xcb
-0x41 0x40 0x23 0xcb
-0x41 0x80 0x23 0xcb
-0x41 0xa0 0x23 0xcb
-0x41 0xc0 0x23 0xcb
-
-# CHECK: sub x1, x2, w3, uxtb
-# CHECK: sub x1, x2, w3, uxth
-# CHECK: sub x1, x2, w3, uxtw
-# CHECK: sub x1, x2, w3, sxtb
-# CHECK: sub x1, x2, w3, sxth
-# CHECK: sub x1, x2, w3, sxtw
-
-0xe1 0x43 0x23 0x4b
-0xe1 0x43 0x23 0x4b
-0x5f 0x60 0x23 0xcb
-0x5f 0x60 0x23 0xcb
-
-# CHECK: sub w1, wsp, w3
-# CHECK: sub w1, wsp, w3
-# CHECK: sub sp, x2, x3
-# CHECK: sub sp, x2, x3
-
-0x41 0x00 0x23 0x2b
-0x41 0x20 0x23 0x2b
-0x41 0x40 0x23 0x2b
-0x41 0x60 0x23 0x2b
-0x41 0x80 0x23 0x2b
-0x41 0xa0 0x23 0x2b
-0x41 0xc0 0x23 0x2b
-0x41 0xe0 0x23 0x2b
-
-# CHECK: adds w1, w2, w3, uxtb
-# CHECK: adds w1, w2, w3, uxth
-# CHECK: adds w1, w2, w3
-# CHECK: adds w1, w2, w3, uxtx
-# CHECK: adds w1, w2, w3, sxtb
-# CHECK: adds w1, w2, w3, sxth
-# CHECK: adds w1, w2, w3, sxtw
-# CHECK: adds w1, w2, w3, sxtx
-
-0x41 0x00 0x23 0xab
-0x41 0x20 0x23 0xab
-0x41 0x40 0x23 0xab
-0x41 0x80 0x23 0xab
-0x41 0xa0 0x23 0xab
-0x41 0xc0 0x23 0xab
-
-# CHECK: adds x1, x2, w3, uxtb
-# CHECK: adds x1, x2, w3, uxth
-# CHECK: adds x1, x2, w3, uxtw
-# CHECK: adds x1, x2, w3, sxtb
-# CHECK: adds x1, x2, w3, sxth
-# CHECK: adds x1, x2, w3, sxtw
-
-0xe1 0x43 0x23 0x2b
-0xe1 0x43 0x23 0x2b
-
-# CHECK: adds w1, wsp, w3
-# CHECK: adds w1, wsp, w3
-
-0x41 0x00 0x23 0x6b
-0x41 0x20 0x23 0x6b
-0x41 0x40 0x23 0x6b
-0x41 0x60 0x23 0x6b
-0x41 0x80 0x23 0x6b
-0x41 0xa0 0x23 0x6b
-0x41 0xc0 0x23 0x6b
-0x41 0xe0 0x23 0x6b
-
-# CHECK: subs w1, w2, w3, uxtb
-# CHECK: subs w1, w2, w3, uxth
-# CHECK: subs w1, w2, w3
-# CHECK: subs w1, w2, w3, uxtx
-# CHECK: subs w1, w2, w3, sxtb
-# CHECK: subs w1, w2, w3, sxth
-# CHECK: subs w1, w2, w3, sxtw
-# CHECK: subs w1, w2, w3, sxtx
-
-0x41 0x00 0x23 0xeb
-0x41 0x20 0x23 0xeb
-0x41 0x40 0x23 0xeb
-0x41 0x80 0x23 0xeb
-0x41 0xa0 0x23 0xeb
-0x41 0xc0 0x23 0xeb
-
-# CHECK: subs x1, x2, w3, uxtb
-# CHECK: subs x1, x2, w3, uxth
-# CHECK: subs x1, x2, w3, uxtw
-# CHECK: subs x1, x2, w3, sxtb
-# CHECK: subs x1, x2, w3, sxth
-# CHECK: subs x1, x2, w3, sxtw
-
-0xe1 0x43 0x23 0x6b
-0xe1 0x43 0x23 0x6b
-
-# CHECK: subs w1, wsp, w3
-# CHECK: subs w1, wsp, w3
-
-0x1f 0x41 0x28 0xeb
-0x3f 0x41 0x28 0x6b
-0xff 0x43 0x28 0x6b
-0xff 0x43 0x28 0xeb
-
-# CHECK: cmp x8, w8, uxtw
-# CHECK: cmp w9, w8, uxtw
-# CHECK: cmp wsp, w8
-# CHECK: cmp sp, w8
-
-0x3f 0x41 0x28 0x4b
-0xe1 0x43 0x28 0x4b
-0xff 0x43 0x28 0x4b
-0x3f 0x41 0x28 0xcb
-0xe1 0x43 0x28 0xcb
-0xff 0x43 0x28 0xcb
-0xe1 0x43 0x28 0x6b
-0xe1 0x43 0x28 0xeb
-
-# CHECK: sub wsp, w9, w8
-# CHECK: sub w1, wsp, w8
-# CHECK: sub wsp, wsp, w8
-# CHECK: sub sp, x9, w8
-# CHECK: sub x1, sp, w8
-# CHECK: sub sp, sp, w8
-# CHECK: subs w1, wsp, w8
-# CHECK: subs x1, sp, w8
-
-#==---------------------------------------------------------------------------==
-# Signed/Unsigned divide
-#==---------------------------------------------------------------------------==
-
-0x41 0x0c 0xc3 0x1a
-0x41 0x0c 0xc3 0x9a
-0x41 0x08 0xc3 0x1a
-0x41 0x08 0xc3 0x9a
-
-# CHECK: sdiv w1, w2, w3
-# CHECK: sdiv x1, x2, x3
-# CHECK: udiv w1, w2, w3
-# CHECK: udiv x1, x2, x3
-
-#==---------------------------------------------------------------------------==
-# Variable shifts
-#==---------------------------------------------------------------------------==
-
-  0x41 0x28 0xc3 0x1a
-# CHECK: asr w1, w2, w3
-  0x41 0x28 0xc3 0x9a
-# CHECK: asr x1, x2, x3
-  0x41 0x20 0xc3 0x1a
-# CHECK: lsl w1, w2, w3
-  0x41 0x20 0xc3 0x9a
-# CHECK: lsl x1, x2, x3
-  0x41 0x24 0xc3 0x1a
-# CHECK: lsr w1, w2, w3
-  0x41 0x24 0xc3 0x9a
-# CHECK: lsr x1, x2, x3
-  0x41 0x2c 0xc3 0x1a
-# CHECK: ror w1, w2, w3
-  0x41 0x2c 0xc3 0x9a
-# CHECK: ror x1, x2, x3
-
-#==---------------------------------------------------------------------------==
-# One operand instructions
-#==---------------------------------------------------------------------------==
-
-  0x41 0x14 0xc0 0x5a
-# CHECK: cls w1, w2
-  0x41 0x14 0xc0 0xda
-# CHECK: cls x1, x2
-  0x41 0x10 0xc0 0x5a
-# CHECK: clz w1, w2
-  0x41 0x10 0xc0 0xda
-# CHECK: clz x1, x2
-  0x41 0x00 0xc0 0x5a
-# CHECK: rbit w1, w2
-  0x41 0x00 0xc0 0xda
-# CHECK: rbit x1, x2
-  0x41 0x08 0xc0 0x5a
-# CHECK: rev w1, w2
-  0x41 0x0c 0xc0 0xda
-# CHECK: rev x1, x2
-  0x41 0x04 0xc0 0x5a
-# CHECK: rev16 w1, w2
-  0x41 0x04 0xc0 0xda
-# CHECK: rev16 x1, x2
-  0x41 0x08 0xc0 0xda
-# CHECK: rev32 x1, x2
-
-#==---------------------------------------------------------------------------==
-# 6.6.1 Multiply-add instructions
-#==---------------------------------------------------------------------------==
-
-0x41 0x10 0x03 0x1b
-0x41 0x10 0x03 0x9b
-0x41 0x90 0x03 0x1b
-0x41 0x90 0x03 0x9b
-0x41 0x10 0x23 0x9b
-0x41 0x90 0x23 0x9b
-0x41 0x10 0xa3 0x9b
-0x41 0x90 0xa3 0x9b
-
-# CHECK: madd   w1, w2, w3, w4
-# CHECK: madd   x1, x2, x3, x4
-# CHECK: msub   w1, w2, w3, w4
-# CHECK: msub   x1, x2, x3, x4
-# CHECK: smaddl x1, w2, w3, x4
-# CHECK: smsubl x1, w2, w3, x4
-# CHECK: umaddl x1, w2, w3, x4
-# CHECK: umsubl x1, w2, w3, x4
-
-#==---------------------------------------------------------------------------==
-# Multiply-high instructions
-#==---------------------------------------------------------------------------==
-
-0x41 0x7c 0x43 0x9b
-0x41 0x7c 0xc3 0x9b
-
-# CHECK: smulh x1, x2, x3
-# CHECK: umulh x1, x2, x3
-
-#==---------------------------------------------------------------------------==
-# Move immediate instructions
-#==---------------------------------------------------------------------------==
-
-0x20 0x00 0x80 0x52
-0x20 0x00 0x80 0xd2
-0x20 0x00 0xa0 0x52
-0x20 0x00 0xa0 0xd2
-
-# CHECK: movz w0, #0x1
-# CHECK: movz x0, #0x1
-# CHECK: movz w0, #0x1, lsl #16
-# CHECK: movz x0, #0x1, lsl #16
-
-0x40 0x00 0x80 0x12
-0x40 0x00 0x80 0x92
-0x40 0x00 0xa0 0x12
-0x40 0x00 0xa0 0x92
-
-# CHECK: movn w0, #0x2
-# CHECK: movn x0, #0x2
-# CHECK: movn w0, #0x2, lsl #16
-# CHECK: movn x0, #0x2, lsl #16
-
-0x20 0x00 0x80 0x72
-0x20 0x00 0x80 0xf2
-0x20 0x00 0xa0 0x72
-0x20 0x00 0xa0 0xf2
-
-# CHECK: movk w0, #0x1
-# CHECK: movk x0, #0x1
-# CHECK: movk w0, #0x1, lsl #16
-# CHECK: movk x0, #0x1, lsl #16
-
-#==---------------------------------------------------------------------------==
-# Conditionally set flags instructions
-#==---------------------------------------------------------------------------==
-
-  0x1f 0x00 0x00 0x31
-# CHECK: cmn w0, #0
-  0x1f 0xfc 0x03 0xb1
-# CHECK: x0, #255
-
-  0x23 0x08 0x42 0x3a
-# CHECK: ccmn w1, #2, #3, eq
-  0x23 0x08 0x42 0xba
-# CHECK: ccmn x1, #2, #3, eq
-  0x23 0x08 0x42 0x7a
-# CHECK: ccmp w1, #2, #3, eq
-  0x23 0x08 0x42 0xfa
-# CHECK: ccmp x1, #2, #3, eq
-
-  0x23 0x00 0x42 0x3a
-# CHECK: ccmn w1, w2, #3, eq
-  0x23 0x00 0x42 0xba
-# CHECK: ccmn x1, x2, #3, eq
-  0x23 0x00 0x42 0x7a
-# CHECK: ccmp w1, w2, #3, eq
-  0x23 0x00 0x42 0xfa
-# CHECK: ccmp x1, x2, #3, eq
-
-#==---------------------------------------------------------------------------==
-# Conditional select instructions
-#==---------------------------------------------------------------------------==
-
-  0x41 0x00 0x83 0x1a
-# CHECK: csel w1, w2, w3, eq
-  0x41 0x00 0x83 0x9a
-# CHECK: csel x1, x2, x3, eq
-  0x41 0x04 0x83 0x1a
-# CHECK: csinc w1, w2, w3, eq
-  0x41 0x04 0x83 0x9a
-# CHECK: csinc x1, x2, x3, eq
-  0x41 0x00 0x83 0x5a
-# CHECK: csinv w1, w2, w3, eq
-  0x41 0x00 0x83 0xda
-# CHECK: csinv x1, x2, x3, eq
-  0x41 0x04 0x83 0x5a
-# CHECK: csneg w1, w2, w3, eq
-  0x41 0x04 0x83 0xda
-# CHECK: csneg x1, x2, x3, eq
diff --git a/test/MC/Disassembler/ARM64/basic-a64-undefined.txt b/test/MC/Disassembler/ARM64/basic-a64-undefined.txt
deleted file mode 100644
index 0e15af63e68..00000000000
--- a/test/MC/Disassembler/ARM64/basic-a64-undefined.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-# These spawn another process so they're rather expensive. Not many.
-
-# LDR/STR: undefined if option field is 10x or 00x.
-# RUN: echo "0x00 0x08 0x20 0xf8" | llvm-mc -triple arm64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x00 0x88 0x20 0xf8" | llvm-mc -triple arm64 -disassemble 2>&1 | FileCheck %s
-
-# Instructions notionally in the add/sub (extended register) sheet, but with
-# invalid shift amount or "opt" field.
-# RUN: echo "0x00 0x10 0xa0 0x0b" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x00 0x10 0x60 0x0b" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x00 0x14 0x20 0x0b" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
-
-# MOVK with sf == 0 and hw<1> == 1 is unallocated.
-# RUN: echo "0x00 0x00 0xc0 0x72" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
-
-# ADD/SUB (shifted register) are reserved if shift == '11' or sf == '0' and imm6<5> == '1'.
-# RUN: echo "0x00 0x00 0xc0 0xeb" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x00 0x80 0x80 0x6b" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
-
-# UBFM is undefined when s == 0 and imms<5> or immr<5> is 1.
-# RUN: echo "0x00 0x80 0x00 0x53" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
-
-# EXT on vectors of i8 must have imm<3> = 0.
-# RUN: echo "0x00 0x40 0x00 0x2e" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
-
-# SCVTF on fixed point W-registers is undefined if scale<5> == 0.
-# Same with FCVTZS and FCVTZU.
-# RUN: echo "0x00 0x00 0x02 0x1e" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x00 0x00 0x18 0x1e" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
-
-# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM64/bitfield.txt b/test/MC/Disassembler/ARM64/bitfield.txt
deleted file mode 100644
index d620cb3b217..00000000000
--- a/test/MC/Disassembler/ARM64/bitfield.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
-
-#==---------------------------------------------------------------------------==
-# 5.4.4 Bitfield Operations
-#==---------------------------------------------------------------------------==
-
-0x41 0x3c 0x01 0x33
-0x41 0x3c 0x41 0xb3
-0x41 0x3c 0x01 0x13
-0x41 0x3c 0x41 0x93
-0x41 0x3c 0x01 0x53
-0x41 0x3c 0x41 0xd3
-
-# CHECK: bfxil  w1, w2, #1, #15
-# CHECK: bfxil  x1, x2, #1, #15
-# CHECK: sbfx w1, w2, #1, #15
-# CHECK: sbfx x1, x2, #1, #15
-# CHECK: ubfx w1, w2, #1, #15
-# CHECK: ubfx x1, x2, #1, #15
-
-#==---------------------------------------------------------------------------==
-# 5.4.5 Extract (immediate)
-#==---------------------------------------------------------------------------==
-
-0x41 0x3c 0x83 0x13
-0x62 0x04 0xc4 0x93
-
-# CHECK: extr w1, w2, w3, #15
-# CHECK: extr x2, x3, x4, #1
diff --git a/test/MC/Disassembler/ARM64/branch.txt b/test/MC/Disassembler/ARM64/branch.txt
deleted file mode 100644
index 6af1ad886a4..00000000000
--- a/test/MC/Disassembler/ARM64/branch.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
-
-#-----------------------------------------------------------------------------
-# Unconditional branch (register) instructions.
-#-----------------------------------------------------------------------------
-
-  0xc0 0x03 0x5f 0xd6
-# CHECK: ret
-  0x20 0x00 0x5f 0xd6
-# CHECK: ret x1
-  0xe0 0x03 0xbf 0xd6
-# CHECK: drps
-  0xe0 0x03 0x9f 0xd6
-# CHECK: eret
-  0xa0 0x00 0x1f 0xd6
-# CHECK: br  x5
-  0x20 0x01 0x3f 0xd6
-# CHECK: blr x9
-  0x0B 0x00 0x18 0x37
-# CHECK: tbnz	w11, #3, #0
-
-#-----------------------------------------------------------------------------
-# Exception generation instructions.
-#-----------------------------------------------------------------------------
-
-  0x20 0x00 0x20 0xd4
-# CHECK: brk   #0x1
-  0x41 0x00 0xa0 0xd4
-# CHECK: dcps1 #0x2
-  0x62 0x00 0xa0 0xd4
-# CHECK: dcps2 #0x3
-  0x83 0x00 0xa0 0xd4
-# CHECK: dcps3 #0x4
-  0xa0 0x00 0x40 0xd4
-# CHECK: hlt   #0x5
-  0xc2 0x00 0x00 0xd4
-# CHECK: hvc   #0x6
-  0xe3 0x00 0x00 0xd4
-# CHECK: smc   #0x7
-  0x01 0x01 0x00 0xd4
-# CHECK: svc   #0x8
-
-#-----------------------------------------------------------------------------
-# PC-relative branches (both positive and negative displacement)
-#-----------------------------------------------------------------------------
-
-  0x07 0x00 0x00 0x14
-# CHECK: b #28
-  0x06 0x00 0x00 0x94
-# CHECK: bl #24
-  0xa1 0x00 0x00 0x54
-# CHECK: b.ne #20
-  0x80 0x00 0x08 0x36
-# CHECK: tbz w0, #1, #16
-  0xe1 0xff 0xf7 0x36
-# CHECK: tbz w1, #30, #-4
-  0x60 0x00 0x08 0x37
-# CHECK: tbnz w0, #1, #12
-  0x40 0x00 0x00 0xb4
-# CHECK: cbz x0, #8
-  0x20 0x00 0x00 0xb5
-# CHECK: cbnz x0, #4
-  0x1f 0x20 0x03 0xd5
-# CHECK: nop
-  0xff 0xff 0xff 0x17
-# CHECK: b #-4
-  0xc1 0xff 0xff 0x54
-# CHECK: b.ne #-8
-  0xa0 0xff 0x0f 0x36
-# CHECK: tbz w0, #1, #-12
-  0x80 0xff 0xff 0xb4
-# CHECK: cbz x0, #-16
-  0x1f 0x20 0x03 0xd5
-# CHECK: nop
-
diff --git a/test/MC/Disassembler/ARM64/canonical-form.txt b/test/MC/Disassembler/ARM64/canonical-form.txt
deleted file mode 100644
index 1c94b13b4ac..00000000000
--- a/test/MC/Disassembler/ARM64/canonical-form.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon --disassemble < %s | FileCheck %s
-
-0x00 0x08 0x00 0xc8
-
-# CHECK: stxr	w0, x0, [x0]
-
-0x00 0x00 0x40 0x9b
-
-# CHECK: smulh x0, x0, x0
-
-0x08 0x20 0x21 0x1e
-
-# CHECK: fcmp s0, #0.0
-
-0x1f 0x00 0x00 0x11
-
-# CHECK: mov wsp, w0
-
-0x00 0x7c 0x00 0x13
-
-# CHECK: asr	w0, w0, #0
diff --git a/test/MC/Disassembler/ARM64/crc32.txt b/test/MC/Disassembler/ARM64/crc32.txt
deleted file mode 100644
index 51717ee2862..00000000000
--- a/test/MC/Disassembler/ARM64/crc32.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# RUN: llvm-mc -triple=arm64 -mattr=+crc -disassemble < %s | FileCheck %s
-
-# CHECK: crc32b  w5, w7, w20
-# CHECK: crc32h  w28, wzr, w30
-# CHECK: crc32w  w0, w1, w2
-# CHECK: crc32x  w7, w9, x20
-# CHECK: crc32cb w9, w5, w4
-# CHECK: crc32ch w13, w17, w25
-# CHECK: crc32cw wzr, w3, w5
-# CHECK: crc32cx w18, w16, xzr
-0xe5 0x40 0xd4 0x1a
-0xfc 0x47 0xde 0x1a
-0x20 0x48 0xc2 0x1a
-0x27 0x4d 0xd4 0x9a
-0xa9 0x50 0xc4 0x1a
-0x2d 0x56 0xd9 0x1a
-0x7f 0x58 0xc5 0x1a
-0x12 0x5e 0xdf 0x9a
diff --git a/test/MC/Disassembler/ARM64/crypto.txt b/test/MC/Disassembler/ARM64/crypto.txt
deleted file mode 100644
index b905b92c636..00000000000
--- a/test/MC/Disassembler/ARM64/crypto.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto --disassemble < %s | FileCheck %s
-# RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -output-asm-variant=1 --disassemble < %s | FileCheck %s --check-prefix=CHECK-APPLE
-
-  0x20 0x48 0x28 0x4e
-  0x20 0x58 0x28 0x4e
-  0x20 0x68 0x28 0x4e
-  0x20 0x78 0x28 0x4e
-  0x20 0x00 0x02 0x5e
-  0x20 0x10 0x02 0x5e
-  0x20 0x20 0x02 0x5e
-  0x20 0x30 0x02 0x5e
-  0x20 0x40 0x02 0x5e
-  0x20 0x50 0x02 0x5e
-  0x20 0x60 0x02 0x5e
-  0x20 0x08 0x28 0x5e
-  0x20 0x18 0x28 0x5e
-  0x20 0x28 0x28 0x5e
-
-# CHECK: aese v0.16b, v1.16b
-# CHECK: aesd v0.16b, v1.16b
-# CHECK: aesmc v0.16b, v1.16b
-# CHECK: aesimc v0.16b, v1.16b
-# CHECK: sha1c q0, s1, v2.4s
-# CHECK: sha1p q0, s1, v2.4s
-# CHECK: sha1m q0, s1, v2.4s
-# CHECK: sha1su0 v0.4s, v1.4s, v2
-# CHECK: sha256h q0, q1, v2.4s
-# CHECK: sha256h2 q0, q1, v2.4s
-# CHECK: sha256su1 v0.4s, v1.4s, v2.4s
-# CHECK: sha1h s0, s1
-# CHECK: sha1su1 v0.4s, v1.4s
-# CHECK: sha256su0 v0.4s, v1.4s
-
-# CHECK-APPLE: aese.16b v0, v1
-# CHECK-APPLE: aesd.16b v0, v1
-# CHECK-APPLE: aesmc.16b v0, v1
-# CHECK-APPLE: aesimc.16b v0, v1
-# CHECK-APPLE: sha1c.4s q0, s1, v2
-# CHECK-APPLE: sha1p.4s q0, s1, v2
-# CHECK-APPLE: sha1m.4s q0, s1, v2
-# CHECK-APPLE: sha1su0.4s v0, v1, v2
-# CHECK-APPLE: sha256h.4s q0, q1, v2
-# CHECK-APPLE: sha256h2.4s q0, q1, v2
-# CHECK-APPLE: sha256su1.4s v0, v1, v2
-# CHECK-APPLE: sha1h s0, s1
-# CHECK-APPLE: sha1su1.4s v0, v1
-# CHECK-APPLE: sha256su0.4s v0, v1
diff --git a/test/MC/Disassembler/ARM64/invalid-logical.txt b/test/MC/Disassembler/ARM64/invalid-logical.txt
deleted file mode 100644
index 8a4ecb664ed..00000000000
--- a/test/MC/Disassembler/ARM64/invalid-logical.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin -disassemble < %s 2>&1 | FileCheck %s
-
-# rdar://15226511
-0x7b 0xbf 0x25 0x72
-# CHECK: invalid instruction encoding
-# CHECK-NEXT: 0x7b 0xbf 0x25 0x72
diff --git a/test/MC/Disassembler/ARM64/lit.local.cfg b/test/MC/Disassembler/ARM64/lit.local.cfg
deleted file mode 100644
index 46a946845e1..00000000000
--- a/test/MC/Disassembler/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-config.suffixes = ['.txt']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/MC/Disassembler/ARM64/logical.txt b/test/MC/Disassembler/ARM64/logical.txt
deleted file mode 100644
index e3cb3ebe7e0..00000000000
--- a/test/MC/Disassembler/ARM64/logical.txt
+++ /dev/null
@@ -1,223 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
-
-#==---------------------------------------------------------------------------==
-# 5.4.2 Logical (immediate)
-#==---------------------------------------------------------------------------==
-
-0x00 0x00 0x00 0x12
-0x00 0x00 0x40 0x92
-0x41 0x0c 0x00 0x12
-0x41 0x0c 0x40 0x92
-0xbf 0xec 0x7c 0x92
-0x00 0x00 0x00 0x72
-0x00 0x00 0x40 0xf2
-0x41 0x0c 0x00 0x72
-0x41 0x0c 0x40 0xf2
-0x5f 0x0c 0x40 0xf2
-
-# CHECK: and  w0, w0, #0x1
-# CHECK: and  x0, x0, #0x1
-# CHECK: and  w1, w2, #0xf
-# CHECK: and  x1, x2, #0xf
-# CHECK: and  sp, x5, #0xfffffffffffffff0
-# CHECK: ands w0, w0, #0x1
-# CHECK: ands x0, x0, #0x1
-# CHECK: ands w1, w2, #0xf
-# CHECK: ands x1, x2, #0xf
-# CHECK: tst x2, #0xf
-
-0x41 0x00 0x12 0x52
-0x41 0x00 0x71 0xd2
-0x5f 0x00 0x71 0xd2
-
-# CHECK: eor w1, w2, #0x4000
-# CHECK: eor x1, x2, #0x8000
-# CHECK: eor sp, x2, #0x8000
-
-0x41 0x00 0x12 0x32
-0x41 0x00 0x71 0xb2
-0x5f 0x00 0x71 0xb2
-
-# CHECK: orr w1, w2, #0x4000
-# CHECK: orr x1, x2, #0x8000
-# CHECK: orr sp, x2, #0x8000
-
-#==---------------------------------------------------------------------------==
-# 5.5.3 Logical (shifted register)
-#==---------------------------------------------------------------------------==
-
-0x41 0x00 0x03 0x0a
-0x41 0x00 0x03 0x8a
-0x41 0x08 0x03 0x0a
-0x41 0x08 0x03 0x8a
-0x41 0x08 0x43 0x0a
-0x41 0x08 0x43 0x8a
-0x41 0x08 0x83 0x0a
-0x41 0x08 0x83 0x8a
-0x41 0x08 0xc3 0x0a
-0x41 0x08 0xc3 0x8a
-
-# CHECK: and  w1, w2, w3
-# CHECK: and  x1, x2, x3
-# CHECK: and  w1, w2, w3, lsl #2
-# CHECK: and  x1, x2, x3, lsl #2
-# CHECK: and  w1, w2, w3, lsr #2
-# CHECK: and  x1, x2, x3, lsr #2
-# CHECK: and  w1, w2, w3, asr #2
-# CHECK: and  x1, x2, x3, asr #2
-# CHECK: and  w1, w2, w3, ror #2
-# CHECK: and  x1, x2, x3, ror #2
-
-0x41 0x00 0x03 0x6a
-0x41 0x00 0x03 0xea
-0x41 0x08 0x03 0x6a
-0x41 0x08 0x03 0xea
-0x41 0x08 0x43 0x6a
-0x41 0x08 0x43 0xea
-0x41 0x08 0x83 0x6a
-0x41 0x08 0x83 0xea
-0x41 0x08 0xc3 0x6a
-0x41 0x08 0xc3 0xea
-
-# CHECK: ands w1, w2, w3
-# CHECK: ands x1, x2, x3
-# CHECK: ands w1, w2, w3, lsl #2
-# CHECK: ands x1, x2, x3, lsl #2
-# CHECK: ands w1, w2, w3, lsr #2
-# CHECK: ands x1, x2, x3, lsr #2
-# CHECK: ands w1, w2, w3, asr #2
-# CHECK: ands x1, x2, x3, asr #2
-# CHECK: ands w1, w2, w3, ror #2
-# CHECK: ands x1, x2, x3, ror #2
-
-0x41 0x00 0x23 0x0a
-0x41 0x00 0x23 0x8a
-0x41 0x0c 0x23 0x0a
-0x41 0x0c 0x23 0x8a
-0x41 0x0c 0x63 0x0a
-0x41 0x0c 0x63 0x8a
-0x41 0x0c 0xa3 0x0a
-0x41 0x0c 0xa3 0x8a
-0x41 0x0c 0xe3 0x0a
-0x41 0x0c 0xe3 0x8a
-
-# CHECK: bic w1, w2, w3
-# CHECK: bic x1, x2, x3
-# CHECK: bic w1, w2, w3, lsl #3
-# CHECK: bic x1, x2, x3, lsl #3
-# CHECK: bic w1, w2, w3, lsr #3
-# CHECK: bic x1, x2, x3, lsr #3
-# CHECK: bic w1, w2, w3, asr #3
-# CHECK: bic x1, x2, x3, asr #3
-# CHECK: bic w1, w2, w3, ror #3
-# CHECK: bic x1, x2, x3, ror #3
-
-0x41 0x00 0x23 0x6a
-0x41 0x00 0x23 0xea
-0x41 0x0c 0x23 0x6a
-0x41 0x0c 0x23 0xea
-0x41 0x0c 0x63 0x6a
-0x41 0x0c 0x63 0xea
-0x41 0x0c 0xa3 0x6a
-0x41 0x0c 0xa3 0xea
-0x41 0x0c 0xe3 0x6a
-0x41 0x0c 0xe3 0xea
-
-# CHECK: bics w1, w2, w3
-# CHECK: bics x1, x2, x3
-# CHECK: bics w1, w2, w3, lsl #3
-# CHECK: bics x1, x2, x3, lsl #3
-# CHECK: bics w1, w2, w3, lsr #3
-# CHECK: bics x1, x2, x3, lsr #3
-# CHECK: bics w1, w2, w3, asr #3
-# CHECK: bics x1, x2, x3, asr #3
-# CHECK: bics w1, w2, w3, ror #3
-# CHECK: bics x1, x2, x3, ror #3
-
-0x41 0x00 0x23 0x4a
-0x41 0x00 0x23 0xca
-0x41 0x10 0x23 0x4a
-0x41 0x10 0x23 0xca
-0x41 0x10 0x63 0x4a
-0x41 0x10 0x63 0xca
-0x41 0x10 0xa3 0x4a
-0x41 0x10 0xa3 0xca
-0x41 0x10 0xe3 0x4a
-0x41 0x10 0xe3 0xca
-
-# CHECK: eon w1, w2, w3
-# CHECK: eon x1, x2, x3
-# CHECK: eon w1, w2, w3, lsl #4
-# CHECK: eon x1, x2, x3, lsl #4
-# CHECK: eon w1, w2, w3, lsr #4
-# CHECK: eon x1, x2, x3, lsr #4
-# CHECK: eon w1, w2, w3, asr #4
-# CHECK: eon x1, x2, x3, asr #4
-# CHECK: eon w1, w2, w3, ror #4
-# CHECK: eon x1, x2, x3, ror #4
-
-0x41 0x00 0x03 0x4a
-0x41 0x00 0x03 0xca
-0x41 0x14 0x03 0x4a
-0x41 0x14 0x03 0xca
-0x41 0x14 0x43 0x4a
-0x41 0x14 0x43 0xca
-0x41 0x14 0x83 0x4a
-0x41 0x14 0x83 0xca
-0x41 0x14 0xc3 0x4a
-0x41 0x14 0xc3 0xca
-
-# CHECK: eor w1, w2, w3
-# CHECK: eor x1, x2, x3
-# CHECK: eor w1, w2, w3, lsl #5
-# CHECK: eor x1, x2, x3, lsl #5
-# CHECK: eor w1, w2, w3, lsr #5
-# CHECK: eor x1, x2, x3, lsr #5
-# CHECK: eor w1, w2, w3, asr #5
-# CHECK: eor x1, x2, x3, asr #5
-# CHECK: eor w1, w2, w3, ror #5
-# CHECK: eor x1, x2, x3, ror #5
-
-0x41 0x00 0x03 0x2a
-0x41 0x00 0x03 0xaa
-0x41 0x18 0x03 0x2a
-0x41 0x18 0x03 0xaa
-0x41 0x18 0x43 0x2a
-0x41 0x18 0x43 0xaa
-0x41 0x18 0x83 0x2a
-0x41 0x18 0x83 0xaa
-0x41 0x18 0xc3 0x2a
-0x41 0x18 0xc3 0xaa
-
-# CHECK: orr w1, w2, w3
-# CHECK: orr x1, x2, x3
-# CHECK: orr w1, w2, w3, lsl #6
-# CHECK: orr x1, x2, x3, lsl #6
-# CHECK: orr w1, w2, w3, lsr #6
-# CHECK: orr x1, x2, x3, lsr #6
-# CHECK: orr w1, w2, w3, asr #6
-# CHECK: orr x1, x2, x3, asr #6
-# CHECK: orr w1, w2, w3, ror #6
-# CHECK: orr x1, x2, x3, ror #6
-
-0x41 0x00 0x23 0x2a
-0x41 0x00 0x23 0xaa
-0x41 0x1c 0x23 0x2a
-0x41 0x1c 0x23 0xaa
-0x41 0x1c 0x63 0x2a
-0x41 0x1c 0x63 0xaa
-0x41 0x1c 0xa3 0x2a
-0x41 0x1c 0xa3 0xaa
-0x41 0x1c 0xe3 0x2a
-0x41 0x1c 0xe3 0xaa
-
-# CHECK: orn w1, w2, w3
-# CHECK: orn x1, x2, x3
-# CHECK: orn w1, w2, w3, lsl #7
-# CHECK: orn x1, x2, x3, lsl #7
-# CHECK: orn w1, w2, w3, lsr #7
-# CHECK: orn x1, x2, x3, lsr #7
-# CHECK: orn w1, w2, w3, asr #7
-# CHECK: orn x1, x2, x3, asr #7
-# CHECK: orn w1, w2, w3, ror #7
-# CHECK: orn x1, x2, x3, ror #7
diff --git a/test/MC/Disassembler/ARM64/memory.txt b/test/MC/Disassembler/ARM64/memory.txt
deleted file mode 100644
index 54556a10b8a..00000000000
--- a/test/MC/Disassembler/ARM64/memory.txt
+++ /dev/null
@@ -1,564 +0,0 @@
-# RUN: llvm-mc --disassemble -triple arm64-apple-darwin < %s | FileCheck %s
-
-#-----------------------------------------------------------------------------
-# Indexed loads
-#-----------------------------------------------------------------------------
-
-  0x85 0x14 0x40 0xb9
-  0x64 0x00 0x40 0xf9
-  0xe2 0x13 0x40 0xf9
-  0xe5 0x07 0x40 0x3d
-  0xe6 0x07 0x40 0x7d
-  0xe7 0x07 0x40 0xbd
-  0xe8 0x07 0x40 0xfd
-  0xe9 0x07 0xc0 0x3d
-  0x64 0x00 0x40 0x39
-  0x20 0x78 0xa0 0xb8
-  0x85 0x50 0x40 0x39
-
-# CHECK: ldr	w5, [x4, #20]
-# CHECK: ldr	x4, [x3]
-# CHECK: ldr	x2, [sp, #32]
-# CHECK: ldr	b5, [sp, #1]
-# CHECK: ldr	h6, [sp, #2]
-# CHECK: ldr	s7, [sp, #4]
-# CHECK: ldr	d8, [sp, #8]
-# CHECK: ldr	q9, [sp, #16]
-# CHECK: ldrb	w4, [x3]
-# CHECK: ldrsw	x0, [x1, x0, lsl #2]
-# CHECK: ldrb	w5, [x4, #20]
-# CHECK: ldrsb	w9, [x3]
-# CHECK: ldrsb	x2, [sp, #128]
-# CHECK: ldrh	w2, [sp, #32]
-# CHECK: ldrsh	w3, [sp, #32]
-# CHECK: ldrsh	x5, [x9, #24]
-# CHECK: ldrsw	x9, [sp, #512]
-# CHECK: prfm	pldl3strm, [sp, #32]
-
-  0x69 0x00 0xc0 0x39
-  0xe2 0x03 0x82 0x39
-  0xe2 0x43 0x40 0x79
-  0xe3 0x43 0xc0 0x79
-  0x25 0x31 0x80 0x79
-  0xe9 0x03 0x82 0xb9
-  0xe5 0x13 0x80 0xf9
-  0x40 0x00 0x80 0xf9
-  0x41 0x00 0x80 0xf9
-  0x42 0x00 0x80 0xf9
-  0x43 0x00 0x80 0xf9
-  0x44 0x00 0x80 0xf9
-  0x45 0x00 0x80 0xf9
-  0x50 0x00 0x80 0xf9
-  0x51 0x00 0x80 0xf9
-  0x52 0x00 0x80 0xf9
-  0x53 0x00 0x80 0xf9
-  0x54 0x00 0x80 0xf9
-  0x55 0x00 0x80 0xf9
-
-# CHECK: prfm	pldl1keep, [x2]
-# CHECK: prfm	pldl1strm, [x2]
-# CHECK: prfm	pldl2keep, [x2]
-# CHECK: prfm	pldl2strm, [x2]
-# CHECK: prfm	pldl3keep, [x2]
-# CHECK: prfm	pldl3strm, [x2]
-# CHECK: prfm	pstl1keep, [x2]
-# CHECK: prfm	pstl1strm, [x2]
-# CHECK: prfm	pstl2keep, [x2]
-# CHECK: prfm	pstl2strm, [x2]
-# CHECK: prfm	pstl3keep, [x2]
-# CHECK: prfm	pstl3strm, [x2]
-
-#-----------------------------------------------------------------------------
-# Indexed stores
-#-----------------------------------------------------------------------------
-
-  0x64 0x00 0x00 0xf9
-  0xe2 0x13 0x00 0xf9
-  0x85 0x14 0x00 0xb9
-  0xe5 0x07 0x00 0x3d
-  0xe6 0x07 0x00 0x7d
-  0xe7 0x07 0x00 0xbd
-  0xe8 0x07 0x00 0xfd
-  0xe9 0x07 0x80 0x3d
-  0x64 0x00 0x00 0x39
-  0x85 0x50 0x00 0x39
-  0xe2 0x43 0x00 0x79
-  0x00 0xe8 0x20 0x38
-  0x00 0x48 0x20 0x38
-
-# CHECK: str	x4, [x3]
-# CHECK: str	x2, [sp, #32]
-# CHECK: str	w5, [x4, #20]
-# CHECK: str	b5, [sp, #1]
-# CHECK: str	h6, [sp, #2]
-# CHECK: str	s7, [sp, #4]
-# CHECK: str	d8, [sp, #8]
-# CHECK: str	q9, [sp, #16]
-# CHECK: strb	w4, [x3]
-# CHECK: strb	w5, [x4, #20]
-# CHECK: strh	w2, [sp, #32]
-# CHECK: strb	w0, [x0, x0, sxtx]
-# CHECK: strb	w0, [x0, w0, uxtw]
-
-#-----------------------------------------------------------------------------
-# Unscaled immediate loads and stores
-#-----------------------------------------------------------------------------
-
-  0x62 0x00 0x40 0xb8
-  0xe2 0x83 0x41 0xb8
-  0x62 0x00 0x40 0xf8
-  0xe2 0x83 0x41 0xf8
-  0xe5 0x13 0x40 0x3c
-  0xe6 0x23 0x40 0x7c
-  0xe7 0x43 0x40 0xbc
-  0xe8 0x83 0x40 0xfc
-  0xe9 0x03 0xc1 0x3c
-  0x69 0x00 0xc0 0x38
-  0xe2 0x03 0x88 0x38
-  0xe3 0x03 0xc2 0x78
-  0x25 0x81 0x81 0x78
-  0xe9 0x03 0x98 0xb8
-
-# CHECK: ldur	w2, [x3]
-# CHECK: ldur	w2, [sp, #24]
-# CHECK: ldur	x2, [x3]
-# CHECK: ldur	x2, [sp, #24]
-# CHECK: ldur	b5, [sp, #1]
-# CHECK: ldur	h6, [sp, #2]
-# CHECK: ldur	s7, [sp, #4]
-# CHECK: ldur	d8, [sp, #8]
-# CHECK: ldur	q9, [sp, #16]
-# CHECK: ldursb	w9, [x3]
-# CHECK: ldursb	x2, [sp, #128]
-# CHECK: ldursh	w3, [sp, #32]
-# CHECK: ldursh	x5, [x9, #24]
-# CHECK: ldursw	x9, [sp, #-128]
-
-  0x64 0x00 0x00 0xb8
-  0xe2 0x03 0x02 0xb8
-  0x64 0x00 0x00 0xf8
-  0xe2 0x03 0x02 0xf8
-  0x85 0x40 0x01 0xb8
-  0xe5 0x13 0x00 0x3c
-  0xe6 0x23 0x00 0x7c
-  0xe7 0x43 0x00 0xbc
-  0xe8 0x83 0x00 0xfc
-  0xe9 0x03 0x81 0x3c
-  0x64 0x00 0x00 0x38
-  0x85 0x40 0x01 0x38
-  0xe2 0x03 0x02 0x78
-  0xe5 0x03 0x82 0xf8
-
-# CHECK: stur	w4, [x3]
-# CHECK: stur	w2, [sp, #32]
-# CHECK: stur	x4, [x3]
-# CHECK: stur	x2, [sp, #32]
-# CHECK: stur	w5, [x4, #20]
-# CHECK: stur	b5, [sp, #1]
-# CHECK: stur	h6, [sp, #2]
-# CHECK: stur	s7, [sp, #4]
-# CHECK: stur	d8, [sp, #8]
-# CHECK: stur	q9, [sp, #16]
-# CHECK: sturb	w4, [x3]
-# CHECK: sturb	w5, [x4, #20]
-# CHECK: sturh	w2, [sp, #32]
-# CHECK: prfum	pldl3strm, [sp, #32]
-
-#-----------------------------------------------------------------------------
-# Unprivileged loads and stores
-#-----------------------------------------------------------------------------
-
-  0x83 0x08 0x41 0xb8
-  0x83 0x08 0x41 0xf8
-  0x83 0x08 0x41 0x38
-  0x69 0x08 0xc0 0x38
-  0xe2 0x0b 0x88 0x38
-  0x83 0x08 0x41 0x78
-  0xe3 0x0b 0xc2 0x78
-  0x25 0x89 0x81 0x78
-  0xe9 0x0b 0x98 0xb8
-
-# CHECK: ldtr	w3, [x4, #16]
-# CHECK: ldtr	x3, [x4, #16]
-# CHECK: ldtrb	w3, [x4, #16]
-# CHECK: ldtrsb	w9, [x3]
-# CHECK: ldtrsb	x2, [sp, #128]
-# CHECK: ldtrh	w3, [x4, #16]
-# CHECK: ldtrsh	w3, [sp, #32]
-# CHECK: ldtrsh	x5, [x9, #24]
-# CHECK: ldtrsw	x9, [sp, #-128]
-
-  0x85 0x48 0x01 0xb8
-  0x64 0x08 0x00 0xf8
-  0xe2 0x0b 0x02 0xf8
-  0x64 0x08 0x00 0x38
-  0x85 0x48 0x01 0x38
-  0xe2 0x0b 0x02 0x78
-
-# CHECK: sttr	w5, [x4, #20]
-# CHECK: sttr	x4, [x3]
-# CHECK: sttr	x2, [sp, #32]
-# CHECK: sttrb	w4, [x3]
-# CHECK: sttrb	w5, [x4, #20]
-# CHECK: sttrh	w2, [sp, #32]
-
-#-----------------------------------------------------------------------------
-# Pre-indexed loads and stores
-#-----------------------------------------------------------------------------
-
-  0xfd 0x8c 0x40 0xf8
-  0xfe 0x8c 0x40 0xf8
-  0x05 0x1c 0x40 0x3c
-  0x06 0x2c 0x40 0x7c
-  0x07 0x4c 0x40 0xbc
-  0x08 0x8c 0x40 0xfc
-  0x09 0x0c 0xc1 0x3c
-
-# CHECK: ldr	x29, [x7, #8]!
-# CHECK: ldr	x30, [x7, #8]!
-# CHECK: ldr	b5, [x0, #1]!
-# CHECK: ldr	h6, [x0, #2]!
-# CHECK: ldr	s7, [x0, #4]!
-# CHECK: ldr	d8, [x0, #8]!
-# CHECK: ldr	q9, [x0, #16]!
-
-  0xfe 0x8c 0x1f 0xf8
-  0xfd 0x8c 0x1f 0xf8
-  0x05 0xfc 0x1f 0x3c
-  0x06 0xec 0x1f 0x7c
-  0x07 0xcc 0x1f 0xbc
-  0x08 0x8c 0x1f 0xfc
-  0x09 0x0c 0x9f 0x3c
-
-# CHECK: str	x30, [x7, #-8]!
-# CHECK: str	x29, [x7, #-8]!
-# CHECK: str	b5, [x0, #-1]!
-# CHECK: str	h6, [x0, #-2]!
-# CHECK: str	s7, [x0, #-4]!
-# CHECK: str	d8, [x0, #-8]!
-# CHECK: str	q9, [x0, #-16]!
-
-#-----------------------------------------------------------------------------
-# post-indexed loads and stores
-#-----------------------------------------------------------------------------
-
-  0xfe 0x84 0x1f 0xf8
-  0xfd 0x84 0x1f 0xf8
-  0x05 0xf4 0x1f 0x3c
-  0x06 0xe4 0x1f 0x7c
-  0x07 0xc4 0x1f 0xbc
-  0x08 0x84 0x1f 0xfc
-  0x09 0x04 0x9f 0x3c
-
-# CHECK: str	x30, [x7], #-8
-# CHECK: str	x29, [x7], #-8
-# CHECK: str	b5, [x0], #-1
-# CHECK: str	h6, [x0], #-2
-# CHECK: str	s7, [x0], #-4
-# CHECK: str	d8, [x0], #-8
-# CHECK: str	q9, [x0], #-16
-
-  0xfd 0x84 0x40 0xf8
-  0xfe 0x84 0x40 0xf8
-  0x05 0x14 0x40 0x3c
-  0x06 0x24 0x40 0x7c
-  0x07 0x44 0x40 0xbc
-  0x08 0x84 0x40 0xfc
-  0x09 0x04 0xc1 0x3c
-
-# CHECK: ldr	x29, [x7], #8
-# CHECK: ldr	x30, [x7], #8
-# CHECK: ldr	b5, [x0], #1
-# CHECK: ldr	h6, [x0], #2
-# CHECK: ldr	s7, [x0], #4
-# CHECK: ldr	d8, [x0], #8
-# CHECK: ldr	q9, [x0], #16
-
-#-----------------------------------------------------------------------------
-# Load/Store pair (indexed  offset)
-#-----------------------------------------------------------------------------
-
-  0xe3 0x09 0x42 0x29
-  0xe4 0x27 0x7f 0xa9
-  0xc2 0x0d 0x42 0x69
-  0xe2 0x0f 0x7e 0x69
-  0x4a 0x04 0x48 0x2d
-  0x4a 0x04 0x40 0x6d
-
-# CHECK: ldp	w3, w2, [x15, #16]
-# CHECK: ldp	x4, x9, [sp, #-16]
-# CHECK: ldpsw	x2, x3, [x14, #16]
-# CHECK: ldpsw	x2, x3, [sp, #-16]
-# CHECK: ldp	s10, s1, [x2, #64]
-# CHECK: ldp	d10, d1, [x2]
-
-  0xe3 0x09 0x02 0x29
-  0xe4 0x27 0x3f 0xa9
-  0x4a 0x04 0x08 0x2d
-  0x4a 0x04 0x00 0x6d
-
-# CHECK: stp	w3, w2, [x15, #16]
-# CHECK: stp	x4, x9, [sp, #-16]
-# CHECK: stp	s10, s1, [x2, #64]
-# CHECK: stp	d10, d1, [x2]
-
-#-----------------------------------------------------------------------------
-# Load/Store pair (pre-indexed)
-#-----------------------------------------------------------------------------
-
-  0xe3 0x09 0xc2 0x29
-  0xe4 0x27 0xff 0xa9
-  0xc2 0x0d 0xc2 0x69
-  0xe2 0x0f 0xfe 0x69
-  0x4a 0x04 0xc8 0x2d
-  0x4a 0x04 0xc1 0x6d
-
-# CHECK: ldp	w3, w2, [x15, #16]!
-# CHECK: ldp	x4, x9, [sp, #-16]!
-# CHECK: ldpsw	x2, x3, [x14, #16]!
-# CHECK: ldpsw	x2, x3, [sp, #-16]!
-# CHECK: ldp	s10, s1, [x2, #64]!
-# CHECK: ldp	d10, d1, [x2, #16]!
-
-  0xe3 0x09 0x82 0x29
-  0xe4 0x27 0xbf 0xa9
-  0x4a 0x04 0x88 0x2d
-  0x4a 0x04 0x81 0x6d
-
-# CHECK: stp	w3, w2, [x15, #16]!
-# CHECK: stp	x4, x9, [sp, #-16]!
-# CHECK: stp	s10, s1, [x2, #64]!
-# CHECK: stp	d10, d1, [x2, #16]!
-
-#-----------------------------------------------------------------------------
-# Load/Store pair (post-indexed)
-#-----------------------------------------------------------------------------
-
-  0xe3 0x09 0xc2 0x28
-  0xe4 0x27 0xff 0xa8
-  0xc2 0x0d 0xc2 0x68
-  0xe2 0x0f 0xfe 0x68
-  0x4a 0x04 0xc8 0x2c
-  0x4a 0x04 0xc1 0x6c
-
-# CHECK: ldp	w3, w2, [x15], #16
-# CHECK: ldp	x4, x9, [sp], #-16
-# CHECK: ldpsw	x2, x3, [x14], #16
-# CHECK: ldpsw	x2, x3, [sp], #-16
-# CHECK: ldp	s10, s1, [x2], #64
-# CHECK: ldp	d10, d1, [x2], #16
-
-  0xe3 0x09 0x82 0x28
-  0xe4 0x27 0xbf 0xa8
-  0x4a 0x04 0x88 0x2c
-  0x4a 0x04 0x81 0x6c
-
-# CHECK: stp	w3, w2, [x15], #16
-# CHECK: stp	x4, x9, [sp], #-16
-# CHECK: stp	s10, s1, [x2], #64
-# CHECK: stp	d10, d1, [x2], #16
-
-#-----------------------------------------------------------------------------
-# Load/Store pair (no-allocate)
-#-----------------------------------------------------------------------------
-
-  0xe3 0x09 0x42 0x28
-  0xe4 0x27 0x7f 0xa8
-  0x4a 0x04 0x48 0x2c
-  0x4a 0x04 0x40 0x6c
-
-# CHECK: ldnp	w3, w2, [x15, #16]
-# CHECK: ldnp	x4, x9, [sp, #-16]
-# CHECK: ldnp	s10, s1, [x2, #64]
-# CHECK: ldnp	d10, d1, [x2]
-
-  0xe3 0x09 0x02 0x28
-  0xe4 0x27 0x3f 0xa8
-  0x4a 0x04 0x08 0x2c
-  0x4a 0x04 0x00 0x6c
-
-# CHECK: stnp	w3, w2, [x15, #16]
-# CHECK: stnp	x4, x9, [sp, #-16]
-# CHECK: stnp	s10, s1, [x2, #64]
-# CHECK: stnp	d10, d1, [x2]
-
-#-----------------------------------------------------------------------------
-# Load/Store register offset
-#-----------------------------------------------------------------------------
-
-  0x00 0x68 0x60 0xb8
-  0x00 0x78 0x60 0xb8
-  0x00 0x68 0x60 0xf8
-  0x00 0x78 0x60 0xf8
-  0x00 0xe8 0x60 0xf8
-
-# CHECK: ldr	w0, [x0, x0]
-# CHECK: ldr	w0, [x0, x0, lsl #2]
-# CHECK: ldr	x0, [x0, x0]
-# CHECK: ldr	x0, [x0, x0, lsl #3]
-# CHECK: ldr	x0, [x0, x0, sxtx]
-
-  0x21 0x68 0x62 0x3c
-  0x21 0x78 0x62 0x3c
-  0x21 0x68 0x62 0x7c
-  0x21 0x78 0x62 0x7c
-  0x21 0x68 0x62 0xbc
-  0x21 0x78 0x62 0xbc
-  0x21 0x68 0x62 0xfc
-  0x21 0x78 0x62 0xfc
-  0x21 0x68 0xe2 0x3c
-  0x21 0x78 0xe2 0x3c
-
-# CHECK: ldr	b1, [x1, x2]
-# CHECK: ldr	b1, [x1, x2, lsl #0]
-# CHECK: ldr	h1, [x1, x2]
-# CHECK: ldr	h1, [x1, x2, lsl #1]
-# CHECK: ldr	s1, [x1, x2]
-# CHECK: ldr	s1, [x1, x2, lsl #2]
-# CHECK: ldr	d1, [x1, x2]
-# CHECK: ldr	d1, [x1, x2, lsl #3]
-# CHECK: ldr	q1, [x1, x2]
-# CHECK: ldr	q1, [x1, x2, lsl #4]
-
-  0x00 0x48 0x20 0x7c
-  0xe1 0x6b 0x23 0xfc
-  0xe1 0x5b 0x23 0xfc
-  0xe1 0x6b 0xa3 0x3c
-  0xe1 0x5b 0xa3 0x3c
-
-# CHECK: str	h0, [x0, w0, uxtw]
-# CHECK: str	d1, [sp, x3]
-# CHECK: str	d1, [sp, w3, uxtw #3]
-# CHECK: str	q1, [sp, x3]
-# CHECK: str	q1, [sp, w3, uxtw #4]
-
-#-----------------------------------------------------------------------------
-# Load/Store exclusive
-#-----------------------------------------------------------------------------
-
-  0x26 0x7c 0x5f 0x08
-  0x26 0x7c 0x5f 0x48
-  0x27 0x0d 0x7f 0x88
-  0x27 0x0d 0x7f 0xc8
-
-# CHECK: ldxrb	w6, [x1]
-# CHECK: ldxrh	w6, [x1]
-# CHECK: ldxp	w7, w3, [x9]
-# CHECK: ldxp	x7, x3, [x9]
-
-  0x64 0x7c 0x01 0xc8
-  0x64 0x7c 0x01 0x88
-  0x64 0x7c 0x01 0x08
-  0x64 0x7c 0x01 0x48
-  0x22 0x18 0x21 0xc8
-  0x22 0x18 0x21 0x88
-
-# CHECK: stxr	w1, x4, [x3]
-# CHECK: stxr	w1, w4, [x3]
-# CHECK: stxrb	w1, w4, [x3]
-# CHECK: stxrh	w1, w4, [x3]
-# CHECK: stxp	w1, x2, x6, [x1]
-# CHECK: stxp	w1, w2, w6, [x1]
-
-#-----------------------------------------------------------------------------
-# Load-acquire/Store-release non-exclusive
-#-----------------------------------------------------------------------------
-
-  0xe4 0xff 0xdf 0x88
-  0xe4 0xff 0xdf 0xc8
-  0xe4 0xff 0xdf 0x08
-  0xe4 0xff 0xdf 0x48
-
-# CHECK: ldar	w4, [sp]
-# CHECK: ldar	x4, [sp]
-# CHECK: ldarb	w4, [sp]
-# CHECK: ldarh	w4, [sp]
-
-  0xc3 0xfc 0x9f 0x88
-  0xc3 0xfc 0x9f 0xc8
-  0xc3 0xfc 0x9f 0x08
-  0xc3 0xfc 0x9f 0x48
-
-# CHECK: stlr	w3, [x6]
-# CHECK: stlr	x3, [x6]
-# CHECK: stlrb	w3, [x6]
-# CHECK: stlrh	w3, [x6]
-
-#-----------------------------------------------------------------------------
-# Load-acquire/Store-release exclusive
-#-----------------------------------------------------------------------------
-
-  0x82 0xfc 0x5f 0x88
-  0x82 0xfc 0x5f 0xc8
-  0x82 0xfc 0x5f 0x08
-  0x82 0xfc 0x5f 0x48
-  0x22 0x98 0x7f 0x88
-  0x22 0x98 0x7f 0xc8
-
-# CHECK: ldaxr	w2, [x4]
-# CHECK: ldaxr	x2, [x4]
-# CHECK: ldaxrb	w2, [x4]
-# CHECK: ldaxrh	w2, [x4]
-# CHECK: ldaxp	w2, w6, [x1]
-# CHECK: ldaxp	x2, x6, [x1]
-
-  0x27 0xfc 0x08 0xc8
-  0x27 0xfc 0x08 0x88
-  0x27 0xfc 0x08 0x08
-  0x27 0xfc 0x08 0x48
-  0x22 0x98 0x21 0xc8
-  0x22 0x98 0x21 0x88
-
-# CHECK: stlxr	w8, x7, [x1]
-# CHECK: stlxr	w8, w7, [x1]
-# CHECK: stlxrb	w8, w7, [x1]
-# CHECK: stlxrh	w8, w7, [x1]
-# CHECK: stlxp	w1, x2, x6, [x1]
-# CHECK: stlxp	w1, w2, w6, [x1]
-
-#-----------------------------------------------------------------------------
-# Load/Store with explicit LSL values
-#-----------------------------------------------------------------------------
-  0x20 0x78 0xa0 0xb8
-  0x20 0x78 0x60 0xf8
-  0x20 0x78 0x20 0xf8
-  0x20 0x78 0x60 0xb8
-  0x20 0x78 0x20 0xb8
-  0x20 0x78 0xe0 0x3c
-  0x20 0x78 0xa0 0x3c
-  0x20 0x78 0x60 0xfc
-  0x20 0x78 0x20 0xfc
-  0x20 0x78 0x60 0xbc
-  0x20 0x78 0x20 0xbc
-  0x20 0x78 0x60 0x7c
-  0x20 0x78 0x60 0x3c
-  0x20 0x78 0x60 0x38
-  0x20 0x78 0x20 0x38
-  0x20 0x78 0xe0 0x38
-  0x20 0x78 0x60 0x78
-  0x20 0x78 0x20 0x78
-  0x20 0x78 0xe0 0x78
-  0x20 0x78 0xa0 0x38
-  0x20 0x78 0xa0 0x78
-
-# CHECK: ldrsw	x0, [x1, x0, lsl #2]
-# CHECK: ldr	x0, [x1, x0, lsl #3]
-# CHECK: str	x0, [x1, x0, lsl #3]
-# CHECK: ldr	w0, [x1, x0, lsl #2]
-# CHECK: str	w0, [x1, x0, lsl #2]
-# CHECK: ldr	q0, [x1, x0, lsl #4]
-# CHECK: str	q0, [x1, x0, lsl #4]
-# CHECK: ldr	d0, [x1, x0, lsl #3]
-# CHECK: str	d0, [x1, x0, lsl #3]
-# CHECK: ldr	s0, [x1, x0, lsl #2]
-# CHECK: str	s0, [x1, x0, lsl #2]
-# CHECK: ldr	h0, [x1, x0, lsl #1]
-# CHECK: ldr	b0, [x1, x0, lsl #0]
-# CHECK: ldrb	w0, [x1, x0, lsl #0]
-# CHECK: strb	w0, [x1, x0, lsl #0]
-# CHECK: ldrsb	w0, [x1, x0, lsl #0]
-# CHECK: ldrh	w0, [x1, x0, lsl #1]
-# CHECK: strh	w0, [x1, x0, lsl #1]
-# CHECK: ldrsh	w0, [x1, x0, lsl #1]
-# CHECK: ldrsb	x0, [x1, x0, lsl #0]
-# CHECK: ldrsh	x0, [x1, x0, lsl #1]
diff --git a/test/MC/Disassembler/ARM64/non-apple-fmov.txt b/test/MC/Disassembler/ARM64/non-apple-fmov.txt
deleted file mode 100644
index 75cb95ce186..00000000000
--- a/test/MC/Disassembler/ARM64/non-apple-fmov.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# RUN: llvm-mc -triple arm64 -mattr=neon -disassemble < %s | FileCheck %s
-
-0x00 0x00 0xae 0x9e
-0x00 0x00 0xaf 0x9e
-
-# CHECK: fmov x0, v0.d[1]
-# CHECK: fmov v0.d[1], x0
diff --git a/test/MC/Disassembler/ARM64/scalar-fp.txt b/test/MC/Disassembler/ARM64/scalar-fp.txt
deleted file mode 100644
index f139700164c..00000000000
--- a/test/MC/Disassembler/ARM64/scalar-fp.txt
+++ /dev/null
@@ -1,255 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon --disassemble -output-asm-variant=1 < %s | FileCheck %s
-
-#-----------------------------------------------------------------------------
-# Floating-point arithmetic
-#-----------------------------------------------------------------------------
-
-0x41 0xc0 0x20 0x1e
-0x41 0xc0 0x60 0x1e
-
-# CHECK: fabs s1, s2
-# CHECK: fabs d1, d2
-
-0x41 0x28 0x23 0x1e
-0x41 0x28 0x63 0x1e
-
-# CHECK: fadd s1, s2, s3
-# CHECK: fadd d1, d2, d3
-
-0x41 0x18 0x23 0x1e
-0x41 0x18 0x63 0x1e
-
-# CHECK: fdiv s1, s2, s3
-# CHECK: fdiv d1, d2, d3
-
-0x41 0x10 0x03 0x1f
-0x41 0x10 0x43 0x1f
-
-# CHECK: fmadd s1, s2, s3, s4
-# CHECK: fmadd d1, d2, d3, d4
-
-0x41 0x48 0x23 0x1e
-0x41 0x48 0x63 0x1e
-0x41 0x68 0x23 0x1e
-0x41 0x68 0x63 0x1e
-
-# CHECK: fmax   s1, s2, s3
-# CHECK: fmax   d1, d2, d3
-# CHECK: fmaxnm s1, s2, s3
-# CHECK: fmaxnm d1, d2, d3
-
-0x41 0x58 0x23 0x1e
-0x41 0x58 0x63 0x1e
-0x41 0x78 0x23 0x1e
-0x41 0x78 0x63 0x1e
-
-# CHECK: fmin   s1, s2, s3
-# CHECK: fmin   d1, d2, d3
-# CHECK: fminnm s1, s2, s3
-# CHECK: fminnm d1, d2, d3
-
-0x41 0x90 0x03 0x1f
-0x41 0x90 0x43 0x1f
-
-# CHECK: fmsub s1, s2, s3, s4
-# CHECK: fmsub d1, d2, d3, d4
-
-0x41 0x08 0x23 0x1e
-0x41 0x08 0x63 0x1e
-
-# CHECK: fmul s1, s2, s3
-# CHECK: fmul d1, d2, d3
-
-0x41 0x40 0x21 0x1e
-0x41 0x40 0x61 0x1e
-
-# CHECK: fneg s1, s2
-# CHECK: fneg d1, d2
-
-0x41 0x10 0x23 0x1f
-0x41 0x10 0x63 0x1f
-
-# CHECK: fnmadd s1, s2, s3, s4
-# CHECK: fnmadd d1, d2, d3, d4
-
-0x41 0x90 0x23 0x1f
-0x41 0x90 0x63 0x1f
-
-# CHECK: fnmsub s1, s2, s3, s4
-# CHECK: fnmsub d1, d2, d3, d4
-
-0x41 0x88 0x23 0x1e
-0x41 0x88 0x63 0x1e
-
-# CHECK: fnmul s1, s2, s3
-# CHECK: fnmul d1, d2, d3
-
-0x41 0xc0 0x21 0x1e
-0x41 0xc0 0x61 0x1e
-
-# CHECK: fsqrt s1, s2
-# CHECK: fsqrt d1, d2
-
-0x41 0x38 0x23 0x1e
-0x41 0x38 0x63 0x1e
-
-# CHECK: fsub s1, s2, s3
-# CHECK: fsub d1, d2, d3
-
-#-----------------------------------------------------------------------------
-# Floating-point comparison
-#-----------------------------------------------------------------------------
-
-0x20 0x04 0x22 0x1e
-0x20 0x04 0x62 0x1e
-0x30 0x04 0x22 0x1e
-0x30 0x04 0x62 0x1e
-
-# CHECK: fccmp  s1, s2, #0, eq
-# CHECK: fccmp  d1, d2, #0, eq
-# CHECK: fccmpe s1, s2, #0, eq
-# CHECK: fccmpe d1, d2, #0, eq
-
-0x20 0x20 0x22 0x1e
-0x20 0x20 0x62 0x1e
-0x28 0x20 0x20 0x1e
-0x28 0x20 0x60 0x1e
-0x30 0x20 0x22 0x1e
-0x30 0x20 0x62 0x1e
-0x38 0x20 0x20 0x1e
-0x38 0x20 0x60 0x1e
-
-# CHECK: fcmp  s1, s2
-# CHECK: fcmp  d1, d2
-# CHECK: fcmp  s1, #0.0
-# CHECK: fcmp  d1, #0.0
-# CHECK: fcmpe s1, s2
-# CHECK: fcmpe d1, d2
-# CHECK: fcmpe s1, #0.0
-# CHECK: fcmpe d1, #0.0
-
-#-----------------------------------------------------------------------------
-# Floating-point conditional select
-#-----------------------------------------------------------------------------
-
-0x41 0x0c 0x23 0x1e
-0x41 0x0c 0x63 0x1e
-
-# CHECK: fcsel s1, s2, s3, eq
-# CHECK: fcsel d1, d2, d3, eq
-
-#-----------------------------------------------------------------------------
-# Floating-point convert
-#-----------------------------------------------------------------------------
-
-0x41 0xc0 0x63 0x1e
-0x41 0x40 0x62 0x1e
-0x41 0xc0 0xe2 0x1e
-0x41 0x40 0xe2 0x1e
-0x41 0xc0 0x22 0x1e
-0x41 0xc0 0x23 0x1e
-
-# CHECK: fcvt h1, d2
-# CHECK: fcvt s1, d2
-# CHECK: fcvt d1, h2
-# CHECK: fcvt s1, h2
-# CHECK: fcvt d1, s2
-# CHECK: fcvt h1, s2
-
-0x41 0x00 0x44 0x1e
-0x41 0x04 0x44 0x1e
-0x41 0x00 0x44 0x9e
-0x41 0x04 0x44 0x9e
-0x41 0x00 0x04 0x1e
-0x41 0x04 0x04 0x1e
-0x41 0x00 0x04 0x9e
-0x41 0x04 0x04 0x9e
-
-#-----------------------------------------------------------------------------
-# Floating-point move
-#-----------------------------------------------------------------------------
-
-0x41 0x00 0x27 0x1e
-0x41 0x00 0x26 0x1e
-0x41 0x00 0x67 0x9e
-0x41 0x00 0x66 0x9e
-
-# CHECK: fmov s1, w2
-# CHECK: fmov w1, s2
-# CHECK: fmov d1, x2
-# CHECK: fmov x1, d2
-
-0x01 0x10 0x28 0x1e
-0x01 0x10 0x68 0x1e
-0x01 0xf0 0x7b 0x1e
-0x01 0xf0 0x6b 0x1e
-
-# CHECK: fmov s1, #0.12500000
-# CHECK: fmov d1, #0.12500000
-# CHECK: fmov d1, #-0.48437500
-# CHECK: fmov d1, #0.48437500
-
-0x41 0x40 0x20 0x1e
-0x41 0x40 0x60 0x1e
-
-# CHECK: fmov s1, s2
-# CHECK: fmov d1, d2
-
-#-----------------------------------------------------------------------------
-# Floating-point round to integral
-#-----------------------------------------------------------------------------
-
-0x41 0x40 0x26 0x1e
-0x41 0x40 0x66 0x1e
-
-# CHECK: frinta s1, s2
-# CHECK: frinta d1, d2
-
-0x41 0xc0 0x27 0x1e
-0x41 0xc0 0x67 0x1e
-
-# CHECK: frinti s1, s2
-# CHECK: frinti d1, d2
-
-0x41 0x40 0x25 0x1e
-0x41 0x40 0x65 0x1e
-
-# CHECK: frintm s1, s2
-# CHECK: frintm d1, d2
-
-0x41 0x40 0x24 0x1e
-0x41 0x40 0x64 0x1e
-
-# CHECK: frintn s1, s2
-# CHECK: frintn d1, d2
-
-0x41 0xc0 0x24 0x1e
-0x41 0xc0 0x64 0x1e
-
-# CHECK: frintp s1, s2
-# CHECK: frintp d1, d2
-
-0x41 0x40 0x27 0x1e
-0x41 0x40 0x67 0x1e
-
-# CHECK: frintx s1, s2
-# CHECK: frintx d1, d2
-
-0x41 0xc0 0x25 0x1e
-0x41 0xc0 0x65 0x1e
-
-# CHECK: frintz s1, s2
-# CHECK: frintz d1, d2
-
-  0x00 0x3c 0xe0 0x7e
-  0x00 0x8c 0xe0 0x5e
-
-# CHECK: cmhs d0, d0, d0
-# CHECK: cmtst d0, d0, d0
-
-0x00 0x00 0xaf 0x9e
-0x00 0x00 0xae 0x9e
-
-# CHECK: fmov.d v0[1], x0
-# CHECK: fmov.d x0, v0[1]
-
diff --git a/test/MC/Disassembler/ARM64/system.txt b/test/MC/Disassembler/ARM64/system.txt
deleted file mode 100644
index 9027a60dd30..00000000000
--- a/test/MC/Disassembler/ARM64/system.txt
+++ /dev/null
@@ -1,62 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
-
-
-#-----------------------------------------------------------------------------
-# Hint encodings
-#-----------------------------------------------------------------------------
-
-  0x1f 0x20 0x03 0xd5
-# CHECK: nop
-  0x9f 0x20 0x03 0xd5
-# CHECK: sev
-  0xbf 0x20 0x03 0xd5
-# CHECK: sevl
-  0x5f 0x20 0x03 0xd5
-# CHECK: wfe
-  0x7f 0x20 0x03 0xd5
-# CHECK: wfi
-  0x3f 0x20 0x03 0xd5
-# CHECK: yield
-
-#-----------------------------------------------------------------------------
-# Single-immediate operand instructions
-#-----------------------------------------------------------------------------
-
-  0x5f 0x3a 0x03 0xd5
-# CHECK: clrex #10
-  0xdf 0x3f 0x03 0xd5
-# CHECK: isb{{$}}
-  0xdf 0x31 0x03 0xd5
-# CHECK: isb #1
-  0xbf 0x33 0x03 0xd5
-# CHECK: dmb osh
-  0x9f 0x37 0x03 0xd5
-# CHECK: dsb nsh
-  0x3f 0x76 0x08 0xd5
-# CHECK: dc ivac
-
-#-----------------------------------------------------------------------------
-# Generic system instructions
-#-----------------------------------------------------------------------------
-  0xff 0x05 0x0a 0xd5
-  0xe7 0x6a 0x0f 0xd5
-  0xf4 0x3f 0x2e 0xd5
-  0xbf 0x40 0x00 0xd5
-  0x00 0xb0 0x18 0xd5
-  0x00 0xb0 0x38 0xd5
-
-# CHECK: sys #2, c0, c5, #7
-# CHECK: sys #7, c6, c10, #7, x7
-# CHECK: sysl  x20, #6, c3, c15, #7
-# CHECK: msr  SPSEL, #0
-# CHECK: msr S3_0_C11_C0_0, x0
-# CHECK: mrs x0, S3_0_C11_C0_0
-
-  0x40 0xc0 0x1e 0xd5
-  0x40 0xc0 0x1c 0xd5
-  0x40 0xc0 0x18 0xd5
-
-# CHECK: msr RMR_EL3, x0
-# CHECK: msr RMR_EL2, x0
-# CHECK: msr RMR_EL1, x0
-
diff --git a/test/MC/MachO/AArch64/darwin-ARM64-local-label-diff.s b/test/MC/MachO/AArch64/darwin-ARM64-local-label-diff.s
new file mode 100644
index 00000000000..d98c257c858
--- /dev/null
+++ b/test/MC/MachO/AArch64/darwin-ARM64-local-label-diff.s
@@ -0,0 +1,21 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o - < %s | macho-dump -dump-section-data | FileCheck %s
+; rdar://13028719
+
+ .globl context_save0
+ .align 6
+Lcontext_save0:
+context_save0:
+ .fill 2, 8, 5
+Lcontext_save0_end:
+Lcontext_save0_size: .quad (Lcontext_save0_end - Lcontext_save0)
+
+ .align 6
+Lcontext_save1:
+ .fill 2, 8, 0
+Lcontext_save1_end:
+Lcontext_save1_size: .quad (Lcontext_save1_end - Lcontext_save1)
+
+Llockup_release:
+ .quad 0
+
+; CHECK:  ('_section_data', '05000000 00000000 05000000 00000000 10000000 00000000 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 00000000 00000000 00000000 00000000 10000000 00000000 00000000 00000000')
diff --git a/test/MC/MachO/AArch64/darwin-ARM64-reloc.s b/test/MC/MachO/AArch64/darwin-ARM64-reloc.s
new file mode 100644
index 00000000000..7f586aedd63
--- /dev/null
+++ b/test/MC/MachO/AArch64/darwin-ARM64-reloc.s
@@ -0,0 +1,157 @@
+; RUN: llvm-mc -n -triple arm64-apple-darwin10 %s -filetype=obj -o - | macho-dump --dump-section-data | FileCheck %s
+
+	.text
+_fred:
+	bl	_func
+	bl	_func + 20
+
+	adrp	x3, _data@page
+        ldr	w2, [x3, _data@pageoff]
+
+        add	x3, x3, _data@pageoff + 4
+
+	adrp	x3, _data@page+1
+        ldr	w2, [x3, _data@pageoff + 4]
+
+	adrp	x3, _data_ext@gotpage
+        ldr	w2, [x3, _data_ext@gotpageoff]
+
+	.data
+_data:
+        .quad _foo
+        .quad _foo + 4
+        .quad _foo - _bar
+        .quad _foo - _bar + 4
+
+        .long _foo - _bar
+
+        .quad _foo@got
+        .long _foo@got - .
+
+
+; CHECK: ('cputype', 16777228)
+; CHECK: ('cpusubtype', 0)
+; CHECK: ('filetype', 1)
+; CHECK: ('num_load_commands', 3)
+; CHECK: ('load_commands_size', 336)
+; CHECK: ('flag', 0)
+; CHECK: ('reserved', 0)
+; CHECK: ('load_commands', [
+; CHECK:   # Load Command 0
+; CHECK:  (('command', 25)
+; CHECK:   ('size', 232)
+; CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:   ('vm_addr', 0)
+; CHECK:   ('vm_size', 84)
+; CHECK:   ('file_offset', 368)
+; CHECK:   ('file_size', 84)
+; CHECK:   ('maxprot', 7)
+; CHECK:   ('initprot', 7)
+; CHECK:   ('num_sections', 2)
+; CHECK:   ('flags', 0)
+; CHECK:   ('sections', [
+; CHECK:     # Section 0
+; CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:     ('address', 0)
+; CHECK:     ('size', 36)
+; CHECK:     ('offset', 368)
+; CHECK:     ('alignment', 0)
+; CHECK:     ('reloc_offset', 452)
+; CHECK:     ('num_reloc', 13)
+; CHECK:     ('flags', 0x80000400)
+; CHECK:     ('reserved1', 0)
+; CHECK:     ('reserved2', 0)
+; CHECK:     ('reserved3', 0)
+; CHECK:    ),
+; CHECK:   ('_relocations', [
+; CHECK:     # Relocation 0
+; CHECK:     (('word-0', 0x20),
+; CHECK:      ('word-1', 0x6c000005)),
+; CHECK:     # Relocation 1
+; CHECK:     (('word-0', 0x1c),
+; CHECK:      ('word-1', 0x5d000005)),
+; CHECK:     # Relocation 2
+; CHECK:     (('word-0', 0x18),
+; CHECK:      ('word-1', 0xa4000004)),
+; CHECK:     # Relocation 3
+; CHECK:     (('word-0', 0x18),
+; CHECK:      ('word-1', 0x4c000002)),
+; CHECK:     # Relocation 4
+; CHECK:     (('word-0', 0x14),
+; CHECK:      ('word-1', 0xa4000001)),
+; CHECK:     # Relocation 5
+; CHECK:     (('word-0', 0x14),
+; CHECK:      ('word-1', 0x3d000002)),
+; CHECK:     # Relocation 6
+; CHECK:     (('word-0', 0x10),
+; CHECK:      ('word-1', 0xa4000004)),
+; CHECK:     # Relocation 7
+; CHECK:     (('word-0', 0x10),
+; CHECK:      ('word-1', 0x4c000002)),
+; CHECK:     # Relocation 8
+; CHECK:     (('word-0', 0xc),
+; CHECK:      ('word-1', 0x4c000002)),
+; CHECK:     # Relocation 9
+; CHECK:     (('word-0', 0x8),
+; CHECK:      ('word-1', 0x3d000002)),
+; CHECK:     # Relocation 10
+; CHECK:     (('word-0', 0x4),
+; CHECK:      ('word-1', 0xa4000014)),
+; CHECK:     # Relocation 11
+; CHECK:     (('word-0', 0x4),
+; CHECK:      ('word-1', 0x2d000007)),
+; CHECK:     # Relocation 12
+; CHECK:     (('word-0', 0x0),
+; CHECK:      ('word-1', 0x2d000007)),
+; CHECK:   ])
+; CHECK:   ('_section_data', '00000094 00000094 03000090 620040b9 63000091 03000090 620040b9 03000090 620040b9')
+; CHECK:     # Section 1
+; CHECK:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:     ('address', 36)
+; CHECK:     ('size', 48)
+; CHECK:     ('offset', 404)
+; CHECK:     ('alignment', 0)
+; CHECK:     ('reloc_offset', 556)
+; CHECK:     ('num_reloc', 10)
+; CHECK:     ('flags', 0x0)
+; CHECK:     ('reserved1', 0)
+; CHECK:     ('reserved2', 0)
+; CHECK:     ('reserved3', 0)
+; CHECK:    ),
+; CHECK:   ('_relocations', [
+; CHECK:     # Relocation 0
+; CHECK:     (('word-0', 0x2c),
+; CHECK:      ('word-1', 0x7d000006)),
+; CHECK:     # Relocation 1
+; CHECK:     (('word-0', 0x24),
+; CHECK:      ('word-1', 0x7e000006)),
+; CHECK:     # Relocation 2
+; CHECK:     (('word-0', 0x20),
+; CHECK:      ('word-1', 0x1c000004)),
+; CHECK:     # Relocation 3
+; CHECK:     (('word-0', 0x20),
+; CHECK:      ('word-1', 0xc000006)),
+; CHECK:     # Relocation 4
+; CHECK:     (('word-0', 0x18),
+; CHECK:      ('word-1', 0x1e000004)),
+; CHECK:     # Relocation 5
+; CHECK:     (('word-0', 0x18),
+; CHECK:      ('word-1', 0xe000006)),
+; CHECK:     # Relocation 6
+; CHECK:     (('word-0', 0x10),
+; CHECK:      ('word-1', 0x1e000004)),
+; CHECK:     # Relocation 7
+; CHECK:     (('word-0', 0x10),
+; CHECK:      ('word-1', 0xe000006)),
+; CHECK:     # Relocation 8
+; CHECK:     (('word-0', 0x8),
+; CHECK:      ('word-1', 0xe000006)),
+; CHECK:     # Relocation 9
+; CHECK:     (('word-0', 0x0),
+; CHECK:      ('word-1', 0xe000006)),
+; CHECK:   ])
+; CHECK:   ('_section_data', '00000000 00000000 04000000 00000000 00000000 00000000 04000000 00000000 00000000 00000000 00000000 d4ffffff')
+; CHECK:   ])
+; CHECK:  ),
diff --git a/test/MC/MachO/AArch64/lit.local.cfg b/test/MC/MachO/AArch64/lit.local.cfg
new file mode 100644
index 00000000000..9a66a00189e
--- /dev/null
+++ b/test/MC/MachO/AArch64/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
+
diff --git a/test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s b/test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s
deleted file mode 100644
index d98c257c858..00000000000
--- a/test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o - < %s | macho-dump -dump-section-data | FileCheck %s
-; rdar://13028719
-
- .globl context_save0
- .align 6
-Lcontext_save0:
-context_save0:
- .fill 2, 8, 5
-Lcontext_save0_end:
-Lcontext_save0_size: .quad (Lcontext_save0_end - Lcontext_save0)
-
- .align 6
-Lcontext_save1:
- .fill 2, 8, 0
-Lcontext_save1_end:
-Lcontext_save1_size: .quad (Lcontext_save1_end - Lcontext_save1)
-
-Llockup_release:
- .quad 0
-
-; CHECK:  ('_section_data', '05000000 00000000 05000000 00000000 10000000 00000000 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 00000000 00000000 00000000 00000000 10000000 00000000 00000000 00000000')
diff --git a/test/MC/MachO/ARM64/darwin-ARM64-reloc.s b/test/MC/MachO/ARM64/darwin-ARM64-reloc.s
deleted file mode 100644
index 7f586aedd63..00000000000
--- a/test/MC/MachO/ARM64/darwin-ARM64-reloc.s
+++ /dev/null
@@ -1,157 +0,0 @@
-; RUN: llvm-mc -n -triple arm64-apple-darwin10 %s -filetype=obj -o - | macho-dump --dump-section-data | FileCheck %s
-
-	.text
-_fred:
-	bl	_func
-	bl	_func + 20
-
-	adrp	x3, _data@page
-        ldr	w2, [x3, _data@pageoff]
-
-        add	x3, x3, _data@pageoff + 4
-
-	adrp	x3, _data@page+1
-        ldr	w2, [x3, _data@pageoff + 4]
-
-	adrp	x3, _data_ext@gotpage
-        ldr	w2, [x3, _data_ext@gotpageoff]
-
-	.data
-_data:
-        .quad _foo
-        .quad _foo + 4
-        .quad _foo - _bar
-        .quad _foo - _bar + 4
-
-        .long _foo - _bar
-
-        .quad _foo@got
-        .long _foo@got - .
-
-
-; CHECK: ('cputype', 16777228)
-; CHECK: ('cpusubtype', 0)
-; CHECK: ('filetype', 1)
-; CHECK: ('num_load_commands', 3)
-; CHECK: ('load_commands_size', 336)
-; CHECK: ('flag', 0)
-; CHECK: ('reserved', 0)
-; CHECK: ('load_commands', [
-; CHECK:   # Load Command 0
-; CHECK:  (('command', 25)
-; CHECK:   ('size', 232)
-; CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:   ('vm_addr', 0)
-; CHECK:   ('vm_size', 84)
-; CHECK:   ('file_offset', 368)
-; CHECK:   ('file_size', 84)
-; CHECK:   ('maxprot', 7)
-; CHECK:   ('initprot', 7)
-; CHECK:   ('num_sections', 2)
-; CHECK:   ('flags', 0)
-; CHECK:   ('sections', [
-; CHECK:     # Section 0
-; CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:     ('address', 0)
-; CHECK:     ('size', 36)
-; CHECK:     ('offset', 368)
-; CHECK:     ('alignment', 0)
-; CHECK:     ('reloc_offset', 452)
-; CHECK:     ('num_reloc', 13)
-; CHECK:     ('flags', 0x80000400)
-; CHECK:     ('reserved1', 0)
-; CHECK:     ('reserved2', 0)
-; CHECK:     ('reserved3', 0)
-; CHECK:    ),
-; CHECK:   ('_relocations', [
-; CHECK:     # Relocation 0
-; CHECK:     (('word-0', 0x20),
-; CHECK:      ('word-1', 0x6c000005)),
-; CHECK:     # Relocation 1
-; CHECK:     (('word-0', 0x1c),
-; CHECK:      ('word-1', 0x5d000005)),
-; CHECK:     # Relocation 2
-; CHECK:     (('word-0', 0x18),
-; CHECK:      ('word-1', 0xa4000004)),
-; CHECK:     # Relocation 3
-; CHECK:     (('word-0', 0x18),
-; CHECK:      ('word-1', 0x4c000002)),
-; CHECK:     # Relocation 4
-; CHECK:     (('word-0', 0x14),
-; CHECK:      ('word-1', 0xa4000001)),
-; CHECK:     # Relocation 5
-; CHECK:     (('word-0', 0x14),
-; CHECK:      ('word-1', 0x3d000002)),
-; CHECK:     # Relocation 6
-; CHECK:     (('word-0', 0x10),
-; CHECK:      ('word-1', 0xa4000004)),
-; CHECK:     # Relocation 7
-; CHECK:     (('word-0', 0x10),
-; CHECK:      ('word-1', 0x4c000002)),
-; CHECK:     # Relocation 8
-; CHECK:     (('word-0', 0xc),
-; CHECK:      ('word-1', 0x4c000002)),
-; CHECK:     # Relocation 9
-; CHECK:     (('word-0', 0x8),
-; CHECK:      ('word-1', 0x3d000002)),
-; CHECK:     # Relocation 10
-; CHECK:     (('word-0', 0x4),
-; CHECK:      ('word-1', 0xa4000014)),
-; CHECK:     # Relocation 11
-; CHECK:     (('word-0', 0x4),
-; CHECK:      ('word-1', 0x2d000007)),
-; CHECK:     # Relocation 12
-; CHECK:     (('word-0', 0x0),
-; CHECK:      ('word-1', 0x2d000007)),
-; CHECK:   ])
-; CHECK:   ('_section_data', '00000094 00000094 03000090 620040b9 63000091 03000090 620040b9 03000090 620040b9')
-; CHECK:     # Section 1
-; CHECK:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:     ('address', 36)
-; CHECK:     ('size', 48)
-; CHECK:     ('offset', 404)
-; CHECK:     ('alignment', 0)
-; CHECK:     ('reloc_offset', 556)
-; CHECK:     ('num_reloc', 10)
-; CHECK:     ('flags', 0x0)
-; CHECK:     ('reserved1', 0)
-; CHECK:     ('reserved2', 0)
-; CHECK:     ('reserved3', 0)
-; CHECK:    ),
-; CHECK:   ('_relocations', [
-; CHECK:     # Relocation 0
-; CHECK:     (('word-0', 0x2c),
-; CHECK:      ('word-1', 0x7d000006)),
-; CHECK:     # Relocation 1
-; CHECK:     (('word-0', 0x24),
-; CHECK:      ('word-1', 0x7e000006)),
-; CHECK:     # Relocation 2
-; CHECK:     (('word-0', 0x20),
-; CHECK:      ('word-1', 0x1c000004)),
-; CHECK:     # Relocation 3
-; CHECK:     (('word-0', 0x20),
-; CHECK:      ('word-1', 0xc000006)),
-; CHECK:     # Relocation 4
-; CHECK:     (('word-0', 0x18),
-; CHECK:      ('word-1', 0x1e000004)),
-; CHECK:     # Relocation 5
-; CHECK:     (('word-0', 0x18),
-; CHECK:      ('word-1', 0xe000006)),
-; CHECK:     # Relocation 6
-; CHECK:     (('word-0', 0x10),
-; CHECK:      ('word-1', 0x1e000004)),
-; CHECK:     # Relocation 7
-; CHECK:     (('word-0', 0x10),
-; CHECK:      ('word-1', 0xe000006)),
-; CHECK:     # Relocation 8
-; CHECK:     (('word-0', 0x8),
-; CHECK:      ('word-1', 0xe000006)),
-; CHECK:     # Relocation 9
-; CHECK:     (('word-0', 0x0),
-; CHECK:      ('word-1', 0xe000006)),
-; CHECK:   ])
-; CHECK:   ('_section_data', '00000000 00000000 04000000 00000000 00000000 00000000 04000000 00000000 00000000 00000000 00000000 d4ffffff')
-; CHECK:   ])
-; CHECK:  ),
diff --git a/test/MC/MachO/ARM64/lit.local.cfg b/test/MC/MachO/ARM64/lit.local.cfg
deleted file mode 100644
index a75a42b6f74..00000000000
--- a/test/MC/MachO/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/Transforms/ConstantHoisting/AArch64/const-addr.ll b/test/Transforms/ConstantHoisting/AArch64/const-addr.ll
new file mode 100644
index 00000000000..89d596055c4
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/AArch64/const-addr.ll
@@ -0,0 +1,23 @@
+; RUN: opt -mtriple=arm64-darwin-unknown -S -consthoist < %s | FileCheck %s
+
+%T = type { i32, i32, i32, i32 }
+
+define i32 @test1() nounwind {
+; CHECK-LABEL: test1
+; CHECK: %const = bitcast i64 68141056 to i64
+; CHECK: %1 = inttoptr i64 %const to %T*
+; CHECK: %o1 = getelementptr %T* %1, i32 0, i32 1
+; CHECK: %o2 = getelementptr %T* %1, i32 0, i32 2
+; CHECK: %o3 = getelementptr %T* %1, i32 0, i32 3
+  %at = inttoptr i64 68141056 to %T*
+  %o1 = getelementptr %T* %at, i32 0, i32 1
+  %t1 = load i32* %o1
+  %o2 = getelementptr %T* %at, i32 0, i32 2
+  %t2 = load i32* %o2
+  %a1 = add i32 %t1, %t2
+  %o3 = getelementptr %T* %at, i32 0, i32 3
+  %t3 = load i32* %o3
+  %a2 = add i32 %a1, %t3
+  ret i32 %a2
+}
+
diff --git a/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll b/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
new file mode 100644
index 00000000000..575be791d9b
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
@@ -0,0 +1,27 @@
+; RUN: opt -mtriple=arm64-darwin-unknown -S -consthoist < %s | FileCheck %s
+
+define i128 @test1(i128 %a) nounwind {
+; CHECK-LABEL: test1
+; CHECK: %const = bitcast i128 12297829382473034410122878 to i128
+  %1 = add i128 %a, 12297829382473034410122878
+  %2 = add i128 %1, 12297829382473034410122878
+  ret i128 %2
+}
+
+; Check that we don't hoist large, but cheap constants
+define i512 @test2(i512 %a) nounwind {
+; CHECK-LABEL: test2
+; CHECK-NOT: %const = bitcast i512 7 to i512
+  %1 = and i512 %a, 7
+  %2 = or i512 %1, 7
+  ret i512 %2
+}
+
+; Check that we don't hoist the shift value of a shift instruction.
+define i512 @test3(i512 %a) nounwind {
+; CHECK-LABEL: test3
+; CHECK-NOT: %const = bitcast i512 504 to i512
+  %1 = shl i512 %a, 504
+  %2 = ashr i512 %1, 504
+  ret i512 %2
+}
diff --git a/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg b/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg
new file mode 100644
index 00000000000..c42034979fc
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
diff --git a/test/Transforms/ConstantHoisting/ARM64/const-addr.ll b/test/Transforms/ConstantHoisting/ARM64/const-addr.ll
deleted file mode 100644
index 89d596055c4..00000000000
--- a/test/Transforms/ConstantHoisting/ARM64/const-addr.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: opt -mtriple=arm64-darwin-unknown -S -consthoist < %s | FileCheck %s
-
-%T = type { i32, i32, i32, i32 }
-
-define i32 @test1() nounwind {
-; CHECK-LABEL: test1
-; CHECK: %const = bitcast i64 68141056 to i64
-; CHECK: %1 = inttoptr i64 %const to %T*
-; CHECK: %o1 = getelementptr %T* %1, i32 0, i32 1
-; CHECK: %o2 = getelementptr %T* %1, i32 0, i32 2
-; CHECK: %o3 = getelementptr %T* %1, i32 0, i32 3
-  %at = inttoptr i64 68141056 to %T*
-  %o1 = getelementptr %T* %at, i32 0, i32 1
-  %t1 = load i32* %o1
-  %o2 = getelementptr %T* %at, i32 0, i32 2
-  %t2 = load i32* %o2
-  %a1 = add i32 %t1, %t2
-  %o3 = getelementptr %T* %at, i32 0, i32 3
-  %t3 = load i32* %o3
-  %a2 = add i32 %a1, %t3
-  ret i32 %a2
-}
-
diff --git a/test/Transforms/ConstantHoisting/ARM64/large-immediate.ll b/test/Transforms/ConstantHoisting/ARM64/large-immediate.ll
deleted file mode 100644
index 575be791d9b..00000000000
--- a/test/Transforms/ConstantHoisting/ARM64/large-immediate.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: opt -mtriple=arm64-darwin-unknown -S -consthoist < %s | FileCheck %s
-
-define i128 @test1(i128 %a) nounwind {
-; CHECK-LABEL: test1
-; CHECK: %const = bitcast i128 12297829382473034410122878 to i128
-  %1 = add i128 %a, 12297829382473034410122878
-  %2 = add i128 %1, 12297829382473034410122878
-  ret i128 %2
-}
-
-; Check that we don't hoist large, but cheap constants
-define i512 @test2(i512 %a) nounwind {
-; CHECK-LABEL: test2
-; CHECK-NOT: %const = bitcast i512 7 to i512
-  %1 = and i512 %a, 7
-  %2 = or i512 %1, 7
-  ret i512 %2
-}
-
-; Check that we don't hoist the shift value of a shift instruction.
-define i512 @test3(i512 %a) nounwind {
-; CHECK-LABEL: test3
-; CHECK-NOT: %const = bitcast i512 504 to i512
-  %1 = shl i512 %a, 504
-  %2 = ashr i512 %1, 504
-  ret i512 %2
-}
diff --git a/test/Transforms/ConstantHoisting/ARM64/lit.local.cfg b/test/Transforms/ConstantHoisting/ARM64/lit.local.cfg
deleted file mode 100644
index 84ac9811f01..00000000000
--- a/test/Transforms/ConstantHoisting/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/Transforms/GlobalMerge/AArch64/arm64.ll b/test/Transforms/GlobalMerge/AArch64/arm64.ll
new file mode 100644
index 00000000000..eea474a74f1
--- /dev/null
+++ b/test/Transforms/GlobalMerge/AArch64/arm64.ll
@@ -0,0 +1,88 @@
+; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
+; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s
+
+; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
+; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
+
+; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
+; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
+; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
+; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+@bar = internal global [5 x i32] zeroinitializer, align 4
+@baz = internal global [5 x i32] zeroinitializer, align 4
+@foo = internal global [5 x i32] zeroinitializer, align 4
+
+; Function Attrs: nounwind ssp
+define internal void @initialize() #0 {
+  %1 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %1, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
+  %2 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %2, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
+  %3 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
+  %4 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %4, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
+  %5 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %5, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
+  %6 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
+  %7 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %7, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
+  %8 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %8, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
+  %9 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
+  %10 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %10, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
+  ret void
+}
+
+declare i32 @calc(...)
+
+; Function Attrs: nounwind ssp
+define internal void @calculate() #0 {
+  %1 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
+  %2 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
+  %3 = mul nsw i32 %2, %1
+  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0), align 4
+  %4 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
+  %5 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
+  %6 = mul nsw i32 %5, %4
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 1), align 4
+  %7 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
+  %8 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
+  %9 = mul nsw i32 %8, %7
+  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 2), align 4
+  %10 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
+  %11 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
+  %12 = mul nsw i32 %11, %10
+  store i32 %12, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 3), align 4
+  %13 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
+  %14 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
+  %15 = mul nsw i32 %14, %13
+  store i32 %15, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 4), align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone ssp
+define internal i32* @returnFoo() #1 {
+  ret i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0)
+}
+
+attributes #0 = { nounwind ssp }
+attributes #1 = { nounwind readnone ssp }
+attributes #2 = { nounwind }
diff --git a/test/Transforms/GlobalMerge/AArch64/lit.local.cfg b/test/Transforms/GlobalMerge/AArch64/lit.local.cfg
new file mode 100644
index 00000000000..9a66a00189e
--- /dev/null
+++ b/test/Transforms/GlobalMerge/AArch64/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/GlobalMerge/ARM64/arm64.ll b/test/Transforms/GlobalMerge/ARM64/arm64.ll
deleted file mode 100644
index eea474a74f1..00000000000
--- a/test/Transforms/GlobalMerge/ARM64/arm64.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s
-
-; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
-; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
-; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
-; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
-; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios7.0.0"
-
-@bar = internal global [5 x i32] zeroinitializer, align 4
-@baz = internal global [5 x i32] zeroinitializer, align 4
-@foo = internal global [5 x i32] zeroinitializer, align 4
-
-; Function Attrs: nounwind ssp
-define internal void @initialize() #0 {
-  %1 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %1, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
-  %2 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %2, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
-  %3 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
-  %4 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %4, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
-  %5 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %5, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
-  %6 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
-  %7 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %7, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
-  %8 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %8, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
-  %9 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
-  %10 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %10, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
-  ret void
-}
-
-declare i32 @calc(...)
-
-; Function Attrs: nounwind ssp
-define internal void @calculate() #0 {
-  %1 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
-  %2 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
-  %3 = mul nsw i32 %2, %1
-  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0), align 4
-  %4 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
-  %5 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
-  %6 = mul nsw i32 %5, %4
-  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 1), align 4
-  %7 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
-  %8 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
-  %9 = mul nsw i32 %8, %7
-  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 2), align 4
-  %10 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
-  %11 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
-  %12 = mul nsw i32 %11, %10
-  store i32 %12, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 3), align 4
-  %13 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
-  %14 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
-  %15 = mul nsw i32 %14, %13
-  store i32 %15, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 4), align 4
-  ret void
-}
-
-; Function Attrs: nounwind readnone ssp
-define internal i32* @returnFoo() #1 {
-  ret i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0)
-}
-
-attributes #0 = { nounwind ssp }
-attributes #1 = { nounwind readnone ssp }
-attributes #2 = { nounwind }
diff --git a/test/Transforms/GlobalMerge/ARM64/lit.local.cfg b/test/Transforms/GlobalMerge/ARM64/lit.local.cfg
deleted file mode 100644
index a75a42b6f74..00000000000
--- a/test/Transforms/GlobalMerge/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
index 1883a8fc8e6..39408a2d394 100644
--- a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
+++ b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
@@ -68,7 +68,7 @@ declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind rea
 
 define <4 x i32> @mulByZeroARM64(<4 x i16> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
   ret <4 x i32> %a
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> zeroinitializer
@@ -76,7 +76,7 @@ entry:
 
 define <4 x i32> @mulByOneARM64(<4 x i16> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %a
 ; CHECK: entry:
 ; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
@@ -85,7 +85,7 @@ entry:
 
 define <4 x i32> @constantMulARM64() nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
   ret <4 x i32> %a
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
@@ -93,7 +93,7 @@ entry:
 
 define <4 x i32> @constantMulSARM64() nounwind readnone ssp {
 entry:
-  %b = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  %b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %b
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -101,7 +101,7 @@ entry:
 
 define <4 x i32> @constantMulUARM64() nounwind readnone ssp {
 entry:
-  %b = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  %b = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %b
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -109,17 +109,17 @@ entry:
 
 define <4 x i32> @complex1ARM64(<4 x i16> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
   %b = add <4 x i32> zeroinitializer, %a
   ret <4 x i32> %b
 ; CHECK: entry:
-; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
+; CHECK-NEXT: %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
 ; CHECK-NEXT: ret <4 x i32> %a
 }
 
 define <4 x i32> @complex2ARM64(<4 x i32> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
   %b = add <4 x i32> %x, %a
   ret <4 x i32> %b
 ; CHECK: entry:
@@ -127,8 +127,8 @@ entry:
 ; CHECK-NEXT: ret <4 x i32> %b
 }
 
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
 
 ; CHECK: attributes #0 = { nounwind readnone ssp }
 ; CHECK: attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg b/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg
new file mode 100644
index 00000000000..6642d287068
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll']
+
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/lsr-memcpy.ll b/test/Transforms/LoopStrengthReduce/AArch64/lsr-memcpy.ll
new file mode 100644
index 00000000000..9a175ad8d35
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AArch64/lsr-memcpy.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=arm64-unknown-unknown -mcpu=cyclone -pre-RA-sched=list-hybrid < %s | FileCheck %s
+; rdar://10232252
+; Prevent LSR of doing poor choice that cannot be folded in addressing mode
+
+; Remove the -pre-RA-sched=list-hybrid option after fixing:
+; <rdar://problem/12702735> [ARM64][coalescer] need better register
+; coalescing for simple unit tests.
+
+; CHECK: testCase
+; CHECK: %while.body{{$}}
+; CHECK: ldr [[STREG:x[0-9]+]], [{{x[0-9]+}}], #8
+; CHECK-NEXT: str [[STREG]], [{{x[0-9]+}}], #8
+; CHECK: %while.end
+define i32 @testCase() nounwind ssp {
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %len.06 = phi i64 [ 1288, %entry ], [ %sub, %while.body ]
+  %pDst.05 = phi i64* [ inttoptr (i64 6442450944 to i64*), %entry ], [ %incdec.ptr1, %while.body ]
+  %pSrc.04 = phi i64* [ inttoptr (i64 4294967296 to i64*), %entry ], [ %incdec.ptr, %while.body ]
+  %incdec.ptr = getelementptr inbounds i64* %pSrc.04, i64 1
+  %tmp = load volatile i64* %pSrc.04, align 8
+  %incdec.ptr1 = getelementptr inbounds i64* %pDst.05, i64 1
+  store volatile i64 %tmp, i64* %pDst.05, align 8
+  %sub = add i64 %len.06, -8
+  %cmp = icmp sgt i64 %sub, -1
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body
+  tail call void inttoptr (i64 6442450944 to void ()*)() nounwind
+  ret i32 0
+}
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll b/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll
new file mode 100644
index 00000000000..10b2c3a403c
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -O3 -mtriple=arm64-unknown-unknown -mcpu=cyclone -pre-RA-sched=list-hybrid | FileCheck %s
+; <rdar://problem/11635990> [arm64] [lsr] Inefficient EA/loop-exit calc in bzero_phys
+;
+; LSR on loop %while.cond should reassociate non-address mode
+; expressions at use %cmp16 to avoid sinking computation into %while.body18.
+;
+; Remove the -pre-RA-sched=list-hybrid option after fixing:
+; <rdar://problem/12702735> [ARM64][coalescer] need better register
+; coalescing for simple unit tests.
+
+; CHECK: @memset
+; CHECK: %while.body18{{$}}
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #8
+; First set the IVREG variable, then use it
+; CHECK-NEXT: sub [[IVREG:x[0-9]+]],
+; CHECK: [[IVREG]], #8
+; CHECK-NEXT: cmp  [[IVREG]], #7
+; CHECK-NEXT: b.hi
+define i8* @memset(i8* %dest, i32 %val, i64 %len) nounwind ssp noimplicitfloat {
+entry:
+  %cmp = icmp eq i64 %len, 0
+  br i1 %cmp, label %done, label %while.cond.preheader
+
+while.cond.preheader:                             ; preds = %entry
+  %conv = trunc i32 %val to i8
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %while.cond.preheader
+  %ptr.0 = phi i8* [ %incdec.ptr, %while.body ], [ %dest, %while.cond.preheader ]
+  %len.addr.0 = phi i64 [ %dec, %while.body ], [ %len, %while.cond.preheader ]
+  %cond = icmp eq i64 %len.addr.0, 0
+  br i1 %cond, label %done, label %land.rhs
+
+land.rhs:                                         ; preds = %while.cond
+  %0 = ptrtoint i8* %ptr.0 to i64
+  %and = and i64 %0, 7
+  %cmp5 = icmp eq i64 %and, 0
+  br i1 %cmp5, label %if.end9, label %while.body
+
+while.body:                                       ; preds = %land.rhs
+  %incdec.ptr = getelementptr inbounds i8* %ptr.0, i64 1
+  store i8 %conv, i8* %ptr.0, align 1, !tbaa !0
+  %dec = add i64 %len.addr.0, -1
+  br label %while.cond
+
+if.end9:                                          ; preds = %land.rhs
+  %conv.mask = and i32 %val, 255
+  %1 = zext i32 %conv.mask to i64
+  %2 = shl nuw nsw i64 %1, 8
+  %ins18 = or i64 %2, %1
+  %3 = shl nuw nsw i64 %1, 16
+  %ins15 = or i64 %ins18, %3
+  %4 = shl nuw nsw i64 %1, 24
+  %5 = shl nuw nsw i64 %1, 32
+  %mask8 = or i64 %ins15, %4
+  %6 = shl nuw nsw i64 %1, 40
+  %mask5 = or i64 %mask8, %5
+  %7 = shl nuw nsw i64 %1, 48
+  %8 = shl nuw i64 %1, 56
+  %mask2.masked = or i64 %mask5, %6
+  %mask = or i64 %mask2.masked, %7
+  %ins = or i64 %mask, %8
+  %9 = bitcast i8* %ptr.0 to i64*
+  %cmp1636 = icmp ugt i64 %len.addr.0, 7
+  br i1 %cmp1636, label %while.body18, label %while.body29.lr.ph
+
+while.body18:                                     ; preds = %if.end9, %while.body18
+  %wideptr.038 = phi i64* [ %incdec.ptr19, %while.body18 ], [ %9, %if.end9 ]
+  %len.addr.137 = phi i64 [ %sub, %while.body18 ], [ %len.addr.0, %if.end9 ]
+  %incdec.ptr19 = getelementptr inbounds i64* %wideptr.038, i64 1
+  store i64 %ins, i64* %wideptr.038, align 8, !tbaa !2
+  %sub = add i64 %len.addr.137, -8
+  %cmp16 = icmp ugt i64 %sub, 7
+  br i1 %cmp16, label %while.body18, label %while.end20
+
+while.end20:                                      ; preds = %while.body18
+  %cmp21 = icmp eq i64 %sub, 0
+  br i1 %cmp21, label %done, label %while.body29.lr.ph
+
+while.body29.lr.ph:                               ; preds = %while.end20, %if.end9
+  %len.addr.1.lcssa49 = phi i64 [ %sub, %while.end20 ], [ %len.addr.0, %if.end9 ]
+  %wideptr.0.lcssa48 = phi i64* [ %incdec.ptr19, %while.end20 ], [ %9, %if.end9 ]
+  %10 = bitcast i64* %wideptr.0.lcssa48 to i8*
+  br label %while.body29
+
+while.body29:                                     ; preds = %while.body29, %while.body29.lr.ph
+  %len.addr.235 = phi i64 [ %len.addr.1.lcssa49, %while.body29.lr.ph ], [ %dec26, %while.body29 ]
+  %ptr.134 = phi i8* [ %10, %while.body29.lr.ph ], [ %incdec.ptr31, %while.body29 ]
+  %dec26 = add i64 %len.addr.235, -1
+  %incdec.ptr31 = getelementptr inbounds i8* %ptr.134, i64 1
+  store i8 %conv, i8* %ptr.134, align 1, !tbaa !0
+  %cmp27 = icmp eq i64 %dec26, 0
+  br i1 %cmp27, label %done, label %while.body29
+
+done:                                             ; preds = %while.cond, %while.body29, %while.end20, %entry
+  ret i8* %dest
+}
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA"}
+!2 = metadata !{metadata !"long long", metadata !0}
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/req-regs.ll b/test/Transforms/LoopStrengthReduce/AArch64/req-regs.ll
new file mode 100644
index 00000000000..217896e55c6
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AArch64/req-regs.ll
@@ -0,0 +1,70 @@
+; RUN: llc -mcpu=cyclone -debug-only=loop-reduce < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; LSR used to fail here due to a bug in the ReqRegs test.
+; CHECK: The chosen solution requires
+; CHECK-NOT: No Satisfactory Solution
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+define void @do_integer_add(i64 %iterations, i8* nocapture readonly %cookie) {
+entry:
+  %N = bitcast i8* %cookie to i32*
+  %0 = load i32* %N, align 4
+  %add = add nsw i32 %0, 57
+  %cmp56 = icmp eq i64 %iterations, 0
+  br i1 %cmp56, label %while.end, label %for.cond.preheader.preheader
+
+for.cond.preheader.preheader:                     ; preds = %entry
+  br label %for.cond.preheader
+
+while.cond.loopexit:                              ; preds = %for.body
+  %add21.lcssa = phi i32 [ %add21, %for.body ]
+  %dec58 = add i64 %dec58.in, -1
+  %cmp = icmp eq i64 %dec58, 0
+  br i1 %cmp, label %while.end.loopexit, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %for.cond.preheader.preheader, %while.cond.loopexit
+  %dec58.in = phi i64 [ %dec58, %while.cond.loopexit ], [ %iterations, %for.cond.preheader.preheader ]
+  %a.057 = phi i32 [ %add21.lcssa, %while.cond.loopexit ], [ %add, %for.cond.preheader.preheader ]
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.cond.preheader
+  %a.154 = phi i32 [ %a.057, %for.cond.preheader ], [ %add21, %for.body ]
+  %i.053 = phi i32 [ 1, %for.cond.preheader ], [ %inc, %for.body ]
+  %inc = add nsw i32 %i.053, 1
+  %add2 = shl i32 %a.154, 1
+  %add3 = add nsw i32 %add2, %i.053
+  %add4 = shl i32 %add3, 1
+  %add5 = add nsw i32 %add4, %i.053
+  %add6 = shl i32 %add5, 1
+  %add7 = add nsw i32 %add6, %i.053
+  %add8 = shl i32 %add7, 1
+  %add9 = add nsw i32 %add8, %i.053
+  %add10 = shl i32 %add9, 1
+  %add11 = add nsw i32 %add10, %i.053
+  %add12 = shl i32 %add11, 1
+  %add13 = add nsw i32 %add12, %i.053
+  %add14 = shl i32 %add13, 1
+  %add15 = add nsw i32 %add14, %i.053
+  %add16 = shl i32 %add15, 1
+  %add17 = add nsw i32 %add16, %i.053
+  %add18 = shl i32 %add17, 1
+  %add19 = add nsw i32 %add18, %i.053
+  %add20 = shl i32 %add19, 1
+  %add21 = add nsw i32 %add20, %i.053
+  %exitcond = icmp eq i32 %inc, 1001
+  br i1 %exitcond, label %while.cond.loopexit, label %for.body
+
+while.end.loopexit:                               ; preds = %while.cond.loopexit
+  %add21.lcssa.lcssa = phi i32 [ %add21.lcssa, %while.cond.loopexit ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %a.0.lcssa = phi i32 [ %add, %entry ], [ %add21.lcssa.lcssa, %while.end.loopexit ]
+  tail call void @use_int(i32 %a.0.lcssa)
+  ret void
+}
+
+declare void @use_int(i32)
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg b/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
deleted file mode 100644
index a49957999f0..00000000000
--- a/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-config.suffixes = ['.ll']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll b/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll
deleted file mode 100644
index 9a175ad8d35..00000000000
--- a/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc -mtriple=arm64-unknown-unknown -mcpu=cyclone -pre-RA-sched=list-hybrid < %s | FileCheck %s
-; rdar://10232252
-; Prevent LSR of doing poor choice that cannot be folded in addressing mode
-
-; Remove the -pre-RA-sched=list-hybrid option after fixing:
-; <rdar://problem/12702735> [ARM64][coalescer] need better register
-; coalescing for simple unit tests.
-
-; CHECK: testCase
-; CHECK: %while.body{{$}}
-; CHECK: ldr [[STREG:x[0-9]+]], [{{x[0-9]+}}], #8
-; CHECK-NEXT: str [[STREG]], [{{x[0-9]+}}], #8
-; CHECK: %while.end
-define i32 @testCase() nounwind ssp {
-entry:
-  br label %while.body
-
-while.body:                                       ; preds = %while.body, %entry
-  %len.06 = phi i64 [ 1288, %entry ], [ %sub, %while.body ]
-  %pDst.05 = phi i64* [ inttoptr (i64 6442450944 to i64*), %entry ], [ %incdec.ptr1, %while.body ]
-  %pSrc.04 = phi i64* [ inttoptr (i64 4294967296 to i64*), %entry ], [ %incdec.ptr, %while.body ]
-  %incdec.ptr = getelementptr inbounds i64* %pSrc.04, i64 1
-  %tmp = load volatile i64* %pSrc.04, align 8
-  %incdec.ptr1 = getelementptr inbounds i64* %pDst.05, i64 1
-  store volatile i64 %tmp, i64* %pDst.05, align 8
-  %sub = add i64 %len.06, -8
-  %cmp = icmp sgt i64 %sub, -1
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body
-  tail call void inttoptr (i64 6442450944 to void ()*)() nounwind
-  ret i32 0
-}
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll b/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll
deleted file mode 100644
index 10b2c3a403c..00000000000
--- a/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc < %s -O3 -mtriple=arm64-unknown-unknown -mcpu=cyclone -pre-RA-sched=list-hybrid | FileCheck %s
-; <rdar://problem/11635990> [arm64] [lsr] Inefficient EA/loop-exit calc in bzero_phys
-;
-; LSR on loop %while.cond should reassociate non-address mode
-; expressions at use %cmp16 to avoid sinking computation into %while.body18.
-;
-; Remove the -pre-RA-sched=list-hybrid option after fixing:
-; <rdar://problem/12702735> [ARM64][coalescer] need better register
-; coalescing for simple unit tests.
-
-; CHECK: @memset
-; CHECK: %while.body18{{$}}
-; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #8
-; First set the IVREG variable, then use it
-; CHECK-NEXT: sub [[IVREG:x[0-9]+]],
-; CHECK: [[IVREG]], #8
-; CHECK-NEXT: cmp  [[IVREG]], #7
-; CHECK-NEXT: b.hi
-define i8* @memset(i8* %dest, i32 %val, i64 %len) nounwind ssp noimplicitfloat {
-entry:
-  %cmp = icmp eq i64 %len, 0
-  br i1 %cmp, label %done, label %while.cond.preheader
-
-while.cond.preheader:                             ; preds = %entry
-  %conv = trunc i32 %val to i8
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %while.cond.preheader
-  %ptr.0 = phi i8* [ %incdec.ptr, %while.body ], [ %dest, %while.cond.preheader ]
-  %len.addr.0 = phi i64 [ %dec, %while.body ], [ %len, %while.cond.preheader ]
-  %cond = icmp eq i64 %len.addr.0, 0
-  br i1 %cond, label %done, label %land.rhs
-
-land.rhs:                                         ; preds = %while.cond
-  %0 = ptrtoint i8* %ptr.0 to i64
-  %and = and i64 %0, 7
-  %cmp5 = icmp eq i64 %and, 0
-  br i1 %cmp5, label %if.end9, label %while.body
-
-while.body:                                       ; preds = %land.rhs
-  %incdec.ptr = getelementptr inbounds i8* %ptr.0, i64 1
-  store i8 %conv, i8* %ptr.0, align 1, !tbaa !0
-  %dec = add i64 %len.addr.0, -1
-  br label %while.cond
-
-if.end9:                                          ; preds = %land.rhs
-  %conv.mask = and i32 %val, 255
-  %1 = zext i32 %conv.mask to i64
-  %2 = shl nuw nsw i64 %1, 8
-  %ins18 = or i64 %2, %1
-  %3 = shl nuw nsw i64 %1, 16
-  %ins15 = or i64 %ins18, %3
-  %4 = shl nuw nsw i64 %1, 24
-  %5 = shl nuw nsw i64 %1, 32
-  %mask8 = or i64 %ins15, %4
-  %6 = shl nuw nsw i64 %1, 40
-  %mask5 = or i64 %mask8, %5
-  %7 = shl nuw nsw i64 %1, 48
-  %8 = shl nuw i64 %1, 56
-  %mask2.masked = or i64 %mask5, %6
-  %mask = or i64 %mask2.masked, %7
-  %ins = or i64 %mask, %8
-  %9 = bitcast i8* %ptr.0 to i64*
-  %cmp1636 = icmp ugt i64 %len.addr.0, 7
-  br i1 %cmp1636, label %while.body18, label %while.body29.lr.ph
-
-while.body18:                                     ; preds = %if.end9, %while.body18
-  %wideptr.038 = phi i64* [ %incdec.ptr19, %while.body18 ], [ %9, %if.end9 ]
-  %len.addr.137 = phi i64 [ %sub, %while.body18 ], [ %len.addr.0, %if.end9 ]
-  %incdec.ptr19 = getelementptr inbounds i64* %wideptr.038, i64 1
-  store i64 %ins, i64* %wideptr.038, align 8, !tbaa !2
-  %sub = add i64 %len.addr.137, -8
-  %cmp16 = icmp ugt i64 %sub, 7
-  br i1 %cmp16, label %while.body18, label %while.end20
-
-while.end20:                                      ; preds = %while.body18
-  %cmp21 = icmp eq i64 %sub, 0
-  br i1 %cmp21, label %done, label %while.body29.lr.ph
-
-while.body29.lr.ph:                               ; preds = %while.end20, %if.end9
-  %len.addr.1.lcssa49 = phi i64 [ %sub, %while.end20 ], [ %len.addr.0, %if.end9 ]
-  %wideptr.0.lcssa48 = phi i64* [ %incdec.ptr19, %while.end20 ], [ %9, %if.end9 ]
-  %10 = bitcast i64* %wideptr.0.lcssa48 to i8*
-  br label %while.body29
-
-while.body29:                                     ; preds = %while.body29, %while.body29.lr.ph
-  %len.addr.235 = phi i64 [ %len.addr.1.lcssa49, %while.body29.lr.ph ], [ %dec26, %while.body29 ]
-  %ptr.134 = phi i8* [ %10, %while.body29.lr.ph ], [ %incdec.ptr31, %while.body29 ]
-  %dec26 = add i64 %len.addr.235, -1
-  %incdec.ptr31 = getelementptr inbounds i8* %ptr.134, i64 1
-  store i8 %conv, i8* %ptr.134, align 1, !tbaa !0
-  %cmp27 = icmp eq i64 %dec26, 0
-  br i1 %cmp27, label %done, label %while.body29
-
-done:                                             ; preds = %while.cond, %while.body29, %while.end20, %entry
-  ret i8* %dest
-}
-
-!0 = metadata !{metadata !"omnipotent char", metadata !1}
-!1 = metadata !{metadata !"Simple C/C++ TBAA"}
-!2 = metadata !{metadata !"long long", metadata !0}
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/req-regs.ll b/test/Transforms/LoopStrengthReduce/ARM64/req-regs.ll
deleted file mode 100644
index 217896e55c6..00000000000
--- a/test/Transforms/LoopStrengthReduce/ARM64/req-regs.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: llc -mcpu=cyclone -debug-only=loop-reduce < %s 2>&1 | FileCheck %s
-; REQUIRES: asserts
-
-; LSR used to fail here due to a bug in the ReqRegs test.
-; CHECK: The chosen solution requires
-; CHECK-NOT: No Satisfactory Solution
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios"
-
-define void @do_integer_add(i64 %iterations, i8* nocapture readonly %cookie) {
-entry:
-  %N = bitcast i8* %cookie to i32*
-  %0 = load i32* %N, align 4
-  %add = add nsw i32 %0, 57
-  %cmp56 = icmp eq i64 %iterations, 0
-  br i1 %cmp56, label %while.end, label %for.cond.preheader.preheader
-
-for.cond.preheader.preheader:                     ; preds = %entry
-  br label %for.cond.preheader
-
-while.cond.loopexit:                              ; preds = %for.body
-  %add21.lcssa = phi i32 [ %add21, %for.body ]
-  %dec58 = add i64 %dec58.in, -1
-  %cmp = icmp eq i64 %dec58, 0
-  br i1 %cmp, label %while.end.loopexit, label %for.cond.preheader
-
-for.cond.preheader:                               ; preds = %for.cond.preheader.preheader, %while.cond.loopexit
-  %dec58.in = phi i64 [ %dec58, %while.cond.loopexit ], [ %iterations, %for.cond.preheader.preheader ]
-  %a.057 = phi i32 [ %add21.lcssa, %while.cond.loopexit ], [ %add, %for.cond.preheader.preheader ]
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.cond.preheader
-  %a.154 = phi i32 [ %a.057, %for.cond.preheader ], [ %add21, %for.body ]
-  %i.053 = phi i32 [ 1, %for.cond.preheader ], [ %inc, %for.body ]
-  %inc = add nsw i32 %i.053, 1
-  %add2 = shl i32 %a.154, 1
-  %add3 = add nsw i32 %add2, %i.053
-  %add4 = shl i32 %add3, 1
-  %add5 = add nsw i32 %add4, %i.053
-  %add6 = shl i32 %add5, 1
-  %add7 = add nsw i32 %add6, %i.053
-  %add8 = shl i32 %add7, 1
-  %add9 = add nsw i32 %add8, %i.053
-  %add10 = shl i32 %add9, 1
-  %add11 = add nsw i32 %add10, %i.053
-  %add12 = shl i32 %add11, 1
-  %add13 = add nsw i32 %add12, %i.053
-  %add14 = shl i32 %add13, 1
-  %add15 = add nsw i32 %add14, %i.053
-  %add16 = shl i32 %add15, 1
-  %add17 = add nsw i32 %add16, %i.053
-  %add18 = shl i32 %add17, 1
-  %add19 = add nsw i32 %add18, %i.053
-  %add20 = shl i32 %add19, 1
-  %add21 = add nsw i32 %add20, %i.053
-  %exitcond = icmp eq i32 %inc, 1001
-  br i1 %exitcond, label %while.cond.loopexit, label %for.body
-
-while.end.loopexit:                               ; preds = %while.cond.loopexit
-  %add21.lcssa.lcssa = phi i32 [ %add21.lcssa, %while.cond.loopexit ]
-  br label %while.end
-
-while.end:                                        ; preds = %while.end.loopexit, %entry
-  %a.0.lcssa = phi i32 [ %add, %entry ], [ %add21.lcssa.lcssa, %while.end.loopexit ]
-  tail call void @use_int(i32 %a.0.lcssa)
-  ret void
-}
-
-declare void @use_int(i32)
diff --git a/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll b/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
new file mode 100644
index 00000000000..f8eb3ed1f35
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -loop-vectorize -mtriple=arm64-none-linux-gnu -mattr=+neon -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Function Attrs: nounwind
+define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
+;CHECK-LABEL: array_add
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+entry:
+  %cmp10 = icmp sgt i32 %size, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32* %c, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret i32* %c
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/gather-cost.ll b/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
new file mode 100644
index 00000000000..bb285382e53
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
@@ -0,0 +1,85 @@
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+
+@kernel = global [512 x float] zeroinitializer, align 16
+@kernel2 = global [512 x float] zeroinitializer, align 16
+@kernel3 = global [512 x float] zeroinitializer, align 16
+@kernel4 = global [512 x float] zeroinitializer, align 16
+@src_data = global [1536 x float] zeroinitializer, align 16
+@r_ = global i8 0, align 1
+@g_ = global i8 0, align 1
+@b_ = global i8 0, align 1
+
+; We don't want to vectorize most loops containing gathers because they are
+; expensive.
+; Make sure we don't vectorize it.
+; CHECK-NOT: x float>
+
+define void @_Z4testmm(i64 %size, i64 %offset) {
+entry:
+  %cmp53 = icmp eq i64 %size, 0
+  br i1 %cmp53, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
+  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
+  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
+  %add = add i64 %v.055, %offset
+  %mul = mul i64 %add, 3
+  %arrayidx = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %mul
+  %0 = load float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float]* @kernel, i64 0, i64 %v.055
+  %1 = load float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %1
+  %arrayidx4 = getelementptr inbounds [512 x float]* @kernel2, i64 0, i64 %v.055
+  %2 = load float* %arrayidx4, align 4
+  %mul5 = fmul fast float %mul3, %2
+  %arrayidx6 = getelementptr inbounds [512 x float]* @kernel3, i64 0, i64 %v.055
+  %3 = load float* %arrayidx6, align 4
+  %mul7 = fmul fast float %mul5, %3
+  %arrayidx8 = getelementptr inbounds [512 x float]* @kernel4, i64 0, i64 %v.055
+  %4 = load float* %arrayidx8, align 4
+  %mul9 = fmul fast float %mul7, %4
+  %add10 = fadd fast float %r.057, %mul9
+  %arrayidx.sum = add i64 %mul, 1
+  %arrayidx11 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
+  %5 = load float* %arrayidx11, align 4
+  %mul13 = fmul fast float %1, %5
+  %mul15 = fmul fast float %2, %mul13
+  %mul17 = fmul fast float %3, %mul15
+  %mul19 = fmul fast float %4, %mul17
+  %add20 = fadd fast float %g.056, %mul19
+  %arrayidx.sum52 = add i64 %mul, 2
+  %arrayidx21 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
+  %6 = load float* %arrayidx21, align 4
+  %mul23 = fmul fast float %1, %6
+  %mul25 = fmul fast float %2, %mul23
+  %mul27 = fmul fast float %3, %mul25
+  %mul29 = fmul fast float %4, %mul27
+  %add30 = fadd fast float %b.054, %mul29
+  %inc = add i64 %v.055, 1
+  %exitcond = icmp ne i64 %inc, %size
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+  %add30.lcssa = phi float [ %add30, %for.body ]
+  %add20.lcssa = phi float [ %add20, %for.body ]
+  %add10.lcssa = phi float [ %add10, %for.body ]
+  %phitmp = fptoui float %add10.lcssa to i8
+  %phitmp60 = fptoui float %add20.lcssa to i8
+  %phitmp61 = fptoui float %add30.lcssa to i8
+  br label %for.end
+
+for.end:
+  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  store i8 %r.0.lcssa, i8* @r_, align 1
+  store i8 %g.0.lcssa, i8* @g_, align 1
+  store i8 %b.0.lcssa, i8* @b_, align 1
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/ARM64/arm64-unroll.ll b/test/Transforms/LoopVectorize/ARM64/arm64-unroll.ll
deleted file mode 100644
index f8eb3ed1f35..00000000000
--- a/test/Transforms/LoopVectorize/ARM64/arm64-unroll.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: opt < %s -loop-vectorize -mtriple=arm64-none-linux-gnu -mattr=+neon -S | FileCheck %s
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-
-; Function Attrs: nounwind
-define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
-;CHECK-LABEL: array_add
-;CHECK: load <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: ret
-entry:
-  %cmp10 = icmp sgt i32 %size, 0
-  br i1 %cmp10, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
-  %0 = load i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32* %b, i64 %indvars.iv
-  %1 = load i32* %arrayidx2, align 4
-  %add = add nsw i32 %1, %0
-  %arrayidx4 = getelementptr inbounds i32* %c, i64 %indvars.iv
-  store i32 %add, i32* %arrayidx4, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %size
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret i32* %c
-}
diff --git a/test/Transforms/LoopVectorize/ARM64/gather-cost.ll b/test/Transforms/LoopVectorize/ARM64/gather-cost.ll
deleted file mode 100644
index bb285382e53..00000000000
--- a/test/Transforms/LoopVectorize/ARM64/gather-cost.ll
+++ /dev/null
@@ -1,85 +0,0 @@
-; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-
-@kernel = global [512 x float] zeroinitializer, align 16
-@kernel2 = global [512 x float] zeroinitializer, align 16
-@kernel3 = global [512 x float] zeroinitializer, align 16
-@kernel4 = global [512 x float] zeroinitializer, align 16
-@src_data = global [1536 x float] zeroinitializer, align 16
-@r_ = global i8 0, align 1
-@g_ = global i8 0, align 1
-@b_ = global i8 0, align 1
-
-; We don't want to vectorize most loops containing gathers because they are
-; expensive.
-; Make sure we don't vectorize it.
-; CHECK-NOT: x float>
-
-define void @_Z4testmm(i64 %size, i64 %offset) {
-entry:
-  %cmp53 = icmp eq i64 %size, 0
-  br i1 %cmp53, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:
-  br label %for.body
-
-for.body:
-  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
-  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
-  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
-  %add = add i64 %v.055, %offset
-  %mul = mul i64 %add, 3
-  %arrayidx = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %mul
-  %0 = load float* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [512 x float]* @kernel, i64 0, i64 %v.055
-  %1 = load float* %arrayidx2, align 4
-  %mul3 = fmul fast float %0, %1
-  %arrayidx4 = getelementptr inbounds [512 x float]* @kernel2, i64 0, i64 %v.055
-  %2 = load float* %arrayidx4, align 4
-  %mul5 = fmul fast float %mul3, %2
-  %arrayidx6 = getelementptr inbounds [512 x float]* @kernel3, i64 0, i64 %v.055
-  %3 = load float* %arrayidx6, align 4
-  %mul7 = fmul fast float %mul5, %3
-  %arrayidx8 = getelementptr inbounds [512 x float]* @kernel4, i64 0, i64 %v.055
-  %4 = load float* %arrayidx8, align 4
-  %mul9 = fmul fast float %mul7, %4
-  %add10 = fadd fast float %r.057, %mul9
-  %arrayidx.sum = add i64 %mul, 1
-  %arrayidx11 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
-  %5 = load float* %arrayidx11, align 4
-  %mul13 = fmul fast float %1, %5
-  %mul15 = fmul fast float %2, %mul13
-  %mul17 = fmul fast float %3, %mul15
-  %mul19 = fmul fast float %4, %mul17
-  %add20 = fadd fast float %g.056, %mul19
-  %arrayidx.sum52 = add i64 %mul, 2
-  %arrayidx21 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
-  %6 = load float* %arrayidx21, align 4
-  %mul23 = fmul fast float %1, %6
-  %mul25 = fmul fast float %2, %mul23
-  %mul27 = fmul fast float %3, %mul25
-  %mul29 = fmul fast float %4, %mul27
-  %add30 = fadd fast float %b.054, %mul29
-  %inc = add i64 %v.055, 1
-  %exitcond = icmp ne i64 %inc, %size
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:
-  %add30.lcssa = phi float [ %add30, %for.body ]
-  %add20.lcssa = phi float [ %add20, %for.body ]
-  %add10.lcssa = phi float [ %add10, %for.body ]
-  %phitmp = fptoui float %add10.lcssa to i8
-  %phitmp60 = fptoui float %add20.lcssa to i8
-  %phitmp61 = fptoui float %add30.lcssa to i8
-  br label %for.end
-
-for.end:
-  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  store i8 %r.0.lcssa, i8* @r_, align 1
-  store i8 %g.0.lcssa, i8* @g_, align 1
-  store i8 %b.0.lcssa, i8* @b_, align 1
-  ret void
-}
diff --git a/test/Transforms/LoopVectorize/ARM64/lit.local.cfg b/test/Transforms/LoopVectorize/ARM64/lit.local.cfg
deleted file mode 100644
index f1d1f88cf39..00000000000
--- a/test/Transforms/LoopVectorize/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-config.suffixes = ['.ll']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg b/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
new file mode 100644
index 00000000000..c42034979fc
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll b/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll
new file mode 100644
index 00000000000..3d6da124fc4
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll
@@ -0,0 +1,18 @@
+; RUN: opt -S -slp-vectorizer %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+define i64 @mismatched_intrinsics(<4 x i32> %in1, <2 x i32> %in2) nounwind {
+; CHECK-LABEL: @mismatched_intrinsics
+; CHECK: call i64 @llvm.arm64.neon.saddlv.i64.v4i32
+; CHECK: call i64 @llvm.arm64.neon.saddlv.i64.v2i32
+
+  %vaddlvq_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1) #2
+  %vaddlv_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in2) #2
+  %tst = icmp sgt i64 %vaddlvq_s32.i, %vaddlv_s32.i
+  %equal = sext i1 %tst to i64
+  ret i64 %equal
+}
+
+declare i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1)
+declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in1)
diff --git a/test/Transforms/SLPVectorizer/ARM64/lit.local.cfg b/test/Transforms/SLPVectorizer/ARM64/lit.local.cfg
deleted file mode 100644
index 84ac9811f01..00000000000
--- a/test/Transforms/SLPVectorizer/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/ARM64/mismatched-intrinsics.ll b/test/Transforms/SLPVectorizer/ARM64/mismatched-intrinsics.ll
deleted file mode 100644
index 3d6da124fc4..00000000000
--- a/test/Transforms/SLPVectorizer/ARM64/mismatched-intrinsics.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: opt -S -slp-vectorizer %s | FileCheck %s
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios5.0.0"
-
-define i64 @mismatched_intrinsics(<4 x i32> %in1, <2 x i32> %in2) nounwind {
-; CHECK-LABEL: @mismatched_intrinsics
-; CHECK: call i64 @llvm.arm64.neon.saddlv.i64.v4i32
-; CHECK: call i64 @llvm.arm64.neon.saddlv.i64.v2i32
-
-  %vaddlvq_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1) #2
-  %vaddlv_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in2) #2
-  %tst = icmp sgt i64 %vaddlvq_s32.i, %vaddlv_s32.i
-  %equal = sext i1 %tst to i64
-  ret i64 %equal
-}
-
-declare i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1)
-declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in1)